LLVM 23.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SetVector.h"
30#include "llvm/ADT/TypeSwitch.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
45
46using namespace llvm;
47using namespace VPlanPatternMatch;
48using namespace SCEVPatternMatch;
49
51 VPlan &Plan, const TargetLibraryInfo &TLI) {
52
54 Plan.getVectorLoopRegion());
56 // Skip blocks outside region
57 if (!VPBB->getParent())
58 break;
59 VPRecipeBase *Term = VPBB->getTerminator();
60 auto EndIter = Term ? Term->getIterator() : VPBB->end();
61 // Introduce each ingredient into VPlan.
62 for (VPRecipeBase &Ingredient :
63 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
64
65 VPValue *VPV = Ingredient.getVPSingleValue();
66 if (!VPV->getUnderlyingValue())
67 continue;
68
70
71 VPRecipeBase *NewRecipe = nullptr;
72 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
73 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
74 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
75 for (VPValue *Op : PhiR->operands())
76 NewRecipe->addOperand(Op);
77 } else if (auto *VPI = dyn_cast<VPInstruction>(&Ingredient)) {
78 assert(!isa<PHINode>(Inst) && "phis should be handled above");
79 // Create VPWidenMemoryRecipe for loads and stores.
80 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
81 NewRecipe = new VPWidenLoadRecipe(
82 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
83 false /*Consecutive*/, false /*Reverse*/, *VPI,
84 Ingredient.getDebugLoc());
85 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
86 NewRecipe = new VPWidenStoreRecipe(
87 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
88 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, *VPI,
89 Ingredient.getDebugLoc());
91 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
92 Ingredient.getDebugLoc());
93 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
94 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
95 if (VectorID == Intrinsic::not_intrinsic)
96 return false;
97 NewRecipe = new VPWidenIntrinsicRecipe(
98 *CI, getVectorIntrinsicIDForCall(CI, &TLI),
99 drop_end(Ingredient.operands()), CI->getType(), VPIRFlags(*CI),
100 *VPI, CI->getDebugLoc());
101 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
102 NewRecipe = new VPWidenCastRecipe(
103 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
104 VPIRFlags(*CI), VPIRMetadata(*CI));
105 } else {
106 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
107 *VPI, Ingredient.getDebugLoc());
108 }
109 } else {
111 "inductions must be created earlier");
112 continue;
113 }
114
115 NewRecipe->insertBefore(&Ingredient);
116 if (NewRecipe->getNumDefinedValues() == 1)
117 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
118 else
119 assert(NewRecipe->getNumDefinedValues() == 0 &&
120 "Only recpies with zero or one defined values expected");
121 Ingredient.eraseFromParent();
122 }
123 }
124 return true;
125}
126
127/// Helper for extra no-alias checks via known-safe recipe and SCEV.
129 const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
130 VPReplicateRecipe &GroupLeader;
132 const Loop &L;
133 VPTypeAnalysis &TypeInfo;
134
135 // Return true if \p A and \p B are known to not alias for all VFs in the
136 // plan, checked via the distance between the accesses
137 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
138 if (A->getOpcode() != Instruction::Store ||
139 B->getOpcode() != Instruction::Store)
140 return false;
141
142 VPValue *AddrA = A->getOperand(1);
143 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, PSE, &L);
144 VPValue *AddrB = B->getOperand(1);
145 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, PSE, &L);
147 return false;
148
149 const APInt *Distance;
150 ScalarEvolution &SE = *PSE.getSE();
151 if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
152 return false;
153
154 const DataLayout &DL = SE.getDataLayout();
155 Type *TyA = TypeInfo.inferScalarType(A->getOperand(0));
156 uint64_t SizeA = DL.getTypeStoreSize(TyA);
157 Type *TyB = TypeInfo.inferScalarType(B->getOperand(0));
158 uint64_t SizeB = DL.getTypeStoreSize(TyB);
159
160 // Use the maximum store size to ensure no overlap from either direction.
161 // Currently only handles fixed sizes, as it is only used for
162 // replicating VPReplicateRecipes.
163 uint64_t MaxStoreSize = std::max(SizeA, SizeB);
164
165 auto VFs = B->getParent()->getPlan()->vectorFactors();
167 if (MaxVF.isScalable())
168 return false;
169 return Distance->abs().uge(
170 MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
171 }
172
173public:
176 const Loop &L, VPTypeAnalysis &TypeInfo)
177 : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), PSE(PSE),
178 L(L), TypeInfo(TypeInfo) {}
179
180 /// Return true if \p R should be skipped during alias checking, either
181 /// because it's in the exclude set or because no-alias can be proven via
182 /// SCEV.
183 bool shouldSkip(VPRecipeBase &R) const {
184 auto *Store = dyn_cast<VPReplicateRecipe>(&R);
185 return ExcludeRecipes.contains(&R) ||
186 (Store && isNoAliasViaDistance(Store, &GroupLeader));
187 }
188};
189
190/// Check if a memory operation doesn't alias with memory operations using
191/// scoped noalias metadata, in blocks in the single-successor chain between \p
192/// FirstBB and \p LastBB. If \p SinkInfo is std::nullopt, only recipes that may
193/// write to memory are checked (for load hoisting). Otherwise recipes that both
194/// read and write memory are checked, and SCEV is used to prove no-alias
195/// between the group leader and other replicate recipes (for store sinking).
196static bool
198 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
199 std::optional<SinkStoreInfo> SinkInfo = {}) {
200 bool CheckReads = SinkInfo.has_value();
201 if (!MemLoc.AATags.Scope)
202 return false;
203
204 for (VPBasicBlock *VPBB :
206 for (VPRecipeBase &R : *VPBB) {
207 if (SinkInfo && SinkInfo->shouldSkip(R))
208 continue;
209
210 // Skip recipes that don't need checking.
211 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
212 continue;
213
215 if (!Loc)
216 // Conservatively assume aliasing for memory operations without
217 // location.
218 return false;
219
221 return false;
222 }
223 }
224 return true;
225}
226
227/// Collect either replicated Loads or Stores grouped by their address SCEV, in
228/// a deep-traversal of the vector loop region in \p Plan.
229template <unsigned Opcode>
232 VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L,
233 function_ref<bool(VPReplicateRecipe *)> FilterFn) {
234 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
235 "Only Load and Store opcodes supported");
236 constexpr bool IsLoad = (Opcode == Instruction::Load);
238 RecipesByAddress;
241 for (VPRecipeBase &R : *VPBB) {
242 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
243 if (!RepR || RepR->getOpcode() != Opcode || !FilterFn(RepR))
244 continue;
245
246 // For loads, operand 0 is address; for stores, operand 1 is address.
247 VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
248 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);
249 if (!isa<SCEVCouldNotCompute>(AddrSCEV))
250 RecipesByAddress[AddrSCEV].push_back(RepR);
251 }
252 }
253 auto Groups = to_vector(RecipesByAddress.values());
254 VPDominatorTree VPDT(Plan);
255 for (auto &Group : Groups) {
256 // Sort mem ops by dominance order, with earliest (most dominating) first.
258 return VPDT.properlyDominates(A, B);
259 });
260 }
261 return Groups;
262}
263
264/// Return true if we do not know how to (mechanically) hoist or sink \p R out
265/// of a loop region.
267 // Assumes don't alias anything or throw; as long as they're guaranteed to
268 // execute, they're safe to hoist.
270 return false;
271
272 // TODO: Relax checks in the future, e.g. we could also hoist reads, if their
273 // memory location is not modified in the vector loop.
274 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
275 return true;
276
277 // Allocas cannot be hoisted.
278 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
279 return RepR && RepR->getOpcode() == Instruction::Alloca;
280}
281
282static bool sinkScalarOperands(VPlan &Plan) {
283 auto Iter = vp_depth_first_deep(Plan.getEntry());
284 bool ScalarVFOnly = Plan.hasScalarVFOnly();
285 bool Changed = false;
286
288 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
289 VPBasicBlock *SinkTo, VPValue *Op) {
290 auto *Candidate =
291 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
292 if (!Candidate)
293 return;
294
295 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
296 // for now.
298 return;
299
300 if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(*Candidate))
301 return;
302
303 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
304 if (!ScalarVFOnly && RepR->isSingleScalar())
305 return;
306
307 WorkList.insert({SinkTo, Candidate});
308 };
309
310 // First, collect the operands of all recipes in replicate blocks as seeds for
311 // sinking.
313 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
314 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
315 continue;
316 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
317 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
318 continue;
319 for (auto &Recipe : *VPBB)
320 for (VPValue *Op : Recipe.operands())
321 InsertIfValidSinkCandidate(VPBB, Op);
322 }
323
324 // Try to sink each replicate or scalar IV steps recipe in the worklist.
325 for (unsigned I = 0; I != WorkList.size(); ++I) {
326 VPBasicBlock *SinkTo;
327 VPSingleDefRecipe *SinkCandidate;
328 std::tie(SinkTo, SinkCandidate) = WorkList[I];
329
330 // All recipe users of SinkCandidate must be in the same block SinkTo or all
331 // users outside of SinkTo must only use the first lane of SinkCandidate. In
332 // the latter case, we need to duplicate SinkCandidate.
333 auto UsersOutsideSinkTo =
334 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
335 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
336 });
337 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
338 return !U->usesFirstLaneOnly(SinkCandidate);
339 }))
340 continue;
341 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
342
343 if (NeedsDuplicating) {
344 if (ScalarVFOnly)
345 continue;
346 VPSingleDefRecipe *Clone;
347 if (auto *SinkCandidateRepR =
348 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
349 // TODO: Handle converting to uniform recipes as separate transform,
350 // then cloning should be sufficient here.
351 Instruction *I = SinkCandidate->getUnderlyingInstr();
352 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
353 nullptr /*Mask*/, *SinkCandidateRepR,
354 *SinkCandidateRepR);
355 // TODO: add ".cloned" suffix to name of Clone's VPValue.
356 } else {
357 Clone = SinkCandidate->clone();
358 }
359
360 Clone->insertBefore(SinkCandidate);
361 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
362 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
363 });
364 }
365 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
366 for (VPValue *Op : SinkCandidate->operands())
367 InsertIfValidSinkCandidate(SinkTo, Op);
368 Changed = true;
369 }
370 return Changed;
371}
372
373/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
374/// the mask.
376 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
377 if (!EntryBB || EntryBB->size() != 1 ||
378 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
379 return nullptr;
380
381 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
382}
383
384/// If \p R is a triangle region, return the 'then' block of the triangle.
386 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
387 if (EntryBB->getNumSuccessors() != 2)
388 return nullptr;
389
390 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
391 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
392 if (!Succ0 || !Succ1)
393 return nullptr;
394
395 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
396 return nullptr;
397 if (Succ0->getSingleSuccessor() == Succ1)
398 return Succ0;
399 if (Succ1->getSingleSuccessor() == Succ0)
400 return Succ1;
401 return nullptr;
402}
403
404// Merge replicate regions in their successor region, if a replicate region
405// is connected to a successor replicate region with the same predicate by a
406// single, empty VPBasicBlock.
408 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
409
410 // Collect replicate regions followed by an empty block, followed by another
411 // replicate region with matching masks to process front. This is to avoid
412 // iterator invalidation issues while merging regions.
415 vp_depth_first_deep(Plan.getEntry()))) {
416 if (!Region1->isReplicator())
417 continue;
418 auto *MiddleBasicBlock =
419 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
420 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
421 continue;
422
423 auto *Region2 =
424 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
425 if (!Region2 || !Region2->isReplicator())
426 continue;
427
428 VPValue *Mask1 = getPredicatedMask(Region1);
429 VPValue *Mask2 = getPredicatedMask(Region2);
430 if (!Mask1 || Mask1 != Mask2)
431 continue;
432
433 assert(Mask1 && Mask2 && "both region must have conditions");
434 WorkList.push_back(Region1);
435 }
436
437 // Move recipes from Region1 to its successor region, if both are triangles.
438 for (VPRegionBlock *Region1 : WorkList) {
439 if (TransformedRegions.contains(Region1))
440 continue;
441 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
442 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
443
444 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
445 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
446 if (!Then1 || !Then2)
447 continue;
448
449 // Note: No fusion-preventing memory dependencies are expected in either
450 // region. Such dependencies should be rejected during earlier dependence
451 // checks, which guarantee accesses can be re-ordered for vectorization.
452 //
453 // Move recipes to the successor region.
454 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
455 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
456
457 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
458 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
459
460 // Move VPPredInstPHIRecipes from the merge block to the successor region's
461 // merge block. Update all users inside the successor region to use the
462 // original values.
463 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
464 VPValue *PredInst1 =
465 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
466 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
467 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
468 return cast<VPRecipeBase>(&U)->getParent() == Then2;
469 });
470
471 // Remove phi recipes that are unused after merging the regions.
472 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
473 Phi1ToMove.eraseFromParent();
474 continue;
475 }
476 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
477 }
478
479 // Remove the dead recipes in Region1's entry block.
480 for (VPRecipeBase &R :
481 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
482 R.eraseFromParent();
483
484 // Finally, remove the first region.
485 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
486 VPBlockUtils::disconnectBlocks(Pred, Region1);
487 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
488 }
489 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
490 TransformedRegions.insert(Region1);
491 }
492
493 return !TransformedRegions.empty();
494}
495
497 VPlan &Plan) {
498 Instruction *Instr = PredRecipe->getUnderlyingInstr();
499 // Build the triangular if-then region.
500 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
501 assert(Instr->getParent() && "Predicated instruction not in any basic block");
502 auto *BlockInMask = PredRecipe->getMask();
503 auto *MaskDef = BlockInMask->getDefiningRecipe();
504 auto *BOMRecipe = new VPBranchOnMaskRecipe(
505 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
506 auto *Entry =
507 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
508
509 // Replace predicated replicate recipe with a replicate recipe without a
510 // mask but in the replicate region.
511 auto *RecipeWithoutMask = new VPReplicateRecipe(
512 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
513 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
514 PredRecipe->getDebugLoc());
515 auto *Pred =
516 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
517
518 VPPredInstPHIRecipe *PHIRecipe = nullptr;
519 if (PredRecipe->getNumUsers() != 0) {
520 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
521 RecipeWithoutMask->getDebugLoc());
522 PredRecipe->replaceAllUsesWith(PHIRecipe);
523 PHIRecipe->setOperand(0, RecipeWithoutMask);
524 }
525 PredRecipe->eraseFromParent();
526 auto *Exiting =
527 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
529 Plan.createReplicateRegion(Entry, Exiting, RegionName);
530
531 // Note: first set Entry as region entry and then connect successors starting
532 // from it in order, to propagate the "parent" of each VPBasicBlock.
533 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
534 VPBlockUtils::connectBlocks(Pred, Exiting);
535
536 return Region;
537}
538
539static void addReplicateRegions(VPlan &Plan) {
542 vp_depth_first_deep(Plan.getEntry()))) {
543 for (VPRecipeBase &R : *VPBB)
544 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
545 if (RepR->isPredicated())
546 WorkList.push_back(RepR);
547 }
548 }
549
550 unsigned BBNum = 0;
551 for (VPReplicateRecipe *RepR : WorkList) {
552 VPBasicBlock *CurrentBlock = RepR->getParent();
553 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
554
555 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
556 SplitBlock->setName(
557 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
558 // Record predicated instructions for above packing optimizations.
560 Region->setParent(CurrentBlock->getParent());
562
563 VPRegionBlock *ParentRegion = Region->getParent();
564 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
565 ParentRegion->setExiting(SplitBlock);
566 }
567}
568
572 vp_depth_first_deep(Plan.getEntry()))) {
573 // Don't fold the blocks in the skeleton of the Plan into their single
574 // predecessors for now.
575 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
576 if (!VPBB->getParent())
577 continue;
578 auto *PredVPBB =
579 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
580 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
581 isa<VPIRBasicBlock>(PredVPBB))
582 continue;
583 WorkList.push_back(VPBB);
584 }
585
586 for (VPBasicBlock *VPBB : WorkList) {
587 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
588 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
589 R.moveBefore(*PredVPBB, PredVPBB->end());
590 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
591 auto *ParentRegion = VPBB->getParent();
592 if (ParentRegion && ParentRegion->getExiting() == VPBB)
593 ParentRegion->setExiting(PredVPBB);
594 VPBlockUtils::transferSuccessors(VPBB, PredVPBB);
595 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
596 }
597 return !WorkList.empty();
598}
599
601 // Convert masked VPReplicateRecipes to if-then region blocks.
603
604 bool ShouldSimplify = true;
605 while (ShouldSimplify) {
606 ShouldSimplify = sinkScalarOperands(Plan);
607 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
608 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
609 }
610}
611
612/// Remove redundant casts of inductions.
613///
614/// Such redundant casts are casts of induction variables that can be ignored,
615/// because we already proved that the casted phi is equal to the uncasted phi
616/// in the vectorized loop. There is no need to vectorize the cast - the same
617/// value can be used for both the phi and casts in the vector loop.
619 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
621 if (!IV || IV->getTruncInst())
622 continue;
623
624 // A sequence of IR Casts has potentially been recorded for IV, which
625 // *must be bypassed* when the IV is vectorized, because the vectorized IV
626 // will produce the desired casted value. This sequence forms a def-use
627 // chain and is provided in reverse order, ending with the cast that uses
628 // the IV phi. Search for the recipe of the last cast in the chain and
629 // replace it with the original IV. Note that only the final cast is
630 // expected to have users outside the cast-chain and the dead casts left
631 // over will be cleaned up later.
632 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
633 VPValue *FindMyCast = IV;
634 for (Instruction *IRCast : reverse(Casts)) {
635 VPSingleDefRecipe *FoundUserCast = nullptr;
636 for (auto *U : FindMyCast->users()) {
637 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
638 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
639 FoundUserCast = UserCast;
640 break;
641 }
642 }
643 FindMyCast = FoundUserCast;
644 }
645 FindMyCast->replaceAllUsesWith(IV);
646 }
647}
648
649/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
650/// recipe, if it exists.
652 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
653 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
654 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
655 for (VPUser *U : CanonicalIV->users()) {
657 if (WidenNewIV)
658 break;
659 }
660
661 if (!WidenNewIV)
662 return;
663
664 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
665 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
666 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
667
668 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
669 continue;
670
671 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
672 // everything WidenNewIV's users need. That is, WidenOriginalIV will
673 // generate a vector phi or all users of WidenNewIV demand the first lane
674 // only.
675 if (Plan.hasScalarVFOnly() ||
676 !vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
677 vputils::onlyFirstLaneUsed(WidenNewIV)) {
678 // We are replacing a wide canonical iv with a suitable wide induction.
679 // This is used to compute header mask, hence all lanes will be used and
680 // we need to drop wrap flags only applying to lanes guranteed to execute
681 // in the original scalar loop.
682 WidenOriginalIV->dropPoisonGeneratingFlags();
683 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
684 WidenNewIV->eraseFromParent();
685 return;
686 }
687 }
688}
689
690/// Returns true if \p R is dead and can be removed.
691static bool isDeadRecipe(VPRecipeBase &R) {
692 // Do remove conditional assume instructions as their conditions may be
693 // flattened.
694 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
695 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
697 if (IsConditionalAssume)
698 return true;
699
700 if (R.mayHaveSideEffects())
701 return false;
702
703 // Recipe is dead if no user keeps the recipe alive.
704 return all_of(R.definedValues(),
705 [](VPValue *V) { return V->getNumUsers() == 0; });
706}
707
710 vp_post_order_deep(Plan.getEntry()))) {
711 // The recipes in the block are processed in reverse order, to catch chains
712 // of dead recipes.
713 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
714 if (isDeadRecipe(R)) {
715 R.eraseFromParent();
716 continue;
717 }
718
719 // Check if R is a dead VPPhi <-> update cycle and remove it.
720 VPValue *Start, *Incoming;
721 if (!match(&R, m_VPPhi(m_VPValue(Start), m_VPValue(Incoming))))
722 continue;
723 auto *PhiR = cast<VPPhi>(&R);
724 VPUser *PhiUser = PhiR->getSingleUser();
725 if (!PhiUser)
726 continue;
727 if (PhiUser != Incoming->getDefiningRecipe() ||
728 Incoming->getNumUsers() != 1)
729 continue;
730 PhiR->replaceAllUsesWith(Start);
731 PhiR->eraseFromParent();
732 Incoming->getDefiningRecipe()->eraseFromParent();
733 }
734 }
735}
736
739 Instruction::BinaryOps InductionOpcode,
740 FPMathOperator *FPBinOp, Instruction *TruncI,
741 VPIRValue *StartV, VPValue *Step, DebugLoc DL,
742 VPBuilder &Builder) {
743 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
744 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
745 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
746 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
747 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
748
749 // Truncate base induction if needed.
750 VPTypeAnalysis TypeInfo(Plan);
751 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
752 if (TruncI) {
753 Type *TruncTy = TruncI->getType();
754 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
755 "Not truncating.");
756 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
757 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
758 ResultTy = TruncTy;
759 }
760
761 // Truncate step if needed.
762 Type *StepTy = TypeInfo.inferScalarType(Step);
763 if (ResultTy != StepTy) {
764 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
765 "Not truncating.");
766 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
767 auto *VecPreheader =
769 VPBuilder::InsertPointGuard Guard(Builder);
770 Builder.setInsertPoint(VecPreheader);
771 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
772 }
773 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
774 &Plan.getVF(), DL);
775}
776
779 for (unsigned I = 0; I != Users.size(); ++I) {
781 if (isa<VPHeaderPHIRecipe>(Cur))
782 continue;
783 for (VPValue *V : Cur->definedValues())
784 Users.insert_range(V->users());
785 }
786 return Users.takeVector();
787}
788
789/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
790/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
791/// generates scalar values.
792static VPValue *
794 VPlan &Plan, VPBuilder &Builder) {
796 VPIRValue *StartV = Plan.getZero(ID.getStep()->getType());
797 VPValue *StepV = PtrIV->getOperand(1);
799 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
800 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
801
802 return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
803 PtrIV->getDebugLoc(), "next.gep");
804}
805
806/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
807/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
808/// VPWidenPointerInductionRecipe will generate vectors only. If some users
809/// require vectors while other require scalars, the scalar uses need to extract
810/// the scalars from the generated vectors (Note that this is different to how
811/// int/fp inductions are handled). Legalize extract-from-ends using uniform
812/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
813/// the correct end value is available. Also optimize
814/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
815/// providing them scalar steps built on the canonical scalar IV and update the
816/// original IV's users. This is an optional optimization to reduce the needs of
817/// vector extracts.
820 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
821 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
822 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
823 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
824 if (!PhiR)
825 continue;
826
827 // Try to narrow wide and replicating recipes to uniform recipes, based on
828 // VPlan analysis.
829 // TODO: Apply to all recipes in the future, to replace legacy uniformity
830 // analysis.
831 auto Users = collectUsersRecursively(PhiR);
832 for (VPUser *U : reverse(Users)) {
833 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
834 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
835 // Skip recipes that shouldn't be narrowed.
836 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
837 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
838 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
839 continue;
840
841 // Skip recipes that may have other lanes than their first used.
843 continue;
844
845 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
846 Def->operands(), /*IsUniform*/ true,
847 /*Mask*/ nullptr, /*Flags*/ *Def);
848 Clone->insertAfter(Def);
849 Def->replaceAllUsesWith(Clone);
850 }
851
852 // Replace wide pointer inductions which have only their scalars used by
853 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
854 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
855 if (!Plan.hasScalarVFOnly() &&
856 !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
857 continue;
858
859 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
860 PtrIV->replaceAllUsesWith(PtrAdd);
861 continue;
862 }
863
864 // Replace widened induction with scalar steps for users that only use
865 // scalars.
866 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
867 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
868 return U->usesScalars(WideIV);
869 }))
870 continue;
871
872 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
874 Plan, ID.getKind(), ID.getInductionOpcode(),
875 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
876 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
877 WideIV->getDebugLoc(), Builder);
878
879 // Update scalar users of IV to use Step instead.
880 if (!HasOnlyVectorVFs) {
881 assert(!Plan.hasScalableVF() &&
882 "plans containing a scalar VF cannot also include scalable VFs");
883 WideIV->replaceAllUsesWith(Steps);
884 } else {
885 bool HasScalableVF = Plan.hasScalableVF();
886 WideIV->replaceUsesWithIf(Steps,
887 [WideIV, HasScalableVF](VPUser &U, unsigned) {
888 if (HasScalableVF)
889 return U.usesFirstLaneOnly(WideIV);
890 return U.usesScalars(WideIV);
891 });
892 }
893 }
894}
895
896/// Check if \p VPV is an untruncated wide induction, either before or after the
897/// increment. If so return the header IV (before the increment), otherwise
898/// return null.
901 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
902 if (WideIV) {
903 // VPV itself is a wide induction, separately compute the end value for exit
904 // users if it is not a truncated IV.
905 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
906 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
907 }
908
909 // Check if VPV is an optimizable induction increment.
910 VPRecipeBase *Def = VPV->getDefiningRecipe();
911 if (!Def || Def->getNumOperands() != 2)
912 return nullptr;
913 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
914 if (!WideIV)
915 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
916 if (!WideIV)
917 return nullptr;
918
919 auto IsWideIVInc = [&]() {
920 auto &ID = WideIV->getInductionDescriptor();
921
922 // Check if VPV increments the induction by the induction step.
923 VPValue *IVStep = WideIV->getStepValue();
924 switch (ID.getInductionOpcode()) {
925 case Instruction::Add:
926 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
927 case Instruction::FAdd:
928 return match(VPV, m_c_FAdd(m_Specific(WideIV), m_Specific(IVStep)));
929 case Instruction::FSub:
930 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
931 m_Specific(IVStep)));
932 case Instruction::Sub: {
933 // IVStep will be the negated step of the subtraction. Check if Step == -1
934 // * IVStep.
935 VPValue *Step;
936 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
937 return false;
938 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, PSE);
939 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, PSE);
940 ScalarEvolution &SE = *PSE.getSE();
941 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
942 !isa<SCEVCouldNotCompute>(StepSCEV) &&
943 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
944 }
945 default:
946 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
947 match(VPV, m_GetElementPtr(m_Specific(WideIV),
948 m_Specific(WideIV->getStepValue())));
949 }
950 llvm_unreachable("should have been covered by switch above");
951 };
952 return IsWideIVInc() ? WideIV : nullptr;
953}
954
955/// Attempts to optimize the induction variable exit values for users in the
956/// early exit block.
958 VPTypeAnalysis &TypeInfo,
959 VPBlockBase *PredVPBB,
960 VPValue *Op,
962 VPValue *Incoming, *Mask;
965 return nullptr;
966
967 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
968 if (!WideIV)
969 return nullptr;
970
971 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
972 if (WideIntOrFp && WideIntOrFp->getTruncInst())
973 return nullptr;
974
975 // Calculate the final index.
976 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
977 auto *CanonicalIV = LoopRegion->getCanonicalIV();
978 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
979 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
980
981 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
982 VPValue *FirstActiveLane =
983 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
984 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
985 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
986 FirstActiveLaneType, DL);
987 VPValue *EndValue = B.createAdd(CanonicalIV, FirstActiveLane, DL);
988
989 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
990 // changed it means the exit is using the incremented value, so we need to
991 // add the step.
992 if (Incoming != WideIV) {
993 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
994 EndValue = B.createAdd(EndValue, One, DL);
995 }
996
997 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
998 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
999 VPIRValue *Start = WideIV->getStartValue();
1000 VPValue *Step = WideIV->getStepValue();
1001 EndValue = B.createDerivedIV(
1002 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1003 Start, EndValue, Step);
1004 }
1005
1006 return EndValue;
1007}
1008
1009/// Compute the end value for \p WideIV, unless it is truncated. Creates a
1010/// VPDerivedIVRecipe for non-canonical inductions.
1012 VPBuilder &VectorPHBuilder,
1013 VPTypeAnalysis &TypeInfo,
1014 VPValue *VectorTC) {
1015 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1016 // Truncated wide inductions resume from the last lane of their vector value
1017 // in the last vector iteration which is handled elsewhere.
1018 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1019 return nullptr;
1020
1021 VPIRValue *Start = WideIV->getStartValue();
1022 VPValue *Step = WideIV->getStepValue();
1024 VPValue *EndValue = VectorTC;
1025 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
1026 EndValue = VectorPHBuilder.createDerivedIV(
1027 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1028 Start, VectorTC, Step);
1029 }
1030
1031 // EndValue is derived from the vector trip count (which has the same type as
1032 // the widest induction) and thus may be wider than the induction here.
1033 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
1034 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
1035 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
1036 ScalarTypeOfWideIV,
1037 WideIV->getDebugLoc());
1038 }
1039
1040 return EndValue;
1041}
1042
1043/// Attempts to optimize the induction variable exit values for users in the
1044/// exit block coming from the latch in the original scalar loop.
1046 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
1049 VPWidenInductionRecipe *WideIV = nullptr;
1051 WideIV = getOptimizableIVOf(Incoming, PSE);
1052
1053 if (!WideIV)
1054 return nullptr;
1055
1056 VPValue *EndValue = EndValues.lookup(WideIV);
1057 assert(EndValue && "Must have computed the end value up front");
1058
1059 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1060 // changed it means the exit is using the incremented value, so we don't
1061 // need to subtract the step.
1062 if (Incoming != WideIV)
1063 return EndValue;
1064
1065 // Otherwise, subtract the step from the EndValue.
1066 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
1067 VPValue *Step = WideIV->getStepValue();
1068 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
1069 if (ScalarTy->isIntegerTy())
1070 return B.createSub(EndValue, Step, DebugLoc::getUnknown(), "ind.escape");
1071 if (ScalarTy->isPointerTy()) {
1072 Type *StepTy = TypeInfo.inferScalarType(Step);
1073 auto *Zero = Plan.getZero(StepTy);
1074 return B.createPtrAdd(EndValue, B.createSub(Zero, Step),
1075 DebugLoc::getUnknown(), "ind.escape");
1076 }
1077 if (ScalarTy->isFloatingPointTy()) {
1078 const auto &ID = WideIV->getInductionDescriptor();
1079 return B.createNaryOp(
1080 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1081 ? Instruction::FSub
1082 : Instruction::FAdd,
1083 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
1084 }
1085 llvm_unreachable("all possible induction types must be handled");
1086 return nullptr;
1087}
1088
1090 VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail) {
1091 // Compute end values for all inductions.
1092 VPTypeAnalysis TypeInfo(Plan);
1093 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1094 auto *VectorPH = cast<VPBasicBlock>(VectorRegion->getSinglePredecessor());
1095 VPBuilder VectorPHBuilder(VectorPH, VectorPH->begin());
1097 VPValue *ResumeTC =
1098 FoldTail ? Plan.getTripCount() : &Plan.getVectorTripCount();
1099 for (auto &Phi : VectorRegion->getEntryBasicBlock()->phis()) {
1100 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(&Phi);
1101 if (!WideIV)
1102 continue;
1104 WideIV, VectorPHBuilder, TypeInfo, ResumeTC))
1105 EndValues[WideIV] = EndValue;
1106 }
1107
1108 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
1109 for (VPRecipeBase &R : make_early_inc_range(*MiddleVPBB)) {
1110 VPValue *Op;
1111 if (!match(&R, m_ExitingIVValue(m_VPValue(Op))))
1112 continue;
1113 auto *WideIV = cast<VPWidenInductionRecipe>(Op);
1114 if (VPValue *EndValue = EndValues.lookup(WideIV)) {
1115 R.getVPSingleValue()->replaceAllUsesWith(EndValue);
1116 R.eraseFromParent();
1117 }
1118 }
1119
1120 // Then, optimize exit block users.
1121 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1122 for (VPRecipeBase &R : ExitVPBB->phis()) {
1123 auto *ExitIRI = cast<VPIRPhi>(&R);
1124
1125 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
1126 VPValue *Escape = nullptr;
1127 if (PredVPBB == MiddleVPBB)
1128 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
1129 ExitIRI->getOperand(Idx),
1130 EndValues, PSE);
1131 else
1133 Plan, TypeInfo, PredVPBB, ExitIRI->getOperand(Idx), PSE);
1134 if (Escape)
1135 ExitIRI->setOperand(Idx, Escape);
1136 }
1137 }
1138 }
1139}
1140
1141/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
1142/// them with already existing recipes expanding the same SCEV expression.
1145
1146 for (VPRecipeBase &R :
1148 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
1149 if (!ExpR)
1150 continue;
1151
1152 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
1153 if (Inserted)
1154 continue;
1155 ExpR->replaceAllUsesWith(V->second);
1156 ExpR->eraseFromParent();
1157 }
1158}
1159
1161 SmallVector<VPValue *> WorkList;
1163 WorkList.push_back(V);
1164
1165 while (!WorkList.empty()) {
1166 VPValue *Cur = WorkList.pop_back_val();
1167 if (!Seen.insert(Cur).second)
1168 continue;
1169 VPRecipeBase *R = Cur->getDefiningRecipe();
1170 if (!R)
1171 continue;
1172 if (!isDeadRecipe(*R))
1173 continue;
1174 append_range(WorkList, R->operands());
1175 R->eraseFromParent();
1176 }
1177}
1178
1179/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1180/// Returns an optional pair, where the first element indicates whether it is
1181/// an intrinsic ID.
1182static std::optional<std::pair<bool, unsigned>>
1184 return TypeSwitch<const VPSingleDefRecipe *,
1185 std::optional<std::pair<bool, unsigned>>>(R)
1188 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1189 .Case([](const VPWidenIntrinsicRecipe *I) {
1190 return std::make_pair(true, I->getVectorIntrinsicID());
1191 })
1192 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
1193 // For recipes that do not directly map to LLVM IR instructions,
1194 // assign opcodes after the last VPInstruction opcode (which is also
1195 // after the last IR Instruction opcode), based on the VPRecipeID.
1196 return std::make_pair(false,
1197 VPInstruction::OpsEnd + 1 + I->getVPRecipeID());
1198 })
1199 .Default([](auto *) { return std::nullopt; });
1200}
1201
1202/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1203/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1204/// Operands are foldable live-ins.
1206 ArrayRef<VPValue *> Operands,
1207 const DataLayout &DL,
1208 VPTypeAnalysis &TypeInfo) {
1209 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1210 if (!OpcodeOrIID)
1211 return nullptr;
1212
1214 for (VPValue *Op : Operands) {
1215 if (!match(Op, m_LiveIn()))
1216 return nullptr;
1217 Value *V = Op->getUnderlyingValue();
1218 if (!V)
1219 return nullptr;
1220 Ops.push_back(V);
1221 }
1222
1223 auto FoldToIRValue = [&]() -> Value * {
1224 InstSimplifyFolder Folder(DL);
1225 if (OpcodeOrIID->first) {
1226 if (R.getNumOperands() != 2)
1227 return nullptr;
1228 unsigned ID = OpcodeOrIID->second;
1229 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
1230 TypeInfo.inferScalarType(&R));
1231 }
1232 unsigned Opcode = OpcodeOrIID->second;
1233 if (Instruction::isBinaryOp(Opcode))
1234 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1235 Ops[0], Ops[1]);
1236 if (Instruction::isCast(Opcode))
1237 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1238 TypeInfo.inferScalarType(R.getVPSingleValue()));
1239 switch (Opcode) {
1241 return Folder.FoldSelect(Ops[0], Ops[1],
1243 case VPInstruction::Not:
1244 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1246 case Instruction::Select:
1247 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1248 case Instruction::ICmp:
1249 case Instruction::FCmp:
1250 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1251 Ops[1]);
1252 case Instruction::GetElementPtr: {
1253 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1254 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1255 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1256 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1257 }
1260 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
1261 Ops[0], Ops[1],
1262 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1263 // An extract of a live-in is an extract of a broadcast, so return the
1264 // broadcasted element.
1265 case Instruction::ExtractElement:
1266 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1267 return Ops[0];
1268 }
1269 return nullptr;
1270 };
1271
1272 if (Value *V = FoldToIRValue())
1273 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1274 return nullptr;
1275}
1276
1277/// Try to simplify VPSingleDefRecipe \p Def.
1279 VPlan *Plan = Def->getParent()->getPlan();
1280
1281 // Simplification of live-in IR values for SingleDef recipes using
1282 // InstSimplifyFolder.
1283 const DataLayout &DL = Plan->getDataLayout();
1284 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
1285 return Def->replaceAllUsesWith(V);
1286
1287 // Fold PredPHI LiveIn -> LiveIn.
1288 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1289 VPValue *Op = PredPHI->getOperand(0);
1290 if (isa<VPIRValue>(Op))
1291 PredPHI->replaceAllUsesWith(Op);
1292 }
1293
1294 VPBuilder Builder(Def);
1295
1296 // Avoid replacing VPInstructions with underlying values with new
1297 // VPInstructions, as we would fail to create widen/replicate recpes from the
1298 // new VPInstructions without an underlying value, and miss out on some
1299 // transformations that only apply to widened/replicated recipes later, by
1300 // doing so.
1301 // TODO: We should also not replace non-VPInstructions like VPWidenRecipe with
1302 // VPInstructions without underlying values, as those will get skipped during
1303 // cost computation.
1304 bool CanCreateNewRecipe =
1305 !isa<VPInstruction>(Def) || !Def->getUnderlyingValue();
1306
1307 VPValue *A;
1308 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1309 Type *TruncTy = TypeInfo.inferScalarType(Def);
1310 Type *ATy = TypeInfo.inferScalarType(A);
1311 if (TruncTy == ATy) {
1312 Def->replaceAllUsesWith(A);
1313 } else {
1314 // Don't replace a non-widened cast recipe with a widened cast.
1315 if (!isa<VPWidenCastRecipe>(Def))
1316 return;
1317 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1318
1319 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1320 ? Instruction::SExt
1321 : Instruction::ZExt;
1322 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1323 TruncTy);
1324 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1325 // UnderlyingExt has distinct return type, used to retain legacy cost.
1326 Ext->setUnderlyingValue(UnderlyingExt);
1327 }
1328 Def->replaceAllUsesWith(Ext);
1329 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1330 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1331 Def->replaceAllUsesWith(Trunc);
1332 }
1333 }
1334#ifndef NDEBUG
1335 // Verify that the cached type info is for both A and its users is still
1336 // accurate by comparing it to freshly computed types.
1337 VPTypeAnalysis TypeInfo2(*Plan);
1338 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1339 for (VPUser *U : A->users()) {
1340 auto *R = cast<VPRecipeBase>(U);
1341 for (VPValue *VPV : R->definedValues())
1342 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1343 }
1344#endif
1345 }
1346
1347 // Simplify (X && Y) | (X && !Y) -> X.
1348 // TODO: Split up into simpler, modular combines: (X && Y) | (X && Z) into X
1349 // && (Y | Z) and (X | !X) into true. This requires queuing newly created
1350 // recipes to be visited during simplification.
1351 VPValue *X, *Y, *Z;
1352 if (match(Def,
1355 Def->replaceAllUsesWith(X);
1356 Def->eraseFromParent();
1357 return;
1358 }
1359
1360 // x | AllOnes -> AllOnes
1361 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1362 return Def->replaceAllUsesWith(
1363 Plan->getAllOnesValue(TypeInfo.inferScalarType(Def)));
1364
1365 // x | 0 -> x
1366 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1367 return Def->replaceAllUsesWith(X);
1368
1369 // x | !x -> AllOnes
1371 return Def->replaceAllUsesWith(
1372 Plan->getAllOnesValue(TypeInfo.inferScalarType(Def)));
1373
1374 // x & 0 -> 0
1375 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1376 return Def->replaceAllUsesWith(
1377 Plan->getZero(TypeInfo.inferScalarType(Def)));
1378
1379 // x & AllOnes -> x
1380 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_AllOnes())))
1381 return Def->replaceAllUsesWith(X);
1382
1383 // x && false -> false
1384 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_False())))
1385 return Def->replaceAllUsesWith(Plan->getFalse());
1386
1387 // x && true -> x
1388 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_True())))
1389 return Def->replaceAllUsesWith(X);
1390
1391 // (x && y) | (x && z) -> x && (y | z)
1392 if (CanCreateNewRecipe &&
1395 // Simplify only if one of the operands has one use to avoid creating an
1396 // extra recipe.
1397 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1398 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1399 return Def->replaceAllUsesWith(
1400 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1401
1402 // x && (x && y) -> x && y
1403 if (match(Def, m_LogicalAnd(m_VPValue(X),
1405 return Def->replaceAllUsesWith(Def->getOperand(1));
1406
1407 // x && (y && x) -> x && y
1408 if (match(Def, m_LogicalAnd(m_VPValue(X),
1410 return Def->replaceAllUsesWith(Builder.createLogicalAnd(X, Y));
1411
1412 // x && !x -> 0
1414 return Def->replaceAllUsesWith(Plan->getFalse());
1415
1416 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1417 return Def->replaceAllUsesWith(X);
1418
1419 // select c, false, true -> not c
1420 VPValue *C;
1421 if (CanCreateNewRecipe &&
1422 match(Def, m_Select(m_VPValue(C), m_False(), m_True())))
1423 return Def->replaceAllUsesWith(Builder.createNot(C));
1424
1425 // select !c, x, y -> select c, y, x
1426 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1427 Def->setOperand(0, C);
1428 Def->setOperand(1, Y);
1429 Def->setOperand(2, X);
1430 return;
1431 }
1432
1433 if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
1434 return Def->replaceAllUsesWith(A);
1435
1436 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1437 return Def->replaceAllUsesWith(A);
1438
1439 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1440 return Def->replaceAllUsesWith(
1441 Plan->getZero(TypeInfo.inferScalarType(Def)));
1442
1443 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_AllOnes()))) {
1444 // Preserve nsw from the Mul on the new Sub.
1446 false, cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap()};
1447 return Def->replaceAllUsesWith(
1448 Builder.createSub(Plan->getZero(TypeInfo.inferScalarType(A)), A,
1449 Def->getDebugLoc(), "", NW));
1450 }
1451
1452 const APInt *APC;
1453 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_APInt(APC))) &&
1454 APC->isPowerOf2())
1455 return Def->replaceAllUsesWith(Builder.createNaryOp(
1456 Instruction::Shl,
1457 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1458 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1459
1460 // Don't convert udiv to lshr inside a replicate region, as VPInstructions are
1461 // not allowed in them.
1462 const VPRegionBlock *ParentRegion = Def->getParent()->getParent();
1463 bool IsInReplicateRegion = ParentRegion && ParentRegion->isReplicator();
1464 if (CanCreateNewRecipe && !IsInReplicateRegion &&
1465 match(Def, m_UDiv(m_VPValue(A), m_APInt(APC))) && APC->isPowerOf2())
1466 return Def->replaceAllUsesWith(Builder.createNaryOp(
1467 Instruction::LShr,
1468 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1469 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1470
1471 if (match(Def, m_Not(m_VPValue(A)))) {
1472 if (match(A, m_Not(m_VPValue(A))))
1473 return Def->replaceAllUsesWith(A);
1474
1475 // Try to fold Not into compares by adjusting the predicate in-place.
1476 CmpPredicate Pred;
1477 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1478 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1479 if (all_of(Cmp->users(),
1481 m_Not(m_Specific(Cmp)),
1482 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1483 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1484 for (VPUser *U : to_vector(Cmp->users())) {
1485 auto *R = cast<VPSingleDefRecipe>(U);
1486 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1487 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1488 R->setOperand(1, Y);
1489 R->setOperand(2, X);
1490 } else {
1491 // not (cmp pred) -> cmp inv_pred
1492 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1493 R->replaceAllUsesWith(Cmp);
1494 }
1495 }
1496 // If Cmp doesn't have a debug location, use the one from the negation,
1497 // to preserve the location.
1498 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1499 Cmp->setDebugLoc(Def->getDebugLoc());
1500 }
1501 }
1502 }
1503
1504 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1505 // any-of (fcmp uno %A, %B), ...
1506 if (match(Def, m_AnyOf())) {
1508 VPRecipeBase *UnpairedCmp = nullptr;
1509 for (VPValue *Op : Def->operands()) {
1510 VPValue *X;
1511 if (Op->getNumUsers() > 1 ||
1513 m_Deferred(X)))) {
1514 NewOps.push_back(Op);
1515 } else if (!UnpairedCmp) {
1516 UnpairedCmp = Op->getDefiningRecipe();
1517 } else {
1518 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1519 UnpairedCmp->getOperand(0), X));
1520 UnpairedCmp = nullptr;
1521 }
1522 }
1523
1524 if (UnpairedCmp)
1525 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1526
1527 if (NewOps.size() < Def->getNumOperands()) {
1528 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1529 return Def->replaceAllUsesWith(NewAnyOf);
1530 }
1531 }
1532
1533 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1534 // This is useful for fmax/fmin without fast-math flags, where we need to
1535 // check if any operand is NaN.
1536 if (CanCreateNewRecipe &&
1538 m_Deferred(X)),
1540 m_Deferred(Y))))) {
1541 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1542 return Def->replaceAllUsesWith(NewCmp);
1543 }
1544
1545 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1546 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1547 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1548 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1549 TypeInfo.inferScalarType(Def))
1550 return Def->replaceAllUsesWith(Def->getOperand(1));
1551
1553 m_One()))) {
1554 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1555 if (TypeInfo.inferScalarType(X) != WideStepTy)
1556 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1557 Def->replaceAllUsesWith(X);
1558 return;
1559 }
1560
1561 // For i1 vp.merges produced by AnyOf reductions:
1562 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1564 m_VPValue(X), m_VPValue())) &&
1566 TypeInfo.inferScalarType(Def)->isIntegerTy(1)) {
1567 Def->setOperand(1, Def->getOperand(0));
1568 Def->setOperand(0, Y);
1569 return;
1570 }
1571
1572 // Simplify MaskedCond with no block mask to its single operand.
1574 !cast<VPInstruction>(Def)->isMasked())
1575 return Def->replaceAllUsesWith(Def->getOperand(0));
1576
1577 // Look through ExtractLastLane.
1578 if (match(Def, m_ExtractLastLane(m_VPValue(A)))) {
1579 if (match(A, m_BuildVector())) {
1580 auto *BuildVector = cast<VPInstruction>(A);
1581 Def->replaceAllUsesWith(
1582 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1583 return;
1584 }
1585 if (Plan->hasScalarVFOnly())
1586 return Def->replaceAllUsesWith(A);
1587 }
1588
1589 // Look through ExtractPenultimateElement (BuildVector ....).
1591 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1592 Def->replaceAllUsesWith(
1593 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1594 return;
1595 }
1596
1597 uint64_t Idx;
1599 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1600 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1601 return;
1602 }
1603
1604 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1605 Def->replaceAllUsesWith(
1606 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1607 return;
1608 }
1609
1610 // Look through broadcast of single-scalar when used as select conditions; in
1611 // that case the scalar condition can be used directly.
1612 if (match(Def,
1615 "broadcast operand must be single-scalar");
1616 Def->setOperand(0, C);
1617 return;
1618 }
1619
1621 if (Def->getNumOperands() == 1) {
1622 Def->replaceAllUsesWith(Def->getOperand(0));
1623 return;
1624 }
1625 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1626 if (all_equal(Phi->incoming_values()))
1627 Phi->replaceAllUsesWith(Phi->getOperand(0));
1628 }
1629 return;
1630 }
1631
1632 VPIRValue *IRV;
1633 if (Def->getNumOperands() == 1 &&
1635 return Def->replaceAllUsesWith(IRV);
1636
1637 // Some simplifications can only be applied after unrolling. Perform them
1638 // below.
1639 if (!Plan->isUnrolled())
1640 return;
1641
1642 // After unrolling, extract-lane may be used to extract values from multiple
1643 // scalar sources. Only simplify when extracting from a single scalar source.
1644 VPValue *LaneToExtract;
1645 if (match(Def, m_ExtractLane(m_VPValue(LaneToExtract), m_VPValue(A)))) {
1646 // Simplify extract-lane(%lane_num, %scalar_val) -> %scalar_val.
1648 return Def->replaceAllUsesWith(A);
1649
1650 // Simplify extract-lane with single source to extract-element.
1651 Def->replaceAllUsesWith(Builder.createNaryOp(
1652 Instruction::ExtractElement, {A, LaneToExtract}, Def->getDebugLoc()));
1653 return;
1654 }
1655
1656 // Look for cycles where Def is of the form:
1657 // X = phi(0, IVInc) ; used only by IVInc, or by IVInc and Inc = X + Y
1658 // IVInc = X + Step ; used by X and Def
1659 // Def = IVInc + Y
1660 // Fold the increment Y into the phi's start value, replace Def with IVInc,
1661 // and if Inc exists, replace it with X.
1662 if (match(Def, m_Add(m_Add(m_VPValue(X), m_VPValue()), m_VPValue(Y))) &&
1664 match(X, m_VPPhi(m_ZeroInt(), m_Specific(Def->getOperand(0))))) {
1665 auto *Phi = cast<VPPhi>(X);
1666 auto *IVInc = Def->getOperand(0);
1667 if (IVInc->getNumUsers() == 2) {
1668 // If Phi has a second user (besides IVInc's defining recipe), it must
1669 // be Inc = Phi + Y for the fold to apply.
1672 if (Phi->getNumUsers() == 1 || (Phi->getNumUsers() == 2 && Inc)) {
1673 Def->replaceAllUsesWith(IVInc);
1674 if (Inc)
1675 Inc->replaceAllUsesWith(Phi);
1676 Phi->setOperand(0, Y);
1677 return;
1678 }
1679 }
1680 }
1681
1682 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1683 // just the pointer operand.
1684 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Def))
1685 if (!VPR->getOffset() || match(VPR->getOffset(), m_ZeroInt()))
1686 return VPR->replaceAllUsesWith(VPR->getOperand(0));
1687
1688 // VPScalarIVSteps after unrolling can be replaced by their start value, if
1689 // the start index is zero and only the first lane 0 is demanded.
1690 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1691 if (!Steps->getStartIndex() && vputils::onlyFirstLaneUsed(Steps)) {
1692 Steps->replaceAllUsesWith(Steps->getOperand(0));
1693 return;
1694 }
1695 }
1696 // Simplify redundant ReductionStartVector recipes after unrolling.
1697 VPValue *StartV;
1699 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1700 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1701 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1702 return PhiR && PhiR->isInLoop();
1703 });
1704 return;
1705 }
1706
1708 Def->replaceAllUsesWith(A);
1709 return;
1710 }
1711
1712 if (match(Def, m_ExtractLastLane(m_VPValue(A))) &&
1715 cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
1716 all_of(A->users(),
1717 [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
1718 return Def->replaceAllUsesWith(A);
1719 }
1720
1721 if (Plan->getConcreteUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
1722 return Def->replaceAllUsesWith(A);
1723}
1724
1727 Plan.getEntry());
1728 VPTypeAnalysis TypeInfo(Plan);
1730 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1731 if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R))
1732 simplifyRecipe(Def, TypeInfo);
1733 }
1734}
1735
1736/// Reassociate (headermask && x) && y -> headermask && (x && y) to allow the
1737/// header mask to be simplified further when tail folding, e.g. in
1738/// optimizeEVLMasks.
1739static void reassociateHeaderMask(VPlan &Plan) {
1740 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
1741 if (!HeaderMask)
1742 return;
1743
1744 SmallVector<VPUser *> Worklist;
1745 for (VPUser *U : HeaderMask->users())
1746 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue())))
1748
1749 while (!Worklist.empty()) {
1750 auto *R = dyn_cast<VPSingleDefRecipe>(Worklist.pop_back_val());
1751 VPValue *X, *Y;
1752 if (!R || !match(R, m_LogicalAnd(
1753 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(X)),
1754 m_VPValue(Y))))
1755 continue;
1756 append_range(Worklist, R->users());
1757 VPBuilder Builder(R);
1758 R->replaceAllUsesWith(
1759 Builder.createLogicalAnd(HeaderMask, Builder.createLogicalAnd(X, Y)));
1760 }
1761}
1762
1764 if (Plan.hasScalarVFOnly())
1765 return;
1766
1767 // Try to narrow wide and replicating recipes to single scalar recipes,
1768 // based on VPlan analysis. Only process blocks in the loop region for now,
1769 // without traversing into nested regions, as recipes in replicate regions
1770 // cannot be converted yet.
1773 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1775 VPWidenStoreRecipe>(&R))
1776 continue;
1777 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1778 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1779 continue;
1780
1781 // Convert an unmasked scatter with an uniform address into
1782 // extract-last-lane + scalar store.
1783 // TODO: Add a profitability check comparing the cost of a scatter vs.
1784 // extract + scalar store.
1785 auto *WidenStoreR = dyn_cast<VPWidenStoreRecipe>(&R);
1786 if (WidenStoreR && vputils::isSingleScalar(WidenStoreR->getAddr()) &&
1787 !WidenStoreR->isConsecutive()) {
1788 assert(!WidenStoreR->isReverse() &&
1789 "Not consecutive memory recipes shouldn't be reversed");
1790 VPValue *Mask = WidenStoreR->getMask();
1791
1792 // Only convert the scatter to a scalar store if it is unmasked.
1793 // TODO: Support converting scatter masked by the header mask to scalar
1794 // store.
1795 if (Mask)
1796 continue;
1797
1799 {WidenStoreR->getOperand(1)});
1800 Extract->insertBefore(WidenStoreR);
1801
1802 // TODO: Sink the scalar store recipe to middle block if possible.
1803 auto *ScalarStore = new VPReplicateRecipe(
1804 &WidenStoreR->getIngredient(), {Extract, WidenStoreR->getAddr()},
1805 true /*IsSingleScalar*/, nullptr /*Mask*/, {},
1806 *WidenStoreR /*Metadata*/);
1807 ScalarStore->insertBefore(WidenStoreR);
1808 WidenStoreR->eraseFromParent();
1809 continue;
1810 }
1811
1812 auto *RepOrWidenR = dyn_cast<VPRecipeWithIRFlags>(&R);
1813 if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
1814 vputils::isSingleScalar(RepR->getOperand(1))) {
1815 auto *Clone = new VPReplicateRecipe(
1816 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1817 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1818 *RepR /*Metadata*/, RepR->getDebugLoc());
1819 Clone->insertBefore(RepOrWidenR);
1820 VPBuilder Builder(Clone);
1821 VPValue *ExtractOp = Clone->getOperand(0);
1822 if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
1823 ExtractOp =
1824 Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
1825 ExtractOp =
1826 Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
1827 Clone->setOperand(0, ExtractOp);
1828 RepR->eraseFromParent();
1829 continue;
1830 }
1831
1832 // Skip recipes that aren't single scalars.
1833 if (!RepOrWidenR || !vputils::isSingleScalar(RepOrWidenR))
1834 continue;
1835
1836 // Predicate to check if a user of Op introduces extra broadcasts.
1837 auto IntroducesBCastOf = [](const VPValue *Op) {
1838 return [Op](const VPUser *U) {
1839 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1843 VPI->getOpcode()))
1844 return false;
1845 }
1846 return !U->usesScalars(Op);
1847 };
1848 };
1849
1850 if (any_of(RepOrWidenR->users(), IntroducesBCastOf(RepOrWidenR)) &&
1851 none_of(RepOrWidenR->operands(), [&](VPValue *Op) {
1852 if (any_of(
1853 make_filter_range(Op->users(), not_equal_to(RepOrWidenR)),
1854 IntroducesBCastOf(Op)))
1855 return false;
1856 // Non-constant live-ins require broadcasts, while constants do not
1857 // need explicit broadcasts.
1858 auto *IRV = dyn_cast<VPIRValue>(Op);
1859 bool LiveInNeedsBroadcast = IRV && !isa<Constant>(IRV->getValue());
1860 auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
1861 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1862 }))
1863 continue;
1864
1865 auto *Clone = new VPReplicateRecipe(
1866 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1867 true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
1868 Clone->insertBefore(RepOrWidenR);
1869 RepOrWidenR->replaceAllUsesWith(Clone);
1870 if (isDeadRecipe(*RepOrWidenR))
1871 RepOrWidenR->eraseFromParent();
1872 }
1873 }
1874}
1875
1876/// Try to see if all of \p Blend's masks share a common value logically and'ed
1877/// and remove it from the masks.
1879 if (Blend->isNormalized())
1880 return;
1881 VPValue *CommonEdgeMask;
1882 if (!match(Blend->getMask(0),
1883 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1884 return;
1885 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1886 if (!match(Blend->getMask(I),
1887 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1888 return;
1889 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1890 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1891}
1892
1893/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1894/// to make sure the masks are simplified.
1895static void simplifyBlends(VPlan &Plan) {
1898 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1899 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1900 if (!Blend)
1901 continue;
1902
1903 removeCommonBlendMask(Blend);
1904
1905 // Try to remove redundant blend recipes.
1906 SmallPtrSet<VPValue *, 4> UniqueValues;
1907 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1908 UniqueValues.insert(Blend->getIncomingValue(0));
1909 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1910 if (!match(Blend->getMask(I), m_False()))
1911 UniqueValues.insert(Blend->getIncomingValue(I));
1912
1913 if (UniqueValues.size() == 1) {
1914 Blend->replaceAllUsesWith(*UniqueValues.begin());
1915 Blend->eraseFromParent();
1916 continue;
1917 }
1918
1919 if (Blend->isNormalized())
1920 continue;
1921
1922 // Normalize the blend so its first incoming value is used as the initial
1923 // value with the others blended into it.
1924
1925 unsigned StartIndex = 0;
1926 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1927 // If a value's mask is used only by the blend then is can be deadcoded.
1928 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1929 // that's used by multiple blends where it can be removed from them all.
1930 VPValue *Mask = Blend->getMask(I);
1931 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1932 StartIndex = I;
1933 break;
1934 }
1935 }
1936
1937 SmallVector<VPValue *, 4> OperandsWithMask;
1938 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1939
1940 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1941 if (I == StartIndex)
1942 continue;
1943 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1944 OperandsWithMask.push_back(Blend->getMask(I));
1945 }
1946
1947 auto *NewBlend =
1948 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1949 OperandsWithMask, *Blend, Blend->getDebugLoc());
1950 NewBlend->insertBefore(&R);
1951
1952 VPValue *DeadMask = Blend->getMask(StartIndex);
1953 Blend->replaceAllUsesWith(NewBlend);
1954 Blend->eraseFromParent();
1956
1957 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1958 VPValue *NewMask;
1959 if (NewBlend->getNumOperands() == 3 &&
1960 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1961 VPValue *Inc0 = NewBlend->getOperand(0);
1962 VPValue *Inc1 = NewBlend->getOperand(1);
1963 VPValue *OldMask = NewBlend->getOperand(2);
1964 NewBlend->setOperand(0, Inc1);
1965 NewBlend->setOperand(1, Inc0);
1966 NewBlend->setOperand(2, NewMask);
1967 if (OldMask->getNumUsers() == 0)
1968 cast<VPInstruction>(OldMask)->eraseFromParent();
1969 }
1970 }
1971 }
1972}
1973
1974/// Optimize the width of vector induction variables in \p Plan based on a known
1975/// constant Trip Count, \p BestVF and \p BestUF.
1977 ElementCount BestVF,
1978 unsigned BestUF) {
1979 // Only proceed if we have not completely removed the vector region.
1980 if (!Plan.getVectorLoopRegion())
1981 return false;
1982
1983 const APInt *TC;
1984 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
1985 return false;
1986
1987 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1988 // and UF. Returns at least 8.
1989 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1990 APInt AlignedTC =
1993 APInt MaxVal = AlignedTC - 1;
1994 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
1995 };
1996 unsigned NewBitWidth =
1997 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
1998
1999 LLVMContext &Ctx = Plan.getContext();
2000 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
2001
2002 bool MadeChange = false;
2003
2004 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2005 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2006 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2007
2008 // Currently only handle canonical IVs as it is trivial to replace the start
2009 // and stop values, and we currently only perform the optimization when the
2010 // IV has a single use.
2011 if (!WideIV || !WideIV->isCanonical() ||
2012 WideIV->hasMoreThanOneUniqueUser() ||
2013 NewIVTy == WideIV->getScalarType())
2014 continue;
2015
2016 // Currently only handle cases where the single user is a header-mask
2017 // comparison with the backedge-taken-count.
2018 VPUser *SingleUser = WideIV->getSingleUser();
2019 if (!SingleUser ||
2020 !match(SingleUser, m_ICmp(m_Specific(WideIV),
2023 continue;
2024
2025 // Update IV operands and comparison bound to use new narrower type.
2026 auto *NewStart = Plan.getZero(NewIVTy);
2027 WideIV->setStartValue(NewStart);
2028 auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
2029 WideIV->setStepValue(NewStep);
2030
2031 auto *NewBTC = new VPWidenCastRecipe(
2032 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy,
2033 nullptr, VPIRFlags::getDefaultFlags(Instruction::Trunc));
2034 Plan.getVectorPreheader()->appendRecipe(NewBTC);
2035 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
2036 Cmp->setOperand(1, NewBTC);
2037
2038 MadeChange = true;
2039 }
2040
2041 return MadeChange;
2042}
2043
2044/// Return true if \p Cond is known to be true for given \p BestVF and \p
2045/// BestUF.
2047 ElementCount BestVF, unsigned BestUF,
2050 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
2051 &PSE](VPValue *C) {
2052 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, PSE);
2053 });
2054
2055 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
2057 m_Specific(CanIV->getBackedgeValue()),
2058 m_Specific(&Plan.getVectorTripCount()))))
2059 return false;
2060
2061 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
2062 // count is not conveniently available as SCEV so far, so we compare directly
2063 // against the original trip count. This is stricter than necessary, as we
2064 // will only return true if the trip count == vector trip count.
2065 const SCEV *VectorTripCount =
2067 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2068 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), PSE);
2069 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2070 "Trip count SCEV must be computable");
2071 ScalarEvolution &SE = *PSE.getSE();
2072 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2073 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2074 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
2075}
2076
2077/// Try to replace multiple active lane masks used for control flow with
2078/// a single, wide active lane mask instruction followed by multiple
2079/// extract subvector intrinsics. This applies to the active lane mask
2080/// instructions both in the loop and in the preheader.
2081/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
2082/// new extracts from the first active lane mask, which has it's last
2083/// operand (multiplier) set to UF.
2085 unsigned UF) {
2086 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
2087 return false;
2088
2089 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2090 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2091 auto *Term = &ExitingVPBB->back();
2092
2093 using namespace llvm::VPlanPatternMatch;
2095 m_VPValue(), m_VPValue(), m_VPValue())))))
2096 return false;
2097
2098 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
2099 LLVMContext &Ctx = Plan.getContext();
2100
2101 auto ExtractFromALM = [&](VPInstruction *ALM,
2102 SmallVectorImpl<VPValue *> &Extracts) {
2103 DebugLoc DL = ALM->getDebugLoc();
2104 for (unsigned Part = 0; Part < UF; ++Part) {
2106 Ops.append({ALM, Plan.getConstantInt(64, VF.getKnownMinValue() * Part)});
2107 auto *Ext =
2108 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
2109 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
2110 Extracts[Part] = Ext;
2111 Ext->insertAfter(ALM);
2112 }
2113 };
2114
2115 // Create a list of each active lane mask phi, ordered by unroll part.
2117 for (VPRecipeBase &R : Header->phis()) {
2119 if (!Phi)
2120 continue;
2121 VPValue *Index = nullptr;
2122 match(Phi->getBackedgeValue(),
2124 assert(Index && "Expected index from ActiveLaneMask instruction");
2125
2126 uint64_t Part;
2127 if (match(Index,
2129 m_VPValue(), m_Mul(m_VPValue(), m_ConstantInt(Part)))))
2130 Phis[Part] = Phi;
2131 else {
2132 // Anything other than a CanonicalIVIncrementForPart is part 0
2133 assert(!match(
2134 Index,
2136 Phis[0] = Phi;
2137 }
2138 }
2139
2140 assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
2141 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
2142
2143 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
2144 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
2145
2146 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
2147 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
2148 "Expected incoming values of Phi to be ActiveLaneMasks");
2149
2150 // When using wide lane masks, the return type of the get.active.lane.mask
2151 // intrinsic is VF x UF (last operand).
2152 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
2153 EntryALM->setOperand(2, ALMMultiplier);
2154 LoopALM->setOperand(2, ALMMultiplier);
2155
2156 // Create UF x extract vectors and insert into preheader.
2157 SmallVector<VPValue *> EntryExtracts(UF);
2158 ExtractFromALM(EntryALM, EntryExtracts);
2159
2160 // Create UF x extract vectors and insert before the loop compare & branch,
2161 // updating the compare to use the first extract.
2162 SmallVector<VPValue *> LoopExtracts(UF);
2163 ExtractFromALM(LoopALM, LoopExtracts);
2164 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
2165 Not->setOperand(0, LoopExtracts[0]);
2166
2167 // Update the incoming values of active lane mask phis.
2168 for (unsigned Part = 0; Part < UF; ++Part) {
2169 Phis[Part]->setStartValue(EntryExtracts[Part]);
2170 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
2171 }
2172
2173 return true;
2174}
2175
2176/// Try to simplify the branch condition of \p Plan. This may restrict the
2177/// resulting plan to \p BestVF and \p BestUF.
2179 unsigned BestUF,
2181 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2182 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2183 auto *Term = &ExitingVPBB->back();
2184 VPValue *Cond;
2185 auto m_CanIVInc = m_Add(m_VPValue(), m_Specific(&Plan.getVFxUF()));
2186 // Check if the branch condition compares the canonical IV increment (for main
2187 // loop), or the canonical IV increment plus an offset (for epilog loop).
2188 if (match(Term, m_BranchOnCount(
2189 m_CombineOr(m_CanIVInc, m_c_Add(m_CanIVInc, m_LiveIn())),
2190 m_VPValue())) ||
2192 m_VPValue(), m_VPValue(), m_VPValue()))))) {
2193 // Try to simplify the branch condition if VectorTC <= VF * UF when the
2194 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
2195 const SCEV *VectorTripCount =
2197 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2198 VectorTripCount =
2200 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2201 "Trip count SCEV must be computable");
2202 ScalarEvolution &SE = *PSE.getSE();
2203 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2204 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2205 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
2206 return false;
2207 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond))) ||
2209 // For BranchOnCond, check if we can prove the condition to be true using VF
2210 // and UF.
2211 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2212 return false;
2213 } else {
2214 return false;
2215 }
2216
2217 // The vector loop region only executes once. Convert terminator of the
2218 // exiting block to exit in the first iteration.
2219 if (match(Term, m_BranchOnTwoConds())) {
2220 Term->setOperand(1, Plan.getTrue());
2221 return true;
2222 }
2223
2224 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, Plan.getTrue(), {},
2225 {}, Term->getDebugLoc());
2226 ExitingVPBB->appendRecipe(BOC);
2227 Term->eraseFromParent();
2228
2229 return true;
2230}
2231
2232/// From the definition of llvm.experimental.get.vector.length,
2233/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2237 vp_depth_first_deep(Plan.getEntry()))) {
2238 for (VPRecipeBase &R : *VPBB) {
2239 VPValue *AVL;
2240 if (!match(&R, m_EVL(m_VPValue(AVL))))
2241 continue;
2242
2243 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, PSE);
2244 if (isa<SCEVCouldNotCompute>(AVLSCEV))
2245 continue;
2246 ScalarEvolution &SE = *PSE.getSE();
2247 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
2248 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
2249 continue;
2250
2252 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
2253 R.getDebugLoc());
2254 if (Trunc != AVL) {
2255 auto *TruncR = cast<VPSingleDefRecipe>(Trunc);
2256 const DataLayout &DL = Plan.getDataLayout();
2257 VPTypeAnalysis TypeInfo(Plan);
2258 if (VPValue *Folded =
2259 tryToFoldLiveIns(*TruncR, TruncR->operands(), DL, TypeInfo))
2260 Trunc = Folded;
2261 }
2262 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
2263 return true;
2264 }
2265 }
2266 return false;
2267}
2268
2270 unsigned BestUF,
2272 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2273 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2274
2275 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
2276 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2277 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2278
2279 if (MadeChange) {
2280 Plan.setVF(BestVF);
2281 assert(Plan.getConcreteUF() == BestUF && "BestUF must match the Plan's UF");
2282 }
2283}
2284
2285/// Sink users of \p FOR after the recipe defining the previous value \p
2286/// Previous of the recurrence. \returns true if all users of \p FOR could be
2287/// re-arranged as needed or false if it is not possible.
2288static bool
2290 VPRecipeBase *Previous,
2291 VPDominatorTree &VPDT) {
2292 // If Previous is a live-in (no defining recipe), it naturally dominates all
2293 // recipes in the loop, so no sinking is needed.
2294 if (!Previous)
2295 return true;
2296
2297 // Collect recipes that need sinking.
2300 Seen.insert(Previous);
2301 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
2302 // The previous value must not depend on the users of the recurrence phi. In
2303 // that case, FOR is not a fixed order recurrence.
2304 if (SinkCandidate == Previous)
2305 return false;
2306
2307 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
2308 !Seen.insert(SinkCandidate).second ||
2309 VPDT.properlyDominates(Previous, SinkCandidate))
2310 return true;
2311
2312 if (cannotHoistOrSinkRecipe(*SinkCandidate))
2313 return false;
2314
2315 WorkList.push_back(SinkCandidate);
2316 return true;
2317 };
2318
2319 // Recursively sink users of FOR after Previous.
2320 WorkList.push_back(FOR);
2321 for (unsigned I = 0; I != WorkList.size(); ++I) {
2322 VPRecipeBase *Current = WorkList[I];
2323 assert(Current->getNumDefinedValues() == 1 &&
2324 "only recipes with a single defined value expected");
2325
2326 for (VPUser *User : Current->getVPSingleValue()->users()) {
2327 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
2328 return false;
2329 }
2330 }
2331
2332 // Keep recipes to sink ordered by dominance so earlier instructions are
2333 // processed first.
2334 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2335 return VPDT.properlyDominates(A, B);
2336 });
2337
2338 for (VPRecipeBase *SinkCandidate : WorkList) {
2339 if (SinkCandidate == FOR)
2340 continue;
2341
2342 SinkCandidate->moveAfter(Previous);
2343 Previous = SinkCandidate;
2344 }
2345 return true;
2346}
2347
2348/// Try to hoist \p Previous and its operands before all users of \p FOR.
2350 VPRecipeBase *Previous,
2351 VPDominatorTree &VPDT) {
2352 if (cannotHoistOrSinkRecipe(*Previous))
2353 return false;
2354
2355 // Collect recipes that need hoisting.
2356 SmallVector<VPRecipeBase *> HoistCandidates;
2358 VPRecipeBase *HoistPoint = nullptr;
2359 // Find the closest hoist point by looking at all users of FOR and selecting
2360 // the recipe dominating all other users.
2361 for (VPUser *U : FOR->users()) {
2362 auto *R = cast<VPRecipeBase>(U);
2363 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
2364 HoistPoint = R;
2365 }
2366 assert(all_of(FOR->users(),
2367 [&VPDT, HoistPoint](VPUser *U) {
2368 auto *R = cast<VPRecipeBase>(U);
2369 return HoistPoint == R ||
2370 VPDT.properlyDominates(HoistPoint, R);
2371 }) &&
2372 "HoistPoint must dominate all users of FOR");
2373
2374 auto NeedsHoisting = [HoistPoint, &VPDT,
2375 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
2376 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
2377 if (!HoistCandidate)
2378 return nullptr;
2379 VPRegionBlock *EnclosingLoopRegion =
2380 HoistCandidate->getParent()->getEnclosingLoopRegion();
2381 assert((!HoistCandidate->getRegion() ||
2382 HoistCandidate->getRegion() == EnclosingLoopRegion) &&
2383 "CFG in VPlan should still be flat, without replicate regions");
2384 // Hoist candidate was already visited, no need to hoist.
2385 if (!Visited.insert(HoistCandidate).second)
2386 return nullptr;
2387
2388 // Candidate is outside loop region or a header phi, dominates FOR users w/o
2389 // hoisting.
2390 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
2391 return nullptr;
2392
2393 // If we reached a recipe that dominates HoistPoint, we don't need to
2394 // hoist the recipe.
2395 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
2396 return nullptr;
2397 return HoistCandidate;
2398 };
2399
2400 if (!NeedsHoisting(Previous->getVPSingleValue()))
2401 return true;
2402
2403 // Recursively try to hoist Previous and its operands before all users of FOR.
2404 HoistCandidates.push_back(Previous);
2405
2406 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
2407 VPRecipeBase *Current = HoistCandidates[I];
2408 assert(Current->getNumDefinedValues() == 1 &&
2409 "only recipes with a single defined value expected");
2410 if (cannotHoistOrSinkRecipe(*Current))
2411 return false;
2412
2413 for (VPValue *Op : Current->operands()) {
2414 // If we reach FOR, it means the original Previous depends on some other
2415 // recurrence that in turn depends on FOR. If that is the case, we would
2416 // also need to hoist recipes involving the other FOR, which may break
2417 // dependencies.
2418 if (Op == FOR)
2419 return false;
2420
2421 if (auto *R = NeedsHoisting(Op)) {
2422 // Bail out if the recipe defines multiple values.
2423 // TODO: Hoisting such recipes requires additional handling.
2424 if (R->getNumDefinedValues() != 1)
2425 return false;
2426 HoistCandidates.push_back(R);
2427 }
2428 }
2429 }
2430
2431 // Order recipes to hoist by dominance so earlier instructions are processed
2432 // first.
2433 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2434 return VPDT.properlyDominates(A, B);
2435 });
2436
2437 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
2438 HoistCandidate->moveBefore(*HoistPoint->getParent(),
2439 HoistPoint->getIterator());
2440 }
2441
2442 return true;
2443}
2444
2446 VPBuilder &LoopBuilder) {
2447 VPDominatorTree VPDT(Plan);
2448 VPTypeAnalysis TypeInfo(Plan);
2449
2451 for (VPRecipeBase &R :
2454 RecurrencePhis.push_back(FOR);
2455
2456 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
2458 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
2459 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
2460 // to terminate.
2461 while (auto *PrevPhi =
2463 assert(PrevPhi->getParent() == FOR->getParent());
2464 assert(SeenPhis.insert(PrevPhi).second);
2465 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
2466 }
2467
2468 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
2469 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
2470 return false;
2471
2472 // Introduce a recipe to combine the incoming and previous values of a
2473 // fixed-order recurrence.
2474 VPBasicBlock *InsertBlock =
2475 Previous ? Previous->getParent() : FOR->getParent();
2476 if (!Previous || isa<VPHeaderPHIRecipe>(Previous))
2477 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
2478 else
2479 LoopBuilder.setInsertPoint(InsertBlock,
2480 std::next(Previous->getIterator()));
2481
2482 auto *RecurSplice =
2484 {FOR, FOR->getBackedgeValue()});
2485
2486 FOR->replaceAllUsesWith(RecurSplice);
2487 // Set the first operand of RecurSplice to FOR again, after replacing
2488 // all users.
2489 RecurSplice->setOperand(0, FOR);
2490
2491 // Check for users extracting at the penultimate active lane of the FOR.
2492 // If only a single lane is active in the current iteration, we need to
2493 // select the last element from the previous iteration (from the FOR phi
2494 // directly).
2495 for (VPUser *U : RecurSplice->users()) {
2497 m_Specific(RecurSplice))))
2498 continue;
2499
2501 VPValue *LastActiveLane = cast<VPInstruction>(U)->getOperand(0);
2502 Type *Ty = TypeInfo.inferScalarType(LastActiveLane);
2503 VPValue *Zero = Plan.getConstantInt(Ty, 0);
2504 VPValue *One = Plan.getConstantInt(Ty, 1);
2505 VPValue *PenultimateIndex = B.createSub(LastActiveLane, One);
2506 VPValue *PenultimateLastIter =
2507 B.createNaryOp(VPInstruction::ExtractLane,
2508 {PenultimateIndex, FOR->getBackedgeValue()});
2509 VPValue *LastPrevIter =
2510 B.createNaryOp(VPInstruction::ExtractLastLane, FOR);
2511
2512 VPValue *Cmp = B.createICmp(CmpInst::ICMP_EQ, LastActiveLane, Zero);
2513 VPValue *Sel = B.createSelect(Cmp, LastPrevIter, PenultimateLastIter);
2514 cast<VPInstruction>(U)->replaceAllUsesWith(Sel);
2515 }
2516 }
2517 return true;
2518}
2519
2521 for (VPRecipeBase &R :
2523 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2524 if (!PhiR)
2525 continue;
2526 RecurKind RK = PhiR->getRecurrenceKind();
2527 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2529 continue;
2530
2531 for (VPUser *U : collectUsersRecursively(PhiR))
2532 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2533 RecWithFlags->dropPoisonGeneratingFlags();
2534 }
2535 }
2536}
2537
2538namespace {
2539struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2540 static bool isSentinel(const VPSingleDefRecipe *Def) {
2541 return Def == getEmptyKey() || Def == getTombstoneKey();
2542 }
2543
2544 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2545 /// return that source element type.
2546 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2547 // All VPInstructions that lower to GEPs must have the i8 source element
2548 // type (as they are PtrAdds), so we omit it.
2550 .Case([](const VPReplicateRecipe *I) -> Type * {
2551 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2552 return GEP->getSourceElementType();
2553 return nullptr;
2554 })
2555 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2556 [](auto *I) { return I->getSourceElementType(); })
2557 .Default([](auto *) { return nullptr; });
2558 }
2559
2560 /// Returns true if recipe \p Def can be safely handed for CSE.
2561 static bool canHandle(const VPSingleDefRecipe *Def) {
2562 // We can extend the list of handled recipes in the future,
2563 // provided we account for the data embedded in them while checking for
2564 // equality or hashing.
2565 auto C = getOpcodeOrIntrinsicID(Def);
2566
2567 // The issue with (Insert|Extract)Value is that the index of the
2568 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2569 // VPlan.
2570 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2571 C->second == Instruction::ExtractValue)))
2572 return false;
2573
2574 // During CSE, we can only handle recipes that don't read from memory: if
2575 // they read from memory, there could be an intervening write to memory
2576 // before the next instance is CSE'd, leading to an incorrect result.
2577 return !Def->mayReadFromMemory();
2578 }
2579
2580 /// Hash the underlying data of \p Def.
2581 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2582 const VPlan *Plan = Def->getParent()->getPlan();
2583 VPTypeAnalysis TypeInfo(*Plan);
2584 hash_code Result = hash_combine(
2585 Def->getVPRecipeID(), getOpcodeOrIntrinsicID(Def),
2586 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2588 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2589 if (RFlags->hasPredicate())
2590 return hash_combine(Result, RFlags->getPredicate());
2591 return Result;
2592 }
2593
2594 /// Check equality of underlying data of \p L and \p R.
2595 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2596 if (isSentinel(L) || isSentinel(R))
2597 return L == R;
2598 if (L->getVPRecipeID() != R->getVPRecipeID() ||
2600 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2602 !equal(L->operands(), R->operands()))
2603 return false;
2605 "must have valid opcode info for both recipes");
2606 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2607 if (LFlags->hasPredicate() &&
2608 LFlags->getPredicate() !=
2609 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2610 return false;
2611 // Recipes in replicate regions implicitly depend on predicate. If either
2612 // recipe is in a replicate region, only consider them equal if both have
2613 // the same parent.
2614 const VPRegionBlock *RegionL = L->getRegion();
2615 const VPRegionBlock *RegionR = R->getRegion();
2616 if (((RegionL && RegionL->isReplicator()) ||
2617 (RegionR && RegionR->isReplicator())) &&
2618 L->getParent() != R->getParent())
2619 return false;
2620 const VPlan *Plan = L->getParent()->getPlan();
2621 VPTypeAnalysis TypeInfo(*Plan);
2622 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2623 }
2624};
2625} // end anonymous namespace
2626
2627/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2628/// Plan.
2630 VPDominatorTree VPDT(Plan);
2632
2634 Plan.getEntry());
2636 for (VPRecipeBase &R : *VPBB) {
2637 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2638 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2639 continue;
2640 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2641 // V must dominate Def for a valid replacement.
2642 if (!VPDT.dominates(V->getParent(), VPBB))
2643 continue;
2644 // Only keep flags present on both V and Def.
2645 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2646 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2647 Def->replaceAllUsesWith(V);
2648 continue;
2649 }
2650 CSEMap[Def] = Def;
2651 }
2652 }
2653}
2654
2655/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2656static void licm(VPlan &Plan) {
2657 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2658
2659 // Hoist any loop invariant recipes from the vector loop region to the
2660 // preheader. Preform a shallow traversal of the vector loop region, to
2661 // exclude recipes in replicate regions. Since the top-level blocks in the
2662 // vector loop region are guaranteed to execute if the vector pre-header is,
2663 // we don't need to check speculation safety.
2664 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2665 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2666 "Expected vector prehader's successor to be the vector loop region");
2668 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2669 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2671 continue;
2672 if (any_of(R.operands(), [](VPValue *Op) {
2673 return !Op->isDefinedOutsideLoopRegions();
2674 }))
2675 continue;
2676 R.moveBefore(*Preheader, Preheader->end());
2677 }
2678 }
2679
2680#ifndef NDEBUG
2681 VPDominatorTree VPDT(Plan);
2682#endif
2683 // Sink recipes with no users inside the vector loop region if all users are
2684 // in the same exit block of the region.
2685 // TODO: Extend to sink recipes from inner loops.
2687 vp_post_order_shallow(LoopRegion->getEntry()))) {
2688 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
2690 continue;
2691
2692 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
2693 assert(!RepR->isPredicated() &&
2694 "Expected prior transformation of predicated replicates to "
2695 "replicate regions");
2696 // narrowToSingleScalarRecipes should have already maximally narrowed
2697 // replicates to single-scalar replicates.
2698 // TODO: When unrolling, replicateByVF doesn't handle sunk
2699 // non-single-scalar replicates correctly.
2700 if (!RepR->isSingleScalar())
2701 continue;
2702 }
2703
2704 // TODO: Use R.definedValues() instead of casting to VPSingleDefRecipe to
2705 // support recipes with multiple defined values (e.g., interleaved loads).
2706 auto *Def = cast<VPSingleDefRecipe>(&R);
2707 // Skip recipes without users as we cannot determine a sink block.
2708 // TODO: Clone sinkable recipes without users to all exit blocks to reduce
2709 // their execution frequency.
2710 if (Def->getNumUsers() == 0)
2711 continue;
2712
2713 VPBasicBlock *SinkBB = nullptr;
2714 // Cannot sink the recipe if any user
2715 // * is defined in any loop region, or
2716 // * is a phi, or
2717 // * multiple users in different blocks.
2718 if (any_of(Def->users(), [&SinkBB](VPUser *U) {
2719 auto *UserR = cast<VPRecipeBase>(U);
2720 VPBasicBlock *Parent = UserR->getParent();
2721 // TODO: If the user is a PHI node, we should check the block of
2722 // incoming value. Support PHI node users if needed.
2723 if (UserR->isPhi() || Parent->getEnclosingLoopRegion())
2724 return true;
2725 // TODO: Support sinking when users are in multiple blocks.
2726 if (SinkBB && SinkBB != Parent)
2727 return true;
2728 SinkBB = Parent;
2729 return false;
2730 }))
2731 continue;
2732
2733 // Only sink to dedicated exit blocks of the loop region.
2734 if (SinkBB->getSinglePredecessor() != LoopRegion)
2735 continue;
2736
2737 // TODO: This will need to be a check instead of a assert after
2738 // conditional branches in vectorized loops are supported.
2739 assert(VPDT.properlyDominates(VPBB, SinkBB) &&
2740 "Defining block must dominate sink block");
2741 // TODO: Clone the recipe if users are on multiple exit paths, instead of
2742 // just moving.
2743 Def->moveBefore(*SinkBB, SinkBB->getFirstNonPhi());
2744 }
2745 }
2746}
2747
2749 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2750 if (Plan.hasScalarVFOnly())
2751 return;
2752 // Keep track of created truncates, so they can be re-used. Note that we
2753 // cannot use RAUW after creating a new truncate, as this would could make
2754 // other uses have different types for their operands, making them invalidly
2755 // typed.
2757 VPTypeAnalysis TypeInfo(Plan);
2758 VPBasicBlock *PH = Plan.getVectorPreheader();
2761 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2764 continue;
2765
2766 VPValue *ResultVPV = R.getVPSingleValue();
2767 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2768 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2769 if (!NewResSizeInBits)
2770 continue;
2771
2772 // If the value wasn't vectorized, we must maintain the original scalar
2773 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2774 // skip casts which do not need to be handled explicitly here, as
2775 // redundant casts will be removed during recipe simplification.
2777 continue;
2778
2779 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2780 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2781 assert(OldResTy->isIntegerTy() && "only integer types supported");
2782 (void)OldResSizeInBits;
2783
2784 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2785
2786 // Any wrapping introduced by shrinking this operation shouldn't be
2787 // considered undefined behavior. So, we can't unconditionally copy
2788 // arithmetic wrapping flags to VPW.
2789 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2790 VPW->dropPoisonGeneratingFlags();
2791
2792 if (OldResSizeInBits != NewResSizeInBits &&
2793 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2794 // Extend result to original width.
2795 auto *Ext = new VPWidenCastRecipe(
2796 Instruction::ZExt, ResultVPV, OldResTy, nullptr,
2797 VPIRFlags::getDefaultFlags(Instruction::ZExt));
2798 Ext->insertAfter(&R);
2799 ResultVPV->replaceAllUsesWith(Ext);
2800 Ext->setOperand(0, ResultVPV);
2801 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2802 } else {
2803 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2804 "Only ICmps should not need extending the result.");
2805 }
2806
2807 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2809 continue;
2810
2811 // Shrink operands by introducing truncates as needed.
2812 unsigned StartIdx =
2813 match(&R, m_Select(m_VPValue(), m_VPValue(), m_VPValue())) ? 1 : 0;
2814 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2815 auto *Op = R.getOperand(Idx);
2816 unsigned OpSizeInBits =
2818 if (OpSizeInBits == NewResSizeInBits)
2819 continue;
2820 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2821 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2822 if (!IterIsEmpty) {
2823 R.setOperand(Idx, ProcessedIter->second);
2824 continue;
2825 }
2826
2827 VPBuilder Builder;
2828 if (isa<VPIRValue>(Op))
2829 Builder.setInsertPoint(PH);
2830 else
2831 Builder.setInsertPoint(&R);
2832 VPWidenCastRecipe *NewOp =
2833 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2834 ProcessedIter->second = NewOp;
2835 R.setOperand(Idx, NewOp);
2836 }
2837
2838 }
2839 }
2840}
2841
2842void VPlanTransforms::removeBranchOnConst(VPlan &Plan, bool OnlyLatches) {
2843 std::optional<VPDominatorTree> VPDT;
2844 if (OnlyLatches)
2845 VPDT.emplace(Plan);
2846
2849 VPValue *Cond;
2850 // Skip blocks that are not terminated by BranchOnCond.
2851 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2852 continue;
2853
2854 if (OnlyLatches && !VPBlockUtils::isLatch(VPBB, *VPDT))
2855 continue;
2856
2857 assert(VPBB->getNumSuccessors() == 2 &&
2858 "Two successors expected for BranchOnCond");
2859 unsigned RemovedIdx;
2860 if (match(Cond, m_True()))
2861 RemovedIdx = 1;
2862 else if (match(Cond, m_False()))
2863 RemovedIdx = 0;
2864 else
2865 continue;
2866
2867 VPBasicBlock *RemovedSucc =
2868 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2869 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2870 "There must be a single edge between VPBB and its successor");
2871 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
2872 // these recipes.
2873 for (VPRecipeBase &R : RemovedSucc->phis())
2874 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2875
2876 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2877 // automatically on VPlan destruction if it becomes unreachable.
2878 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2879 VPBB->back().eraseFromParent();
2880 }
2881}
2882
2904
2905// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2906// the loop terminator with a branch-on-cond recipe with the negated
2907// active-lane-mask as operand. Note that this turns the loop into an
2908// uncountable one. Only the existing terminator is replaced, all other existing
2909// recipes/users remain unchanged, except for poison-generating flags being
2910// dropped from the canonical IV increment. Return the created
2911// VPActiveLaneMaskPHIRecipe.
2912//
2913// The function adds the following recipes:
2914//
2915// vector.ph:
2916// %EntryInc = canonical-iv-increment-for-part CanonicalIVStart
2917// %EntryALM = active-lane-mask %EntryInc, TC
2918//
2919// vector.body:
2920// ...
2921// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2922// ...
2923// %InLoopInc = canonical-iv-increment-for-part CanonicalIVIncrement
2924// %ALM = active-lane-mask %InLoopInc, TC
2925// %Negated = Not %ALM
2926// branch-on-cond %Negated
2927//
2930 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2931 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2932 auto *CanonicalIVPHI = TopRegion->getCanonicalIV();
2933 VPValue *StartV = CanonicalIVPHI->getStartValue();
2934
2935 auto *CanonicalIVIncrement =
2936 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2937 // TODO: Check if dropping the flags is needed.
2938 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2939 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2940 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2941 // we have to take unrolling into account. Each part needs to start at
2942 // Part * VF
2943 auto *VecPreheader = Plan.getVectorPreheader();
2944 VPBuilder Builder(VecPreheader);
2945
2946 // Create the ActiveLaneMask instruction using the correct start values.
2947 VPValue *TC = Plan.getTripCount();
2948 VPValue *VF = &Plan.getVF();
2949
2950 auto *EntryIncrement = Builder.createOverflowingOp(
2951 VPInstruction::CanonicalIVIncrementForPart, {StartV, VF}, {false, false},
2952 DL, "index.part.next");
2953
2954 // Create the active lane mask instruction in the VPlan preheader.
2955 VPValue *ALMMultiplier =
2956 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2957 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2958 {EntryIncrement, TC, ALMMultiplier}, DL,
2959 "active.lane.mask.entry");
2960
2961 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2962 // preheader ActiveLaneMask instruction.
2963 auto *LaneMaskPhi =
2965 LaneMaskPhi->insertAfter(CanonicalIVPHI);
2966
2967 // Create the active lane mask for the next iteration of the loop before the
2968 // original terminator.
2969 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2970 Builder.setInsertPoint(OriginalTerminator);
2971 auto *InLoopIncrement = Builder.createOverflowingOp(
2973 {CanonicalIVIncrement, &Plan.getVF()}, {false, false}, DL);
2974 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2975 {InLoopIncrement, TC, ALMMultiplier}, DL,
2976 "active.lane.mask.next");
2977 LaneMaskPhi->addOperand(ALM);
2978
2979 // Replace the original terminator with BranchOnCond. We have to invert the
2980 // mask here because a true condition means jumping to the exit block.
2981 auto *NotMask = Builder.createNot(ALM, DL);
2982 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2983 OriginalTerminator->eraseFromParent();
2984 return LaneMaskPhi;
2985}
2986
2988 bool UseActiveLaneMaskForControlFlow) {
2989 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2990 auto *FoundWidenCanonicalIVUser = find_if(
2992 assert(FoundWidenCanonicalIVUser &&
2993 "Must have widened canonical IV when tail folding!");
2994 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
2995 auto *WideCanonicalIV =
2996 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2997 VPSingleDefRecipe *LaneMask;
2998 if (UseActiveLaneMaskForControlFlow) {
2999 LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(Plan);
3000 } else {
3001 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
3002 VPValue *ALMMultiplier =
3003 Plan.getConstantInt(LoopRegion->getCanonicalIVType(), 1);
3004 LaneMask =
3005 B.createNaryOp(VPInstruction::ActiveLaneMask,
3006 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
3007 nullptr, "active.lane.mask");
3008 }
3009
3010 // Walk users of WideCanonicalIV and replace the header mask of the form
3011 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
3012 // removing the old one to ensure there is always only a single header mask.
3013 HeaderMask->replaceAllUsesWith(LaneMask);
3014 HeaderMask->eraseFromParent();
3015}
3016
3017template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
3018 Op0_t In;
3020
3021 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
3022
3023 template <typename OpTy> bool match(OpTy *V) const {
3024 if (m_Specific(In).match(V)) {
3025 Out = nullptr;
3026 return true;
3027 }
3028 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
3029 }
3030};
3031
3032/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
3033/// Returns the remaining part \p Out if so, or nullptr otherwise.
3034template <typename Op0_t, typename Op1_t>
3035static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
3036 Op1_t &Out) {
3037 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
3038}
3039
3040/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
3041/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
3042/// recipe could be created.
3043/// \p HeaderMask Header Mask.
3044/// \p CurRecipe Recipe to be transform.
3045/// \p TypeInfo VPlan-based type analysis.
3046/// \p EVL The explicit vector length parameter of vector-predication
3047/// intrinsics.
3049 VPRecipeBase &CurRecipe,
3050 VPTypeAnalysis &TypeInfo, VPValue &EVL) {
3051 VPlan *Plan = CurRecipe.getParent()->getPlan();
3052 DebugLoc DL = CurRecipe.getDebugLoc();
3053 VPValue *Addr, *Mask, *EndPtr;
3054
3055 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
3056 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
3057 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
3058 EVLEndPtr->insertBefore(&CurRecipe);
3059 EVLEndPtr->setOperand(1, &EVL);
3060 return EVLEndPtr;
3061 };
3062
3063 if (match(&CurRecipe,
3064 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))) &&
3065 !cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
3066 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
3067 EVL, Mask);
3068
3069 VPValue *ReversedVal;
3070 if (match(&CurRecipe, m_Reverse(m_VPValue(ReversedVal))) &&
3071 match(ReversedVal,
3072 m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) &&
3073 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
3074 cast<VPWidenLoadRecipe>(ReversedVal)->isReverse()) {
3075 auto *LoadR = new VPWidenLoadEVLRecipe(
3076 *cast<VPWidenLoadRecipe>(ReversedVal), AdjustEndPtr(EndPtr), EVL, Mask);
3077 LoadR->insertBefore(&CurRecipe);
3078 return new VPWidenIntrinsicRecipe(
3079 Intrinsic::experimental_vp_reverse, {LoadR, Plan->getTrue(), &EVL},
3080 TypeInfo.inferScalarType(LoadR), {}, {}, DL);
3081 }
3082
3083 VPValue *StoredVal;
3084 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(StoredVal),
3085 m_RemoveMask(HeaderMask, Mask))) &&
3086 !cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
3087 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
3088 StoredVal, EVL, Mask);
3089
3090 if (match(&CurRecipe,
3091 m_MaskedStore(m_VPValue(EndPtr), m_Reverse(m_VPValue(ReversedVal)),
3092 m_RemoveMask(HeaderMask, Mask))) &&
3093 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
3094 cast<VPWidenStoreRecipe>(CurRecipe).isReverse()) {
3095 auto *NewReverse = new VPWidenIntrinsicRecipe(
3096 Intrinsic::experimental_vp_reverse,
3097 {ReversedVal, Plan->getTrue(), &EVL},
3098 TypeInfo.inferScalarType(ReversedVal), {}, {}, DL);
3099 NewReverse->insertBefore(&CurRecipe);
3100 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
3101 AdjustEndPtr(EndPtr), NewReverse, EVL,
3102 Mask);
3103 }
3104
3105 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
3106 if (Rdx->isConditional() &&
3107 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
3108 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
3109
3110 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
3111 if (Interleave->getMask() &&
3112 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
3113 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
3114
3115 VPValue *LHS, *RHS;
3116 if (match(&CurRecipe,
3117 m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
3118 return new VPWidenIntrinsicRecipe(
3119 Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
3120 TypeInfo.inferScalarType(LHS), {}, {}, DL);
3121
3122 if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
3123 m_VPValue(RHS))))
3124 return new VPWidenIntrinsicRecipe(
3125 Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
3126 TypeInfo.inferScalarType(LHS), {}, {}, DL);
3127
3128 if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
3129 Type *Ty = TypeInfo.inferScalarType(CurRecipe.getVPSingleValue());
3130 VPValue *ZExt = VPBuilder(&CurRecipe)
3132 &EVL, Ty, TypeInfo.inferScalarType(&EVL), DL);
3133 return new VPInstruction(
3134 Instruction::Sub, {ZExt, Plan->getConstantInt(Ty, 1)},
3135 VPIRFlags::getDefaultFlags(Instruction::Sub), {}, DL);
3136 }
3137
3138 return nullptr;
3139}
3140
3141/// Optimize away any EVL-based header masks to VP intrinsic based recipes.
3142/// The transforms here need to preserve the original semantics.
3144 // Find the EVL-based header mask if it exists: icmp ult step-vector, EVL
3145 VPValue *HeaderMask = nullptr, *EVL = nullptr;
3148 m_VPValue(EVL))) &&
3149 match(EVL, m_EVL(m_VPValue()))) {
3150 HeaderMask = R.getVPSingleValue();
3151 break;
3152 }
3153 }
3154 if (!HeaderMask)
3155 return;
3156
3157 VPTypeAnalysis TypeInfo(Plan);
3158 SmallVector<VPRecipeBase *> OldRecipes;
3159 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3161 if (auto *NewR = optimizeMaskToEVL(HeaderMask, *R, TypeInfo, *EVL)) {
3162 NewR->insertBefore(R);
3163 for (auto [Old, New] :
3164 zip_equal(R->definedValues(), NewR->definedValues()))
3165 Old->replaceAllUsesWith(New);
3166 OldRecipes.push_back(R);
3167 }
3168 }
3169
3170 // Replace remaining (HeaderMask && Mask) with vp.merge (True, Mask,
3171 // False, EVL)
3172 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3173 VPValue *Mask;
3174 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask)))) {
3175 auto *LogicalAnd = cast<VPInstruction>(U);
3176 auto *Merge = new VPWidenIntrinsicRecipe(
3177 Intrinsic::vp_merge, {Plan.getTrue(), Mask, Plan.getFalse(), EVL},
3178 TypeInfo.inferScalarType(Mask), {}, {}, LogicalAnd->getDebugLoc());
3179 Merge->insertBefore(LogicalAnd);
3180 LogicalAnd->replaceAllUsesWith(Merge);
3181 OldRecipes.push_back(LogicalAnd);
3182 }
3183 }
3184
3185 // Erase old recipes at the end so we don't invalidate TypeInfo.
3186 for (VPRecipeBase *R : reverse(OldRecipes)) {
3187 SmallVector<VPValue *> PossiblyDead(R->operands());
3188 R->eraseFromParent();
3189 for (VPValue *Op : PossiblyDead)
3191 }
3192}
3193
3194/// After replacing the canonical IV with a EVL-based IV, fixup recipes that use
3195/// VF to use the EVL instead to avoid incorrect updates on the penultimate
3196/// iteration.
3197static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
3198 VPTypeAnalysis TypeInfo(Plan);
3199 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3200 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3201
3202 assert(all_of(Plan.getVF().users(),
3205 "User of VF that we can't transform to EVL.");
3206 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3208 });
3209
3210 assert(all_of(Plan.getVFxUF().users(),
3211 [&LoopRegion, &Plan](VPUser *U) {
3212 return match(U,
3213 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
3214 m_Specific(&Plan.getVFxUF()))) ||
3215 isa<VPWidenPointerInductionRecipe>(U);
3216 }) &&
3217 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
3218 "increment of the canonical induction.");
3219 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3220 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
3221 // canonical induction must not be updated.
3223 });
3224
3225 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
3226 // contained.
3227 bool ContainsFORs =
3229 if (ContainsFORs) {
3230 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3231 VPValue *MaxEVL = &Plan.getVF();
3232 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3233 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3234 MaxEVL = Builder.createScalarZExtOrTrunc(
3235 MaxEVL, Type::getInt32Ty(Plan.getContext()),
3236 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
3237
3238 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
3239 VPValue *PrevEVL = Builder.createScalarPhi(
3240 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
3241
3244 for (VPRecipeBase &R : *VPBB) {
3245 VPValue *V1, *V2;
3246 if (!match(&R,
3248 m_VPValue(V1), m_VPValue(V2))))
3249 continue;
3250 VPValue *Imm = Plan.getOrAddLiveIn(
3253 Intrinsic::experimental_vp_splice,
3254 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3255 TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
3256 R.getDebugLoc());
3257 VPSplice->insertBefore(&R);
3258 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
3259 }
3260 }
3261 }
3262
3263 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
3264 if (!HeaderMask)
3265 return;
3266
3267 // Replace header masks with a mask equivalent to predicating by EVL:
3268 //
3269 // icmp ule widen-canonical-iv backedge-taken-count
3270 // ->
3271 // icmp ult step-vector, EVL
3272 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3273 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
3274 Type *EVLType = TypeInfo.inferScalarType(&EVL);
3275 VPValue *EVLMask = Builder.createICmp(
3277 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
3278 HeaderMask->replaceAllUsesWith(EVLMask);
3279}
3280
3281/// Converts a tail folded vector loop region to step by
3282/// VPInstruction::ExplicitVectorLength elements instead of VF elements each
3283/// iteration.
3284///
3285/// - Add a VPCurrentIterationPHIRecipe and related recipes to \p Plan and
3286/// replaces all uses except the canonical IV increment of
3287/// VPCanonicalIVPHIRecipe with a VPCurrentIterationPHIRecipe.
3288/// VPCanonicalIVPHIRecipe is used only for loop iterations counting after
3289/// this transformation.
3290///
3291/// - The header mask is replaced with a header mask based on the EVL.
3292///
3293/// - Plans with FORs have a new phi added to keep track of the EVL of the
3294/// previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with
3295/// @llvm.vp.splice.
3296///
3297/// The function uses the following definitions:
3298/// %StartV is the canonical induction start value.
3299///
3300/// The function adds the following recipes:
3301///
3302/// vector.ph:
3303/// ...
3304///
3305/// vector.body:
3306/// ...
3307/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3308/// [ %NextIter, %vector.body ]
3309/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3310/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3311/// ...
3312/// %OpEVL = cast i32 %VPEVL to IVSize
3313/// %NextIter = add IVSize %OpEVL, %CurrentIter
3314/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3315/// ...
3316///
3317/// If MaxSafeElements is provided, the function adds the following recipes:
3318/// vector.ph:
3319/// ...
3320///
3321/// vector.body:
3322/// ...
3323/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3324/// [ %NextIter, %vector.body ]
3325/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3326/// %cmp = cmp ult %AVL, MaxSafeElements
3327/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3328/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3329/// ...
3330/// %OpEVL = cast i32 %VPEVL to IVSize
3331/// %NextIter = add IVSize %OpEVL, %CurrentIter
3332/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3333/// ...
3334///
3336 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3337 if (Plan.hasScalarVFOnly())
3338 return;
3339 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3340 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3341
3342 auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
3343 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3344 VPValue *StartV = CanonicalIVPHI->getStartValue();
3345
3346 // Create the CurrentIteration recipe in the vector loop.
3347 auto *CurrentIteration =
3349 CurrentIteration->insertAfter(CanonicalIVPHI);
3350 VPBuilder Builder(Header, Header->getFirstNonPhi());
3351 // Create the AVL (application vector length), starting from TC -> 0 in steps
3352 // of EVL.
3353 VPPhi *AVLPhi = Builder.createScalarPhi(
3354 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
3355 VPValue *AVL = AVLPhi;
3356
3357 if (MaxSafeElements) {
3358 // Support for MaxSafeDist for correct loop emission.
3359 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
3360 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
3361 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
3362 "safe_avl");
3363 }
3364 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
3365 DebugLoc::getUnknown(), "evl");
3366
3367 auto *CanonicalIVIncrement =
3368 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
3369 Builder.setInsertPoint(CanonicalIVIncrement);
3370 VPValue *OpVPEVL = VPEVL;
3371
3372 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
3373 OpVPEVL = Builder.createScalarZExtOrTrunc(
3374 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
3375
3376 auto *NextIter = Builder.createAdd(
3377 OpVPEVL, CurrentIteration, CanonicalIVIncrement->getDebugLoc(),
3378 "current.iteration.next", CanonicalIVIncrement->getNoWrapFlags());
3379 CurrentIteration->addOperand(NextIter);
3380
3381 VPValue *NextAVL =
3382 Builder.createSub(AVLPhi, OpVPEVL, DebugLoc::getCompilerGenerated(),
3383 "avl.next", {/*NUW=*/true, /*NSW=*/false});
3384 AVLPhi->addOperand(NextAVL);
3385
3386 fixupVFUsersForEVL(Plan, *VPEVL);
3387 removeDeadRecipes(Plan);
3388
3389 // Replace all uses of VPCanonicalIVPHIRecipe by
3390 // VPCurrentIterationPHIRecipe except for the canonical IV increment.
3391 CanonicalIVPHI->replaceAllUsesWith(CurrentIteration);
3392 CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
3393 // TODO: support unroll factor > 1.
3394 Plan.setUF(1);
3395}
3396
3398 // Find the vector loop entry by locating VPCurrentIterationPHIRecipe.
3399 // There should be only one VPCurrentIteration in the entire plan.
3400 VPCurrentIterationPHIRecipe *CurrentIteration = nullptr;
3401
3404 for (VPRecipeBase &R : VPBB->phis())
3405 if (auto *PhiR = dyn_cast<VPCurrentIterationPHIRecipe>(&R)) {
3406 assert(!CurrentIteration &&
3407 "Found multiple CurrentIteration. Only one expected");
3408 CurrentIteration = PhiR;
3409 }
3410
3411 // Early return if it is not variable-length stepping.
3412 if (!CurrentIteration)
3413 return;
3414
3415 VPBasicBlock *HeaderVPBB = CurrentIteration->getParent();
3416 VPValue *CurrentIterationIncr = CurrentIteration->getBackedgeValue();
3417
3418 // Convert CurrentIteration to concrete recipe.
3419 auto *ScalarR =
3420 VPBuilder(CurrentIteration)
3422 {CurrentIteration->getStartValue(), CurrentIterationIncr},
3423 CurrentIteration->getDebugLoc(), "current.iteration.iv");
3424 CurrentIteration->replaceAllUsesWith(ScalarR);
3425 CurrentIteration->eraseFromParent();
3426
3427 // Replace CanonicalIVInc with CurrentIteration increment.
3428 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3429 VPValue *Backedge = CanonicalIV->getIncomingValue(1);
3430 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
3431 m_Specific(&Plan.getVFxUF()))) &&
3432 "Unexpected canonical iv");
3433 Backedge->replaceAllUsesWith(CurrentIterationIncr);
3434
3435 // Remove unused phi and increment.
3436 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
3437 CanonicalIVIncrement->eraseFromParent();
3438 CanonicalIV->eraseFromParent();
3439}
3440
3442 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3443 // The canonical IV may not exist at this stage.
3444 if (!LoopRegion ||
3446 return;
3447 VPCanonicalIVPHIRecipe *CanIV = LoopRegion->getCanonicalIV();
3448 if (std::next(CanIV->getIterator()) == CanIV->getParent()->end())
3449 return;
3450 // The EVL IV is always immediately after the canonical IV.
3452 std::next(CanIV->getIterator()));
3453 if (!EVLPhi)
3454 return;
3455
3456 // Bail if not an EVL tail folded loop.
3457 VPValue *AVL;
3458 if (!match(EVLPhi->getBackedgeValue(),
3459 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi))))
3460 return;
3461
3462 // The AVL may be capped to a safe distance.
3463 VPValue *SafeAVL, *UnsafeAVL;
3464 if (match(AVL,
3466 m_VPValue(SafeAVL)),
3467 m_Deferred(UnsafeAVL), m_Deferred(SafeAVL))))
3468 AVL = UnsafeAVL;
3469
3470 VPValue *AVLNext;
3471 [[maybe_unused]] bool FoundAVLNext =
3473 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3474 assert(FoundAVLNext && "Didn't find AVL backedge?");
3475
3476 VPBasicBlock *Latch = LoopRegion->getExitingBasicBlock();
3477 auto *LatchBr = cast<VPInstruction>(Latch->getTerminator());
3478 if (match(LatchBr, m_BranchOnCond(m_True())))
3479 return;
3480
3481 assert(
3482 match(LatchBr,
3485 m_Specific(&Plan.getVectorTripCount())))) &&
3486 "Expected BranchOnCond with ICmp comparing CanIV increment with vector "
3487 "trip count");
3488
3489 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
3490 VPBuilder Builder(LatchBr);
3491 LatchBr->setOperand(
3492 0, Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, Plan.getZero(AVLTy)));
3493}
3494
3496 VPlan &Plan, PredicatedScalarEvolution &PSE,
3497 const DenseMap<Value *, const SCEV *> &StridesMap) {
3498 // Replace VPValues for known constant strides guaranteed by predicate scalar
3499 // evolution.
3500 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
3501 auto *R = cast<VPRecipeBase>(&U);
3502 return R->getRegion() ||
3503 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
3504 };
3505 ValueToSCEVMapTy RewriteMap;
3506 for (const SCEV *Stride : StridesMap.values()) {
3507 using namespace SCEVPatternMatch;
3508 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3509 const APInt *StrideConst;
3510 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3511 // Only handle constant strides for now.
3512 continue;
3513
3514 auto *CI = Plan.getConstantInt(*StrideConst);
3515 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3516 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3517
3518 // The versioned value may not be used in the loop directly but through a
3519 // sext/zext. Add new live-ins in those cases.
3520 for (Value *U : StrideV->users()) {
3522 continue;
3523 VPValue *StrideVPV = Plan.getLiveIn(U);
3524 if (!StrideVPV)
3525 continue;
3526 unsigned BW = U->getType()->getScalarSizeInBits();
3527 APInt C =
3528 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3529 VPValue *CI = Plan.getConstantInt(C);
3530 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3531 }
3532 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3533 }
3534
3535 for (VPRecipeBase &R : *Plan.getEntry()) {
3536 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3537 if (!ExpSCEV)
3538 continue;
3539 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3540 auto *NewSCEV =
3541 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3542 if (NewSCEV != ScevExpr) {
3543 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3544 ExpSCEV->replaceAllUsesWith(NewExp);
3545 if (Plan.getTripCount() == ExpSCEV)
3546 Plan.resetTripCount(NewExp);
3547 }
3548 }
3549}
3550
3552 VPlan &Plan,
3553 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
3554 // Collect recipes in the backward slice of `Root` that may generate a poison
3555 // value that is used after vectorization.
3557 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3559 Worklist.push_back(Root);
3560
3561 // Traverse the backward slice of Root through its use-def chain.
3562 while (!Worklist.empty()) {
3563 VPRecipeBase *CurRec = Worklist.pop_back_val();
3564
3565 if (!Visited.insert(CurRec).second)
3566 continue;
3567
3568 // Prune search if we find another recipe generating a widen memory
3569 // instruction. Widen memory instructions involved in address computation
3570 // will lead to gather/scatter instructions, which don't need to be
3571 // handled.
3573 VPHeaderPHIRecipe>(CurRec))
3574 continue;
3575
3576 // This recipe contributes to the address computation of a widen
3577 // load/store. If the underlying instruction has poison-generating flags,
3578 // drop them directly.
3579 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3580 VPValue *A, *B;
3581 // Dropping disjoint from an OR may yield incorrect results, as some
3582 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3583 // for dependence analysis). Instead, replace it with an equivalent Add.
3584 // This is possible as all users of the disjoint OR only access lanes
3585 // where the operands are disjoint or poison otherwise.
3586 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3587 RecWithFlags->isDisjoint()) {
3588 VPBuilder Builder(RecWithFlags);
3589 VPInstruction *New =
3590 Builder.createAdd(A, B, RecWithFlags->getDebugLoc());
3591 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3592 RecWithFlags->replaceAllUsesWith(New);
3593 RecWithFlags->eraseFromParent();
3594 CurRec = New;
3595 } else
3596 RecWithFlags->dropPoisonGeneratingFlags();
3597 } else {
3600 (void)Instr;
3601 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3602 "found instruction with poison generating flags not covered by "
3603 "VPRecipeWithIRFlags");
3604 }
3605
3606 // Add new definitions to the worklist.
3607 for (VPValue *Operand : CurRec->operands())
3608 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3609 Worklist.push_back(OpDef);
3610 }
3611 });
3612
3613 // Traverse all the recipes in the VPlan and collect the poison-generating
3614 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3615 // VPInterleaveRecipe.
3616 auto Iter = vp_depth_first_deep(Plan.getEntry());
3618 for (VPRecipeBase &Recipe : *VPBB) {
3619 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3620 Instruction &UnderlyingInstr = WidenRec->getIngredient();
3621 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3622 if (AddrDef && WidenRec->isConsecutive() &&
3623 BlockNeedsPredication(UnderlyingInstr.getParent()))
3624 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3625 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3626 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3627 if (AddrDef) {
3628 // Check if any member of the interleave group needs predication.
3629 const InterleaveGroup<Instruction> *InterGroup =
3630 InterleaveRec->getInterleaveGroup();
3631 bool NeedPredication = false;
3632 for (int I = 0, NumMembers = InterGroup->getNumMembers();
3633 I < NumMembers; ++I) {
3634 Instruction *Member = InterGroup->getMember(I);
3635 if (Member)
3636 NeedPredication |= BlockNeedsPredication(Member->getParent());
3637 }
3638
3639 if (NeedPredication)
3640 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3641 }
3642 }
3643 }
3644 }
3645}
3646
3648 VPlan &Plan,
3650 &InterleaveGroups,
3651 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
3652 if (InterleaveGroups.empty())
3653 return;
3654
3655 // Interleave memory: for each Interleave Group we marked earlier as relevant
3656 // for this VPlan, replace the Recipes widening its memory instructions with a
3657 // single VPInterleaveRecipe at its insertion point.
3658 VPDominatorTree VPDT(Plan);
3659 for (const auto *IG : InterleaveGroups) {
3660 auto *Start =
3661 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
3662 VPIRMetadata InterleaveMD(*Start);
3663 SmallVector<VPValue *, 4> StoredValues;
3664 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
3665 StoredValues.push_back(StoreR->getStoredValue());
3666 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3667 Instruction *MemberI = IG->getMember(I);
3668 if (!MemberI)
3669 continue;
3670 VPWidenMemoryRecipe *MemoryR =
3671 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
3672 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
3673 StoredValues.push_back(StoreR->getStoredValue());
3674 InterleaveMD.intersect(*MemoryR);
3675 }
3676
3677 bool NeedsMaskForGaps =
3678 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
3679 (!StoredValues.empty() && !IG->isFull());
3680
3681 Instruction *IRInsertPos = IG->getInsertPos();
3682 auto *InsertPos =
3683 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
3684
3686 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3687 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3688 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3689
3690 // Get or create the start address for the interleave group.
3691 VPValue *Addr = Start->getAddr();
3692 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3693 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
3694 // We cannot re-use the address of member zero because it does not
3695 // dominate the insert position. Instead, use the address of the insert
3696 // position and create a PtrAdd adjusting it to the address of member
3697 // zero.
3698 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3699 // InsertPos or sink loads above zero members to join it.
3700 assert(IG->getIndex(IRInsertPos) != 0 &&
3701 "index of insert position shouldn't be zero");
3702 auto &DL = IRInsertPos->getDataLayout();
3703 APInt Offset(32,
3704 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3705 IG->getIndex(IRInsertPos),
3706 /*IsSigned=*/true);
3707 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3708 VPBuilder B(InsertPos);
3709 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3710 }
3711 // If the group is reverse, adjust the index to refer to the last vector
3712 // lane instead of the first. We adjust the index from the first vector
3713 // lane, rather than directly getting the pointer for lane VF - 1, because
3714 // the pointer operand of the interleaved access is supposed to be uniform.
3715 if (IG->isReverse()) {
3716 auto *ReversePtr = new VPVectorEndPointerRecipe(
3717 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3718 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3719 ReversePtr->insertBefore(InsertPos);
3720 Addr = ReversePtr;
3721 }
3722 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3723 InsertPos->getMask(), NeedsMaskForGaps,
3724 InterleaveMD, InsertPos->getDebugLoc());
3725 VPIG->insertBefore(InsertPos);
3726
3727 unsigned J = 0;
3728 for (unsigned i = 0; i < IG->getFactor(); ++i)
3729 if (Instruction *Member = IG->getMember(i)) {
3730 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
3731 if (!Member->getType()->isVoidTy()) {
3732 VPValue *OriginalV = MemberR->getVPSingleValue();
3733 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3734 J++;
3735 }
3736 MemberR->eraseFromParent();
3737 }
3738 }
3739}
3740
3741/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3742/// value, phi and backedge value. In the following example:
3743///
3744/// vector.ph:
3745/// Successor(s): vector loop
3746///
3747/// <x1> vector loop: {
3748/// vector.body:
3749/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3750/// ...
3751/// EMIT branch-on-count ...
3752/// No successors
3753/// }
3754///
3755/// WIDEN-INDUCTION will get expanded to:
3756///
3757/// vector.ph:
3758/// ...
3759/// vp<%induction.start> = ...
3760/// vp<%induction.increment> = ...
3761///
3762/// Successor(s): vector loop
3763///
3764/// <x1> vector loop: {
3765/// vector.body:
3766/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3767/// ...
3768/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3769/// EMIT branch-on-count ...
3770/// No successors
3771/// }
3772static void
3774 VPTypeAnalysis &TypeInfo) {
3775 VPlan *Plan = WidenIVR->getParent()->getPlan();
3776 VPValue *Start = WidenIVR->getStartValue();
3777 VPValue *Step = WidenIVR->getStepValue();
3778 VPValue *VF = WidenIVR->getVFValue();
3779 DebugLoc DL = WidenIVR->getDebugLoc();
3780
3781 // The value from the original loop to which we are mapping the new induction
3782 // variable.
3783 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3784
3785 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3788 VPIRFlags Flags = *WidenIVR;
3789 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3790 AddOp = Instruction::Add;
3791 MulOp = Instruction::Mul;
3792 } else {
3793 AddOp = ID.getInductionOpcode();
3794 MulOp = Instruction::FMul;
3795 }
3796
3797 // If the phi is truncated, truncate the start and step values.
3798 VPBuilder Builder(Plan->getVectorPreheader());
3799 Type *StepTy = TypeInfo.inferScalarType(Step);
3800 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3801 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3802 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3803 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3804 StepTy = Ty;
3805 }
3806
3807 // Construct the initial value of the vector IV in the vector loop preheader.
3808 Type *IVIntTy =
3810 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3811 if (StepTy->isFloatingPointTy())
3812 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3813
3814 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3815 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3816
3817 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3818 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3819 DebugLoc::getUnknown(), "induction");
3820
3821 // Create the widened phi of the vector IV.
3822 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), Init,
3823 WidenIVR->getDebugLoc(), "vec.ind");
3824 WidePHI->insertBefore(WidenIVR);
3825
3826 // Create the backedge value for the vector IV.
3827 VPValue *Inc;
3828 VPValue *Prev;
3829 // If unrolled, use the increment and prev value from the operands.
3830 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3831 Inc = SplatVF;
3832 Prev = WidenIVR->getLastUnrolledPartOperand();
3833 } else {
3834 if (VPRecipeBase *R = VF->getDefiningRecipe())
3835 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3836 // Multiply the vectorization factor by the step using integer or
3837 // floating-point arithmetic as appropriate.
3838 if (StepTy->isFloatingPointTy())
3839 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3840 DL);
3841 else
3842 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3843 TypeInfo.inferScalarType(VF), DL);
3844
3845 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3846 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3847 Prev = WidePHI;
3848 }
3849
3851 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3852 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3853 WidenIVR->getDebugLoc(), "vec.ind.next");
3854
3855 WidePHI->addOperand(Next);
3856
3857 WidenIVR->replaceAllUsesWith(WidePHI);
3858}
3859
3860/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3861/// initial value, phi and backedge value. In the following example:
3862///
3863/// <x1> vector loop: {
3864/// vector.body:
3865/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3866/// ...
3867/// EMIT branch-on-count ...
3868/// }
3869///
3870/// WIDEN-POINTER-INDUCTION will get expanded to:
3871///
3872/// <x1> vector loop: {
3873/// vector.body:
3874/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3875/// EMIT %mul = mul %stepvector, %step
3876/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3877/// ...
3878/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3879/// EMIT branch-on-count ...
3880/// }
3882 VPTypeAnalysis &TypeInfo) {
3883 VPlan *Plan = R->getParent()->getPlan();
3884 VPValue *Start = R->getStartValue();
3885 VPValue *Step = R->getStepValue();
3886 VPValue *VF = R->getVFValue();
3887
3888 assert(R->getInductionDescriptor().getKind() ==
3890 "Not a pointer induction according to InductionDescriptor!");
3891 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3892 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3893 "Recipe should have been replaced");
3894
3895 VPBuilder Builder(R);
3896 DebugLoc DL = R->getDebugLoc();
3897
3898 // Build a scalar pointer phi.
3899 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3900
3901 // Create actual address geps that use the pointer phi as base and a
3902 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3903 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3904 Type *StepTy = TypeInfo.inferScalarType(Step);
3905 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3906 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3907 VPValue *PtrAdd =
3908 Builder.createWidePtrAdd(ScalarPtrPhi, Offset, DL, "vector.gep");
3909 R->replaceAllUsesWith(PtrAdd);
3910
3911 // Create the backedge value for the scalar pointer phi.
3913 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3914 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3915 DL);
3916 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
3917
3918 VPValue *InductionGEP =
3919 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3920 ScalarPtrPhi->addOperand(InductionGEP);
3921}
3922
3924 // Replace loop regions with explicity CFG.
3925 SmallVector<VPRegionBlock *> LoopRegions;
3927 vp_depth_first_deep(Plan.getEntry()))) {
3928 if (!R->isReplicator())
3929 LoopRegions.push_back(R);
3930 }
3931 for (VPRegionBlock *R : LoopRegions)
3932 R->dissolveToCFGLoop();
3933}
3934
3937 // The transform runs after dissolving loop regions, so all VPBasicBlocks
3938 // terminated with BranchOnTwoConds are reached via a shallow traversal.
3941 if (!VPBB->empty() && match(&VPBB->back(), m_BranchOnTwoConds()))
3942 WorkList.push_back(cast<VPInstruction>(&VPBB->back()));
3943 }
3944
3945 // Expand BranchOnTwoConds instructions into explicit CFG with two new
3946 // single-condition branches:
3947 // 1. A branch that replaces BranchOnTwoConds, jumps to the first successor if
3948 // the first condition is true, and otherwise jumps to a new interim block.
3949 // 2. A branch that ends the interim block, jumps to the second successor if
3950 // the second condition is true, and otherwise jumps to the third
3951 // successor.
3952 for (VPInstruction *Br : WorkList) {
3953 assert(Br->getNumOperands() == 2 &&
3954 "BranchOnTwoConds must have exactly 2 conditions");
3955 DebugLoc DL = Br->getDebugLoc();
3956 VPBasicBlock *BrOnTwoCondsBB = Br->getParent();
3957 const auto Successors = to_vector(BrOnTwoCondsBB->getSuccessors());
3958 assert(Successors.size() == 3 &&
3959 "BranchOnTwoConds must have exactly 3 successors");
3960
3961 for (VPBlockBase *Succ : Successors)
3962 VPBlockUtils::disconnectBlocks(BrOnTwoCondsBB, Succ);
3963
3964 VPValue *Cond0 = Br->getOperand(0);
3965 VPValue *Cond1 = Br->getOperand(1);
3966 VPBlockBase *Succ0 = Successors[0];
3967 VPBlockBase *Succ1 = Successors[1];
3968 VPBlockBase *Succ2 = Successors[2];
3969 assert(!Succ0->getParent() && !Succ1->getParent() && !Succ2->getParent() &&
3970 !BrOnTwoCondsBB->getParent() && "regions must already be dissolved");
3971
3972 VPBasicBlock *InterimBB =
3973 Plan.createVPBasicBlock(BrOnTwoCondsBB->getName() + ".interim");
3974
3975 VPBuilder(BrOnTwoCondsBB)
3977 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);
3978 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, InterimBB);
3979
3981 VPBlockUtils::connectBlocks(InterimBB, Succ1);
3982 VPBlockUtils::connectBlocks(InterimBB, Succ2);
3983 Br->eraseFromParent();
3984 }
3985}
3986
3988 VPTypeAnalysis TypeInfo(Plan);
3991 vp_depth_first_deep(Plan.getEntry()))) {
3992 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3993 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
3994 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
3995 ToRemove.push_back(WidenIVR);
3996 continue;
3997 }
3998
3999 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
4000 // If the recipe only generates scalars, scalarize it instead of
4001 // expanding it.
4002 if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
4003 VPBuilder Builder(WidenIVR);
4004 VPValue *PtrAdd =
4005 scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
4006 WidenIVR->replaceAllUsesWith(PtrAdd);
4007 ToRemove.push_back(WidenIVR);
4008 continue;
4009 }
4010 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
4011 ToRemove.push_back(WidenIVR);
4012 continue;
4013 }
4014
4015 // Expand VPBlendRecipe into VPInstruction::Select.
4016 VPBuilder Builder(&R);
4017 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
4018 VPValue *Select = Blend->getIncomingValue(0);
4019 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
4020 Select = Builder.createSelect(Blend->getMask(I),
4021 Blend->getIncomingValue(I), Select,
4022 R.getDebugLoc(), "predphi", *Blend);
4023 Blend->replaceAllUsesWith(Select);
4024 ToRemove.push_back(Blend);
4025 }
4026
4027 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(&R)) {
4028 if (!VEPR->getOffset()) {
4029 assert(Plan.getConcreteUF() == 1 &&
4030 "Expected unroller to have materialized offset for UF != 1");
4031 VEPR->materializeOffset();
4032 }
4033 }
4034
4035 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
4036 Expr->decompose();
4037 ToRemove.push_back(Expr);
4038 }
4039
4040 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
4041 auto *LastActiveL = dyn_cast<VPInstruction>(&R);
4042 if (LastActiveL &&
4043 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
4044 // Create Not(Mask) for all operands.
4046 for (VPValue *Op : LastActiveL->operands()) {
4047 VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
4048 NotMasks.push_back(NotMask);
4049 }
4050
4051 // Create FirstActiveLane on the inverted masks.
4052 VPValue *FirstInactiveLane = Builder.createNaryOp(
4054 LastActiveL->getDebugLoc(), "first.inactive.lane");
4055
4056 // Subtract 1 to get the last active lane.
4057 VPValue *One =
4058 Plan.getConstantInt(TypeInfo.inferScalarType(FirstInactiveLane), 1);
4059 VPValue *LastLane =
4060 Builder.createSub(FirstInactiveLane, One,
4061 LastActiveL->getDebugLoc(), "last.active.lane");
4062
4063 LastActiveL->replaceAllUsesWith(LastLane);
4064 ToRemove.push_back(LastActiveL);
4065 continue;
4066 }
4067
4068 // Lower MaskedCond with block mask to LogicalAnd.
4070 auto *VPI = cast<VPInstruction>(&R);
4071 assert(VPI->isMasked() &&
4072 "Unmasked MaskedCond should be simplified earlier");
4073 VPI->replaceAllUsesWith(Builder.createNaryOp(
4074 VPInstruction::LogicalAnd, {VPI->getMask(), VPI->getOperand(0)}));
4075 ToRemove.push_back(VPI);
4076 continue;
4077 }
4078
4079 // Lower BranchOnCount to ICmp + BranchOnCond.
4080 VPValue *IV, *TC;
4081 if (match(&R, m_BranchOnCount(m_VPValue(IV), m_VPValue(TC)))) {
4082 auto *BranchOnCountInst = cast<VPInstruction>(&R);
4083 DebugLoc DL = BranchOnCountInst->getDebugLoc();
4084 VPValue *Cond = Builder.createICmp(CmpInst::ICMP_EQ, IV, TC, DL);
4085 Builder.createNaryOp(VPInstruction::BranchOnCond, Cond, DL);
4086 ToRemove.push_back(BranchOnCountInst);
4087 continue;
4088 }
4089
4090 VPValue *VectorStep;
4091 VPValue *ScalarStep;
4093 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
4094 continue;
4095
4096 // Expand WideIVStep.
4097 auto *VPI = cast<VPInstruction>(&R);
4098 Type *IVTy = TypeInfo.inferScalarType(VPI);
4099 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
4101 ? Instruction::UIToFP
4102 : Instruction::Trunc;
4103 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
4104 }
4105
4106 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
4107 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
4108 ScalarStep =
4109 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
4110 }
4111
4112 VPIRFlags Flags;
4113 unsigned MulOpc;
4114 if (IVTy->isFloatingPointTy()) {
4115 MulOpc = Instruction::FMul;
4116 Flags = VPI->getFastMathFlags();
4117 } else {
4118 MulOpc = Instruction::Mul;
4119 Flags = VPIRFlags::getDefaultFlags(MulOpc);
4120 }
4121
4122 VPInstruction *Mul = Builder.createNaryOp(
4123 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
4124 VectorStep = Mul;
4125 VPI->replaceAllUsesWith(VectorStep);
4126 ToRemove.push_back(VPI);
4127 }
4128 }
4129
4130 for (VPRecipeBase *R : ToRemove)
4131 R->eraseFromParent();
4132}
4133
4135 VPBasicBlock *HeaderVPBB,
4136 VPBasicBlock *LatchVPBB,
4137 VPBasicBlock *MiddleVPBB,
4138 UncountableExitStyle Style) {
4139 struct EarlyExitInfo {
4140 VPBasicBlock *EarlyExitingVPBB;
4141 VPIRBasicBlock *EarlyExitVPBB;
4142 VPValue *CondToExit;
4143 };
4144
4145 VPDominatorTree VPDT(Plan);
4146 VPBuilder Builder(LatchVPBB->getTerminator());
4148 for (VPIRBasicBlock *ExitBlock : Plan.getExitBlocks()) {
4149 for (VPBlockBase *Pred : to_vector(ExitBlock->getPredecessors())) {
4150 if (Pred == MiddleVPBB)
4151 continue;
4152 // Collect condition for this early exit.
4153 auto *EarlyExitingVPBB = cast<VPBasicBlock>(Pred);
4154 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
4155 VPValue *CondOfEarlyExitingVPBB;
4156 [[maybe_unused]] bool Matched =
4157 match(EarlyExitingVPBB->getTerminator(),
4158 m_BranchOnCond(m_VPValue(CondOfEarlyExitingVPBB)));
4159 assert(Matched && "Terminator must be BranchOnCond");
4160
4161 // Insert the MaskedCond in the EarlyExitingVPBB so the predicator adds
4162 // the correct block mask.
4163 VPBuilder EarlyExitingBuilder(EarlyExitingVPBB->getTerminator());
4164 auto *CondToEarlyExit = EarlyExitingBuilder.createNaryOp(
4166 TrueSucc == ExitBlock
4167 ? CondOfEarlyExitingVPBB
4168 : EarlyExitingBuilder.createNot(CondOfEarlyExitingVPBB));
4169 assert((isa<VPIRValue>(CondOfEarlyExitingVPBB) ||
4170 !VPDT.properlyDominates(EarlyExitingVPBB, LatchVPBB) ||
4171 VPDT.properlyDominates(
4172 CondOfEarlyExitingVPBB->getDefiningRecipe()->getParent(),
4173 LatchVPBB)) &&
4174 "exit condition must dominate the latch");
4175 Exits.push_back({
4176 EarlyExitingVPBB,
4177 ExitBlock,
4178 CondToEarlyExit,
4179 });
4180 }
4181 }
4182
4183 assert(!Exits.empty() && "must have at least one early exit");
4184 // Sort exits by RPO order to get correct program order. RPO gives a
4185 // topological ordering of the CFG, ensuring upstream exits are checked
4186 // before downstream exits in the dispatch chain.
4188 HeaderVPBB);
4190 for (const auto &[Num, VPB] : enumerate(RPOT))
4191 RPOIdx[VPB] = Num;
4192 llvm::sort(Exits, [&RPOIdx](const EarlyExitInfo &A, const EarlyExitInfo &B) {
4193 return RPOIdx[A.EarlyExitingVPBB] < RPOIdx[B.EarlyExitingVPBB];
4194 });
4195#ifndef NDEBUG
4196 // After RPO sorting, verify that for any pair where one exit dominates
4197 // another, the dominating exit comes first. This is guaranteed by RPO
4198 // (topological order) and is required for the dispatch chain correctness.
4199 for (unsigned I = 0; I + 1 < Exits.size(); ++I)
4200 for (unsigned J = I + 1; J < Exits.size(); ++J)
4201 assert(!VPDT.properlyDominates(Exits[J].EarlyExitingVPBB,
4202 Exits[I].EarlyExitingVPBB) &&
4203 "RPO sort must place dominating exits before dominated ones");
4204#endif
4205
4206 // Build the AnyOf condition for the latch terminator using logical OR
4207 // to avoid poison propagation from later exit conditions when an earlier
4208 // exit is taken.
4209 VPValue *Combined = Exits[0].CondToExit;
4210 for (const EarlyExitInfo &Info : drop_begin(Exits))
4211 Combined = Builder.createLogicalOr(Combined, Info.CondToExit);
4212
4213 VPValue *IsAnyExitTaken =
4214 Builder.createNaryOp(VPInstruction::AnyOf, {Combined});
4215
4217 "Early exit store masking not implemented");
4218
4219 // Create the vector.early.exit blocks.
4220 SmallVector<VPBasicBlock *> VectorEarlyExitVPBBs(Exits.size());
4221 for (unsigned Idx = 0; Idx != Exits.size(); ++Idx) {
4222 Twine BlockSuffix = Exits.size() == 1 ? "" : Twine(".") + Twine(Idx);
4223 VPBasicBlock *VectorEarlyExitVPBB =
4224 Plan.createVPBasicBlock("vector.early.exit" + BlockSuffix);
4225 VectorEarlyExitVPBBs[Idx] = VectorEarlyExitVPBB;
4226 }
4227
4228 // Create the dispatch block (or reuse the single exit block if only one
4229 // exit). The dispatch block computes the first active lane of the combined
4230 // condition and, for multiple exits, chains through conditions to determine
4231 // which exit to take.
4232 VPBasicBlock *DispatchVPBB =
4233 Exits.size() == 1 ? VectorEarlyExitVPBBs[0]
4234 : Plan.createVPBasicBlock("vector.early.exit.check");
4235 VPBuilder DispatchBuilder(DispatchVPBB, DispatchVPBB->begin());
4236 VPValue *FirstActiveLane =
4237 DispatchBuilder.createNaryOp(VPInstruction::FirstActiveLane, {Combined},
4238 DebugLoc::getUnknown(), "first.active.lane");
4239
4240 // For each early exit, disconnect the original exiting block
4241 // (early.exiting.I) from the exit block (ir-bb<exit.I>) and route through a
4242 // new vector.early.exit block. Update ir-bb<exit.I>'s phis to extract their
4243 // values at the first active lane:
4244 //
4245 // Input:
4246 // early.exiting.I:
4247 // ...
4248 // EMIT branch-on-cond vp<%cond.I>
4249 // Successor(s): in.loop.succ, ir-bb<exit.I>
4250 //
4251 // ir-bb<exit.I>:
4252 // IR %phi = phi [ vp<%incoming.I>, early.exiting.I ], ...
4253 //
4254 // Output:
4255 // early.exiting.I:
4256 // ...
4257 // Successor(s): in.loop.succ
4258 //
4259 // vector.early.exit.I:
4260 // EMIT vp<%exit.val> = extract-lane vp<%first.lane>, vp<%incoming.I>
4261 // Successor(s): ir-bb<exit.I>
4262 //
4263 // ir-bb<exit.I>:
4264 // IR %phi = phi ... (extra operand: vp<%exit.val> from
4265 // vector.early.exit.I)
4266 //
4267 for (auto [Exit, VectorEarlyExitVPBB] :
4268 zip_equal(Exits, VectorEarlyExitVPBBs)) {
4269 auto &[EarlyExitingVPBB, EarlyExitVPBB, _] = Exit;
4270 // Adjust the phi nodes in EarlyExitVPBB.
4271 // 1. remove incoming values from EarlyExitingVPBB,
4272 // 2. extract the incoming value at FirstActiveLane
4273 // 3. add back the extracts as last operands for the phis
4274 // Then adjust the CFG, removing the edge between EarlyExitingVPBB and
4275 // EarlyExitVPBB and adding a new edge between VectorEarlyExitVPBB and
4276 // EarlyExitVPBB. The extracts at FirstActiveLane are now the incoming
4277 // values from VectorEarlyExitVPBB.
4278 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
4279 auto *ExitIRI = cast<VPIRPhi>(&R);
4280 VPValue *IncomingVal =
4281 ExitIRI->getIncomingValueForBlock(EarlyExitingVPBB);
4282 VPValue *NewIncoming = IncomingVal;
4283 if (!isa<VPIRValue>(IncomingVal)) {
4284 VPBuilder EarlyExitBuilder(VectorEarlyExitVPBB);
4285 NewIncoming = EarlyExitBuilder.createNaryOp(
4286 VPInstruction::ExtractLane, {FirstActiveLane, IncomingVal},
4287 DebugLoc::getUnknown(), "early.exit.value");
4288 }
4289 ExitIRI->removeIncomingValueFor(EarlyExitingVPBB);
4290 ExitIRI->addOperand(NewIncoming);
4291 }
4292
4293 EarlyExitingVPBB->getTerminator()->eraseFromParent();
4294 VPBlockUtils::disconnectBlocks(EarlyExitingVPBB, EarlyExitVPBB);
4295 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
4296 }
4297
4298 // Chain through exits: for each exit, check if its condition is true at
4299 // the first active lane. If so, take that exit; otherwise, try the next.
4300 // The last exit needs no check since it must be taken if all others fail.
4301 //
4302 // For 3 exits (cond.0, cond.1, cond.2), this creates:
4303 //
4304 // latch:
4305 // ...
4306 // EMIT vp<%combined> = logical-or vp<%cond.0>, vp<%cond.1>, vp<%cond.2>
4307 // ...
4308 //
4309 // vector.early.exit.check:
4310 // EMIT vp<%first.lane> = first-active-lane vp<%combined>
4311 // EMIT vp<%at.cond.0> = extract-lane vp<%first.lane>, vp<%cond.0>
4312 // EMIT branch-on-cond vp<%at.cond.0>
4313 // Successor(s): vector.early.exit.0, vector.early.exit.check.0
4314 //
4315 // vector.early.exit.check.0:
4316 // EMIT vp<%at.cond.1> = extract-lane vp<%first.lane>, vp<%cond.1>
4317 // EMIT branch-on-cond vp<%at.cond.1>
4318 // Successor(s): vector.early.exit.1, vector.early.exit.2
4319 VPBasicBlock *CurrentBB = DispatchVPBB;
4320 for (auto [I, Exit] : enumerate(ArrayRef(Exits).drop_back())) {
4321 VPValue *LaneVal = DispatchBuilder.createNaryOp(
4322 VPInstruction::ExtractLane, {FirstActiveLane, Exit.CondToExit},
4323 DebugLoc::getUnknown(), "exit.cond.at.lane");
4324
4325 // For the last dispatch, branch directly to the last exit on false;
4326 // otherwise, create a new check block.
4327 bool IsLastDispatch = (I + 2 == Exits.size());
4328 VPBasicBlock *FalseBB =
4329 IsLastDispatch ? VectorEarlyExitVPBBs.back()
4330 : Plan.createVPBasicBlock(
4331 Twine("vector.early.exit.check.") + Twine(I));
4332
4333 DispatchBuilder.createNaryOp(VPInstruction::BranchOnCond, {LaneVal});
4334 CurrentBB->setSuccessors({VectorEarlyExitVPBBs[I], FalseBB});
4335 VectorEarlyExitVPBBs[I]->setPredecessors({CurrentBB});
4336 FalseBB->setPredecessors({CurrentBB});
4337
4338 CurrentBB = FalseBB;
4339 DispatchBuilder.setInsertPoint(CurrentBB);
4340 }
4341
4342 // Replace the latch terminator with the new branching logic.
4343 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
4344 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
4345 "Unexpected terminator");
4346 auto *IsLatchExitTaken =
4347 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
4348 LatchExitingBranch->getOperand(1));
4349
4350 DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();
4351 LatchExitingBranch->eraseFromParent();
4352 Builder.setInsertPoint(LatchVPBB);
4353 Builder.createNaryOp(VPInstruction::BranchOnTwoConds,
4354 {IsAnyExitTaken, IsLatchExitTaken}, LatchDL);
4355 LatchVPBB->clearSuccessors();
4356 LatchVPBB->setSuccessors({DispatchVPBB, MiddleVPBB, HeaderVPBB});
4357 DispatchVPBB->setPredecessors({LatchVPBB});
4358}
4359
4360/// This function tries convert extended in-loop reductions to
4361/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
4362/// valid. The created recipe must be decomposed to its constituent
4363/// recipes before execution.
4364static VPExpressionRecipe *
4366 VFRange &Range) {
4367 Type *RedTy = Ctx.Types.inferScalarType(Red);
4368 VPValue *VecOp = Red->getVecOp();
4369
4370 // Clamp the range if using extended-reduction is profitable.
4371 auto IsExtendedRedValidAndClampRange =
4372 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
4374 [&](ElementCount VF) {
4375 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4377
4379 InstructionCost ExtCost =
4380 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
4381 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4382
4383 if (Red->isPartialReduction()) {
4386 // FIXME: Move partial reduction creation, costing and clamping
4387 // here from LoopVectorize.cpp.
4388 ExtRedCost = Ctx.TTI.getPartialReductionCost(
4389 Opcode, SrcTy, nullptr, RedTy, VF, ExtKind,
4390 llvm::TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind,
4391 RedTy->isFloatingPointTy()
4392 ? std::optional{Red->getFastMathFlags()}
4393 : std::nullopt);
4394 } else if (!RedTy->isFloatingPointTy()) {
4395 // TTI::getExtendedReductionCost only supports integer types.
4396 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
4397 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
4398 Red->getFastMathFlags(), CostKind);
4399 }
4400 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
4401 },
4402 Range);
4403 };
4404
4405 VPValue *A;
4406 // Match reduce(ext)).
4407 if (isa<VPWidenCastRecipe>(VecOp) &&
4408 (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) ||
4409 match(VecOp, m_FPExt(m_VPValue(A)))) &&
4410 IsExtendedRedValidAndClampRange(
4411 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
4412 cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
4413 Ctx.Types.inferScalarType(A)))
4414 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
4415
4416 return nullptr;
4417}
4418
4419/// This function tries convert extended in-loop reductions to
4420/// VPExpressionRecipe and clamp the \p Range if it is beneficial
4421/// and valid. The created VPExpressionRecipe must be decomposed to its
4422/// constituent recipes before execution. Patterns of the
4423/// VPExpressionRecipe:
4424/// reduce.add(mul(...)),
4425/// reduce.add(mul(ext(A), ext(B))),
4426/// reduce.add(ext(mul(ext(A), ext(B)))).
4427/// reduce.fadd(fmul(ext(A), ext(B)))
4428static VPExpressionRecipe *
4430 VPCostContext &Ctx, VFRange &Range) {
4431 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
4432 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
4433 Opcode != Instruction::FAdd)
4434 return nullptr;
4435
4436 Type *RedTy = Ctx.Types.inferScalarType(Red);
4437
4438 // Clamp the range if using multiply-accumulate-reduction is profitable.
4439 auto IsMulAccValidAndClampRange =
4441 VPWidenCastRecipe *OuterExt) -> bool {
4443 [&](ElementCount VF) {
4445 Type *SrcTy =
4446 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
4447 InstructionCost MulAccCost;
4448
4449 if (Red->isPartialReduction()) {
4450 Type *SrcTy2 =
4451 Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr;
4452 // FIXME: Move partial reduction creation, costing and clamping
4453 // here from LoopVectorize.cpp.
4454 MulAccCost = Ctx.TTI.getPartialReductionCost(
4455 Opcode, SrcTy, SrcTy2, RedTy, VF,
4457 Ext0->getOpcode())
4460 Ext1->getOpcode())
4462 Mul->getOpcode(), CostKind,
4463 RedTy->isFloatingPointTy()
4464 ? std::optional{Red->getFastMathFlags()}
4465 : std::nullopt);
4466 } else {
4467 // Only partial reductions support mixed or floating-point extends
4468 // at the moment.
4469 if (Ext0 && Ext1 &&
4470 (Ext0->getOpcode() != Ext1->getOpcode() ||
4471 Ext0->getOpcode() == Instruction::CastOps::FPExt))
4472 return false;
4473
4474 bool IsZExt =
4475 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
4476 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4477 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
4478 SrcVecTy, CostKind);
4479 }
4480
4481 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4482 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4483 InstructionCost ExtCost = 0;
4484 if (Ext0)
4485 ExtCost += Ext0->computeCost(VF, Ctx);
4486 if (Ext1)
4487 ExtCost += Ext1->computeCost(VF, Ctx);
4488 if (OuterExt)
4489 ExtCost += OuterExt->computeCost(VF, Ctx);
4490
4491 return MulAccCost.isValid() &&
4492 MulAccCost < ExtCost + MulCost + RedCost;
4493 },
4494 Range);
4495 };
4496
4497 VPValue *VecOp = Red->getVecOp();
4498 VPRecipeBase *Sub = nullptr;
4499 VPValue *A, *B;
4500 VPValue *Tmp = nullptr;
4501
4502 // Try to match reduce.fadd(fmul(fpext(...), fpext(...))).
4503 if (match(VecOp, m_FMul(m_FPExt(m_VPValue()), m_FPExt(m_VPValue())))) {
4504 assert(Opcode == Instruction::FAdd &&
4505 "MulAccumulateReduction from an FMul must accumulate into an FAdd "
4506 "instruction");
4507 auto *FMul = dyn_cast<VPWidenRecipe>(VecOp);
4508 if (!FMul)
4509 return nullptr;
4510
4511 auto *RecipeA = dyn_cast<VPWidenCastRecipe>(FMul->getOperand(0));
4512 auto *RecipeB = dyn_cast<VPWidenCastRecipe>(FMul->getOperand(1));
4513
4514 if (RecipeA && RecipeB &&
4515 IsMulAccValidAndClampRange(FMul, RecipeA, RecipeB, nullptr)) {
4516 return new VPExpressionRecipe(RecipeA, RecipeB, FMul, Red);
4517 }
4518 }
4519 if (RedTy->isFloatingPointTy())
4520 return nullptr;
4521
4522 // Sub reductions could have a sub between the add reduction and vec op.
4523 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
4524 Sub = VecOp->getDefiningRecipe();
4525 VecOp = Tmp;
4526 }
4527
4528 // If ValB is a constant and can be safely extended, truncate it to the same
4529 // type as ExtA's operand, then extend it to the same type as ExtA. This
4530 // creates two uniform extends that can more easily be matched by the rest of
4531 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4532 // replaced with the new extend of the constant.
4533 auto ExtendAndReplaceConstantOp = [&Ctx, &Red](VPWidenCastRecipe *ExtA,
4534 VPWidenCastRecipe *&ExtB,
4535 VPValue *&ValB,
4536 VPWidenRecipe *Mul) {
4537 if (!ExtA || ExtB || !isa<VPIRValue>(ValB) || Red->isPartialReduction())
4538 return;
4539 Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
4540 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4541 const APInt *Const;
4542 if (!match(ValB, m_APInt(Const)) ||
4544 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
4545 return;
4546 // The truncate ensures that the type of each extended operand is the
4547 // same, and it's been proven that the constant can be extended from
4548 // NarrowTy safely. Necessary since ExtA's extended operand would be
4549 // e.g. an i8, while the const will likely be an i32. This will be
4550 // elided by later optimisations.
4551 VPBuilder Builder(Mul);
4552 auto *Trunc =
4553 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
4554 Type *WideTy = Ctx.Types.inferScalarType(ExtA);
4555 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
4556 Mul->setOperand(1, ExtB);
4557 };
4558
4559 // Try to match reduce.add(mul(...)).
4560 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
4563 auto *Mul = cast<VPWidenRecipe>(VecOp);
4564
4565 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4566 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4567
4568 // Match reduce.add/sub(mul(ext, ext)).
4569 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
4570 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
4571 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4572 if (Sub)
4573 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4574 cast<VPWidenRecipe>(Sub), Red);
4575 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4576 }
4577 // TODO: Add an expression type for this variant with a negated mul
4578 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4579 return new VPExpressionRecipe(Mul, Red);
4580 }
4581 // TODO: Add an expression type for negated versions of other expression
4582 // variants.
4583 if (Sub)
4584 return nullptr;
4585
4586 // Match reduce.add(ext(mul(A, B))).
4587 if (!Red->isPartialReduction() &&
4588 match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
4589 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
4590 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
4593
4594 // reduce.add(ext(mul(ext, const)))
4595 // -> reduce.add(ext(mul(ext, ext(const))))
4596 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
4597
4598 // reduce.add(ext(mul(ext(A), ext(B))))
4599 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
4600 // The inner extends must either have the same opcode as the outer extend or
4601 // be the same, in which case the multiply can never result in a negative
4602 // value and the outer extend can be folded away by doing wider
4603 // extends for the operands of the mul.
4604 if (Ext0 && Ext1 &&
4605 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
4606 Ext0->getOpcode() == Ext1->getOpcode() &&
4607 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
4608 auto *NewExt0 = new VPWidenCastRecipe(
4609 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), nullptr,
4610 *Ext0, *Ext0, Ext0->getDebugLoc());
4611 NewExt0->insertBefore(Ext0);
4612
4613 VPWidenCastRecipe *NewExt1 = NewExt0;
4614 if (Ext0 != Ext1) {
4615 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
4616 Ext->getResultType(), nullptr, *Ext1,
4617 *Ext1, Ext1->getDebugLoc());
4618 NewExt1->insertBefore(Ext1);
4619 }
4620 Mul->setOperand(0, NewExt0);
4621 Mul->setOperand(1, NewExt1);
4622 Red->setOperand(1, Mul);
4623 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
4624 }
4625 }
4626 return nullptr;
4627}
4628
4629/// This function tries to create abstract recipes from the reduction recipe for
4630/// following optimizations and cost estimation.
4632 VPCostContext &Ctx,
4633 VFRange &Range) {
4634 VPExpressionRecipe *AbstractR = nullptr;
4635 auto IP = std::next(Red->getIterator());
4636 auto *VPBB = Red->getParent();
4637 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
4638 AbstractR = MulAcc;
4639 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
4640 AbstractR = ExtRed;
4641 // Cannot create abstract inloop reduction recipes.
4642 if (!AbstractR)
4643 return;
4644
4645 AbstractR->insertBefore(*VPBB, IP);
4646 Red->replaceAllUsesWith(AbstractR);
4647}
4648
4659
4661 if (Plan.hasScalarVFOnly())
4662 return;
4663
4664#ifndef NDEBUG
4665 VPDominatorTree VPDT(Plan);
4666#endif
4667
4668 SmallVector<VPValue *> VPValues;
4671 append_range(VPValues, Plan.getLiveIns());
4672 for (VPRecipeBase &R : *Plan.getEntry())
4673 append_range(VPValues, R.definedValues());
4674
4675 auto *VectorPreheader = Plan.getVectorPreheader();
4676 for (VPValue *VPV : VPValues) {
4678 (isa<VPIRValue>(VPV) && isa<Constant>(VPV->getLiveInIRValue())))
4679 continue;
4680
4681 // Add explicit broadcast at the insert point that dominates all users.
4682 VPBasicBlock *HoistBlock = VectorPreheader;
4683 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
4684 for (VPUser *User : VPV->users()) {
4685 if (User->usesScalars(VPV))
4686 continue;
4687 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
4688 HoistPoint = HoistBlock->begin();
4689 else
4690 assert(VPDT.dominates(VectorPreheader,
4691 cast<VPRecipeBase>(User)->getParent()) &&
4692 "All users must be in the vector preheader or dominated by it");
4693 }
4694
4695 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
4696 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
4697 VPV->replaceUsesWithIf(Broadcast,
4698 [VPV, Broadcast](VPUser &U, unsigned Idx) {
4699 return Broadcast != &U && !U.usesScalars(VPV);
4700 });
4701 }
4702}
4703
4705 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4706
4707 // Collect candidate loads with invariant addresses and noalias scopes
4708 // metadata and memory-writing recipes with noalias metadata.
4712 vp_depth_first_shallow(LoopRegion->getEntry()))) {
4713 for (VPRecipeBase &R : *VPBB) {
4714 // Only handle single-scalar replicated loads with invariant addresses.
4715 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4716 if (RepR->isPredicated() || !RepR->isSingleScalar() ||
4717 RepR->getOpcode() != Instruction::Load)
4718 continue;
4719
4720 VPValue *Addr = RepR->getOperand(0);
4721 if (Addr->isDefinedOutsideLoopRegions()) {
4723 if (!Loc.AATags.Scope)
4724 continue;
4725 CandidateLoads.push_back({RepR, Loc});
4726 }
4727 }
4728 if (R.mayWriteToMemory()) {
4730 if (!Loc || !Loc->AATags.Scope || !Loc->AATags.NoAlias)
4731 return;
4732 Stores.push_back(*Loc);
4733 }
4734 }
4735 }
4736
4737 VPBasicBlock *Preheader = Plan.getVectorPreheader();
4738 for (auto &[LoadRecipe, LoadLoc] : CandidateLoads) {
4739 // Hoist the load to the preheader if it doesn't alias with any stores
4740 // according to the noalias metadata. Other loads should have been hoisted
4741 // by other passes
4742 const AAMDNodes &LoadAA = LoadLoc.AATags;
4743 if (all_of(Stores, [&](const MemoryLocation &StoreLoc) {
4745 LoadAA.Scope, StoreLoc.AATags.NoAlias);
4746 })) {
4747 LoadRecipe->moveBefore(*Preheader, Preheader->getFirstNonPhi());
4748 }
4749 }
4750}
4751
4752// Collect common metadata from a group of replicate recipes by intersecting
4753// metadata from all recipes in the group.
4755 VPIRMetadata CommonMetadata = *Recipes.front();
4756 for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
4757 CommonMetadata.intersect(*Recipe);
4758 return CommonMetadata;
4759}
4760
4761template <unsigned Opcode>
4765 const Loop *L) {
4766 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
4767 "Only Load and Store opcodes supported");
4768 constexpr bool IsLoad = (Opcode == Instruction::Load);
4769 VPTypeAnalysis TypeInfo(Plan);
4770
4771 // For each address, collect operations with the same or complementary masks.
4773 auto GetLoadStoreValueType = [&](VPReplicateRecipe *Recipe) {
4774 return TypeInfo.inferScalarType(IsLoad ? Recipe : Recipe->getOperand(0));
4775 };
4777 Plan, PSE, L,
4778 [](VPReplicateRecipe *RepR) { return RepR->isPredicated(); });
4779 for (auto Recipes : Groups) {
4780 if (Recipes.size() < 2)
4781 continue;
4782
4783 // Collect groups with the same or complementary masks.
4784 for (VPReplicateRecipe *&RecipeI : Recipes) {
4785 if (!RecipeI)
4786 continue;
4787
4788 VPValue *MaskI = RecipeI->getMask();
4789 Type *TypeI = GetLoadStoreValueType(RecipeI);
4791 Group.push_back(RecipeI);
4792 RecipeI = nullptr;
4793
4794 // Find all operations with the same or complementary masks.
4795 bool HasComplementaryMask = false;
4796 for (VPReplicateRecipe *&RecipeJ : Recipes) {
4797 if (!RecipeJ)
4798 continue;
4799
4800 VPValue *MaskJ = RecipeJ->getMask();
4801 Type *TypeJ = GetLoadStoreValueType(RecipeJ);
4802 if (TypeI == TypeJ) {
4803 // Check if any operation in the group has a complementary mask with
4804 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
4805 HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
4806 match(MaskJ, m_Not(m_Specific(MaskI)));
4807 Group.push_back(RecipeJ);
4808 RecipeJ = nullptr;
4809 }
4810 }
4811
4812 if (HasComplementaryMask) {
4813 assert(Group.size() >= 2 && "must have at least 2 entries");
4814 AllGroups.push_back(std::move(Group));
4815 }
4816 }
4817 }
4818
4819 return AllGroups;
4820}
4821
4822// Find the recipe with minimum alignment in the group.
4823template <typename InstType>
4824static VPReplicateRecipe *
4826 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4827 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
4828 cast<InstType>(B->getUnderlyingInstr())->getAlign();
4829 });
4830}
4831
4834 const Loop *L) {
4835 auto Groups =
4837 if (Groups.empty())
4838 return;
4839
4840 // Process each group of loads.
4841 for (auto &Group : Groups) {
4842 // Try to use the earliest (most dominating) load to replace all others.
4843 VPReplicateRecipe *EarliestLoad = Group[0];
4844 VPBasicBlock *FirstBB = EarliestLoad->getParent();
4845 VPBasicBlock *LastBB = Group.back()->getParent();
4846
4847 // Check that the load doesn't alias with stores between first and last.
4848 auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
4849 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
4850 continue;
4851
4852 // Collect common metadata from all loads in the group.
4853 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4854
4855 // Find the load with minimum alignment to use.
4856 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
4857
4858 bool IsSingleScalar = EarliestLoad->isSingleScalar();
4859 assert(all_of(Group,
4860 [IsSingleScalar](VPReplicateRecipe *R) {
4861 return R->isSingleScalar() == IsSingleScalar;
4862 }) &&
4863 "all members in group must agree on IsSingleScalar");
4864
4865 // Create an unpredicated version of the earliest load with common
4866 // metadata.
4867 auto *UnpredicatedLoad = new VPReplicateRecipe(
4868 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
4869 IsSingleScalar, /*Mask=*/nullptr, *EarliestLoad, CommonMetadata);
4870
4871 UnpredicatedLoad->insertBefore(EarliestLoad);
4872
4873 // Replace all loads in the group with the unpredicated load.
4874 for (VPReplicateRecipe *Load : Group) {
4875 Load->replaceAllUsesWith(UnpredicatedLoad);
4876 Load->eraseFromParent();
4877 }
4878 }
4879}
4880
4881static bool
4883 PredicatedScalarEvolution &PSE, const Loop &L,
4884 VPTypeAnalysis &TypeInfo) {
4885 auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
4886 if (!StoreLoc || !StoreLoc->AATags.Scope)
4887 return false;
4888
4889 // When sinking a group of stores, all members of the group alias each other.
4890 // Skip them during the alias checks.
4891 SmallPtrSet<VPRecipeBase *, 4> StoresToSinkSet(StoresToSink.begin(),
4892 StoresToSink.end());
4893
4894 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
4895 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
4896 SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], PSE, L, TypeInfo);
4897 return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
4898}
4899
4902 const Loop *L) {
4903 auto Groups =
4905 if (Groups.empty())
4906 return;
4907
4908 VPTypeAnalysis TypeInfo(Plan);
4909
4910 for (auto &Group : Groups) {
4911 if (!canSinkStoreWithNoAliasCheck(Group, PSE, *L, TypeInfo))
4912 continue;
4913
4914 // Use the last (most dominated) store's location for the unconditional
4915 // store.
4916 VPReplicateRecipe *LastStore = Group.back();
4917 VPBasicBlock *InsertBB = LastStore->getParent();
4918
4919 // Collect common alias metadata from all stores in the group.
4920 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4921
4922 // Build select chain for stored values.
4923 VPValue *SelectedValue = Group[0]->getOperand(0);
4924 VPBuilder Builder(InsertBB, LastStore->getIterator());
4925
4926 bool IsSingleScalar = Group[0]->isSingleScalar();
4927 for (unsigned I = 1; I < Group.size(); ++I) {
4928 assert(IsSingleScalar == Group[I]->isSingleScalar() &&
4929 "all members in group must agree on IsSingleScalar");
4930 VPValue *Mask = Group[I]->getMask();
4931 VPValue *Value = Group[I]->getOperand(0);
4932 SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,
4933 Group[I]->getDebugLoc());
4934 }
4935
4936 // Find the store with minimum alignment to use.
4937 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
4938
4939 // Create unconditional store with selected value and common metadata.
4940 auto *UnpredicatedStore = new VPReplicateRecipe(
4941 StoreWithMinAlign->getUnderlyingInstr(),
4942 {SelectedValue, LastStore->getOperand(1)}, IsSingleScalar,
4943 /*Mask=*/nullptr, *LastStore, CommonMetadata);
4944 UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());
4945
4946 // Remove all predicated stores from the group.
4947 for (VPReplicateRecipe *Store : Group)
4948 Store->eraseFromParent();
4949 }
4950}
4951
4953 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
4955 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
4956 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
4957
4958 VPValue *TC = Plan.getTripCount();
4959 if (TC->getNumUsers() == 0)
4960 return;
4961
4962 // Skip cases for which the trip count may be non-trivial to materialize.
4963 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
4964 // tail is required.
4965 if (!Plan.hasScalarTail() ||
4967 Plan.getScalarPreheader() ||
4968 !isa<VPIRValue>(TC))
4969 return;
4970
4971 // Materialize vector trip counts for constants early if it can simply
4972 // be computed as (Original TC / VF * UF) * VF * UF.
4973 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
4974 // tail-folded loops.
4975 ScalarEvolution &SE = *PSE.getSE();
4976 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
4977 if (!isa<SCEVConstant>(TCScev))
4978 return;
4979 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
4980 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
4981 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
4982 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
4983}
4984
4986 VPBasicBlock *VectorPH) {
4988 if (BTC->getNumUsers() == 0)
4989 return;
4990
4991 VPBuilder Builder(VectorPH, VectorPH->begin());
4992 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4993 auto *TCMO =
4994 Builder.createSub(Plan.getTripCount(), Plan.getConstantInt(TCTy, 1),
4995 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
4996 BTC->replaceAllUsesWith(TCMO);
4997}
4998
5000 if (Plan.hasScalarVFOnly())
5001 return;
5002
5003 VPTypeAnalysis TypeInfo(Plan);
5004 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
5005 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5007 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5008 vp_depth_first_shallow(LoopRegion->getEntry()));
5009 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
5010 // VPScalarIVStepsRecipe and VPInstructions, excluding ones in replicate
5011 // regions. Those are not materialized explicitly yet. Those vector users are
5012 // still handled in VPReplicateRegion::execute(), via shouldPack().
5013 // TODO: materialize build vectors for replicating recipes in replicating
5014 // regions.
5015 for (VPBasicBlock *VPBB :
5016 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
5017 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
5019 continue;
5020 auto *DefR = cast<VPSingleDefRecipe>(&R);
5021 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
5022 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
5023 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
5024 };
5025 if ((isa<VPReplicateRecipe>(DefR) &&
5026 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
5027 (isa<VPInstruction>(DefR) &&
5029 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
5030 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
5031 continue;
5032
5033 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
5034 unsigned Opcode = ScalarTy->isStructTy()
5037 auto *BuildVector = new VPInstruction(Opcode, {DefR});
5038 BuildVector->insertAfter(DefR);
5039
5040 DefR->replaceUsesWithIf(
5041 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
5042 VPUser &U, unsigned) {
5043 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
5044 });
5045 }
5046 }
5047
5048 // Create explicit VPInstructions to convert vectors to scalars. The current
5049 // implementation is conservative - it may miss some cases that may or may not
5050 // be vector values. TODO: introduce Unpacks speculatively - remove them later
5051 // if they are known to operate on scalar values.
5052 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
5053 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
5056 continue;
5057 for (VPValue *Def : R.definedValues()) {
5058 // Skip recipes that are single-scalar or only have their first lane
5059 // used.
5060 // TODO: The Defs skipped here may or may not be vector values.
5061 // Introduce Unpacks, and remove them later, if they are guaranteed to
5062 // produce scalar values.
5064 continue;
5065
5066 // At the moment, we create unpacks only for scalar users outside
5067 // replicate regions. Recipes inside replicate regions still extract the
5068 // required lanes implicitly.
5069 // TODO: Remove once replicate regions are unrolled completely.
5070 auto IsCandidateUnpackUser = [Def](VPUser *U) {
5071 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
5072 return U->usesScalars(Def) &&
5073 (!ParentRegion || !ParentRegion->isReplicator());
5074 };
5075 if (none_of(Def->users(), IsCandidateUnpackUser))
5076 continue;
5077
5078 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
5079 if (R.isPhi())
5080 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
5081 else
5082 Unpack->insertAfter(&R);
5083 Def->replaceUsesWithIf(Unpack,
5084 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
5085 return IsCandidateUnpackUser(&U);
5086 });
5087 }
5088 }
5089 }
5090}
5091
5093 VPBasicBlock *VectorPHVPBB,
5094 bool TailByMasking,
5095 bool RequiresScalarEpilogue,
5096 VPValue *Step) {
5097 VPSymbolicValue &VectorTC = Plan.getVectorTripCount();
5098 // There's nothing to do if there are no users of the vector trip count or its
5099 // IR value has already been set.
5100 if (VectorTC.getNumUsers() == 0 || VectorTC.getUnderlyingValue())
5101 return;
5102
5103 VPValue *TC = Plan.getTripCount();
5104 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
5105 VPBasicBlock::iterator InsertPt = VectorPHVPBB->begin();
5106 if (auto *StepR = Step->getDefiningRecipe()) {
5107 assert(StepR->getParent() == VectorPHVPBB &&
5108 "Step must be defined in VectorPHVPBB");
5109 // Insert after Step's definition to maintain valid def-use ordering.
5110 InsertPt = std::next(StepR->getIterator());
5111 }
5112 VPBuilder Builder(VectorPHVPBB, InsertPt);
5113
5114 // If the tail is to be folded by masking, round the number of iterations N
5115 // up to a multiple of Step instead of rounding down. This is done by first
5116 // adding Step-1 and then rounding down. Note that it's ok if this addition
5117 // overflows: the vector induction variable will eventually wrap to zero given
5118 // that it starts at zero and its Step is a power of two; the loop will then
5119 // exit, with the last early-exit vector comparison also producing all-true.
5120 if (TailByMasking) {
5121 TC = Builder.createAdd(
5122 TC, Builder.createSub(Step, Plan.getConstantInt(TCTy, 1)),
5123 DebugLoc::getCompilerGenerated(), "n.rnd.up");
5124 }
5125
5126 // Now we need to generate the expression for the part of the loop that the
5127 // vectorized body will execute. This is equal to N - (N % Step) if scalar
5128 // iterations are not required for correctness, or N - Step, otherwise. Step
5129 // is equal to the vectorization factor (number of SIMD elements) times the
5130 // unroll factor (number of SIMD instructions).
5131 VPValue *R =
5132 Builder.createNaryOp(Instruction::URem, {TC, Step},
5133 DebugLoc::getCompilerGenerated(), "n.mod.vf");
5134
5135 // There are cases where we *must* run at least one iteration in the remainder
5136 // loop. See the cost model for when this can happen. If the step evenly
5137 // divides the trip count, we set the remainder to be equal to the step. If
5138 // the step does not evenly divide the trip count, no adjustment is necessary
5139 // since there will already be scalar iterations. Note that the minimum
5140 // iterations check ensures that N >= Step.
5141 if (RequiresScalarEpilogue) {
5142 assert(!TailByMasking &&
5143 "requiring scalar epilogue is not supported with fail folding");
5144 VPValue *IsZero =
5145 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getZero(TCTy));
5146 R = Builder.createSelect(IsZero, Step, R);
5147 }
5148
5149 VPValue *Res =
5150 Builder.createSub(TC, R, DebugLoc::getCompilerGenerated(), "n.vec");
5151 VectorTC.replaceAllUsesWith(Res);
5152}
5153
5155 ElementCount VFEC) {
5156 // If VF and VFxUF have already been materialized (no remaining users),
5157 // there's nothing more to do.
5158 if (Plan.getVF().isMaterialized()) {
5159 assert(Plan.getVFxUF().isMaterialized() &&
5160 "VF and VFxUF must be materialized together");
5161 return;
5162 }
5163
5164 VPBuilder Builder(VectorPH, VectorPH->begin());
5165 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
5166 VPValue &VF = Plan.getVF();
5167 VPValue &VFxUF = Plan.getVFxUF();
5168 // If there are no users of the runtime VF, compute VFxUF by constant folding
5169 // the multiplication of VF and UF.
5170 if (VF.getNumUsers() == 0) {
5171 VPValue *RuntimeVFxUF =
5172 Builder.createElementCount(TCTy, VFEC * Plan.getConcreteUF());
5173 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
5174 return;
5175 }
5176
5177 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
5178 // vscale) * UF.
5179 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
5181 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
5183 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
5184 }
5185 VF.replaceAllUsesWith(RuntimeVF);
5186
5187 VPValue *MulByUF = Builder.createOverflowingOp(
5188 Instruction::Mul,
5189 {RuntimeVF, Plan.getConstantInt(TCTy, Plan.getConcreteUF())},
5190 {true, false});
5191 VFxUF.replaceAllUsesWith(MulByUF);
5192}
5193
5196 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
5197
5198 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
5199 BasicBlock *EntryBB = Entry->getIRBasicBlock();
5200 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
5201 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
5203 continue;
5204 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
5205 if (!ExpSCEV)
5206 break;
5207 const SCEV *Expr = ExpSCEV->getSCEV();
5208 Value *Res =
5209 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
5210 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
5211 VPValue *Exp = Plan.getOrAddLiveIn(Res);
5212 ExpSCEV->replaceAllUsesWith(Exp);
5213 if (Plan.getTripCount() == ExpSCEV)
5214 Plan.resetTripCount(Exp);
5215 ExpSCEV->eraseFromParent();
5216 }
5218 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
5219 "before any VPIRInstructions");
5220 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
5221 // to the VPIRBasicBlock.
5222 auto EI = Entry->begin();
5223 for (Instruction &I : drop_end(*EntryBB)) {
5224 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
5225 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
5226 EI++;
5227 continue;
5228 }
5230 }
5231
5232 return ExpandedSCEVs;
5233}
5234
5235/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
5236/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
5237/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
5238/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
5239/// an index-independent load if it feeds all wide ops at all indices (\p OpV
5240/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
5241/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
5242/// is defined at \p Idx of a load interleave group.
5243static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx,
5244 VPValue *OpV, unsigned Idx, bool IsScalable) {
5245 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
5246 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
5247 if (!Member0OpR)
5248 return Member0Op == OpV;
5249 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
5250 // For scalable VFs, the narrowed plan processes vscale iterations at once,
5251 // so a shared wide load cannot be narrowed to a uniform scalar; bail out.
5252 return !IsScalable && !W->getMask() && W->isConsecutive() &&
5253 Member0Op == OpV;
5254 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
5255 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
5256 return false;
5257}
5258
5259static bool canNarrowOps(ArrayRef<VPValue *> Ops, bool IsScalable) {
5261 auto *WideMember0 = dyn_cast<VPSingleDefRecipe>(Ops[0]);
5262 if (!WideMember0)
5263 return false;
5264 for (VPValue *V : Ops) {
5266 return false;
5267 auto *R = cast<VPSingleDefRecipe>(V);
5268 if (getOpcodeOrIntrinsicID(R) != getOpcodeOrIntrinsicID(WideMember0))
5269 return false;
5270 }
5271
5272 for (unsigned Idx = 0; Idx != WideMember0->getNumOperands(); ++Idx) {
5274 for (VPValue *Op : Ops)
5275 OpsI.push_back(Op->getDefiningRecipe()->getOperand(Idx));
5276
5277 if (canNarrowOps(OpsI, IsScalable))
5278 continue;
5279
5280 if (any_of(enumerate(OpsI), [WideMember0, Idx, IsScalable](const auto &P) {
5281 const auto &[OpIdx, OpV] = P;
5282 return !canNarrowLoad(WideMember0, Idx, OpV, OpIdx, IsScalable);
5283 }))
5284 return false;
5285 }
5286
5287 return true;
5288}
5289
5290/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
5291/// number of members both equal to VF. The interleave group must also access
5292/// the full vector width.
5293static std::optional<ElementCount> isConsecutiveInterleaveGroup(
5295 VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI) {
5296 if (!InterleaveR || InterleaveR->getMask())
5297 return std::nullopt;
5298
5299 Type *GroupElementTy = nullptr;
5300 if (InterleaveR->getStoredValues().empty()) {
5301 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
5302 if (!all_of(InterleaveR->definedValues(),
5303 [&TypeInfo, GroupElementTy](VPValue *Op) {
5304 return TypeInfo.inferScalarType(Op) == GroupElementTy;
5305 }))
5306 return std::nullopt;
5307 } else {
5308 GroupElementTy =
5309 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
5310 if (!all_of(InterleaveR->getStoredValues(),
5311 [&TypeInfo, GroupElementTy](VPValue *Op) {
5312 return TypeInfo.inferScalarType(Op) == GroupElementTy;
5313 }))
5314 return std::nullopt;
5315 }
5316
5317 auto IG = InterleaveR->getInterleaveGroup();
5318 if (IG->getFactor() != IG->getNumMembers())
5319 return std::nullopt;
5320
5321 auto GetVectorBitWidthForVF = [&TTI](ElementCount VF) {
5322 TypeSize Size = TTI.getRegisterBitWidth(
5325 assert(Size.isScalable() == VF.isScalable() &&
5326 "if Size is scalable, VF must be scalable and vice versa");
5327 return Size.getKnownMinValue();
5328 };
5329
5330 for (ElementCount VF : VFs) {
5331 unsigned MinVal = VF.getKnownMinValue();
5332 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
5333 if (IG->getFactor() == MinVal && GroupSize == GetVectorBitWidthForVF(VF))
5334 return {VF};
5335 }
5336 return std::nullopt;
5337}
5338
5339/// Returns true if \p VPValue is a narrow VPValue.
5340static bool isAlreadyNarrow(VPValue *VPV) {
5341 if (isa<VPIRValue>(VPV))
5342 return true;
5343 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
5344 return RepR && RepR->isSingleScalar();
5345}
5346
5347// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
5348// a narrow variant.
5349static VPValue *
5351 auto *R = V->getDefiningRecipe();
5352 if (!R || NarrowedOps.contains(V))
5353 return V;
5354
5355 if (isAlreadyNarrow(V))
5356 return V;
5357
5359 auto *WideMember0 = cast<VPSingleDefRecipe>(R);
5360 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
5361 WideMember0->setOperand(
5362 Idx,
5363 narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps));
5364 return V;
5365 }
5366
5367 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
5368 // Narrow interleave group to wide load, as transformed VPlan will only
5369 // process one original iteration.
5370 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
5371 auto *L = new VPWidenLoadRecipe(
5372 *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
5373 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
5374 L->insertBefore(LoadGroup);
5375 NarrowedOps.insert(L);
5376 return L;
5377 }
5378
5379 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
5380 assert(RepR->isSingleScalar() &&
5381 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
5382 "must be a single scalar load");
5383 NarrowedOps.insert(RepR);
5384 return RepR;
5385 }
5386
5387 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
5388 VPValue *PtrOp = WideLoad->getAddr();
5389 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
5390 PtrOp = VecPtr->getOperand(0);
5391 // Narrow wide load to uniform scalar load, as transformed VPlan will only
5392 // process one original iteration.
5393 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
5394 /*IsUniform*/ true,
5395 /*Mask*/ nullptr, {}, *WideLoad);
5396 N->insertBefore(WideLoad);
5397 NarrowedOps.insert(N);
5398 return N;
5399}
5400
5401std::unique_ptr<VPlan>
5403 const TargetTransformInfo &TTI) {
5404 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
5405
5406 if (!VectorLoop)
5407 return nullptr;
5408
5409 // Only handle single-block loops for now.
5410 if (VectorLoop->getEntryBasicBlock() != VectorLoop->getExitingBasicBlock())
5411 return nullptr;
5412
5413 // Skip plans when we may not be able to properly narrow.
5414 VPBasicBlock *Exiting = VectorLoop->getExitingBasicBlock();
5415 if (!match(&Exiting->back(), m_BranchOnCount()))
5416 return nullptr;
5417
5418 assert(match(&Exiting->back(),
5420 m_Specific(&Plan.getVectorTripCount()))) &&
5421 "unexpected branch-on-count");
5422
5423 VPTypeAnalysis TypeInfo(Plan);
5425 std::optional<ElementCount> VFToOptimize;
5426 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
5428 continue;
5429
5432 continue;
5433
5434 // Bail out on recipes not supported at the moment:
5435 // * phi recipes other than the canonical induction
5436 // * recipes writing to memory except interleave groups
5437 // Only support plans with a canonical induction phi.
5438 if (R.isPhi())
5439 return nullptr;
5440
5441 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
5442 if (R.mayWriteToMemory() && !InterleaveR)
5443 return nullptr;
5444
5445 // All other ops are allowed, but we reject uses that cannot be converted
5446 // when checking all allowed consumers (store interleave groups) below.
5447 if (!InterleaveR)
5448 continue;
5449
5450 // Try to find a single VF, where all interleave groups are consecutive and
5451 // saturate the full vector width. If we already have a candidate VF, check
5452 // if it is applicable for the current InterleaveR, otherwise look for a
5453 // suitable VF across the Plan's VFs.
5455 VFToOptimize ? SmallVector<ElementCount>({*VFToOptimize})
5456 : to_vector(Plan.vectorFactors());
5457 std::optional<ElementCount> NarrowedVF =
5458 isConsecutiveInterleaveGroup(InterleaveR, VFs, TypeInfo, TTI);
5459 if (!NarrowedVF || (VFToOptimize && NarrowedVF != VFToOptimize))
5460 return nullptr;
5461 VFToOptimize = NarrowedVF;
5462
5463 // Skip read interleave groups.
5464 if (InterleaveR->getStoredValues().empty())
5465 continue;
5466
5467 // Narrow interleave groups, if all operands are already matching narrow
5468 // ops.
5469 auto *Member0 = InterleaveR->getStoredValues()[0];
5470 if (isAlreadyNarrow(Member0) &&
5471 all_of(InterleaveR->getStoredValues(), equal_to(Member0))) {
5472 StoreGroups.push_back(InterleaveR);
5473 continue;
5474 }
5475
5476 // For now, we only support full interleave groups storing load interleave
5477 // groups.
5478 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
5479 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
5480 if (!DefR)
5481 return false;
5482 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
5483 return IR && IR->getInterleaveGroup()->isFull() &&
5484 IR->getVPValue(Op.index()) == Op.value();
5485 })) {
5486 StoreGroups.push_back(InterleaveR);
5487 continue;
5488 }
5489
5490 // Check if all values feeding InterleaveR are matching wide recipes, which
5491 // operands that can be narrowed.
5492 if (!canNarrowOps(InterleaveR->getStoredValues(),
5493 VFToOptimize->isScalable()))
5494 return nullptr;
5495 StoreGroups.push_back(InterleaveR);
5496 }
5497
5498 if (StoreGroups.empty())
5499 return nullptr;
5500
5501 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5502 bool RequiresScalarEpilogue =
5503 MiddleVPBB->getNumSuccessors() == 1 &&
5504 MiddleVPBB->getSingleSuccessor() == Plan.getScalarPreheader();
5505 // Bail out for tail-folding (middle block with a single successor to exit).
5506 if (MiddleVPBB->getNumSuccessors() != 2 && !RequiresScalarEpilogue)
5507 return nullptr;
5508
5509 // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
5510 // original Plan into 2: a) a new clone which contains all VFs of Plan, except
5511 // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
5512 // TODO: Handle cases where only some interleave groups can be narrowed.
5513 std::unique_ptr<VPlan> NewPlan;
5514 if (size(Plan.vectorFactors()) != 1) {
5515 NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
5516 Plan.setVF(*VFToOptimize);
5517 NewPlan->removeVF(*VFToOptimize);
5518 }
5519
5520 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
5521 SmallPtrSet<VPValue *, 4> NarrowedOps;
5522 // Narrow operation tree rooted at store groups.
5523 for (auto *StoreGroup : StoreGroups) {
5524 VPValue *Res =
5525 narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
5526 auto *SI =
5527 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
5528 auto *S = new VPWidenStoreRecipe(
5529 *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
5530 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
5531 S->insertBefore(StoreGroup);
5532 StoreGroup->eraseFromParent();
5533 }
5534
5535 // Adjust induction to reflect that the transformed plan only processes one
5536 // original iteration.
5537 auto *CanIV = VectorLoop->getCanonicalIV();
5538 auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
5539 VPBasicBlock *VectorPH = Plan.getVectorPreheader();
5540 VPBuilder PHBuilder(VectorPH, VectorPH->begin());
5541
5542 VPValue *UF = &Plan.getUF();
5543 VPValue *Step;
5544 if (VFToOptimize->isScalable()) {
5545 VPValue *VScale = PHBuilder.createElementCount(
5547 Step = PHBuilder.createOverflowingOp(Instruction::Mul, {VScale, UF},
5548 {true, false});
5549 Plan.getVF().replaceAllUsesWith(VScale);
5550 } else {
5551 Step = UF;
5553 Plan.getConstantInt(CanIV->getScalarType(), 1));
5554 }
5555 // Materialize vector trip count with the narrowed step.
5556 materializeVectorTripCount(Plan, VectorPH, /*TailByMasking=*/false,
5557 RequiresScalarEpilogue, Step);
5558
5559 Inc->setOperand(1, Step);
5560 Plan.getVFxUF().replaceAllUsesWith(Step);
5561
5562 removeDeadRecipes(Plan);
5563 assert(none_of(*VectorLoop->getEntryBasicBlock(),
5565 "All VPVectorPointerRecipes should have been removed");
5566 return NewPlan;
5567}
5568
5569/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
5570/// BranchOnCond recipe.
5572 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
5573 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5574 auto *MiddleTerm =
5576 // Only add branch metadata if there is a (conditional) terminator.
5577 if (!MiddleTerm)
5578 return;
5579
5580 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
5581 "must have a BranchOnCond");
5582 // Assume that `TripCount % VectorStep ` is equally distributed.
5583 unsigned VectorStep = Plan.getConcreteUF() * VF.getKnownMinValue();
5584 if (VF.isScalable() && VScaleForTuning.has_value())
5585 VectorStep *= *VScaleForTuning;
5586 assert(VectorStep > 0 && "trip count should not be zero");
5587 MDBuilder MDB(Plan.getContext());
5588 MDNode *BranchWeights =
5589 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
5590 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
5591}
5592
5594 VFRange &Range) {
5595 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5596 auto *MiddleVPBB = Plan.getMiddleBlock();
5597 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5598
5599 auto IsScalableOne = [](ElementCount VF) -> bool {
5600 return VF == ElementCount::getScalable(1);
5601 };
5602
5603 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
5604 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
5605 if (!FOR)
5606 continue;
5607
5608 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5609 "Cannot handle loops with uncountable early exits");
5610
5611 // This is the second phase of vectorizing first-order recurrences, creating
5612 // extract for users outside the loop. An overview of the transformation is
5613 // described below. Suppose we have the following loop with some use after
5614 // the loop of the last a[i-1],
5615 //
5616 // for (int i = 0; i < n; ++i) {
5617 // t = a[i - 1];
5618 // b[i] = a[i] - t;
5619 // }
5620 // use t;
5621 //
5622 // There is a first-order recurrence on "a". For this loop, the shorthand
5623 // scalar IR looks like:
5624 //
5625 // scalar.ph:
5626 // s.init = a[-1]
5627 // br scalar.body
5628 //
5629 // scalar.body:
5630 // i = phi [0, scalar.ph], [i+1, scalar.body]
5631 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
5632 // s2 = a[i]
5633 // b[i] = s2 - s1
5634 // br cond, scalar.body, exit.block
5635 //
5636 // exit.block:
5637 // use = lcssa.phi [s1, scalar.body]
5638 //
5639 // In this example, s1 is a recurrence because it's value depends on the
5640 // previous iteration. In the first phase of vectorization, we created a
5641 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
5642 // for users in the scalar preheader and exit block.
5643 //
5644 // vector.ph:
5645 // v_init = vector(..., ..., ..., a[-1])
5646 // br vector.body
5647 //
5648 // vector.body
5649 // i = phi [0, vector.ph], [i+4, vector.body]
5650 // v1 = phi [v_init, vector.ph], [v2, vector.body]
5651 // v2 = a[i, i+1, i+2, i+3]
5652 // b[i] = v2 - v1
5653 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
5654 // b[i, i+1, i+2, i+3] = v2 - v1
5655 // br cond, vector.body, middle.block
5656 //
5657 // middle.block:
5658 // vector.recur.extract.for.phi = v2(2)
5659 // vector.recur.extract = v2(3)
5660 // br cond, scalar.ph, exit.block
5661 //
5662 // scalar.ph:
5663 // scalar.recur.init = phi [vector.recur.extract, middle.block],
5664 // [s.init, otherwise]
5665 // br scalar.body
5666 //
5667 // scalar.body:
5668 // i = phi [0, scalar.ph], [i+1, scalar.body]
5669 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
5670 // s2 = a[i]
5671 // b[i] = s2 - s1
5672 // br cond, scalar.body, exit.block
5673 //
5674 // exit.block:
5675 // lo = lcssa.phi [s1, scalar.body],
5676 // [vector.recur.extract.for.phi, middle.block]
5677 //
5678 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
5679 // Extract the penultimate value of the recurrence and use it as operand for
5680 // the VPIRInstruction modeling the phi.
5682 make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
5684 continue;
5685
5686 // For VF vscale x 1, if vscale = 1, we are unable to extract the
5687 // penultimate value of the recurrence. Instead we rely on the existing
5688 // extract of the last element from the result of
5689 // VPInstruction::FirstOrderRecurrenceSplice.
5690 // TODO: Consider vscale_range info and UF.
5692 Range))
5693 return;
5694 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
5695 VPInstruction::ExtractPenultimateElement, FOR->getBackedgeValue(), {},
5696 "vector.recur.extract.for.phi");
5697 for (VPUser *U : to_vector(cast<VPInstruction>(&R)->users())) {
5698 auto *ExitPhi = dyn_cast<VPIRPhi>(U);
5699 if (!ExitPhi)
5700 continue;
5701 ExitPhi->replaceUsesOfWith(cast<VPInstruction>(&R), PenultimateElement);
5702 }
5703 }
5704 }
5705}
5706
5709 Loop &L) {
5710 ScalarEvolution &SE = *PSE.getSE();
5711 VPRegionBlock *VectorLoopRegion = Plan.getVectorLoopRegion();
5712
5713 // Helper lambda to check if the IV range excludes the sentinel value.
5714 auto CheckSentinel = [&SE](const SCEV *IVSCEV, bool UseMax,
5715 bool Signed) -> std::optional<APInt> {
5716 unsigned BW = IVSCEV->getType()->getScalarSizeInBits();
5717 APInt Sentinel =
5718 UseMax
5721
5722 ConstantRange IVRange =
5723 Signed ? SE.getSignedRange(IVSCEV) : SE.getUnsignedRange(IVSCEV);
5724 if (!IVRange.contains(Sentinel))
5725 return Sentinel;
5726 return std::nullopt;
5727 };
5728
5729 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
5730 for (VPRecipeBase &Phi :
5731 make_early_inc_range(VectorLoopRegion->getEntryBasicBlock()->phis())) {
5732 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&Phi);
5734 PhiR->getRecurrenceKind()))
5735 continue;
5736
5737 Type *PhiTy = VPTypeAnalysis(Plan).inferScalarType(PhiR);
5738 if (PhiTy->isPointerTy() || PhiTy->isFloatingPointTy())
5739 continue;
5740
5741 // If there's a header mask, the backedge select will not be the find-last
5742 // select.
5743 VPValue *BackedgeVal = PhiR->getBackedgeValue();
5744 VPValue *CondSelect = BackedgeVal;
5745 if (HeaderMask &&
5746 !match(BackedgeVal, m_Select(m_Specific(HeaderMask),
5747 m_VPValue(CondSelect), m_Specific(PhiR))))
5748 llvm_unreachable("expected header mask select");
5749
5750 // Get the IV from the conditional select of the reduction phi.
5751 // The conditional select should be a select between the phi and the IV.
5752 VPValue *Cond, *TrueVal, *FalseVal;
5753 if (!match(CondSelect, m_Select(m_VPValue(Cond), m_VPValue(TrueVal),
5754 m_VPValue(FalseVal))))
5755 continue;
5756
5757 // The non-phi operand of the select is the IV.
5758 assert(is_contained(CondSelect->getDefiningRecipe()->operands(), PhiR));
5759 VPValue *IV = TrueVal == PhiR ? FalseVal : TrueVal;
5760
5761 const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(IV, PSE, &L);
5762 const SCEV *Step;
5763 if (!match(IVSCEV, m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step))))
5764 continue;
5765
5766 // Determine direction from SCEV step.
5767 if (!SE.isKnownNonZero(Step))
5768 continue;
5769
5770 // Positive step means we need UMax/SMax to find the last IV value, and
5771 // UMin/SMin otherwise.
5772 bool UseMax = SE.isKnownPositive(Step);
5773 bool UseSigned = true;
5774 std::optional<APInt> SentinelVal =
5775 CheckSentinel(IVSCEV, UseMax, /*IsSigned=*/true);
5776 if (!SentinelVal) {
5777 SentinelVal = CheckSentinel(IVSCEV, UseMax, /*IsSigned=*/false);
5778 UseSigned = false;
5779 }
5780
5781 // If no sentinel was found, fall back to a boolean AnyOf reduction to track
5782 // if the condition was ever true. Requires the IV to not wrap, otherwise we
5783 // cannot use min/max.
5784 if (!SentinelVal) {
5785 auto *AR = cast<SCEVAddRecExpr>(IVSCEV);
5786 if (AR->hasNoSignedWrap())
5787 UseSigned = true;
5788 else if (AR->hasNoUnsignedWrap())
5789 UseSigned = false;
5790 else
5791 continue;
5792 }
5793
5795 BackedgeVal,
5797
5798 RecurKind MinMaxKind =
5799 UseMax ? (UseSigned ? RecurKind::SMax : RecurKind::UMax)
5800 : (UseSigned ? RecurKind::SMin : RecurKind::UMin);
5801 VPIRFlags Flags(MinMaxKind, /*IsOrdered=*/false, /*IsInLoop=*/false,
5802 FastMathFlags());
5803 DebugLoc ExitDL = RdxResult->getDebugLoc();
5804 VPBuilder MiddleBuilder(RdxResult);
5805 VPValue *ReducedIV =
5807 RdxResult->getOperand(0), Flags, ExitDL);
5808
5809 VPValue *NewRdxResult;
5810 VPValue *StartVPV = PhiR->getStartValue();
5811 if (SentinelVal) {
5812 // Sentinel-based approach: reduce IVs with min/max, compare against
5813 // sentinel to detect if condition was ever true, select accordingly.
5814 VPValue *Sentinel = Plan.getConstantInt(*SentinelVal);
5815 auto *Cmp = MiddleBuilder.createICmp(CmpInst::ICMP_NE, ReducedIV,
5816 Sentinel, ExitDL);
5817 NewRdxResult =
5818 MiddleBuilder.createSelect(Cmp, ReducedIV, StartVPV, ExitDL);
5819 StartVPV = Sentinel;
5820 } else {
5821 // Introduce a boolean AnyOf reduction to track if the condition was ever
5822 // true in the loop. Use it to select the initial start value, if it was
5823 // never true.
5824 auto *AnyOfPhi = new VPReductionPHIRecipe(
5825 /*Phi=*/nullptr, RecurKind::Or, *Plan.getFalse(), *Plan.getFalse(),
5826 RdxUnordered{1}, {}, /*HasUsesOutsideReductionChain=*/false);
5827 AnyOfPhi->insertAfter(PhiR);
5828
5829 VPBuilder LoopBuilder(BackedgeVal->getDefiningRecipe());
5830 VPValue *AnyOfCond = Cond;
5831 if (TrueVal == PhiR)
5832 AnyOfCond = LoopBuilder.createNot(Cond);
5833 VPValue *OrVal = LoopBuilder.createOr(AnyOfPhi, AnyOfCond);
5834 AnyOfPhi->setOperand(1, OrVal);
5835
5836 NewRdxResult =
5838 {StartVPV, ReducedIV, OrVal}, {}, ExitDL);
5839
5840 // Initialize the IV reduction phi with the neutral element, not the
5841 // original start value, to ensure correct min/max reduction results.
5842 StartVPV = Plan.getOrAddLiveIn(
5843 getRecurrenceIdentity(MinMaxKind, IVSCEV->getType(), {}));
5844 }
5845 RdxResult->replaceAllUsesWith(NewRdxResult);
5846 RdxResult->eraseFromParent();
5847
5848 auto *NewPhiR = new VPReductionPHIRecipe(
5849 cast<PHINode>(PhiR->getUnderlyingInstr()), RecurKind::FindIV, *StartVPV,
5850 *CondSelect, RdxUnordered{1}, {}, PhiR->hasUsesOutsideReductionChain());
5851 NewPhiR->insertBefore(PhiR);
5852 PhiR->replaceAllUsesWith(NewPhiR);
5853 PhiR->eraseFromParent();
5854 }
5855}
5856
5857namespace {
5858
5859/// Holds the binary operation used to compute the extended operand and the
5860/// casts that feed into it.
5861struct ExtendedReductionOperand {
5862 VPWidenRecipe *BinOp = nullptr;
5863 // Note: The second cast recipe may be null.
5864 std::array<VPWidenCastRecipe *, 2> CastRecipes = {};
5865};
5866
5867/// A chain of recipes that form a partial reduction. Matches either
5868/// reduction_bin_op (extend (A), accumulator), or
5869/// reduction_bin_op (bin_op (extend (A), (extend (B))), accumulator).
5870struct VPPartialReductionChain {
5871 /// The top-level binary operation that forms the reduction to a scalar
5872 /// after the loop body.
5873 VPWidenRecipe *ReductionBinOp;
5874 /// The user of the extends that is then reduced.
5875 ExtendedReductionOperand ExtendedOp;
5876 unsigned ScaleFactor;
5877 /// The recurrence kind for the entire partial reduction chain.
5878 /// This allows distinguishing between Sub and AddWithSub recurrences,
5879 /// when the ReductionBinOp is a Instruction::Sub.
5880 RecurKind RK;
5881};
5882
5883static VPSingleDefRecipe *
5884optimizeExtendsForPartialReduction(VPSingleDefRecipe *BinOp,
5885 VPTypeAnalysis &TypeInfo) {
5886 // reduce.add(mul(ext(A), C))
5887 // -> reduce.add(mul(ext(A), ext(trunc(C))))
5888 const APInt *Const;
5889 if (match(BinOp, m_Mul(m_ZExtOrSExt(m_VPValue()), m_APInt(Const)))) {
5890 auto *ExtA = cast<VPWidenCastRecipe>(BinOp->getOperand(0));
5891 Instruction::CastOps ExtOpc = ExtA->getOpcode();
5892 Type *NarrowTy = TypeInfo.inferScalarType(ExtA->getOperand(0));
5893 if (!BinOp->hasOneUse() ||
5895 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
5896 return BinOp;
5897
5898 VPBuilder Builder(BinOp);
5899 auto *Trunc = Builder.createWidenCast(Instruction::CastOps::Trunc,
5900 BinOp->getOperand(1), NarrowTy);
5901 Type *WideTy = TypeInfo.inferScalarType(ExtA);
5902 BinOp->setOperand(1, Builder.createWidenCast(ExtOpc, Trunc, WideTy));
5903 return BinOp;
5904 }
5905
5906 // reduce.add(ext(mul(ext(A), ext(B))))
5907 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
5908 // TODO: Support this optimization for float types.
5910 m_ZExtOrSExt(m_VPValue()))))) {
5911 auto *Ext = cast<VPWidenCastRecipe>(BinOp);
5912 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
5913 auto *MulLHS = cast<VPWidenCastRecipe>(Mul->getOperand(0));
5914 auto *MulRHS = cast<VPWidenCastRecipe>(Mul->getOperand(1));
5915 if (!Mul->hasOneUse() ||
5916 (Ext->getOpcode() != MulLHS->getOpcode() && MulLHS != MulRHS) ||
5917 MulLHS->getOpcode() != MulRHS->getOpcode())
5918 return BinOp;
5919 VPBuilder Builder(Mul);
5920 Mul->setOperand(0, Builder.createWidenCast(MulLHS->getOpcode(),
5921 MulLHS->getOperand(0),
5922 Ext->getResultType()));
5923 Mul->setOperand(1, MulLHS == MulRHS
5924 ? Mul->getOperand(0)
5925 : Builder.createWidenCast(MulRHS->getOpcode(),
5926 MulRHS->getOperand(0),
5927 Ext->getResultType()));
5928 return Mul;
5929 }
5930
5931 return BinOp;
5932}
5933
5934// Helper to transform a partial reduction chain into a partial reduction
5935// recipe. Assumes profitability has been checked.
5936static void transformToPartialReduction(const VPPartialReductionChain &Chain,
5937 VPTypeAnalysis &TypeInfo, VPlan &Plan,
5938 VPReductionPHIRecipe *RdxPhi) {
5939 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
5940 assert(WidenRecipe->getNumOperands() == 2 && "Expected binary operation");
5941
5942 VPValue *BinOpVal = WidenRecipe->getOperand(0);
5943 VPValue *Accumulator = WidenRecipe->getOperand(1);
5944
5945 // Swap if needed to ensure Accumulator is the PHI or partial reduction.
5947 isa<VPExpressionRecipe>(BinOpVal))
5948 std::swap(BinOpVal, Accumulator);
5949 auto *BinOp = cast<VPSingleDefRecipe>(BinOpVal->getDefiningRecipe());
5950
5951 // Sub-reductions can be implemented in two ways:
5952 // (1) negate the operand in the vector loop (the default way).
5953 // (2) subtract the reduced value from the init value in the middle block.
5954 // Both ways keep the reduction itself as an 'add' reduction.
5955 //
5956 // The ISD nodes for partial reductions don't support folding the
5957 // sub/negation into its operands because the following is not a valid
5958 // transformation:
5959 // sub(0, mul(ext(a), ext(b)))
5960 // -> mul(ext(a), ext(sub(0, b)))
5961 //
5962 // It's therefore better to choose option (2) such that the partial
5963 // reduction is always positive (starting at '0') and to do a final
5964 // subtract in the middle block.
5965 if (WidenRecipe->getOpcode() == Instruction::Sub &&
5966 Chain.RK != RecurKind::Sub) {
5967 VPBuilder Builder(WidenRecipe);
5968 Type *ElemTy = TypeInfo.inferScalarType(BinOp);
5969 auto *Zero = Plan.getZero(ElemTy);
5970 VPIRFlags Flags = WidenRecipe->getUnderlyingInstr()
5971 ? VPIRFlags(*WidenRecipe->getUnderlyingInstr())
5972 : VPIRFlags();
5973 auto *NegRecipe = new VPWidenRecipe(Instruction::Sub, {Zero, BinOp}, Flags,
5975 Builder.insert(NegRecipe);
5976 BinOp = NegRecipe;
5977 }
5978
5979 // FIXME: Do these transforms before invoking the cost-model.
5980 BinOp = optimizeExtendsForPartialReduction(BinOp, TypeInfo);
5981
5982 // Check if WidenRecipe is the final result of the reduction. If so look
5983 // through selects for predicated reductions.
5984 VPValue *Cond = nullptr;
5986 WidenRecipe,
5987 m_Select(m_VPValue(Cond), m_Specific(WidenRecipe), m_Specific(RdxPhi))));
5988 bool IsLastInChain = RdxPhi->getBackedgeValue() == WidenRecipe ||
5989 RdxPhi->getBackedgeValue() == ExitValue;
5990 assert((!ExitValue || IsLastInChain) &&
5991 "if we found ExitValue, it must match RdxPhi's backedge value");
5992
5993 Type *PhiType = TypeInfo.inferScalarType(RdxPhi);
5994 RecurKind RdxKind =
5996 auto *PartialRed = new VPReductionRecipe(
5997 RdxKind,
5998 RdxKind == RecurKind::FAdd ? WidenRecipe->getFastMathFlags()
5999 : FastMathFlags(),
6000 WidenRecipe->getUnderlyingInstr(), Accumulator, BinOp, Cond,
6001 RdxUnordered{/*VFScaleFactor=*/Chain.ScaleFactor});
6002 PartialRed->insertBefore(WidenRecipe);
6003
6004 if (Cond)
6005 ExitValue->replaceAllUsesWith(PartialRed);
6006 WidenRecipe->replaceAllUsesWith(PartialRed);
6007
6008 // We only need to update the PHI node once, which is when we find the
6009 // last reduction in the chain.
6010 if (!IsLastInChain)
6011 return;
6012
6013 // Scale the PHI and ReductionStartVector by the VFScaleFactor
6014 assert(RdxPhi->getVFScaleFactor() == 1 && "scale factor must not be set");
6015 RdxPhi->setVFScaleFactor(Chain.ScaleFactor);
6016
6017 auto *StartInst = cast<VPInstruction>(RdxPhi->getStartValue());
6018 assert(StartInst->getOpcode() == VPInstruction::ReductionStartVector);
6019 auto *NewScaleFactor = Plan.getConstantInt(32, Chain.ScaleFactor);
6020 StartInst->setOperand(2, NewScaleFactor);
6021
6022 // If this is the last value in a sub-reduction chain, then update the PHI
6023 // node to start at `0` and update the reduction-result to subtract from
6024 // the PHI's start value.
6025 if (Chain.RK != RecurKind::Sub)
6026 return;
6027
6028 VPValue *OldStartValue = StartInst->getOperand(0);
6029 StartInst->setOperand(0, StartInst->getOperand(1));
6030
6031 // Replace reduction_result by 'sub (startval, reductionresult)'.
6033 assert(RdxResult && "Could not find reduction result");
6034
6035 VPBuilder Builder = VPBuilder::getToInsertAfter(RdxResult);
6036 constexpr unsigned SubOpc = Instruction::BinaryOps::Sub;
6037 VPInstruction *NewResult = Builder.createNaryOp(
6038 SubOpc, {OldStartValue, RdxResult}, VPIRFlags::getDefaultFlags(SubOpc),
6039 RdxPhi->getDebugLoc());
6040 RdxResult->replaceUsesWithIf(
6041 NewResult,
6042 [&NewResult](VPUser &U, unsigned Idx) { return &U != NewResult; });
6043}
6044
6045/// Check if a partial reduction chain is is supported by the target (i.e. does
6046/// not have an invalid cost) for the given VF range. Clamps the range and
6047/// returns true if profitable for any VF.
6048static bool isValidPartialReduction(const VPPartialReductionChain &Chain,
6049 Type *PhiType, VPCostContext &CostCtx,
6050 VFRange &Range) {
6051 auto GetExtInfo = [&CostCtx](VPWidenCastRecipe *Ext)
6052 -> std::pair<Type *, TargetTransformInfo::PartialReductionExtendKind> {
6053 if (!Ext)
6054 return {nullptr, TargetTransformInfo::PR_None};
6055 Type *ExtOpType = CostCtx.Types.inferScalarType(Ext->getOperand(0));
6057 static_cast<Instruction::CastOps>(Ext->getOpcode()));
6058 return {ExtOpType, ExtKind};
6059 };
6060 ExtendedReductionOperand ExtendedOp = Chain.ExtendedOp;
6061 VPWidenCastRecipe *ExtendA = ExtendedOp.CastRecipes[0];
6062 VPWidenCastRecipe *ExtendB = ExtendedOp.CastRecipes[1];
6063
6064 Type *ExtOpTypeA, *ExtOpTypeB;
6066 std::tie(ExtOpTypeA, ExtKindA) = GetExtInfo(ExtendA);
6067 std::tie(ExtOpTypeB, ExtKindB) = GetExtInfo(ExtendB);
6068
6069 // If ExtendB is nullptr but there's a separate BinOp, the second operand
6070 // was a constant that can use the same extend kind as the first.
6071 if (!ExtendB && ExtendedOp.BinOp &&
6072 ExtendedOp.BinOp != Chain.ReductionBinOp) {
6073 const APInt *Const = nullptr;
6074 for (VPValue *Op : ExtendedOp.BinOp->operands()) {
6075 if (match(Op, m_APInt(Const)))
6076 break;
6077 }
6078 if (!Const || !canConstantBeExtended(Const, ExtOpTypeA, ExtKindA))
6079 return false;
6080 ExtOpTypeB = ExtOpTypeA;
6081 ExtKindB = ExtKindA;
6082 }
6083
6084 std::optional<unsigned> BinOpc;
6085 if (ExtendedOp.BinOp && ExtendedOp.BinOp != Chain.ReductionBinOp)
6086 BinOpc = ExtendedOp.BinOp->getOpcode();
6087
6088 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
6090 [&](ElementCount VF) {
6091 return CostCtx.TTI
6093 WidenRecipe->getOpcode(), ExtOpTypeA, ExtOpTypeB, PhiType, VF,
6094 ExtKindA, ExtKindB, BinOpc, CostCtx.CostKind,
6095 PhiType->isFloatingPointTy()
6096 ? std::optional{WidenRecipe->getFastMathFlags()}
6097 : std::nullopt)
6098 .isValid();
6099 },
6100 Range);
6101}
6102
6104getPartialReductionExtendKind(VPWidenCastRecipe *Cast) {
6106}
6107
6108/// Checks if \p Op (which is an operand of \p UpdateR) is an extended reduction
6109/// operand. This is an operand where the source of the value (e.g. a load) has
6110/// been extended (sext, zext, or fpext) before it is used in the reduction.
6111///
6112/// Possible forms matched by this function:
6113/// - UpdateR(PrevValue, ext(...))
6114/// - UpdateR(PrevValue, BinOp(ext(...), ext(...)))
6115/// - UpdateR(PrevValue, BinOp(ext(...), Constant))
6116/// - UpdateR(PrevValue, neg(BinOp(ext(...), ext(...))))
6117/// - UpdateR(PrevValue, neg(BinOp(ext(...), Constant)))
6118/// - UpdateR(PrevValue, ext(mul(ext(...), ext(...))))
6119/// - UpdateR(PrevValue, ext(mul(ext(...), Constant)))
6120///
6121/// Note: The second operand of UpdateR corresponds to \p Op in the examples.
6122static std::optional<ExtendedReductionOperand>
6123matchExtendedReductionOperand(VPWidenRecipe *UpdateR, VPValue *Op) {
6124 assert(is_contained(UpdateR->operands(), Op) &&
6125 "Op should be operand of UpdateR");
6126
6127 std::optional<TTI::PartialReductionExtendKind> OuterExtKind;
6129 auto *CastRecipe = cast<VPWidenCastRecipe>(Op);
6130 VPValue *CastSource = CastRecipe->getOperand(0);
6131 if (match(CastSource, m_Mul(m_VPValue(), m_VPValue())) ||
6132 match(CastSource, m_FMul(m_VPValue(), m_VPValue()))) {
6133 // Match: ext(mul(...))
6134 // Record the outer extend kind and set `Op` to the mul. We can then match
6135 // this as a binary operation. Note: We can optimize out the outer extend
6136 // by widening the inner extends to match it. See
6137 // optimizeExtendsForPartialReduction.
6138 Op = CastSource;
6139 OuterExtKind = getPartialReductionExtendKind(CastRecipe);
6140 } else if (UpdateR->getOpcode() == Instruction::Add ||
6141 UpdateR->getOpcode() == Instruction::FAdd) {
6142 // Match: UpdateR(PrevValue, ext(...))
6143 // TODO: Remove the add/fadd restriction (we should be able to handle this
6144 // case for sub reductions too).
6145 return ExtendedReductionOperand{UpdateR, {CastRecipe, nullptr}};
6146 }
6147 }
6148
6149 if (!Op->hasOneUse())
6150 return std::nullopt;
6151
6152 // Handle neg(...) pattern (aka sub(0, ...)).
6153 VPValue *NegatedOp = nullptr;
6154 if (match(Op, m_Sub(m_ZeroInt(), m_VPValue(NegatedOp))))
6155 Op = NegatedOp;
6156
6158 if (!BinOp || !Instruction::isBinaryOp(BinOp->getOpcode()))
6159 return std::nullopt;
6160
6161 // The rest of the matching assumes `Op` is a (possibly extended/negated)
6162 // binary operation.
6163
6164 VPValue *LHS = BinOp->getOperand(0);
6165 VPValue *RHS = BinOp->getOperand(1);
6166
6167 // The LHS of the operation must always be an extend.
6169 return std::nullopt;
6170
6171 auto *LHSCast = cast<VPWidenCastRecipe>(LHS);
6172
6173 // The RHS of the operation can be an extend or a constant integer.
6174 // The constant will be validated in isValidPartialReduction.
6175 VPWidenCastRecipe *RHSCast = nullptr;
6177 RHSCast = cast<VPWidenCastRecipe>(RHS);
6178 else if (!isa<VPConstantInt>(RHS))
6179 return std::nullopt;
6180
6181 // The outer extend kind must match the inner extends for folding.
6182 for (VPWidenCastRecipe *Cast : {LHSCast, RHSCast})
6183 if (Cast && OuterExtKind &&
6184 getPartialReductionExtendKind(Cast) != OuterExtKind)
6185 return std::nullopt;
6186
6187 return ExtendedReductionOperand{BinOp, {LHSCast, RHSCast}};
6188}
6189
6190/// Examines each operation in the reduction chain corresponding to \p RedPhiR,
6191/// and determines if the target can use a cheaper operation with a wider
6192/// per-iteration input VF and narrower PHI VF. If successful, returns the chain
6193/// of operations in the reduction.
6194static std::optional<SmallVector<VPPartialReductionChain>>
6195getScaledReductions(VPReductionPHIRecipe *RedPhiR, VPCostContext &CostCtx,
6196 VFRange &Range) {
6197 // Get the backedge value from the reduction PHI and find the
6198 // ComputeReductionResult that uses it (directly or through a select for
6199 // predicated reductions).
6200 auto *RdxResult = vputils::findComputeReductionResult(RedPhiR);
6201 if (!RdxResult)
6202 return std::nullopt;
6203 VPValue *ExitValue = RdxResult->getOperand(0);
6204 match(ExitValue, m_Select(m_VPValue(), m_VPValue(ExitValue), m_VPValue()));
6205
6207 RecurKind RK = RedPhiR->getRecurrenceKind();
6208 Type *PhiType = CostCtx.Types.inferScalarType(RedPhiR);
6209 TypeSize PHISize = PhiType->getPrimitiveSizeInBits();
6210
6211 // Work backwards from the ExitValue examining each reduction operation.
6212 VPValue *CurrentValue = ExitValue;
6213 while (CurrentValue != RedPhiR) {
6214 auto *UpdateR = dyn_cast<VPWidenRecipe>(CurrentValue);
6215 if (!UpdateR || !Instruction::isBinaryOp(UpdateR->getOpcode()))
6216 return std::nullopt;
6217
6218 VPValue *Op = UpdateR->getOperand(1);
6219 VPValue *PrevValue = UpdateR->getOperand(0);
6220
6221 // Find the extended operand. The other operand (PrevValue) is the next link
6222 // in the reduction chain.
6223 std::optional<ExtendedReductionOperand> ExtendedOp =
6224 matchExtendedReductionOperand(UpdateR, Op);
6225 if (!ExtendedOp) {
6226 ExtendedOp = matchExtendedReductionOperand(UpdateR, PrevValue);
6227 if (!ExtendedOp)
6228 return std::nullopt;
6229 std::swap(Op, PrevValue);
6230 }
6231
6232 Type *ExtSrcType = CostCtx.Types.inferScalarType(
6233 ExtendedOp->CastRecipes[0]->getOperand(0));
6234 TypeSize ExtSrcSize = ExtSrcType->getPrimitiveSizeInBits();
6235 if (!PHISize.hasKnownScalarFactor(ExtSrcSize))
6236 return std::nullopt;
6237
6238 VPPartialReductionChain Chain(
6239 {UpdateR, *ExtendedOp,
6240 static_cast<unsigned>(PHISize.getKnownScalarFactor(ExtSrcSize)), RK});
6241 if (!isValidPartialReduction(Chain, PhiType, CostCtx, Range))
6242 return std::nullopt;
6243
6244 Chains.push_back(Chain);
6245 CurrentValue = PrevValue;
6246 }
6247
6248 // The chains were collected by traversing backwards from the exit value.
6249 // Reverse the chains so they are in program order.
6250 std::reverse(Chains.begin(), Chains.end());
6251 return Chains;
6252}
6253} // namespace
6254
6256 VPCostContext &CostCtx,
6257 VFRange &Range) {
6258 // Find all possible valid partial reductions, grouping chains by their PHI.
6259 // This grouping allows invalidating the whole chain, if any link is not a
6260 // valid partial reduction.
6262 ChainsByPhi;
6263 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
6264 for (VPRecipeBase &R : HeaderVPBB->phis()) {
6265 auto *RedPhiR = dyn_cast<VPReductionPHIRecipe>(&R);
6266 if (!RedPhiR)
6267 continue;
6268
6269 if (auto Chains = getScaledReductions(RedPhiR, CostCtx, Range))
6270 ChainsByPhi.try_emplace(RedPhiR, std::move(*Chains));
6271 }
6272
6273 if (ChainsByPhi.empty())
6274 return;
6275
6276 // Build set of partial reduction operations for extend user validation and
6277 // a map of reduction bin ops to their scale factors for scale validation.
6278 SmallPtrSet<VPRecipeBase *, 4> PartialReductionOps;
6279 DenseMap<VPSingleDefRecipe *, unsigned> ScaledReductionMap;
6280 for (const auto &[_, Chains] : ChainsByPhi)
6281 for (const VPPartialReductionChain &Chain : Chains) {
6282 PartialReductionOps.insert(Chain.ExtendedOp.BinOp);
6283 ScaledReductionMap[Chain.ReductionBinOp] = Chain.ScaleFactor;
6284 }
6285
6286 // A partial reduction is invalid if any of its extends are used by
6287 // something that isn't another partial reduction. This is because the
6288 // extends are intended to be lowered along with the reduction itself.
6289 auto ExtendUsersValid = [&](VPWidenCastRecipe *Ext) {
6290 return !Ext || all_of(Ext->users(), [&](VPUser *U) {
6291 return PartialReductionOps.contains(cast<VPRecipeBase>(U));
6292 });
6293 };
6294
6295 // Validate chains: check that extends are only used by partial reductions,
6296 // and that reduction bin ops are only used by other partial reductions with
6297 // matching scale factors, are outside the loop region or the select
6298 // introduced by tail-folding. Otherwise we would create users of scaled
6299 // reductions where the types of the other operands don't match.
6300 for (auto &[RedPhiR, Chains] : ChainsByPhi) {
6301 for (const VPPartialReductionChain &Chain : Chains) {
6302 if (!all_of(Chain.ExtendedOp.CastRecipes, ExtendUsersValid)) {
6303 Chains.clear();
6304 break;
6305 }
6306 auto UseIsValid = [&, RedPhiR = RedPhiR](VPUser *U) {
6307 if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(U))
6308 return PhiR == RedPhiR;
6309 auto *R = cast<VPSingleDefRecipe>(U);
6310 return Chain.ScaleFactor == ScaledReductionMap.lookup_or(R, 0) ||
6312 m_Specific(Chain.ReductionBinOp))) ||
6313 match(R, m_Select(m_VPValue(), m_Specific(Chain.ReductionBinOp),
6314 m_Specific(RedPhiR)));
6315 };
6316 if (!all_of(Chain.ReductionBinOp->users(), UseIsValid)) {
6317 Chains.clear();
6318 break;
6319 }
6320
6321 // Check if the compute-reduction-result is used by a sunk store.
6322 // TODO: Also form partial reductions in those cases.
6323 if (auto *RdxResult = vputils::findComputeReductionResult(RedPhiR)) {
6324 if (any_of(RdxResult->users(), [](VPUser *U) {
6325 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
6326 return RepR && isa<StoreInst>(RepR->getUnderlyingInstr());
6327 })) {
6328 Chains.clear();
6329 break;
6330 }
6331 }
6332 }
6333 }
6334
6335 for (auto &[Phi, Chains] : ChainsByPhi)
6336 for (const VPPartialReductionChain &Chain : Chains)
6337 transformToPartialReduction(Chain, CostCtx.Types, Plan, Phi);
6338}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:849
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
#define _
iv Induction Variable Users
Definition IVUsers.cpp:48
iv users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:81
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
static bool sinkScalarOperands(VPlan &Plan)
static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R)
Return true if we do not know how to (mechanically) hoist or sink R out of a loop region.
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo)
Try to simplify VPSingleDefRecipe Def.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPIRValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL)
After replacing the canonical IV with a EVL-based IV, fixup recipes that use VF to use the EVL instea...
static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx, bool IsScalable)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static std::optional< ElementCount > isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ArrayRef< ElementCount > VFs, VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI)
Returns VF from VFs if IR is a full interleave group with factor and number of members both equal to ...
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectGroupedReplicateMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L, function_ref< bool(VPReplicateRecipe *)> FilterFn)
Collect either replicated Loads or Stores grouped by their address SCEV, in a deep-traversal of the v...
static VPIRValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Compute the end value for WideIV, unless it is truncated.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static void recursivelyDeleteDeadRecipes(VPValue *V)
static void reassociateHeaderMask(VPlan &Plan)
Reassociate (headermask && x) && y -> headermask && (x && y) to allow the header mask to be simplifie...
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan)
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static VPValue * narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl< VPValue * > &NarrowedOps)
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, std::optional< SinkStoreInfo > SinkInfo={})
Check if a memory operation doesn't alias with memory operations using scoped noalias metadata,...
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool canNarrowOps(ArrayRef< VPValue * > Ops, bool IsScalable)
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static void narrowToSingleScalarRecipes(VPlan &Plan)
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
This file provides utility VPlan to VPlan transformations.
#define RUN_VPLAN_PASS(PASS,...)
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const X86InstrFMA3Group Groups[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Helper for extra no-alias checks via known-safe recipe and SCEV.
SinkStoreInfo(const SmallPtrSetImpl< VPRecipeBase * > &ExcludeRecipes, VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
bool shouldSkip(VPRecipeBase &R) const
Return true if R should be skipped during alias checking, either because it's in the exclude set or b...
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1043
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1527
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition APInt.h:207
APInt abs() const
Get the absolute value.
Definition APInt.h:1810
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1503
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:210
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition APInt.h:217
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:220
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:1016
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
@ NoAlias
The two locations do not alias at all.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
back - Get the last element.
Definition ArrayRef.h:151
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
iterator end() const
Definition ArrayRef.h:131
iterator begin() const
Definition ArrayRef.h:130
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
This class represents a range of values.
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:162
static DebugLoc getUnknown()
Definition DebugLoc.h:161
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
ValueT lookup_or(const_arg_type_t< KeyT > Val, U &&Default) const
Definition DenseMap.h:215
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
The group of interleaved loads/stores sharing the same stride and close to each other.
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getNumMembers() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1597
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1080
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
bool empty() const
Definition MapVector.h:77
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:116
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:108
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isFindLastRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getUDivExpr(SCEVUse LHS, SCEVUse RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI bool isKnownNonZero(const SCEV *S)
Test if the given expression is known to be non-zero.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(SCEVUse LHS, SCEVUse RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
LLVM_ABI bool isKnownPositive(const SCEV *S)
Test if the given expression is known to be positive.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
ConstantRange getUnsignedRange(const SCEV *S)
Determine the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, SCEVUse LHS, SCEVUse RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI bool mayAliasInScopes(const MDNode *Scopes, const MDNode *NoAlias)
static LLVM_ABI AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:89
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:98
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:311
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:278
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:201
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
op_range operands()
Definition User.h:267
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3889
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4269
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4344
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4296
iterator end()
Definition VPlan.h:4306
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4304
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4357
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:232
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:598
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:565
const VPRecipeBase & front() const
Definition VPlan.h:4316
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:644
const VPRecipeBase & back() const
Definition VPlan.h:4318
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2794
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2830
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2820
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2836
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2816
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:98
void setSuccessors(ArrayRef< VPBlockBase * > NewSuccs)
Set each VPBasicBlock in NewSuccss as successor of this VPBlockBase.
Definition VPlan.h:319
VPRegionBlock * getParent()
Definition VPlan.h:190
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:202
size_t getNumSuccessors() const
Definition VPlan.h:241
void setPredecessors(ArrayRef< VPBlockBase * > NewPreds)
Set each VPBasicBlock in NewPreds as predecessor of this VPBlockBase.
Definition VPlan.h:310
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:226
VPlan * getPlan()
Definition VPlan.cpp:177
const std::string & getName() const
Definition VPlan.h:181
void clearSuccessors()
Remove all the successors of this block.
Definition VPlan.h:329
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:237
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:182
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:283
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:231
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:215
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:273
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:300
static bool isLatch(const VPBlockBase *VPB, const VPDominatorTree &VPDT)
Returns true if VPB is a loop latch, using isHeader().
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:200
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:218
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:236
static void transferSuccessors(VPBlockBase *Old, VPBlockBase *New)
Transfer successors from Old to New. New must have no successors.
Definition VPlanUtils.h:256
static SmallVector< VPBasicBlock * > blocksInSingleSuccessorChainBetween(VPBasicBlock *FirstBB, VPBasicBlock *LastBB)
Returns the blocks between FirstBB and LastBB, where FirstBB to LastBB forms a single-sucessor chain.
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3298
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPInstruction * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPValue * createElementCount(Type *Ty, ElementCount EC)
VPInstruction * createNot(VPValue *Operand, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRMetadata &Metadata={})
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPInstruction * createOverflowingOp(unsigned Opcode, ArrayRef< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags={false, false}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPIRValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
VPInstruction * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPInstruction * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3831
A recipe for generating the phi node tracking the current scalar iteration index.
Definition VPlan.h:3921
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:465
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:438
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:450
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:460
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:4001
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3343
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2306
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2348
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2337
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4422
Class to record and manage LLVM IR flags.
Definition VPlan.h:690
static VPIRFlags getDefaultFlags(unsigned Opcode)
Returns default flags for Opcode for opcodes that support it, asserts otherwise.
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlags() const
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:1170
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1225
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1327
@ ComputeAnyOfResult
Compute the final result of a AnyOf reduction with select(cmp(),x,y), where one of (x,...
Definition VPlan.h:1272
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1269
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1321
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1264
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1261
@ CanonicalIVIncrementForPart
Definition VPlan.h:1245
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2939
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2931
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2960
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:3012
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2970
VPValue * getIncomingValue(unsigned Idx) const
Returns the incoming VPValue with index Idx.
Definition VPlan.h:1593
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3485
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:406
VPRegionBlock * getRegion()
Definition VPlan.h:4574
VPBasicBlock * getParent()
Definition VPlan.h:481
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:555
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:3172
A recipe for handling reduction phis.
Definition VPlan.h:2700
void setVFScaleFactor(unsigned ScaleFactor)
Set the VFScaleFactor for this reduction phi.
Definition VPlan.h:2747
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2740
RecurKind getRecurrenceKind() const
Returns the recurrence kind of the reduction.
Definition VPlan.h:2758
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:3063
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4457
const VPBlockBase * getEntry() const
Definition VPlan.h:4493
Type * getCanonicalIVType()
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4568
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4525
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4510
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the region.
Definition VPlan.h:4555
const VPBlockBase * getExiting() const
Definition VPlan.h:4505
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4518
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3217
bool isSingleScalar() const
Definition VPlan.h:3258
bool isPredicated() const
Definition VPlan.h:3260
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3282
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:4073
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:607
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:675
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:296
operand_range operands()
Definition VPlanValue.h:364
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:340
unsigned getNumOperands() const
Definition VPlanValue.h:334
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:335
void addOperand(VPValue *Operand)
Definition VPlanValue.h:329
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:46
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:137
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1425
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:127
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:70
bool hasOneUse() const
Definition VPlanValue.h:166
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:196
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1428
unsigned getNumUsers() const
Definition VPlanValue.h:107
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1434
user_range users()
Definition VPlanValue.h:149
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:2154
A Recipe for widening the canonical induction variable of the vector loop.
Definition VPlan.h:3964
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1840
Instruction::CastOps getOpcode() const
Definition VPlan.h:1878
A recipe for handling GEP instructions.
Definition VPlan.h:2090
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2372
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2400
PHINode * getPHINode() const
Returns the underlying PHINode if one exists, or null otherwise.
Definition VPlan.h:2418
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2403
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2423
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2454
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2501
VPValue * getSplatVFValue() const
If the recipe has been unrolled, return the VPValue for the induction increment, otherwise return nul...
Definition VPlan.h:2505
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2532
A recipe for widening vector intrinsics.
Definition VPlan.h:1892
A common base class for widening memory operations.
Definition VPlan.h:3528
A recipe for widened phis.
Definition VPlan.h:2590
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1784
unsigned getOpcode() const
Definition VPlan.h:1821
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4587
VPIRValue * getLiveIn(Value *V) const
Return the live-in VPIRValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4895
bool hasVF(ElementCount VF) const
Definition VPlan.h:4800
const DataLayout & getDataLayout() const
Definition VPlan.h:4782
LLVMContext & getContext() const
Definition VPlan.h:4778
VPBasicBlock * getEntry()
Definition VPlan.h:4679
bool hasScalableVF() const
Definition VPlan.h:4801
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4737
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4758
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition VPlan.h:4807
VPIRValue * getFalse()
Return a VPIRValue wrapping i1 false.
Definition VPlan.h:4866
VPSymbolicValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4776
VPIRValue * getAllOnesValue(Type *Ty)
Return a VPIRValue wrapping the AllOnes value of type Ty.
Definition VPlan.h:4872
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:4942
auto getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4898
bool hasUF(unsigned UF) const
Definition VPlan.h:4818
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4727
VPSymbolicValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4766
VPIRValue * getOrAddLiveIn(Value *V)
Gets the live-in VPIRValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4843
VPIRValue * getZero(Type *Ty)
Return a VPIRValue wrapping the null value of type Ty.
Definition VPlan.h:4869
void setVF(ElementCount VF)
Definition VPlan.h:4788
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4834
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1058
unsigned getConcreteUF() const
Returns the concrete UF of the plan, after unrolling.
Definition VPlan.h:4821
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4751
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4704
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4921
VPIRValue * getTrue()
Return a VPIRValue wrapping i1 true.
Definition VPlan.h:4863
VPSymbolicValue & getUF()
Returns the UF of the vector loop region.
Definition VPlan.h:4773
bool hasScalarVFOnly() const
Definition VPlan.h:4811
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4718
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4684
VPSymbolicValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4769
void setUF(unsigned UF)
Definition VPlan.h:4826
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop.
Definition VPlan.h:4974
LLVM_ABI_FOR_TEST VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition VPlan.cpp:1206
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4877
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
iterator_range< user_iterator > users()
Definition Value.h:427
bool hasName() const
Definition Value.h:262
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
constexpr bool hasKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns true if there exists a value X where RHS.multiplyCoefficientBy(X) will result in a value whos...
Definition TypeSize.h:269
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr ScalarTy getKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns a value X where RHS.multiplyCoefficientBy(X) will result in a value whose quantity matches ou...
Definition TypeSize.h:277
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2803
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::UDiv > m_UDiv(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::FAdd, true > m_c_FAdd(const LHS &L, const RHS &R)
Matches FAdd with LHS and RHS in either order.
LogicalOp_match< LHS, RHS, Instruction::And, true > m_c_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R with LHS and RHS in either order.
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
SCEVAffineAddRec_match< Op0_t, Op1_t, class_match< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
bool match(const SCEV *S, const Pattern &P)
class_match< const SCEV > m_SCEV()
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ComputeReductionResult, Op0_t > m_ComputeReductionResult(const Op0_t &Op0)
auto m_WidenAnyExtend(const Op0_t &Op0)
VPInstruction_match< VPInstruction::StepVector > m_StepVector()
auto m_VPPhi(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::BranchOnTwoConds > m_BranchOnTwoConds()
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExitingIVValue, Op0_t > m_ExitingIVValue(const Op0_t &Op0)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
bind_ty< VPIRValue > m_VPIRValue(VPIRValue *&V)
Match a VPIRValue.
auto m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_True()
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
auto m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isUniformAcrossVFsAndUFs(VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
VPInstruction * findComputeReductionResult(VPReductionPHIRecipe *PhiR)
Find the ComputeReductionResult recipe for PhiR, looking through selects inserted for predicated redu...
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPRecipeBase * findRecipe(VPValue *Start, PredT Pred)
Search Start's users for a recipe satisfying Pred, looking through recipes with definitions.
Definition VPlanUtils.h:111
VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
static VPRecipeBase * findUserOf(VPValue *V, const MatchT &P)
If V is used by a recipe matching pattern P, return it.
Definition VPlanUtils.h:132
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
void stable_sort(R &&Range)
Definition STLExtras.h:2116
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2078
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:841
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:253
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:280
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1152
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
iterator_range< po_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_post_order_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order while traversing through ...
Definition VPlanCFG.h:273
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
UncountableExitStyle
Different methods of handling early exits.
Definition VPlan.h:83
@ ReadOnly
No side effects to worry about, so we can process any uncountable exits in the loop and branch either...
Definition VPlan.h:88
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:552
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1790
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
iterator_range< po_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_post_order_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order.
Definition VPlanCFG.h:266
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FindIV
FindIV reduction with select(icmp(),x,y) where one of (x,y) is a loop induction variable (increasing ...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2012
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2088
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2166
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2146
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:763
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:786
MDNode * NoAlias
The tag specifying the noalias scope.
Definition Metadata.h:789
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
Incoming for lane mask phi as machine instruction, incoming register Reg and incoming block Block are...
This reduction is unordered with the partial result scaled down by some factor.
Definition VPlan.h:2682
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
TargetTransformInfo::TargetCostKind CostKind
VPTypeAnalysis Types
const TargetTransformInfo & TTI
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2638
A VPValue representing a live-in from the input IR or a constant.
Definition VPlanValue.h:207
Type * getType() const
Returns the type of the underlying IR value.
Definition VPlan.cpp:141
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:247
bool isMaterialized() const
Returns true if this symbolic value has been materialized.
Definition VPlanValue.h:255
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3661
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3619
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3746
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3702
static void handleUncountableEarlyExits(VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB, UncountableExitStyle Style)
Update Plan to account for uncountable early exits by introducing appropriate branching logic in the ...
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
Try to simplify VPInstruction::ExplicitVectorLength recipes when the AVL is known to be <= VF,...
static void removeBranchOnConst(VPlan &Plan, bool OnlyLatches=false)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize UF, VF and VFxUF to be computed explicitly using VPInstructions.
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void hoistInvariantLoads(VPlan &Plan)
Hoist single-scalar loads with invariant addresses out of the vector loop to the preheader,...
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void convertToVariableLengthStep(VPlan &Plan)
Transform loops with variable-length stepping after region dissolution.
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static std::unique_ptr< VPlan > narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI)
Try to find a single VF among Plan's VFs for which all interleave groups (with known minimum VF eleme...
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void expandBranchOnTwoConds(VPlan &Plan)
Expand BranchOnTwoConds instructions into explicit CFG with BranchOnCond instructions.
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static bool mergeBlocksIntoPredecessors(VPlan &Plan)
Remove redundant VPBasicBlocks by merging them into their single predecessor if the latter has a sing...
static void optimizeFindIVReductions(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L)
Optimize FindLast reductions selecting IVs (or expressions of IVs) by converting them to FindIV reduc...
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPCurrentIterationPHIRecipe and related recipes to Plan and replaces all uses except the canoni...
static void optimizeEVLMasks(VPlan &Plan)
Optimize recipes which use an EVL-based header mask to VP intrinsics, for example:
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void optimizeInductionLiveOutUsers(VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx, VFRange &Range)
Detect and create partial reduction recipes for scaled reductions in Plan.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue, VPValue *Step)
Materialize vector trip count computations to a set of VPInstructions.
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void convertEVLExitCond(VPlan &Plan)
Replaces the exit condition from (branch-on-cond eq CanonicalIVInc, VectorTripCount) to (branch-on-co...