LLVM 22.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SetVector.h"
30#include "llvm/ADT/TypeSwitch.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
44
45using namespace llvm;
46using namespace VPlanPatternMatch;
47using namespace SCEVPatternMatch;
48
50 VPlan &Plan,
52 GetIntOrFpInductionDescriptor,
53 const TargetLibraryInfo &TLI) {
54
56 Plan.getVectorLoopRegion());
58 // Skip blocks outside region
59 if (!VPBB->getParent())
60 break;
61 VPRecipeBase *Term = VPBB->getTerminator();
62 auto EndIter = Term ? Term->getIterator() : VPBB->end();
63 // Introduce each ingredient into VPlan.
64 for (VPRecipeBase &Ingredient :
65 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
66
67 VPValue *VPV = Ingredient.getVPSingleValue();
68 if (!VPV->getUnderlyingValue())
69 continue;
70
72
73 VPRecipeBase *NewRecipe = nullptr;
74 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
75 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
76 const auto *II = GetIntOrFpInductionDescriptor(Phi);
77 if (!II) {
78 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
79 for (VPValue *Op : PhiR->operands())
80 NewRecipe->addOperand(Op);
81 } else {
82 VPValue *Start = Plan.getOrAddLiveIn(II->getStartValue());
83 VPValue *Step =
85 // It is always safe to copy over the NoWrap and FastMath flags. In
86 // particular, when folding tail by masking, the masked-off lanes are
87 // never used, so it is safe.
89 NewRecipe = new VPWidenIntOrFpInductionRecipe(
90 Phi, Start, Step, &Plan.getVF(), *II, Flags,
91 Ingredient.getDebugLoc());
92 }
93 } else {
94 auto *VPI = cast<VPInstruction>(&Ingredient);
95 assert(!isa<PHINode>(Inst) && "phis should be handled above");
96 // Create VPWidenMemoryRecipe for loads and stores.
97 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
98 NewRecipe = new VPWidenLoadRecipe(
99 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
100 false /*Consecutive*/, false /*Reverse*/, *VPI,
101 Ingredient.getDebugLoc());
102 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
103 NewRecipe = new VPWidenStoreRecipe(
104 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
105 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, *VPI,
106 Ingredient.getDebugLoc());
108 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
109 Ingredient.getDebugLoc());
110 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
111 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
112 if (VectorID == Intrinsic::not_intrinsic)
113 return false;
114 NewRecipe = new VPWidenIntrinsicRecipe(
115 *CI, getVectorIntrinsicIDForCall(CI, &TLI),
116 drop_end(Ingredient.operands()), CI->getType(), VPIRFlags(*CI),
117 *VPI, CI->getDebugLoc());
118 } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
119 NewRecipe = new VPWidenSelectRecipe(SI, Ingredient.operands(), *VPI,
120 *VPI, Ingredient.getDebugLoc());
121 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
122 NewRecipe = new VPWidenCastRecipe(
123 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
124 VPIRFlags(*CI), VPIRMetadata(*CI));
125 } else {
126 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
127 *VPI, Ingredient.getDebugLoc());
128 }
129 }
130
131 NewRecipe->insertBefore(&Ingredient);
132 if (NewRecipe->getNumDefinedValues() == 1)
133 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
134 else
135 assert(NewRecipe->getNumDefinedValues() == 0 &&
136 "Only recpies with zero or one defined values expected");
137 Ingredient.eraseFromParent();
138 }
139 }
140 return true;
141}
142
143/// Helper for extra no-alias checks via known-safe recipe and SCEV.
145 const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
146 VPReplicateRecipe &GroupLeader;
148 const Loop &L;
149 VPTypeAnalysis &TypeInfo;
150
151 // Return true if \p A and \p B are known to not alias for all VFs in the
152 // plan, checked via the distance between the accesses
153 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
154 if (A->getOpcode() != Instruction::Store ||
155 B->getOpcode() != Instruction::Store)
156 return false;
157
158 VPValue *AddrA = A->getOperand(1);
159 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, PSE, &L);
160 VPValue *AddrB = B->getOperand(1);
161 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, PSE, &L);
163 return false;
164
165 const APInt *Distance;
166 ScalarEvolution &SE = *PSE.getSE();
167 if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
168 return false;
169
170 const DataLayout &DL = SE.getDataLayout();
171 Type *TyA = TypeInfo.inferScalarType(A->getOperand(0));
172 uint64_t SizeA = DL.getTypeStoreSize(TyA);
173 Type *TyB = TypeInfo.inferScalarType(B->getOperand(0));
174 uint64_t SizeB = DL.getTypeStoreSize(TyB);
175
176 // Use the maximum store size to ensure no overlap from either direction.
177 // Currently only handles fixed sizes, as it is only used for
178 // replicating VPReplicateRecipes.
179 uint64_t MaxStoreSize = std::max(SizeA, SizeB);
180
181 auto VFs = B->getParent()->getPlan()->vectorFactors();
183 return Distance->abs().uge(
184 MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
185 }
186
187public:
190 const Loop &L, VPTypeAnalysis &TypeInfo)
191 : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), PSE(PSE),
192 L(L), TypeInfo(TypeInfo) {}
193
194 /// Return true if \p R should be skipped during alias checking, either
195 /// because it's in the exclude set or because no-alias can be proven via
196 /// SCEV.
197 bool shouldSkip(VPRecipeBase &R) const {
198 auto *Store = dyn_cast<VPReplicateRecipe>(&R);
199 return ExcludeRecipes.contains(&R) ||
200 (Store && isNoAliasViaDistance(Store, &GroupLeader));
201 }
202};
203
204/// Check if a memory operation doesn't alias with memory operations in blocks
205/// between \p FirstBB and \p LastBB using scoped noalias metadata. If
206/// \p SinkInfo is std::nullopt, only recipes that may write to memory are
207/// checked (for load hoisting). Otherwise recipes that both read and write
208/// memory are checked, and SCEV is used to prove no-alias between the group
209/// leader and other replicate recipes (for store sinking).
210static bool
212 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
213 std::optional<SinkStoreInfo> SinkInfo = {}) {
214 bool CheckReads = SinkInfo.has_value();
215 if (!MemLoc.AATags.Scope)
216 return false;
217
218 const AAMDNodes &MemAA = MemLoc.AATags;
219
220 for (VPBlockBase *Block = FirstBB; Block;
221 Block = Block->getSingleSuccessor()) {
222 assert(Block->getNumSuccessors() <= 1 &&
223 "Expected at most one successor in block chain");
224 auto *VPBB = cast<VPBasicBlock>(Block);
225 for (VPRecipeBase &R : *VPBB) {
226 if (SinkInfo && SinkInfo->shouldSkip(R))
227 continue;
228
229 // Skip recipes that don't need checking.
230 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
231 continue;
232
234 if (!Loc)
235 // Conservatively assume aliasing for memory operations without
236 // location.
237 return false;
238
239 // For reads, check if they don't alias in the reverse direction and
240 // skip if so.
241 if (CheckReads && R.mayReadFromMemory() &&
243 MemAA.NoAlias))
244 continue;
245
246 // Check if the memory operations may alias in the forward direction.
248 Loc->AATags.NoAlias))
249 return false;
250 }
251
252 if (Block == LastBB)
253 break;
254 }
255 return true;
256}
257
258/// Return true if we do not know how to (mechanically) hoist or sink \p R out
259/// of a loop region.
261 // Assumes don't alias anything or throw; as long as they're guaranteed to
262 // execute, they're safe to hoist.
264 return false;
265
266 // TODO: Relax checks in the future, e.g. we could also hoist reads, if their
267 // memory location is not modified in the vector loop.
268 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
269 return true;
270
271 // Allocas cannot be hoisted.
272 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
273 return RepR && RepR->getOpcode() == Instruction::Alloca;
274}
275
276static bool sinkScalarOperands(VPlan &Plan) {
277 auto Iter = vp_depth_first_deep(Plan.getEntry());
278 bool ScalarVFOnly = Plan.hasScalarVFOnly();
279 bool Changed = false;
280
282 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
283 VPBasicBlock *SinkTo, VPValue *Op) {
284 auto *Candidate =
285 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
286 if (!Candidate)
287 return;
288
289 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
290 // for now.
292 return;
293
294 if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(*Candidate))
295 return;
296
297 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
298 if (!ScalarVFOnly && RepR->isSingleScalar())
299 return;
300
301 WorkList.insert({SinkTo, Candidate});
302 };
303
304 // First, collect the operands of all recipes in replicate blocks as seeds for
305 // sinking.
307 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
308 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
309 continue;
310 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
311 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
312 continue;
313 for (auto &Recipe : *VPBB)
314 for (VPValue *Op : Recipe.operands())
315 InsertIfValidSinkCandidate(VPBB, Op);
316 }
317
318 // Try to sink each replicate or scalar IV steps recipe in the worklist.
319 for (unsigned I = 0; I != WorkList.size(); ++I) {
320 VPBasicBlock *SinkTo;
321 VPSingleDefRecipe *SinkCandidate;
322 std::tie(SinkTo, SinkCandidate) = WorkList[I];
323
324 // All recipe users of SinkCandidate must be in the same block SinkTo or all
325 // users outside of SinkTo must only use the first lane of SinkCandidate. In
326 // the latter case, we need to duplicate SinkCandidate.
327 auto UsersOutsideSinkTo =
328 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
329 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
330 });
331 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
332 return !U->usesFirstLaneOnly(SinkCandidate);
333 }))
334 continue;
335 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
336
337 if (NeedsDuplicating) {
338 if (ScalarVFOnly)
339 continue;
340 VPSingleDefRecipe *Clone;
341 if (auto *SinkCandidateRepR =
342 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
343 // TODO: Handle converting to uniform recipes as separate transform,
344 // then cloning should be sufficient here.
345 Instruction *I = SinkCandidate->getUnderlyingInstr();
346 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
347 nullptr /*Mask*/, *SinkCandidateRepR,
348 *SinkCandidateRepR);
349 // TODO: add ".cloned" suffix to name of Clone's VPValue.
350 } else {
351 Clone = SinkCandidate->clone();
352 }
353
354 Clone->insertBefore(SinkCandidate);
355 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
356 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
357 });
358 }
359 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
360 for (VPValue *Op : SinkCandidate->operands())
361 InsertIfValidSinkCandidate(SinkTo, Op);
362 Changed = true;
363 }
364 return Changed;
365}
366
367/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
368/// the mask.
370 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
371 if (!EntryBB || EntryBB->size() != 1 ||
372 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
373 return nullptr;
374
375 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
376}
377
378/// If \p R is a triangle region, return the 'then' block of the triangle.
380 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
381 if (EntryBB->getNumSuccessors() != 2)
382 return nullptr;
383
384 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
385 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
386 if (!Succ0 || !Succ1)
387 return nullptr;
388
389 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
390 return nullptr;
391 if (Succ0->getSingleSuccessor() == Succ1)
392 return Succ0;
393 if (Succ1->getSingleSuccessor() == Succ0)
394 return Succ1;
395 return nullptr;
396}
397
398// Merge replicate regions in their successor region, if a replicate region
399// is connected to a successor replicate region with the same predicate by a
400// single, empty VPBasicBlock.
402 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
403
404 // Collect replicate regions followed by an empty block, followed by another
405 // replicate region with matching masks to process front. This is to avoid
406 // iterator invalidation issues while merging regions.
409 vp_depth_first_deep(Plan.getEntry()))) {
410 if (!Region1->isReplicator())
411 continue;
412 auto *MiddleBasicBlock =
413 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
414 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
415 continue;
416
417 auto *Region2 =
418 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
419 if (!Region2 || !Region2->isReplicator())
420 continue;
421
422 VPValue *Mask1 = getPredicatedMask(Region1);
423 VPValue *Mask2 = getPredicatedMask(Region2);
424 if (!Mask1 || Mask1 != Mask2)
425 continue;
426
427 assert(Mask1 && Mask2 && "both region must have conditions");
428 WorkList.push_back(Region1);
429 }
430
431 // Move recipes from Region1 to its successor region, if both are triangles.
432 for (VPRegionBlock *Region1 : WorkList) {
433 if (TransformedRegions.contains(Region1))
434 continue;
435 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
436 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
437
438 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
439 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
440 if (!Then1 || !Then2)
441 continue;
442
443 // Note: No fusion-preventing memory dependencies are expected in either
444 // region. Such dependencies should be rejected during earlier dependence
445 // checks, which guarantee accesses can be re-ordered for vectorization.
446 //
447 // Move recipes to the successor region.
448 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
449 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
450
451 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
452 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
453
454 // Move VPPredInstPHIRecipes from the merge block to the successor region's
455 // merge block. Update all users inside the successor region to use the
456 // original values.
457 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
458 VPValue *PredInst1 =
459 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
460 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
461 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
462 return cast<VPRecipeBase>(&U)->getParent() == Then2;
463 });
464
465 // Remove phi recipes that are unused after merging the regions.
466 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
467 Phi1ToMove.eraseFromParent();
468 continue;
469 }
470 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
471 }
472
473 // Remove the dead recipes in Region1's entry block.
474 for (VPRecipeBase &R :
475 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
476 R.eraseFromParent();
477
478 // Finally, remove the first region.
479 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
480 VPBlockUtils::disconnectBlocks(Pred, Region1);
481 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
482 }
483 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
484 TransformedRegions.insert(Region1);
485 }
486
487 return !TransformedRegions.empty();
488}
489
491 VPlan &Plan) {
492 Instruction *Instr = PredRecipe->getUnderlyingInstr();
493 // Build the triangular if-then region.
494 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
495 assert(Instr->getParent() && "Predicated instruction not in any basic block");
496 auto *BlockInMask = PredRecipe->getMask();
497 auto *MaskDef = BlockInMask->getDefiningRecipe();
498 auto *BOMRecipe = new VPBranchOnMaskRecipe(
499 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
500 auto *Entry =
501 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
502
503 // Replace predicated replicate recipe with a replicate recipe without a
504 // mask but in the replicate region.
505 auto *RecipeWithoutMask = new VPReplicateRecipe(
506 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
507 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
508 PredRecipe->getDebugLoc());
509 auto *Pred =
510 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
511
512 VPPredInstPHIRecipe *PHIRecipe = nullptr;
513 if (PredRecipe->getNumUsers() != 0) {
514 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
515 RecipeWithoutMask->getDebugLoc());
516 PredRecipe->replaceAllUsesWith(PHIRecipe);
517 PHIRecipe->setOperand(0, RecipeWithoutMask);
518 }
519 PredRecipe->eraseFromParent();
520 auto *Exiting =
521 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
523 Plan.createReplicateRegion(Entry, Exiting, RegionName);
524
525 // Note: first set Entry as region entry and then connect successors starting
526 // from it in order, to propagate the "parent" of each VPBasicBlock.
527 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
528 VPBlockUtils::connectBlocks(Pred, Exiting);
529
530 return Region;
531}
532
533static void addReplicateRegions(VPlan &Plan) {
536 vp_depth_first_deep(Plan.getEntry()))) {
537 for (VPRecipeBase &R : *VPBB)
538 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
539 if (RepR->isPredicated())
540 WorkList.push_back(RepR);
541 }
542 }
543
544 unsigned BBNum = 0;
545 for (VPReplicateRecipe *RepR : WorkList) {
546 VPBasicBlock *CurrentBlock = RepR->getParent();
547 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
548
549 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
550 SplitBlock->setName(
551 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
552 // Record predicated instructions for above packing optimizations.
554 Region->setParent(CurrentBlock->getParent());
556
557 VPRegionBlock *ParentRegion = Region->getParent();
558 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
559 ParentRegion->setExiting(SplitBlock);
560 }
561}
562
563/// Remove redundant VPBasicBlocks by merging them into their predecessor if
564/// the predecessor has a single successor.
568 vp_depth_first_deep(Plan.getEntry()))) {
569 // Don't fold the blocks in the skeleton of the Plan into their single
570 // predecessors for now.
571 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
572 if (!VPBB->getParent())
573 continue;
574 auto *PredVPBB =
575 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
576 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
577 isa<VPIRBasicBlock>(PredVPBB))
578 continue;
579 WorkList.push_back(VPBB);
580 }
581
582 for (VPBasicBlock *VPBB : WorkList) {
583 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
584 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
585 R.moveBefore(*PredVPBB, PredVPBB->end());
586 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
587 auto *ParentRegion = VPBB->getParent();
588 if (ParentRegion && ParentRegion->getExiting() == VPBB)
589 ParentRegion->setExiting(PredVPBB);
590 for (auto *Succ : to_vector(VPBB->successors())) {
592 VPBlockUtils::connectBlocks(PredVPBB, Succ);
593 }
594 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
595 }
596 return !WorkList.empty();
597}
598
600 // Convert masked VPReplicateRecipes to if-then region blocks.
602
603 bool ShouldSimplify = true;
604 while (ShouldSimplify) {
605 ShouldSimplify = sinkScalarOperands(Plan);
606 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
607 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
608 }
609}
610
611/// Remove redundant casts of inductions.
612///
613/// Such redundant casts are casts of induction variables that can be ignored,
614/// because we already proved that the casted phi is equal to the uncasted phi
615/// in the vectorized loop. There is no need to vectorize the cast - the same
616/// value can be used for both the phi and casts in the vector loop.
618 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
620 if (!IV || IV->getTruncInst())
621 continue;
622
623 // A sequence of IR Casts has potentially been recorded for IV, which
624 // *must be bypassed* when the IV is vectorized, because the vectorized IV
625 // will produce the desired casted value. This sequence forms a def-use
626 // chain and is provided in reverse order, ending with the cast that uses
627 // the IV phi. Search for the recipe of the last cast in the chain and
628 // replace it with the original IV. Note that only the final cast is
629 // expected to have users outside the cast-chain and the dead casts left
630 // over will be cleaned up later.
631 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
632 VPValue *FindMyCast = IV;
633 for (Instruction *IRCast : reverse(Casts)) {
634 VPSingleDefRecipe *FoundUserCast = nullptr;
635 for (auto *U : FindMyCast->users()) {
636 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
637 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
638 FoundUserCast = UserCast;
639 break;
640 }
641 }
642 FindMyCast = FoundUserCast;
643 }
644 FindMyCast->replaceAllUsesWith(IV);
645 }
646}
647
648/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
649/// recipe, if it exists.
651 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
652 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
653 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
654 for (VPUser *U : CanonicalIV->users()) {
656 if (WidenNewIV)
657 break;
658 }
659
660 if (!WidenNewIV)
661 return;
662
663 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
664 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
665 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
666
667 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
668 continue;
669
670 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
671 // everything WidenNewIV's users need. That is, WidenOriginalIV will
672 // generate a vector phi or all users of WidenNewIV demand the first lane
673 // only.
674 if (!vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
675 vputils::onlyFirstLaneUsed(WidenNewIV)) {
676 // We are replacing a wide canonical iv with a suitable wide induction.
677 // This is used to compute header mask, hence all lanes will be used and
678 // we need to drop wrap flags only applying to lanes guranteed to execute
679 // in the original scalar loop.
680 WidenOriginalIV->dropPoisonGeneratingFlags();
681 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
682 WidenNewIV->eraseFromParent();
683 return;
684 }
685 }
686}
687
688/// Returns true if \p R is dead and can be removed.
689static bool isDeadRecipe(VPRecipeBase &R) {
690 // Do remove conditional assume instructions as their conditions may be
691 // flattened.
692 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
693 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
695 if (IsConditionalAssume)
696 return true;
697
698 if (R.mayHaveSideEffects())
699 return false;
700
701 // Recipe is dead if no user keeps the recipe alive.
702 return all_of(R.definedValues(),
703 [](VPValue *V) { return V->getNumUsers() == 0; });
704}
705
708 vp_post_order_deep(Plan.getEntry()))) {
709 // The recipes in the block are processed in reverse order, to catch chains
710 // of dead recipes.
711 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
712 if (isDeadRecipe(R)) {
713 R.eraseFromParent();
714 continue;
715 }
716
717 // Check if R is a dead VPPhi <-> update cycle and remove it.
718 auto *PhiR = dyn_cast<VPPhi>(&R);
719 if (!PhiR || PhiR->getNumOperands() != 2)
720 continue;
721 VPUser *PhiUser = PhiR->getSingleUser();
722 if (!PhiUser)
723 continue;
724 VPValue *Incoming = PhiR->getOperand(1);
725 if (PhiUser != Incoming->getDefiningRecipe() ||
726 Incoming->getNumUsers() != 1)
727 continue;
728 PhiR->replaceAllUsesWith(PhiR->getOperand(0));
729 PhiR->eraseFromParent();
730 Incoming->getDefiningRecipe()->eraseFromParent();
731 }
732 }
733}
734
737 Instruction::BinaryOps InductionOpcode,
738 FPMathOperator *FPBinOp, Instruction *TruncI,
739 VPValue *StartV, VPValue *Step, DebugLoc DL,
740 VPBuilder &Builder) {
741 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
742 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
743 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
744 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
745 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
746
747 // Truncate base induction if needed.
748 VPTypeAnalysis TypeInfo(Plan);
749 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
750 if (TruncI) {
751 Type *TruncTy = TruncI->getType();
752 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
753 "Not truncating.");
754 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
755 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
756 ResultTy = TruncTy;
757 }
758
759 // Truncate step if needed.
760 Type *StepTy = TypeInfo.inferScalarType(Step);
761 if (ResultTy != StepTy) {
762 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
763 "Not truncating.");
764 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
765 auto *VecPreheader =
767 VPBuilder::InsertPointGuard Guard(Builder);
768 Builder.setInsertPoint(VecPreheader);
769 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
770 }
771 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
772 &Plan.getVF(), DL);
773}
774
777 for (unsigned I = 0; I != Users.size(); ++I) {
779 if (isa<VPHeaderPHIRecipe>(Cur))
780 continue;
781 for (VPValue *V : Cur->definedValues())
782 Users.insert_range(V->users());
783 }
784 return Users.takeVector();
785}
786
787/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
788/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
789/// generates scalar values.
790static VPValue *
792 VPlan &Plan, VPBuilder &Builder) {
794 VPValue *StartV = Plan.getConstantInt(ID.getStep()->getType(), 0);
795 VPValue *StepV = PtrIV->getOperand(1);
797 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
798 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
799
800 return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
801 PtrIV->getDebugLoc(), "next.gep");
802}
803
804/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
805/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
806/// VPWidenPointerInductionRecipe will generate vectors only. If some users
807/// require vectors while other require scalars, the scalar uses need to extract
808/// the scalars from the generated vectors (Note that this is different to how
809/// int/fp inductions are handled). Legalize extract-from-ends using uniform
810/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
811/// the correct end value is available. Also optimize
812/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
813/// providing them scalar steps built on the canonical scalar IV and update the
814/// original IV's users. This is an optional optimization to reduce the needs of
815/// vector extracts.
818 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
819 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
820 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
821 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
822 if (!PhiR)
823 continue;
824
825 // Try to narrow wide and replicating recipes to uniform recipes, based on
826 // VPlan analysis.
827 // TODO: Apply to all recipes in the future, to replace legacy uniformity
828 // analysis.
829 auto Users = collectUsersRecursively(PhiR);
830 for (VPUser *U : reverse(Users)) {
831 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
832 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
833 // Skip recipes that shouldn't be narrowed.
834 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
835 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
836 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
837 continue;
838
839 // Skip recipes that may have other lanes than their first used.
841 continue;
842
843 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
844 Def->operands(), /*IsUniform*/ true,
845 /*Mask*/ nullptr, /*Flags*/ *Def);
846 Clone->insertAfter(Def);
847 Def->replaceAllUsesWith(Clone);
848 }
849
850 // Replace wide pointer inductions which have only their scalars used by
851 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
852 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
853 if (!Plan.hasScalarVFOnly() &&
854 !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
855 continue;
856
857 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
858 PtrIV->replaceAllUsesWith(PtrAdd);
859 continue;
860 }
861
862 // Replace widened induction with scalar steps for users that only use
863 // scalars.
864 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
865 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
866 return U->usesScalars(WideIV);
867 }))
868 continue;
869
870 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
872 Plan, ID.getKind(), ID.getInductionOpcode(),
873 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
874 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
875 WideIV->getDebugLoc(), Builder);
876
877 // Update scalar users of IV to use Step instead.
878 if (!HasOnlyVectorVFs) {
879 assert(!Plan.hasScalableVF() &&
880 "plans containing a scalar VF cannot also include scalable VFs");
881 WideIV->replaceAllUsesWith(Steps);
882 } else {
883 bool HasScalableVF = Plan.hasScalableVF();
884 WideIV->replaceUsesWithIf(Steps,
885 [WideIV, HasScalableVF](VPUser &U, unsigned) {
886 if (HasScalableVF)
887 return U.usesFirstLaneOnly(WideIV);
888 return U.usesScalars(WideIV);
889 });
890 }
891 }
892}
893
894/// Check if \p VPV is an untruncated wide induction, either before or after the
895/// increment. If so return the header IV (before the increment), otherwise
896/// return null.
899 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
900 if (WideIV) {
901 // VPV itself is a wide induction, separately compute the end value for exit
902 // users if it is not a truncated IV.
903 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
904 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
905 }
906
907 // Check if VPV is an optimizable induction increment.
908 VPRecipeBase *Def = VPV->getDefiningRecipe();
909 if (!Def || Def->getNumOperands() != 2)
910 return nullptr;
911 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
912 if (!WideIV)
913 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
914 if (!WideIV)
915 return nullptr;
916
917 auto IsWideIVInc = [&]() {
918 auto &ID = WideIV->getInductionDescriptor();
919
920 // Check if VPV increments the induction by the induction step.
921 VPValue *IVStep = WideIV->getStepValue();
922 switch (ID.getInductionOpcode()) {
923 case Instruction::Add:
924 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
925 case Instruction::FAdd:
927 m_Specific(IVStep)));
928 case Instruction::FSub:
929 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
930 m_Specific(IVStep)));
931 case Instruction::Sub: {
932 // IVStep will be the negated step of the subtraction. Check if Step == -1
933 // * IVStep.
934 VPValue *Step;
935 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
936 return false;
937 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, PSE);
938 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, PSE);
939 ScalarEvolution &SE = *PSE.getSE();
940 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
941 !isa<SCEVCouldNotCompute>(StepSCEV) &&
942 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
943 }
944 default:
945 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
946 match(VPV, m_GetElementPtr(m_Specific(WideIV),
947 m_Specific(WideIV->getStepValue())));
948 }
949 llvm_unreachable("should have been covered by switch above");
950 };
951 return IsWideIVInc() ? WideIV : nullptr;
952}
953
954/// Attempts to optimize the induction variable exit values for users in the
955/// early exit block.
957 VPTypeAnalysis &TypeInfo,
958 VPBlockBase *PredVPBB,
959 VPValue *Op,
961 VPValue *Incoming, *Mask;
964 return nullptr;
965
966 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
967 if (!WideIV)
968 return nullptr;
969
970 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
971 if (WideIntOrFp && WideIntOrFp->getTruncInst())
972 return nullptr;
973
974 // Calculate the final index.
975 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
976 auto *CanonicalIV = LoopRegion->getCanonicalIV();
977 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
978 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
979
980 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
981 VPValue *FirstActiveLane =
982 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
983 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
984 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
985 FirstActiveLaneType, DL);
986 VPValue *EndValue =
987 B.createNaryOp(Instruction::Add, {CanonicalIV, FirstActiveLane}, DL);
988
989 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
990 // changed it means the exit is using the incremented value, so we need to
991 // add the step.
992 if (Incoming != WideIV) {
993 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
994 EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
995 }
996
997 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
998 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
999 VPValue *Start = WideIV->getStartValue();
1000 VPValue *Step = WideIV->getStepValue();
1001 EndValue = B.createDerivedIV(
1002 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1003 Start, EndValue, Step);
1004 }
1005
1006 return EndValue;
1007}
1008
1009/// Attempts to optimize the induction variable exit values for users in the
1010/// exit block coming from the latch in the original scalar loop.
1012 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
1016 return nullptr;
1017
1018 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
1019 if (!WideIV)
1020 return nullptr;
1021
1022 VPValue *EndValue = EndValues.lookup(WideIV);
1023 assert(EndValue && "end value must have been pre-computed");
1024
1025 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1026 // changed it means the exit is using the incremented value, so we don't
1027 // need to subtract the step.
1028 if (Incoming != WideIV)
1029 return EndValue;
1030
1031 // Otherwise, subtract the step from the EndValue.
1032 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
1033 VPValue *Step = WideIV->getStepValue();
1034 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
1035 if (ScalarTy->isIntegerTy())
1036 return B.createNaryOp(Instruction::Sub, {EndValue, Step},
1037 DebugLoc::getUnknown(), "ind.escape");
1038 if (ScalarTy->isPointerTy()) {
1039 Type *StepTy = TypeInfo.inferScalarType(Step);
1040 auto *Zero = Plan.getConstantInt(StepTy, 0);
1041 return B.createPtrAdd(EndValue,
1042 B.createNaryOp(Instruction::Sub, {Zero, Step}),
1043 DebugLoc::getUnknown(), "ind.escape");
1044 }
1045 if (ScalarTy->isFloatingPointTy()) {
1046 const auto &ID = WideIV->getInductionDescriptor();
1047 return B.createNaryOp(
1048 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1049 ? Instruction::FSub
1050 : Instruction::FAdd,
1051 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
1052 }
1053 llvm_unreachable("all possible induction types must be handled");
1054 return nullptr;
1055}
1056
1058 VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues,
1060 VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
1061 VPTypeAnalysis TypeInfo(Plan);
1062 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1063 for (VPRecipeBase &R : ExitVPBB->phis()) {
1064 auto *ExitIRI = cast<VPIRPhi>(&R);
1065
1066 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
1067 VPValue *Escape = nullptr;
1068 if (PredVPBB == MiddleVPBB)
1069 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
1070 ExitIRI->getOperand(Idx),
1071 EndValues, PSE);
1072 else
1074 Plan, TypeInfo, PredVPBB, ExitIRI->getOperand(Idx), PSE);
1075 if (Escape)
1076 ExitIRI->setOperand(Idx, Escape);
1077 }
1078 }
1079 }
1080}
1081
1082/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
1083/// them with already existing recipes expanding the same SCEV expression.
1086
1087 for (VPRecipeBase &R :
1089 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
1090 if (!ExpR)
1091 continue;
1092
1093 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
1094 if (Inserted)
1095 continue;
1096 ExpR->replaceAllUsesWith(V->second);
1097 ExpR->eraseFromParent();
1098 }
1099}
1100
1102 SmallVector<VPValue *> WorkList;
1104 WorkList.push_back(V);
1105
1106 while (!WorkList.empty()) {
1107 VPValue *Cur = WorkList.pop_back_val();
1108 if (!Seen.insert(Cur).second)
1109 continue;
1110 VPRecipeBase *R = Cur->getDefiningRecipe();
1111 if (!R)
1112 continue;
1113 if (!isDeadRecipe(*R))
1114 continue;
1115 append_range(WorkList, R->operands());
1116 R->eraseFromParent();
1117 }
1118}
1119
1120/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1121/// Returns an optional pair, where the first element indicates whether it is
1122/// an intrinsic ID.
1123static std::optional<std::pair<bool, unsigned>>
1125 return TypeSwitch<const VPSingleDefRecipe *,
1126 std::optional<std::pair<bool, unsigned>>>(R)
1129 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1130 .Case<VPWidenIntrinsicRecipe>([](auto *I) {
1131 return std::make_pair(true, I->getVectorIntrinsicID());
1132 })
1133 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
1134 // For recipes that do not directly map to LLVM IR instructions,
1135 // assign opcodes after the last VPInstruction opcode (which is also
1136 // after the last IR Instruction opcode), based on the VPDefID.
1137 return std::make_pair(false,
1138 VPInstruction::OpsEnd + 1 + I->getVPDefID());
1139 })
1140 .Default([](auto *) { return std::nullopt; });
1141}
1142
1143/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1144/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1145/// Operands are foldable live-ins.
1147 ArrayRef<VPValue *> Operands,
1148 const DataLayout &DL,
1149 VPTypeAnalysis &TypeInfo) {
1150 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1151 if (!OpcodeOrIID)
1152 return nullptr;
1153
1155 for (VPValue *Op : Operands) {
1156 if (!Op->isLiveIn() || !Op->getLiveInIRValue())
1157 return nullptr;
1158 Ops.push_back(Op->getLiveInIRValue());
1159 }
1160
1161 auto FoldToIRValue = [&]() -> Value * {
1162 InstSimplifyFolder Folder(DL);
1163 if (OpcodeOrIID->first) {
1164 if (R.getNumOperands() != 2)
1165 return nullptr;
1166 unsigned ID = OpcodeOrIID->second;
1167 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
1168 TypeInfo.inferScalarType(&R));
1169 }
1170 unsigned Opcode = OpcodeOrIID->second;
1171 if (Instruction::isBinaryOp(Opcode))
1172 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1173 Ops[0], Ops[1]);
1174 if (Instruction::isCast(Opcode))
1175 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1176 TypeInfo.inferScalarType(R.getVPSingleValue()));
1177 switch (Opcode) {
1179 return Folder.FoldSelect(Ops[0], Ops[1],
1181 case VPInstruction::Not:
1182 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1184 case Instruction::Select:
1185 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1186 case Instruction::ICmp:
1187 case Instruction::FCmp:
1188 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1189 Ops[1]);
1190 case Instruction::GetElementPtr: {
1191 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1192 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1193 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1194 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1195 }
1198 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
1199 Ops[0], Ops[1],
1200 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1201 // An extract of a live-in is an extract of a broadcast, so return the
1202 // broadcasted element.
1203 case Instruction::ExtractElement:
1204 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1205 return Ops[0];
1206 }
1207 return nullptr;
1208 };
1209
1210 if (Value *V = FoldToIRValue())
1211 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1212 return nullptr;
1213}
1214
1215/// Try to simplify VPSingleDefRecipe \p Def.
1217 VPlan *Plan = Def->getParent()->getPlan();
1218
1219 // Simplification of live-in IR values for SingleDef recipes using
1220 // InstSimplifyFolder.
1221 const DataLayout &DL =
1223 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
1224 return Def->replaceAllUsesWith(V);
1225
1226 // Fold PredPHI LiveIn -> LiveIn.
1227 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1228 VPValue *Op = PredPHI->getOperand(0);
1229 if (Op->isLiveIn())
1230 PredPHI->replaceAllUsesWith(Op);
1231 }
1232
1233 VPBuilder Builder(Def);
1234 VPValue *A;
1235 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1236 Type *TruncTy = TypeInfo.inferScalarType(Def);
1237 Type *ATy = TypeInfo.inferScalarType(A);
1238 if (TruncTy == ATy) {
1239 Def->replaceAllUsesWith(A);
1240 } else {
1241 // Don't replace a scalarizing recipe with a widened cast.
1242 if (isa<VPReplicateRecipe>(Def))
1243 return;
1244 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1245
1246 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1247 ? Instruction::SExt
1248 : Instruction::ZExt;
1249 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1250 TruncTy);
1251 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1252 // UnderlyingExt has distinct return type, used to retain legacy cost.
1253 Ext->setUnderlyingValue(UnderlyingExt);
1254 }
1255 Def->replaceAllUsesWith(Ext);
1256 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1257 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1258 Def->replaceAllUsesWith(Trunc);
1259 }
1260 }
1261#ifndef NDEBUG
1262 // Verify that the cached type info is for both A and its users is still
1263 // accurate by comparing it to freshly computed types.
1264 VPTypeAnalysis TypeInfo2(*Plan);
1265 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1266 for (VPUser *U : A->users()) {
1267 auto *R = cast<VPRecipeBase>(U);
1268 for (VPValue *VPV : R->definedValues())
1269 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1270 }
1271#endif
1272 }
1273
1274 // Simplify (X && Y) || (X && !Y) -> X.
1275 // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
1276 // && (Y || Z) and (X || !X) into true. This requires queuing newly created
1277 // recipes to be visited during simplification.
1278 VPValue *X, *Y, *Z;
1279 if (match(Def,
1282 Def->replaceAllUsesWith(X);
1283 Def->eraseFromParent();
1284 return;
1285 }
1286
1287 // x | 1 -> 1
1288 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1289 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1290
1291 // x | 0 -> x
1292 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1293 return Def->replaceAllUsesWith(X);
1294
1295 // x & 0 -> 0
1296 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1297 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1298
1299 // x && false -> false
1300 if (match(Def, m_LogicalAnd(m_VPValue(X), m_False())))
1301 return Def->replaceAllUsesWith(Def->getOperand(1));
1302
1303 // (x && y) || (x && z) -> x && (y || z)
1306 // Simplify only if one of the operands has one use to avoid creating an
1307 // extra recipe.
1308 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1309 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1310 return Def->replaceAllUsesWith(
1311 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1312
1313 // x && !x -> 0
1315 return Def->replaceAllUsesWith(Plan->getFalse());
1316
1317 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1318 return Def->replaceAllUsesWith(X);
1319
1320 // select !c, x, y -> select c, y, x
1321 VPValue *C;
1322 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1323 Def->setOperand(0, C);
1324 Def->setOperand(1, Y);
1325 Def->setOperand(2, X);
1326 return;
1327 }
1328
1329 // Reassociate (x && y) && z -> x && (y && z) if x has multiple users. With
1330 // tail folding it is likely that x is a header mask and can be simplified
1331 // further.
1333 m_VPValue(Z))) &&
1334 X->hasMoreThanOneUniqueUser())
1335 return Def->replaceAllUsesWith(
1336 Builder.createLogicalAnd(X, Builder.createLogicalAnd(Y, Z)));
1337
1338 if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
1339 return Def->replaceAllUsesWith(A);
1340
1341 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1342 return Def->replaceAllUsesWith(A);
1343
1344 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1345 return Def->replaceAllUsesWith(
1346 Def->getOperand(0) == A ? Def->getOperand(1) : Def->getOperand(0));
1347
1348 if (match(Def, m_Not(m_VPValue(A)))) {
1349 if (match(A, m_Not(m_VPValue(A))))
1350 return Def->replaceAllUsesWith(A);
1351
1352 // Try to fold Not into compares by adjusting the predicate in-place.
1353 CmpPredicate Pred;
1354 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1355 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1356 if (all_of(Cmp->users(),
1358 m_Not(m_Specific(Cmp)),
1359 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1360 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1361 for (VPUser *U : to_vector(Cmp->users())) {
1362 auto *R = cast<VPSingleDefRecipe>(U);
1363 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1364 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1365 R->setOperand(1, Y);
1366 R->setOperand(2, X);
1367 } else {
1368 // not (cmp pred) -> cmp inv_pred
1369 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1370 R->replaceAllUsesWith(Cmp);
1371 }
1372 }
1373 // If Cmp doesn't have a debug location, use the one from the negation,
1374 // to preserve the location.
1375 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1376 Cmp->setDebugLoc(Def->getDebugLoc());
1377 }
1378 }
1379 }
1380
1381 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1382 // any-of (fcmp uno %A, %B), ...
1383 if (match(Def, m_AnyOf())) {
1385 VPRecipeBase *UnpairedCmp = nullptr;
1386 for (VPValue *Op : Def->operands()) {
1387 VPValue *X;
1388 if (Op->getNumUsers() > 1 ||
1390 m_Deferred(X)))) {
1391 NewOps.push_back(Op);
1392 } else if (!UnpairedCmp) {
1393 UnpairedCmp = Op->getDefiningRecipe();
1394 } else {
1395 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1396 UnpairedCmp->getOperand(0), X));
1397 UnpairedCmp = nullptr;
1398 }
1399 }
1400
1401 if (UnpairedCmp)
1402 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1403
1404 if (NewOps.size() < Def->getNumOperands()) {
1405 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1406 return Def->replaceAllUsesWith(NewAnyOf);
1407 }
1408 }
1409
1410 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1411 // This is useful for fmax/fmin without fast-math flags, where we need to
1412 // check if any operand is NaN.
1414 m_Deferred(X)),
1416 m_Deferred(Y))))) {
1417 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1418 return Def->replaceAllUsesWith(NewCmp);
1419 }
1420
1421 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1422 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1423 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1424 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1425 TypeInfo.inferScalarType(Def))
1426 return Def->replaceAllUsesWith(Def->getOperand(1));
1427
1429 m_One()))) {
1430 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1431 if (TypeInfo.inferScalarType(X) != WideStepTy)
1432 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1433 Def->replaceAllUsesWith(X);
1434 return;
1435 }
1436
1437 // For i1 vp.merges produced by AnyOf reductions:
1438 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1440 m_VPValue(X), m_VPValue())) &&
1442 TypeInfo.inferScalarType(Def)->isIntegerTy(1)) {
1443 Def->setOperand(1, Def->getOperand(0));
1444 Def->setOperand(0, Y);
1445 return;
1446 }
1447
1448 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1449 if (Phi->getOperand(0) == Phi->getOperand(1))
1450 Phi->replaceAllUsesWith(Phi->getOperand(0));
1451 return;
1452 }
1453
1454 // Look through ExtractLastLane.
1455 if (match(Def, m_ExtractLastLane(m_VPValue(A)))) {
1456 if (match(A, m_BuildVector())) {
1457 auto *BuildVector = cast<VPInstruction>(A);
1458 Def->replaceAllUsesWith(
1459 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1460 return;
1461 }
1462 if (Plan->hasScalarVFOnly())
1463 return Def->replaceAllUsesWith(A);
1464 }
1465
1466 // Look through ExtractPenultimateElement (BuildVector ....).
1468 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1469 Def->replaceAllUsesWith(
1470 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1471 return;
1472 }
1473
1474 uint64_t Idx;
1476 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1477 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1478 return;
1479 }
1480
1481 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1482 Def->replaceAllUsesWith(
1483 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1484 return;
1485 }
1486
1487 // Look through broadcast of single-scalar when used as select conditions; in
1488 // that case the scalar condition can be used directly.
1489 if (match(Def,
1492 "broadcast operand must be single-scalar");
1493 Def->setOperand(0, C);
1494 return;
1495 }
1496
1497 if (auto *Phi = dyn_cast<VPPhi>(Def)) {
1498 if (Phi->getNumOperands() == 1)
1499 Phi->replaceAllUsesWith(Phi->getOperand(0));
1500 return;
1501 }
1502
1503 // Some simplifications can only be applied after unrolling. Perform them
1504 // below.
1505 if (!Plan->isUnrolled())
1506 return;
1507
1508 // Hoist an invariant increment Y of a phi X, by having X start at Y.
1509 if (match(Def, m_c_Add(m_VPValue(X), m_VPValue(Y))) && Y->isLiveIn() &&
1510 isa<VPPhi>(X)) {
1511 auto *Phi = cast<VPPhi>(X);
1512 if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) &&
1513 Phi->getSingleUser() == Def) {
1514 Phi->setOperand(0, Y);
1515 Def->replaceAllUsesWith(Phi);
1516 return;
1517 }
1518 }
1519
1520 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1521 // just the pointer operand.
1522 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Def))
1523 if (!VPR->getOffset() || match(VPR->getOffset(), m_ZeroInt()))
1524 return VPR->replaceAllUsesWith(VPR->getOperand(0));
1525
1526 // VPScalarIVSteps for part 0 can be replaced by their start value, if only
1527 // the first lane is demanded.
1528 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1529 if (Steps->isPart0() && vputils::onlyFirstLaneUsed(Steps)) {
1530 Steps->replaceAllUsesWith(Steps->getOperand(0));
1531 return;
1532 }
1533 }
1534 // Simplify redundant ReductionStartVector recipes after unrolling.
1535 VPValue *StartV;
1537 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1538 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1539 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1540 return PhiR && PhiR->isInLoop();
1541 });
1542 return;
1543 }
1544
1546 Def->replaceAllUsesWith(A);
1547 return;
1548 }
1549
1550 if (match(Def, m_ExtractLastLane(m_VPValue(A))) &&
1553 cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
1554 all_of(A->users(),
1555 [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
1556 return Def->replaceAllUsesWith(A);
1557 }
1558
1559 if (Plan->getUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
1560 return Def->replaceAllUsesWith(A);
1561}
1562
1565 Plan.getEntry());
1566 VPTypeAnalysis TypeInfo(Plan);
1568 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1569 if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R))
1570 simplifyRecipe(Def, TypeInfo);
1571 }
1572}
1573
1575 if (Plan.hasScalarVFOnly())
1576 return;
1577
1578 // Try to narrow wide and replicating recipes to single scalar recipes,
1579 // based on VPlan analysis. Only process blocks in the loop region for now,
1580 // without traversing into nested regions, as recipes in replicate regions
1581 // cannot be converted yet.
1584 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1587 continue;
1588 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1589 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1590 continue;
1591
1592 // Convert an unmasked scatter with an uniform address into
1593 // extract-last-lane + scalar store.
1594 // TODO: Add a profitability check comparing the cost of a scatter vs.
1595 // extract + scalar store.
1596 auto *WidenStoreR = dyn_cast<VPWidenStoreRecipe>(&R);
1597 if (WidenStoreR && vputils::isSingleScalar(WidenStoreR->getAddr()) &&
1598 !WidenStoreR->isConsecutive()) {
1599 assert(!WidenStoreR->isReverse() &&
1600 "Not consecutive memory recipes shouldn't be reversed");
1601 VPValue *Mask = WidenStoreR->getMask();
1602
1603 // Only convert the scatter to a scalar store if it is unmasked.
1604 // TODO: Support converting scatter masked by the header mask to scalar
1605 // store.
1606 if (Mask)
1607 continue;
1608
1610 {WidenStoreR->getOperand(1)});
1611 Extract->insertBefore(WidenStoreR);
1612
1613 // TODO: Sink the scalar store recipe to middle block if possible.
1614 auto *ScalarStore = new VPReplicateRecipe(
1615 &WidenStoreR->getIngredient(), {Extract, WidenStoreR->getAddr()},
1616 true /*IsSingleScalar*/, nullptr /*Mask*/, {},
1617 *WidenStoreR /*Metadata*/);
1618 ScalarStore->insertBefore(WidenStoreR);
1619 WidenStoreR->eraseFromParent();
1620 continue;
1621 }
1622
1623 auto *RepOrWidenR = dyn_cast<VPRecipeWithIRFlags>(&R);
1624 if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
1625 vputils::isSingleScalar(RepR->getOperand(1))) {
1626 auto *Clone = new VPReplicateRecipe(
1627 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1628 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1629 *RepR /*Metadata*/, RepR->getDebugLoc());
1630 Clone->insertBefore(RepOrWidenR);
1631 VPBuilder Builder(Clone);
1632 VPValue *ExtractOp = Clone->getOperand(0);
1633 if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
1634 ExtractOp =
1635 Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
1636 ExtractOp =
1637 Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
1638 Clone->setOperand(0, ExtractOp);
1639 RepR->eraseFromParent();
1640 continue;
1641 }
1642
1643 // Skip recipes that aren't single scalars.
1644 if (!RepOrWidenR || !vputils::isSingleScalar(RepOrWidenR))
1645 continue;
1646
1647 // Skip recipes for which conversion to single-scalar does introduce
1648 // additional broadcasts. No extra broadcasts are needed, if either only
1649 // the scalars of the recipe are used, or at least one of the operands
1650 // would require a broadcast. In the latter case, the single-scalar may
1651 // need to be broadcasted, but another broadcast is removed.
1652 if (!all_of(RepOrWidenR->users(),
1653 [RepOrWidenR](const VPUser *U) {
1654 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1655 unsigned Opcode = VPI->getOpcode();
1656 if (Opcode == VPInstruction::ExtractLastLane ||
1657 Opcode == VPInstruction::ExtractLastPart ||
1658 Opcode == VPInstruction::ExtractPenultimateElement)
1659 return true;
1660 }
1661
1662 return U->usesScalars(RepOrWidenR);
1663 }) &&
1664 none_of(RepOrWidenR->operands(), [RepOrWidenR](VPValue *Op) {
1665 if (Op->getSingleUser() != RepOrWidenR)
1666 return false;
1667 // Non-constant live-ins require broadcasts, while constants do not
1668 // need explicit broadcasts.
1669 bool LiveInNeedsBroadcast =
1670 Op->isLiveIn() && !isa<Constant>(Op->getLiveInIRValue());
1671 auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
1672 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1673 }))
1674 continue;
1675
1676 auto *Clone = new VPReplicateRecipe(
1677 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1678 true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
1679 Clone->insertBefore(RepOrWidenR);
1680 RepOrWidenR->replaceAllUsesWith(Clone);
1681 if (isDeadRecipe(*RepOrWidenR))
1682 RepOrWidenR->eraseFromParent();
1683 }
1684 }
1685}
1686
1687/// Try to see if all of \p Blend's masks share a common value logically and'ed
1688/// and remove it from the masks.
1690 if (Blend->isNormalized())
1691 return;
1692 VPValue *CommonEdgeMask;
1693 if (!match(Blend->getMask(0),
1694 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1695 return;
1696 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1697 if (!match(Blend->getMask(I),
1698 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1699 return;
1700 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1701 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1702}
1703
1704/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1705/// to make sure the masks are simplified.
1706static void simplifyBlends(VPlan &Plan) {
1709 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1710 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1711 if (!Blend)
1712 continue;
1713
1714 removeCommonBlendMask(Blend);
1715
1716 // Try to remove redundant blend recipes.
1717 SmallPtrSet<VPValue *, 4> UniqueValues;
1718 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1719 UniqueValues.insert(Blend->getIncomingValue(0));
1720 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1721 if (!match(Blend->getMask(I), m_False()))
1722 UniqueValues.insert(Blend->getIncomingValue(I));
1723
1724 if (UniqueValues.size() == 1) {
1725 Blend->replaceAllUsesWith(*UniqueValues.begin());
1726 Blend->eraseFromParent();
1727 continue;
1728 }
1729
1730 if (Blend->isNormalized())
1731 continue;
1732
1733 // Normalize the blend so its first incoming value is used as the initial
1734 // value with the others blended into it.
1735
1736 unsigned StartIndex = 0;
1737 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1738 // If a value's mask is used only by the blend then is can be deadcoded.
1739 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1740 // that's used by multiple blends where it can be removed from them all.
1741 VPValue *Mask = Blend->getMask(I);
1742 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1743 StartIndex = I;
1744 break;
1745 }
1746 }
1747
1748 SmallVector<VPValue *, 4> OperandsWithMask;
1749 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1750
1751 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1752 if (I == StartIndex)
1753 continue;
1754 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1755 OperandsWithMask.push_back(Blend->getMask(I));
1756 }
1757
1758 auto *NewBlend =
1759 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1760 OperandsWithMask, Blend->getDebugLoc());
1761 NewBlend->insertBefore(&R);
1762
1763 VPValue *DeadMask = Blend->getMask(StartIndex);
1764 Blend->replaceAllUsesWith(NewBlend);
1765 Blend->eraseFromParent();
1767
1768 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1769 VPValue *NewMask;
1770 if (NewBlend->getNumOperands() == 3 &&
1771 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1772 VPValue *Inc0 = NewBlend->getOperand(0);
1773 VPValue *Inc1 = NewBlend->getOperand(1);
1774 VPValue *OldMask = NewBlend->getOperand(2);
1775 NewBlend->setOperand(0, Inc1);
1776 NewBlend->setOperand(1, Inc0);
1777 NewBlend->setOperand(2, NewMask);
1778 if (OldMask->getNumUsers() == 0)
1779 cast<VPInstruction>(OldMask)->eraseFromParent();
1780 }
1781 }
1782 }
1783}
1784
1785/// Optimize the width of vector induction variables in \p Plan based on a known
1786/// constant Trip Count, \p BestVF and \p BestUF.
1788 ElementCount BestVF,
1789 unsigned BestUF) {
1790 // Only proceed if we have not completely removed the vector region.
1791 if (!Plan.getVectorLoopRegion())
1792 return false;
1793
1794 const APInt *TC;
1795 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
1796 return false;
1797
1798 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1799 // and UF. Returns at least 8.
1800 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1801 APInt AlignedTC =
1804 APInt MaxVal = AlignedTC - 1;
1805 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
1806 };
1807 unsigned NewBitWidth =
1808 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
1809
1810 LLVMContext &Ctx = Plan.getContext();
1811 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
1812
1813 bool MadeChange = false;
1814
1815 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1816 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1817 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1818
1819 // Currently only handle canonical IVs as it is trivial to replace the start
1820 // and stop values, and we currently only perform the optimization when the
1821 // IV has a single use.
1822 if (!WideIV || !WideIV->isCanonical() ||
1823 WideIV->hasMoreThanOneUniqueUser() ||
1824 NewIVTy == WideIV->getScalarType())
1825 continue;
1826
1827 // Currently only handle cases where the single user is a header-mask
1828 // comparison with the backedge-taken-count.
1829 VPUser *SingleUser = WideIV->getSingleUser();
1830 if (!SingleUser ||
1831 !match(SingleUser, m_ICmp(m_Specific(WideIV),
1834 continue;
1835
1836 // Update IV operands and comparison bound to use new narrower type.
1837 auto *NewStart = Plan.getConstantInt(NewIVTy, 0);
1838 WideIV->setStartValue(NewStart);
1839 auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
1840 WideIV->setStepValue(NewStep);
1841
1842 auto *NewBTC = new VPWidenCastRecipe(
1843 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
1844 Plan.getVectorPreheader()->appendRecipe(NewBTC);
1845 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
1846 Cmp->setOperand(1, NewBTC);
1847
1848 MadeChange = true;
1849 }
1850
1851 return MadeChange;
1852}
1853
1854/// Return true if \p Cond is known to be true for given \p BestVF and \p
1855/// BestUF.
1857 ElementCount BestVF, unsigned BestUF,
1860 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
1861 &PSE](VPValue *C) {
1862 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, PSE);
1863 });
1864
1865 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
1867 m_Specific(CanIV->getBackedgeValue()),
1868 m_Specific(&Plan.getVectorTripCount()))))
1869 return false;
1870
1871 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
1872 // count is not conveniently available as SCEV so far, so we compare directly
1873 // against the original trip count. This is stricter than necessary, as we
1874 // will only return true if the trip count == vector trip count.
1875 const SCEV *VectorTripCount =
1877 if (isa<SCEVCouldNotCompute>(VectorTripCount))
1878 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), PSE);
1879 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
1880 "Trip count SCEV must be computable");
1881 ScalarEvolution &SE = *PSE.getSE();
1882 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1883 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
1884 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
1885}
1886
1887/// Try to replace multiple active lane masks used for control flow with
1888/// a single, wide active lane mask instruction followed by multiple
1889/// extract subvector intrinsics. This applies to the active lane mask
1890/// instructions both in the loop and in the preheader.
1891/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
1892/// new extracts from the first active lane mask, which has it's last
1893/// operand (multiplier) set to UF.
1895 unsigned UF) {
1896 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
1897 return false;
1898
1899 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1900 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1901 auto *Term = &ExitingVPBB->back();
1902
1903 using namespace llvm::VPlanPatternMatch;
1905 m_VPValue(), m_VPValue(), m_VPValue())))))
1906 return false;
1907
1908 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1909 LLVMContext &Ctx = Plan.getContext();
1910
1911 auto ExtractFromALM = [&](VPInstruction *ALM,
1912 SmallVectorImpl<VPValue *> &Extracts) {
1913 DebugLoc DL = ALM->getDebugLoc();
1914 for (unsigned Part = 0; Part < UF; ++Part) {
1916 Ops.append({ALM, Plan.getOrAddLiveIn(
1917 ConstantInt::get(IntegerType::getInt64Ty(Ctx),
1918 VF.getKnownMinValue() * Part))});
1919 auto *Ext =
1920 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
1921 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
1922 Extracts[Part] = Ext;
1923 Ext->insertAfter(ALM);
1924 }
1925 };
1926
1927 // Create a list of each active lane mask phi, ordered by unroll part.
1929 for (VPRecipeBase &R : Header->phis()) {
1931 if (!Phi)
1932 continue;
1933 VPValue *Index = nullptr;
1934 match(Phi->getBackedgeValue(),
1936 assert(Index && "Expected index from ActiveLaneMask instruction");
1937
1938 uint64_t Part;
1939 if (match(Index,
1941 m_VPValue(), m_ConstantInt(Part))))
1942 Phis[Part] = Phi;
1943 else
1944 // Anything other than a CanonicalIVIncrementForPart is part 0
1945 Phis[0] = Phi;
1946 }
1947
1948 assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
1949 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
1950
1951 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
1952 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
1953
1954 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
1955 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
1956 "Expected incoming values of Phi to be ActiveLaneMasks");
1957
1958 // When using wide lane masks, the return type of the get.active.lane.mask
1959 // intrinsic is VF x UF (last operand).
1960 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
1961 EntryALM->setOperand(2, ALMMultiplier);
1962 LoopALM->setOperand(2, ALMMultiplier);
1963
1964 // Create UF x extract vectors and insert into preheader.
1965 SmallVector<VPValue *> EntryExtracts(UF);
1966 ExtractFromALM(EntryALM, EntryExtracts);
1967
1968 // Create UF x extract vectors and insert before the loop compare & branch,
1969 // updating the compare to use the first extract.
1970 SmallVector<VPValue *> LoopExtracts(UF);
1971 ExtractFromALM(LoopALM, LoopExtracts);
1972 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
1973 Not->setOperand(0, LoopExtracts[0]);
1974
1975 // Update the incoming values of active lane mask phis.
1976 for (unsigned Part = 0; Part < UF; ++Part) {
1977 Phis[Part]->setStartValue(EntryExtracts[Part]);
1978 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
1979 }
1980
1981 return true;
1982}
1983
1984/// Try to simplify the branch condition of \p Plan. This may restrict the
1985/// resulting plan to \p BestVF and \p BestUF.
1987 unsigned BestUF,
1989 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1990 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1991 auto *Term = &ExitingVPBB->back();
1992 VPValue *Cond;
1993 if (match(Term, m_BranchOnCount()) ||
1995 m_VPValue(), m_VPValue(), m_VPValue()))))) {
1996 // Try to simplify the branch condition if VectorTC <= VF * UF when the
1997 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
1998 const SCEV *VectorTripCount =
2000 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2001 VectorTripCount =
2003 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2004 "Trip count SCEV must be computable");
2005 ScalarEvolution &SE = *PSE.getSE();
2006 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2007 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2008 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
2009 return false;
2010 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond)))) {
2011 // For BranchOnCond, check if we can prove the condition to be true using VF
2012 // and UF.
2013 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2014 return false;
2015 } else {
2016 return false;
2017 }
2018
2019 // The vector loop region only executes once. If possible, completely remove
2020 // the region, otherwise replace the terminator controlling the latch with
2021 // (BranchOnCond true).
2022 // TODO: VPWidenIntOrFpInductionRecipe is only partially supported; add
2023 // support for other non-canonical widen induction recipes (e.g.,
2024 // VPWidenPointerInductionRecipe).
2025 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
2026 if (all_of(Header->phis(), [](VPRecipeBase &Phi) {
2027 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi))
2028 return R->isCanonical();
2029 return isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
2030 VPFirstOrderRecurrencePHIRecipe, VPPhi>(&Phi);
2031 })) {
2032 for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
2033 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&HeaderR)) {
2034 VPBuilder Builder(Plan.getVectorPreheader());
2035 VPValue *StepV = Builder.createNaryOp(VPInstruction::StepVector, {},
2036 R->getScalarType());
2037 HeaderR.getVPSingleValue()->replaceAllUsesWith(StepV);
2038 HeaderR.eraseFromParent();
2039 continue;
2040 }
2041 auto *Phi = cast<VPPhiAccessors>(&HeaderR);
2042 HeaderR.getVPSingleValue()->replaceAllUsesWith(Phi->getIncomingValue(0));
2043 HeaderR.eraseFromParent();
2044 }
2045
2046 VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
2047 VPBlockBase *Exit = VectorRegion->getSingleSuccessor();
2048 VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
2049 VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
2050
2051 for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
2052 B->setParent(nullptr);
2053
2054 VPBlockUtils::connectBlocks(Preheader, Header);
2055 VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
2057 } else {
2058 // The vector region contains header phis for which we cannot remove the
2059 // loop region yet.
2060 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
2061 {}, {}, Term->getDebugLoc());
2062 ExitingVPBB->appendRecipe(BOC);
2063 }
2064
2065 Term->eraseFromParent();
2066
2067 return true;
2068}
2069
2070/// From the definition of llvm.experimental.get.vector.length,
2071/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2075 vp_depth_first_deep(Plan.getEntry()))) {
2076 for (VPRecipeBase &R : *VPBB) {
2077 VPValue *AVL;
2078 if (!match(&R, m_EVL(m_VPValue(AVL))))
2079 continue;
2080
2081 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, PSE);
2082 if (isa<SCEVCouldNotCompute>(AVLSCEV))
2083 continue;
2084 ScalarEvolution &SE = *PSE.getSE();
2085 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
2086 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
2087 continue;
2088
2090 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
2091 R.getDebugLoc());
2092 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
2093 return true;
2094 }
2095 }
2096 return false;
2097}
2098
2100 unsigned BestUF,
2102 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2103 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2104
2105 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
2106 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2107 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2108 MadeChange |= simplifyKnownEVL(Plan, BestVF, PSE);
2109
2110 if (MadeChange) {
2111 Plan.setVF(BestVF);
2112 assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF");
2113 }
2114}
2115
2116/// Sink users of \p FOR after the recipe defining the previous value \p
2117/// Previous of the recurrence. \returns true if all users of \p FOR could be
2118/// re-arranged as needed or false if it is not possible.
2119static bool
2121 VPRecipeBase *Previous,
2122 VPDominatorTree &VPDT) {
2123 // Collect recipes that need sinking.
2126 Seen.insert(Previous);
2127 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
2128 // The previous value must not depend on the users of the recurrence phi. In
2129 // that case, FOR is not a fixed order recurrence.
2130 if (SinkCandidate == Previous)
2131 return false;
2132
2133 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
2134 !Seen.insert(SinkCandidate).second ||
2135 VPDT.properlyDominates(Previous, SinkCandidate))
2136 return true;
2137
2138 if (cannotHoistOrSinkRecipe(*SinkCandidate))
2139 return false;
2140
2141 WorkList.push_back(SinkCandidate);
2142 return true;
2143 };
2144
2145 // Recursively sink users of FOR after Previous.
2146 WorkList.push_back(FOR);
2147 for (unsigned I = 0; I != WorkList.size(); ++I) {
2148 VPRecipeBase *Current = WorkList[I];
2149 assert(Current->getNumDefinedValues() == 1 &&
2150 "only recipes with a single defined value expected");
2151
2152 for (VPUser *User : Current->getVPSingleValue()->users()) {
2153 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
2154 return false;
2155 }
2156 }
2157
2158 // Keep recipes to sink ordered by dominance so earlier instructions are
2159 // processed first.
2160 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2161 return VPDT.properlyDominates(A, B);
2162 });
2163
2164 for (VPRecipeBase *SinkCandidate : WorkList) {
2165 if (SinkCandidate == FOR)
2166 continue;
2167
2168 SinkCandidate->moveAfter(Previous);
2169 Previous = SinkCandidate;
2170 }
2171 return true;
2172}
2173
2174/// Try to hoist \p Previous and its operands before all users of \p FOR.
2176 VPRecipeBase *Previous,
2177 VPDominatorTree &VPDT) {
2178 if (cannotHoistOrSinkRecipe(*Previous))
2179 return false;
2180
2181 // Collect recipes that need hoisting.
2182 SmallVector<VPRecipeBase *> HoistCandidates;
2184 VPRecipeBase *HoistPoint = nullptr;
2185 // Find the closest hoist point by looking at all users of FOR and selecting
2186 // the recipe dominating all other users.
2187 for (VPUser *U : FOR->users()) {
2188 auto *R = cast<VPRecipeBase>(U);
2189 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
2190 HoistPoint = R;
2191 }
2192 assert(all_of(FOR->users(),
2193 [&VPDT, HoistPoint](VPUser *U) {
2194 auto *R = cast<VPRecipeBase>(U);
2195 return HoistPoint == R ||
2196 VPDT.properlyDominates(HoistPoint, R);
2197 }) &&
2198 "HoistPoint must dominate all users of FOR");
2199
2200 auto NeedsHoisting = [HoistPoint, &VPDT,
2201 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
2202 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
2203 if (!HoistCandidate)
2204 return nullptr;
2205 VPRegionBlock *EnclosingLoopRegion =
2206 HoistCandidate->getParent()->getEnclosingLoopRegion();
2207 assert((!HoistCandidate->getRegion() ||
2208 HoistCandidate->getRegion() == EnclosingLoopRegion) &&
2209 "CFG in VPlan should still be flat, without replicate regions");
2210 // Hoist candidate was already visited, no need to hoist.
2211 if (!Visited.insert(HoistCandidate).second)
2212 return nullptr;
2213
2214 // Candidate is outside loop region or a header phi, dominates FOR users w/o
2215 // hoisting.
2216 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
2217 return nullptr;
2218
2219 // If we reached a recipe that dominates HoistPoint, we don't need to
2220 // hoist the recipe.
2221 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
2222 return nullptr;
2223 return HoistCandidate;
2224 };
2225
2226 if (!NeedsHoisting(Previous->getVPSingleValue()))
2227 return true;
2228
2229 // Recursively try to hoist Previous and its operands before all users of FOR.
2230 HoistCandidates.push_back(Previous);
2231
2232 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
2233 VPRecipeBase *Current = HoistCandidates[I];
2234 assert(Current->getNumDefinedValues() == 1 &&
2235 "only recipes with a single defined value expected");
2236 if (cannotHoistOrSinkRecipe(*Current))
2237 return false;
2238
2239 for (VPValue *Op : Current->operands()) {
2240 // If we reach FOR, it means the original Previous depends on some other
2241 // recurrence that in turn depends on FOR. If that is the case, we would
2242 // also need to hoist recipes involving the other FOR, which may break
2243 // dependencies.
2244 if (Op == FOR)
2245 return false;
2246
2247 if (auto *R = NeedsHoisting(Op)) {
2248 // Bail out if the recipe defines multiple values.
2249 // TODO: Hoisting such recipes requires additional handling.
2250 if (R->getNumDefinedValues() != 1)
2251 return false;
2252 HoistCandidates.push_back(R);
2253 }
2254 }
2255 }
2256
2257 // Order recipes to hoist by dominance so earlier instructions are processed
2258 // first.
2259 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2260 return VPDT.properlyDominates(A, B);
2261 });
2262
2263 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
2264 HoistCandidate->moveBefore(*HoistPoint->getParent(),
2265 HoistPoint->getIterator());
2266 }
2267
2268 return true;
2269}
2270
2272 VPBuilder &LoopBuilder) {
2273 VPDominatorTree VPDT(Plan);
2274
2276 for (VPRecipeBase &R :
2279 RecurrencePhis.push_back(FOR);
2280
2281 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
2283 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
2284 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
2285 // to terminate.
2286 while (auto *PrevPhi =
2288 assert(PrevPhi->getParent() == FOR->getParent());
2289 assert(SeenPhis.insert(PrevPhi).second);
2290 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
2291 }
2292
2293 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
2294 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
2295 return false;
2296
2297 // Introduce a recipe to combine the incoming and previous values of a
2298 // fixed-order recurrence.
2299 VPBasicBlock *InsertBlock = Previous->getParent();
2300 if (isa<VPHeaderPHIRecipe>(Previous))
2301 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
2302 else
2303 LoopBuilder.setInsertPoint(InsertBlock,
2304 std::next(Previous->getIterator()));
2305
2306 auto *RecurSplice =
2308 {FOR, FOR->getBackedgeValue()});
2309
2310 FOR->replaceAllUsesWith(RecurSplice);
2311 // Set the first operand of RecurSplice to FOR again, after replacing
2312 // all users.
2313 RecurSplice->setOperand(0, FOR);
2314
2315 // Check for users extracting at the penultimate active lane of the FOR.
2316 // If only a single lane is active in the current iteration, we need to
2317 // select the last element from the previous iteration (from the FOR phi
2318 // directly).
2319 for (VPUser *U : RecurSplice->users()) {
2321 m_Specific(RecurSplice))))
2322 continue;
2323
2325 VPValue *LastActiveLane = cast<VPInstruction>(U)->getOperand(0);
2326 Type *I64Ty = Type::getInt64Ty(Plan.getContext());
2327 VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 0));
2328 VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 1));
2329 VPValue *PenultimateIndex =
2330 B.createNaryOp(Instruction::Sub, {LastActiveLane, One});
2331 VPValue *PenultimateLastIter =
2332 B.createNaryOp(VPInstruction::ExtractLane,
2333 {PenultimateIndex, FOR->getBackedgeValue()});
2334 VPValue *LastPrevIter =
2335 B.createNaryOp(VPInstruction::ExtractLastLane, FOR);
2336
2337 VPValue *Cmp = B.createICmp(CmpInst::ICMP_EQ, LastActiveLane, Zero);
2338 VPValue *Sel = B.createSelect(Cmp, LastPrevIter, PenultimateLastIter);
2339 cast<VPInstruction>(U)->replaceAllUsesWith(Sel);
2340 }
2341 }
2342 return true;
2343}
2344
2346 for (VPRecipeBase &R :
2348 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2349 if (!PhiR)
2350 continue;
2351 RecurKind RK = PhiR->getRecurrenceKind();
2352 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2354 continue;
2355
2356 for (VPUser *U : collectUsersRecursively(PhiR))
2357 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2358 RecWithFlags->dropPoisonGeneratingFlags();
2359 }
2360 }
2361}
2362
2363namespace {
2364struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2365 static bool isSentinel(const VPSingleDefRecipe *Def) {
2366 return Def == getEmptyKey() || Def == getTombstoneKey();
2367 }
2368
2369 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2370 /// return that source element type.
2371 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2372 // All VPInstructions that lower to GEPs must have the i8 source element
2373 // type (as they are PtrAdds), so we omit it.
2375 .Case<VPReplicateRecipe>([](auto *I) -> Type * {
2376 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2377 return GEP->getSourceElementType();
2378 return nullptr;
2379 })
2380 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2381 [](auto *I) { return I->getSourceElementType(); })
2382 .Default([](auto *) { return nullptr; });
2383 }
2384
2385 /// Returns true if recipe \p Def can be safely handed for CSE.
2386 static bool canHandle(const VPSingleDefRecipe *Def) {
2387 // We can extend the list of handled recipes in the future,
2388 // provided we account for the data embedded in them while checking for
2389 // equality or hashing.
2390 auto C = getOpcodeOrIntrinsicID(Def);
2391
2392 // The issue with (Insert|Extract)Value is that the index of the
2393 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2394 // VPlan.
2395 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2396 C->second == Instruction::ExtractValue)))
2397 return false;
2398
2399 // During CSE, we can only handle recipes that don't read from memory: if
2400 // they read from memory, there could be an intervening write to memory
2401 // before the next instance is CSE'd, leading to an incorrect result.
2402 return !Def->mayReadFromMemory();
2403 }
2404
2405 /// Hash the underlying data of \p Def.
2406 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2407 const VPlan *Plan = Def->getParent()->getPlan();
2408 VPTypeAnalysis TypeInfo(*Plan);
2409 hash_code Result = hash_combine(
2410 Def->getVPDefID(), getOpcodeOrIntrinsicID(Def),
2411 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2413 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2414 if (RFlags->hasPredicate())
2415 return hash_combine(Result, RFlags->getPredicate());
2416 return Result;
2417 }
2418
2419 /// Check equality of underlying data of \p L and \p R.
2420 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2421 if (isSentinel(L) || isSentinel(R))
2422 return L == R;
2423 if (L->getVPDefID() != R->getVPDefID() ||
2425 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2427 !equal(L->operands(), R->operands()))
2428 return false;
2430 "must have valid opcode info for both recipes");
2431 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2432 if (LFlags->hasPredicate() &&
2433 LFlags->getPredicate() !=
2434 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2435 return false;
2436 // Recipes in replicate regions implicitly depend on predicate. If either
2437 // recipe is in a replicate region, only consider them equal if both have
2438 // the same parent.
2439 const VPRegionBlock *RegionL = L->getRegion();
2440 const VPRegionBlock *RegionR = R->getRegion();
2441 if (((RegionL && RegionL->isReplicator()) ||
2442 (RegionR && RegionR->isReplicator())) &&
2443 L->getParent() != R->getParent())
2444 return false;
2445 const VPlan *Plan = L->getParent()->getPlan();
2446 VPTypeAnalysis TypeInfo(*Plan);
2447 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2448 }
2449};
2450} // end anonymous namespace
2451
2452/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2453/// Plan.
2455 VPDominatorTree VPDT(Plan);
2457
2459 vp_depth_first_deep(Plan.getEntry()))) {
2460 for (VPRecipeBase &R : *VPBB) {
2461 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2462 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2463 continue;
2464 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2465 // V must dominate Def for a valid replacement.
2466 if (!VPDT.dominates(V->getParent(), VPBB))
2467 continue;
2468 // Only keep flags present on both V and Def.
2469 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2470 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2471 Def->replaceAllUsesWith(V);
2472 continue;
2473 }
2474 CSEMap[Def] = Def;
2475 }
2476 }
2477}
2478
2479/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2480static void licm(VPlan &Plan) {
2481 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2482
2483 // Hoist any loop invariant recipes from the vector loop region to the
2484 // preheader. Preform a shallow traversal of the vector loop region, to
2485 // exclude recipes in replicate regions. Since the top-level blocks in the
2486 // vector loop region are guaranteed to execute if the vector pre-header is,
2487 // we don't need to check speculation safety.
2488 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2489 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2490 "Expected vector prehader's successor to be the vector loop region");
2492 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2493 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2495 continue;
2496 if (any_of(R.operands(), [](VPValue *Op) {
2497 return !Op->isDefinedOutsideLoopRegions();
2498 }))
2499 continue;
2500 R.moveBefore(*Preheader, Preheader->end());
2501 }
2502 }
2503}
2504
2506 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2507 if (Plan.hasScalarVFOnly())
2508 return;
2509 // Keep track of created truncates, so they can be re-used. Note that we
2510 // cannot use RAUW after creating a new truncate, as this would could make
2511 // other uses have different types for their operands, making them invalidly
2512 // typed.
2514 VPTypeAnalysis TypeInfo(Plan);
2515 VPBasicBlock *PH = Plan.getVectorPreheader();
2518 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2521 &R))
2522 continue;
2523
2524 VPValue *ResultVPV = R.getVPSingleValue();
2525 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2526 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2527 if (!NewResSizeInBits)
2528 continue;
2529
2530 // If the value wasn't vectorized, we must maintain the original scalar
2531 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2532 // skip casts which do not need to be handled explicitly here, as
2533 // redundant casts will be removed during recipe simplification.
2535 continue;
2536
2537 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2538 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2539 assert(OldResTy->isIntegerTy() && "only integer types supported");
2540 (void)OldResSizeInBits;
2541
2542 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2543
2544 // Any wrapping introduced by shrinking this operation shouldn't be
2545 // considered undefined behavior. So, we can't unconditionally copy
2546 // arithmetic wrapping flags to VPW.
2547 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2548 VPW->dropPoisonGeneratingFlags();
2549
2550 if (OldResSizeInBits != NewResSizeInBits &&
2551 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2552 // Extend result to original width.
2553 auto *Ext =
2554 new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
2555 Ext->insertAfter(&R);
2556 ResultVPV->replaceAllUsesWith(Ext);
2557 Ext->setOperand(0, ResultVPV);
2558 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2559 } else {
2560 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2561 "Only ICmps should not need extending the result.");
2562 }
2563
2564 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2566 continue;
2567
2568 // Shrink operands by introducing truncates as needed.
2569 unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0;
2570 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2571 auto *Op = R.getOperand(Idx);
2572 unsigned OpSizeInBits =
2574 if (OpSizeInBits == NewResSizeInBits)
2575 continue;
2576 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2577 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2578 if (!IterIsEmpty) {
2579 R.setOperand(Idx, ProcessedIter->second);
2580 continue;
2581 }
2582
2583 VPBuilder Builder;
2584 if (Op->isLiveIn())
2585 Builder.setInsertPoint(PH);
2586 else
2587 Builder.setInsertPoint(&R);
2588 VPWidenCastRecipe *NewOp =
2589 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2590 ProcessedIter->second = NewOp;
2591 R.setOperand(Idx, NewOp);
2592 }
2593
2594 }
2595 }
2596}
2597
2601 VPValue *Cond;
2602 // Skip blocks that are not terminated by BranchOnCond.
2603 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2604 continue;
2605
2606 assert(VPBB->getNumSuccessors() == 2 &&
2607 "Two successors expected for BranchOnCond");
2608 unsigned RemovedIdx;
2609 if (match(Cond, m_True()))
2610 RemovedIdx = 1;
2611 else if (match(Cond, m_False()))
2612 RemovedIdx = 0;
2613 else
2614 continue;
2615
2616 VPBasicBlock *RemovedSucc =
2617 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2618 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2619 "There must be a single edge between VPBB and its successor");
2620 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
2621 // these recipes.
2622 for (VPRecipeBase &R : RemovedSucc->phis())
2623 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2624
2625 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2626 // automatically on VPlan destruction if it becomes unreachable.
2627 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2628 VPBB->back().eraseFromParent();
2629 }
2630}
2631
2651
2652// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2653// the loop terminator with a branch-on-cond recipe with the negated
2654// active-lane-mask as operand. Note that this turns the loop into an
2655// uncountable one. Only the existing terminator is replaced, all other existing
2656// recipes/users remain unchanged, except for poison-generating flags being
2657// dropped from the canonical IV increment. Return the created
2658// VPActiveLaneMaskPHIRecipe.
2659//
2660// The function uses the following definitions:
2661//
2662// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
2663// calculate-trip-count-minus-VF (original TC) : original TC
2664// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
2665// CanonicalIVPhi : CanonicalIVIncrement
2666// %StartV is the canonical induction start value.
2667//
2668// The function adds the following recipes:
2669//
2670// vector.ph:
2671// %TripCount = calculate-trip-count-minus-VF (original TC)
2672// [if DataWithControlFlowWithoutRuntimeCheck]
2673// %EntryInc = canonical-iv-increment-for-part %StartV
2674// %EntryALM = active-lane-mask %EntryInc, %TripCount
2675//
2676// vector.body:
2677// ...
2678// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2679// ...
2680// %InLoopInc = canonical-iv-increment-for-part %IncrementValue
2681// %ALM = active-lane-mask %InLoopInc, TripCount
2682// %Negated = Not %ALM
2683// branch-on-cond %Negated
2684//
2687 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2688 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2689 auto *CanonicalIVPHI = TopRegion->getCanonicalIV();
2690 VPValue *StartV = CanonicalIVPHI->getStartValue();
2691
2692 auto *CanonicalIVIncrement =
2693 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2694 // TODO: Check if dropping the flags is needed if
2695 // !DataAndControlFlowWithoutRuntimeCheck.
2696 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2697 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2698 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2699 // we have to take unrolling into account. Each part needs to start at
2700 // Part * VF
2701 auto *VecPreheader = Plan.getVectorPreheader();
2702 VPBuilder Builder(VecPreheader);
2703
2704 // Create the ActiveLaneMask instruction using the correct start values.
2705 VPValue *TC = Plan.getTripCount();
2706
2707 VPValue *TripCount, *IncrementValue;
2709 // When the loop is guarded by a runtime overflow check for the loop
2710 // induction variable increment by VF, we can increment the value before
2711 // the get.active.lane mask and use the unmodified tripcount.
2712 IncrementValue = CanonicalIVIncrement;
2713 TripCount = TC;
2714 } else {
2715 // When avoiding a runtime check, the active.lane.mask inside the loop
2716 // uses a modified trip count and the induction variable increment is
2717 // done after the active.lane.mask intrinsic is called.
2718 IncrementValue = CanonicalIVPHI;
2719 TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
2720 {TC}, DL);
2721 }
2722 auto *EntryIncrement = Builder.createOverflowingOp(
2723 VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
2724 "index.part.next");
2725
2726 // Create the active lane mask instruction in the VPlan preheader.
2727 VPValue *ALMMultiplier =
2728 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2729 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2730 {EntryIncrement, TC, ALMMultiplier}, DL,
2731 "active.lane.mask.entry");
2732
2733 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2734 // preheader ActiveLaneMask instruction.
2735 auto *LaneMaskPhi =
2737 LaneMaskPhi->insertAfter(CanonicalIVPHI);
2738
2739 // Create the active lane mask for the next iteration of the loop before the
2740 // original terminator.
2741 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2742 Builder.setInsertPoint(OriginalTerminator);
2743 auto *InLoopIncrement =
2744 Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
2745 {IncrementValue}, {false, false}, DL);
2746 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2747 {InLoopIncrement, TripCount, ALMMultiplier},
2748 DL, "active.lane.mask.next");
2749 LaneMaskPhi->addOperand(ALM);
2750
2751 // Replace the original terminator with BranchOnCond. We have to invert the
2752 // mask here because a true condition means jumping to the exit block.
2753 auto *NotMask = Builder.createNot(ALM, DL);
2754 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2755 OriginalTerminator->eraseFromParent();
2756 return LaneMaskPhi;
2757}
2758
2759/// Collect the header mask with the pattern:
2760/// (ICMP_ULE, WideCanonicalIV, backedge-taken-count)
2761/// TODO: Introduce explicit recipe for header-mask instead of searching
2762/// for the header-mask pattern manually.
2764 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2765 SmallVector<VPValue *> WideCanonicalIVs;
2766 auto *FoundWidenCanonicalIVUser = find_if(
2768 assert(count_if(LoopRegion->getCanonicalIV()->users(),
2770 "Must have at most one VPWideCanonicalIVRecipe");
2771 if (FoundWidenCanonicalIVUser !=
2772 LoopRegion->getCanonicalIV()->users().end()) {
2773 auto *WideCanonicalIV =
2774 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2775 WideCanonicalIVs.push_back(WideCanonicalIV);
2776 }
2777
2778 // Also include VPWidenIntOrFpInductionRecipes that represent a widened
2779 // version of the canonical induction.
2780 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
2781 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2782 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2783 if (WidenOriginalIV && WidenOriginalIV->isCanonical())
2784 WideCanonicalIVs.push_back(WidenOriginalIV);
2785 }
2786
2787 // Walk users of wide canonical IVs and find the single compare of the form
2788 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
2789 VPSingleDefRecipe *HeaderMask = nullptr;
2790 for (auto *Wide : WideCanonicalIVs) {
2791 for (VPUser *U : SmallVector<VPUser *>(Wide->users())) {
2792 auto *VPI = dyn_cast<VPInstruction>(U);
2793 if (!VPI || !vputils::isHeaderMask(VPI, Plan))
2794 continue;
2795
2796 assert(VPI->getOperand(0) == Wide &&
2797 "WidenCanonicalIV must be the first operand of the compare");
2798 assert(!HeaderMask && "Multiple header masks found?");
2799 HeaderMask = VPI;
2800 }
2801 }
2802 return HeaderMask;
2803}
2804
2806 VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
2809 UseActiveLaneMaskForControlFlow) &&
2810 "DataAndControlFlowWithoutRuntimeCheck implies "
2811 "UseActiveLaneMaskForControlFlow");
2812
2813 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2814 auto *FoundWidenCanonicalIVUser = find_if(
2816 assert(FoundWidenCanonicalIVUser &&
2817 "Must have widened canonical IV when tail folding!");
2818 VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan);
2819 auto *WideCanonicalIV =
2820 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2821 VPSingleDefRecipe *LaneMask;
2822 if (UseActiveLaneMaskForControlFlow) {
2825 } else {
2826 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2827 VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
2828 ConstantInt::get(LoopRegion->getCanonicalIVType(), 1));
2829 LaneMask =
2830 B.createNaryOp(VPInstruction::ActiveLaneMask,
2831 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2832 nullptr, "active.lane.mask");
2833 }
2834
2835 // Walk users of WideCanonicalIV and replace the header mask of the form
2836 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2837 // removing the old one to ensure there is always only a single header mask.
2838 HeaderMask->replaceAllUsesWith(LaneMask);
2839 HeaderMask->eraseFromParent();
2840}
2841
2842template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
2843 Op0_t In;
2845
2846 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
2847
2848 template <typename OpTy> bool match(OpTy *V) const {
2849 if (m_Specific(In).match(V)) {
2850 Out = nullptr;
2851 return true;
2852 }
2853 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
2854 }
2855};
2856
2857/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
2858/// Returns the remaining part \p Out if so, or nullptr otherwise.
2859template <typename Op0_t, typename Op1_t>
2860static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
2861 Op1_t &Out) {
2862 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
2863}
2864
2865/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2866/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2867/// recipe could be created.
2868/// \p HeaderMask Header Mask.
2869/// \p CurRecipe Recipe to be transform.
2870/// \p TypeInfo VPlan-based type analysis.
2871/// \p EVL The explicit vector length parameter of vector-predication
2872/// intrinsics.
2874 VPRecipeBase &CurRecipe,
2875 VPTypeAnalysis &TypeInfo, VPValue &EVL) {
2876 VPlan *Plan = CurRecipe.getParent()->getPlan();
2877 DebugLoc DL = CurRecipe.getDebugLoc();
2878 VPValue *Addr, *Mask, *EndPtr;
2879
2880 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2881 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
2882 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
2883 EVLEndPtr->insertBefore(&CurRecipe);
2884 EVLEndPtr->setOperand(1, &EVL);
2885 return EVLEndPtr;
2886 };
2887
2888 if (match(&CurRecipe,
2889 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))) &&
2890 !cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
2891 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
2892 EVL, Mask);
2893
2894 VPValue *ReversedVal;
2895 if (match(&CurRecipe, m_Reverse(m_VPValue(ReversedVal))) &&
2896 match(ReversedVal,
2897 m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) &&
2898 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
2899 cast<VPWidenLoadRecipe>(ReversedVal)->isReverse()) {
2900 auto *LoadR = new VPWidenLoadEVLRecipe(
2901 *cast<VPWidenLoadRecipe>(ReversedVal), AdjustEndPtr(EndPtr), EVL, Mask);
2902 LoadR->insertBefore(&CurRecipe);
2903 return new VPWidenIntrinsicRecipe(
2904 Intrinsic::experimental_vp_reverse, {LoadR, Plan->getTrue(), &EVL},
2905 TypeInfo.inferScalarType(LoadR), {}, {}, DL);
2906 }
2907
2908 VPValue *StoredVal;
2909 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(StoredVal),
2910 m_RemoveMask(HeaderMask, Mask))) &&
2911 !cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
2912 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
2913 StoredVal, EVL, Mask);
2914
2915 if (match(&CurRecipe,
2916 m_MaskedStore(m_VPValue(EndPtr), m_Reverse(m_VPValue(ReversedVal)),
2917 m_RemoveMask(HeaderMask, Mask))) &&
2918 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
2919 cast<VPWidenStoreRecipe>(CurRecipe).isReverse()) {
2920 auto *NewReverse = new VPWidenIntrinsicRecipe(
2921 Intrinsic::experimental_vp_reverse,
2922 {ReversedVal, Plan->getTrue(), &EVL},
2923 TypeInfo.inferScalarType(ReversedVal), {}, {}, DL);
2924 NewReverse->insertBefore(&CurRecipe);
2925 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
2926 AdjustEndPtr(EndPtr), NewReverse, EVL,
2927 Mask);
2928 }
2929
2930 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
2931 if (Rdx->isConditional() &&
2932 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
2933 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
2934
2935 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
2936 if (Interleave->getMask() &&
2937 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
2938 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
2939
2940 VPValue *LHS, *RHS;
2941 if (match(&CurRecipe,
2942 m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
2943 return new VPWidenIntrinsicRecipe(
2944 Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
2945 TypeInfo.inferScalarType(LHS), {}, {}, DL);
2946
2947 if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
2948 m_VPValue(RHS))))
2949 return new VPWidenIntrinsicRecipe(
2950 Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
2951 TypeInfo.inferScalarType(LHS), {}, {}, DL);
2952
2953 if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
2954 Type *Ty = TypeInfo.inferScalarType(CurRecipe.getVPSingleValue());
2955 VPValue *ZExt =
2956 VPBuilder(&CurRecipe).createScalarCast(Instruction::ZExt, &EVL, Ty, DL);
2957 return new VPInstruction(Instruction::Sub,
2958 {ZExt, Plan->getConstantInt(Ty, 1)}, {}, {}, DL);
2959 }
2960
2961 return nullptr;
2962}
2963
2964/// Replace recipes with their EVL variants.
2966 VPTypeAnalysis TypeInfo(Plan);
2967 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2968 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
2969
2970 assert(all_of(Plan.getVF().users(),
2973 "User of VF that we can't transform to EVL.");
2974 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2976 });
2977
2978 assert(all_of(Plan.getVFxUF().users(),
2979 [&LoopRegion, &Plan](VPUser *U) {
2980 return match(U,
2981 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
2982 m_Specific(&Plan.getVFxUF()))) ||
2983 isa<VPWidenPointerInductionRecipe>(U);
2984 }) &&
2985 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
2986 "increment of the canonical induction.");
2987 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2988 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
2989 // canonical induction must not be updated.
2991 });
2992
2993 // Defer erasing recipes till the end so that we don't invalidate the
2994 // VPTypeAnalysis cache.
2996
2997 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
2998 // contained.
2999 bool ContainsFORs =
3001 if (ContainsFORs) {
3002 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3003 VPValue *MaxEVL = &Plan.getVF();
3004 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3005 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3006 MaxEVL = Builder.createScalarZExtOrTrunc(
3007 MaxEVL, Type::getInt32Ty(Plan.getContext()),
3008 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
3009
3010 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
3011 VPValue *PrevEVL = Builder.createScalarPhi(
3012 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
3013
3016 for (VPRecipeBase &R : *VPBB) {
3017 VPValue *V1, *V2;
3018 if (!match(&R,
3020 m_VPValue(V1), m_VPValue(V2))))
3021 continue;
3022 VPValue *Imm = Plan.getOrAddLiveIn(
3025 Intrinsic::experimental_vp_splice,
3026 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3027 TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
3028 R.getDebugLoc());
3029 VPSplice->insertBefore(&R);
3030 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
3031 ToErase.push_back(&R);
3032 }
3033 }
3034 }
3035
3036 VPValue *HeaderMask = findHeaderMask(Plan);
3037 if (!HeaderMask)
3038 return;
3039
3040 // Replace header masks with a mask equivalent to predicating by EVL:
3041 //
3042 // icmp ule widen-canonical-iv backedge-taken-count
3043 // ->
3044 // icmp ult step-vector, EVL
3045 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3046 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
3047 Type *EVLType = TypeInfo.inferScalarType(&EVL);
3048 VPValue *EVLMask = Builder.createICmp(
3050 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
3051 HeaderMask->replaceAllUsesWith(EVLMask);
3052 ToErase.push_back(HeaderMask->getDefiningRecipe());
3053
3054 // Try to optimize header mask recipes away to their EVL variants.
3055 // TODO: Split optimizeMaskToEVL out and move into
3056 // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in
3057 // tryToBuildVPlanWithVPRecipes beforehand.
3058 for (VPUser *U : collectUsersRecursively(EVLMask)) {
3059 auto *CurRecipe = cast<VPRecipeBase>(U);
3060 VPRecipeBase *EVLRecipe =
3061 optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, EVL);
3062 if (!EVLRecipe)
3063 continue;
3064
3065 unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
3066 assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
3067 "New recipe must define the same number of values as the "
3068 "original.");
3069 EVLRecipe->insertBefore(CurRecipe);
3071 EVLRecipe)) {
3072 for (unsigned I = 0; I < NumDefVal; ++I) {
3073 VPValue *CurVPV = CurRecipe->getVPValue(I);
3074 CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I));
3075 }
3076 }
3077 ToErase.push_back(CurRecipe);
3078 }
3079 // Remove dead EVL mask.
3080 if (EVLMask->getNumUsers() == 0)
3081 ToErase.push_back(EVLMask->getDefiningRecipe());
3082
3083 for (VPRecipeBase *R : reverse(ToErase)) {
3084 SmallVector<VPValue *> PossiblyDead(R->operands());
3085 R->eraseFromParent();
3086 for (VPValue *Op : PossiblyDead)
3088 }
3089}
3090
3091/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
3092/// replaces all uses except the canonical IV increment of
3093/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
3094/// is used only for loop iterations counting after this transformation.
3095///
3096/// The function uses the following definitions:
3097/// %StartV is the canonical induction start value.
3098///
3099/// The function adds the following recipes:
3100///
3101/// vector.ph:
3102/// ...
3103///
3104/// vector.body:
3105/// ...
3106/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
3107/// [ %NextEVLIV, %vector.body ]
3108/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3109/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3110/// ...
3111/// %OpEVL = cast i32 %VPEVL to IVSize
3112/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
3113/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3114/// ...
3115///
3116/// If MaxSafeElements is provided, the function adds the following recipes:
3117/// vector.ph:
3118/// ...
3119///
3120/// vector.body:
3121/// ...
3122/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
3123/// [ %NextEVLIV, %vector.body ]
3124/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3125/// %cmp = cmp ult %AVL, MaxSafeElements
3126/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3127/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3128/// ...
3129/// %OpEVL = cast i32 %VPEVL to IVSize
3130/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
3131/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3132/// ...
3133///
3135 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3136 if (Plan.hasScalarVFOnly())
3137 return;
3138 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3139 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3140
3141 auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
3142 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3143 VPValue *StartV = CanonicalIVPHI->getStartValue();
3144
3145 // Create the ExplicitVectorLengthPhi recipe in the main loop.
3146 auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc::getUnknown());
3147 EVLPhi->insertAfter(CanonicalIVPHI);
3148 VPBuilder Builder(Header, Header->getFirstNonPhi());
3149 // Create the AVL (application vector length), starting from TC -> 0 in steps
3150 // of EVL.
3151 VPPhi *AVLPhi = Builder.createScalarPhi(
3152 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
3153 VPValue *AVL = AVLPhi;
3154
3155 if (MaxSafeElements) {
3156 // Support for MaxSafeDist for correct loop emission.
3157 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
3158 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
3159 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
3160 "safe_avl");
3161 }
3162 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
3164
3165 auto *CanonicalIVIncrement =
3166 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
3167 Builder.setInsertPoint(CanonicalIVIncrement);
3168 VPValue *OpVPEVL = VPEVL;
3169
3170 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
3171 OpVPEVL = Builder.createScalarZExtOrTrunc(
3172 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
3173
3174 auto *NextEVLIV = Builder.createOverflowingOp(
3175 Instruction::Add, {OpVPEVL, EVLPhi},
3176 {CanonicalIVIncrement->hasNoUnsignedWrap(),
3177 CanonicalIVIncrement->hasNoSignedWrap()},
3178 CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
3179 EVLPhi->addOperand(NextEVLIV);
3180
3181 VPValue *NextAVL = Builder.createOverflowingOp(
3182 Instruction::Sub, {AVLPhi, OpVPEVL}, {/*hasNUW=*/true, /*hasNSW=*/false},
3183 DebugLoc::getCompilerGenerated(), "avl.next");
3184 AVLPhi->addOperand(NextAVL);
3185
3186 transformRecipestoEVLRecipes(Plan, *VPEVL);
3187
3188 // Replace all uses of VPCanonicalIVPHIRecipe by
3189 // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
3190 CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
3191 CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
3192 // TODO: support unroll factor > 1.
3193 Plan.setUF(1);
3194}
3195
3197 // Find EVL loop entries by locating VPEVLBasedIVPHIRecipe.
3198 // There should be only one EVL PHI in the entire plan.
3199 VPEVLBasedIVPHIRecipe *EVLPhi = nullptr;
3200
3203 for (VPRecipeBase &R : VPBB->phis())
3204 if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
3205 assert(!EVLPhi && "Found multiple EVL PHIs. Only one expected");
3206 EVLPhi = PhiR;
3207 }
3208
3209 // Early return if no EVL PHI is found.
3210 if (!EVLPhi)
3211 return;
3212
3213 VPBasicBlock *HeaderVPBB = EVLPhi->getParent();
3214 VPValue *EVLIncrement = EVLPhi->getBackedgeValue();
3215 VPValue *AVL;
3216 [[maybe_unused]] bool FoundAVL =
3217 match(EVLIncrement,
3218 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi)));
3219 assert(FoundAVL && "Didn't find AVL?");
3220
3221 // The AVL may be capped to a safe distance.
3222 VPValue *SafeAVL;
3223 if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue())))
3224 AVL = SafeAVL;
3225
3226 VPValue *AVLNext;
3227 [[maybe_unused]] bool FoundAVLNext =
3229 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3230 assert(FoundAVLNext && "Didn't find AVL backedge?");
3231
3232 // Convert EVLPhi to concrete recipe.
3233 auto *ScalarR =
3234 VPBuilder(EVLPhi).createScalarPhi({EVLPhi->getStartValue(), EVLIncrement},
3235 EVLPhi->getDebugLoc(), "evl.based.iv");
3236 EVLPhi->replaceAllUsesWith(ScalarR);
3237 EVLPhi->eraseFromParent();
3238
3239 // Replace CanonicalIVInc with EVL-PHI increment.
3240 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3241 VPValue *Backedge = CanonicalIV->getIncomingValue(1);
3242 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
3243 m_Specific(&Plan.getVFxUF()))) &&
3244 "Unexpected canonical iv");
3245 Backedge->replaceAllUsesWith(EVLIncrement);
3246
3247 // Remove unused phi and increment.
3248 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
3249 CanonicalIVIncrement->eraseFromParent();
3250 CanonicalIV->eraseFromParent();
3251
3252 // Replace the use of VectorTripCount in the latch-exiting block.
3253 // Before: (branch-on-cond (icmp eq EVLIVInc, VectorTripCount))
3254 // After: (branch-on-cond icmp eq AVLNext, 0)
3255 VPBasicBlock *LatchExiting =
3256 HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock();
3257 auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator());
3258 if (match(LatchExitingBr, m_BranchOnCond(m_True())))
3259 return;
3260
3261 assert(match(LatchExitingBr, m_BranchOnCond(m_SpecificCmp(
3262 CmpInst::ICMP_EQ, m_VPValue(EVLIncrement),
3263 m_Specific(&Plan.getVectorTripCount())))) &&
3264 "Expected BranchOnCond with ICmp comparing EVL increment with vector "
3265 "trip count");
3266
3267 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
3268 VPBuilder Builder(LatchExitingBr);
3269 LatchExitingBr->setOperand(0,
3270 Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
3271 Plan.getConstantInt(AVLTy, 0)));
3272}
3273
3275 VPlan &Plan, PredicatedScalarEvolution &PSE,
3276 const DenseMap<Value *, const SCEV *> &StridesMap) {
3277 // Replace VPValues for known constant strides guaranteed by predicate scalar
3278 // evolution.
3279 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
3280 auto *R = cast<VPRecipeBase>(&U);
3281 return R->getRegion() ||
3282 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
3283 };
3284 ValueToSCEVMapTy RewriteMap;
3285 for (const SCEV *Stride : StridesMap.values()) {
3286 using namespace SCEVPatternMatch;
3287 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3288 const APInt *StrideConst;
3289 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3290 // Only handle constant strides for now.
3291 continue;
3292
3293 auto *CI = Plan.getConstantInt(*StrideConst);
3294 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3295 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3296
3297 // The versioned value may not be used in the loop directly but through a
3298 // sext/zext. Add new live-ins in those cases.
3299 for (Value *U : StrideV->users()) {
3301 continue;
3302 VPValue *StrideVPV = Plan.getLiveIn(U);
3303 if (!StrideVPV)
3304 continue;
3305 unsigned BW = U->getType()->getScalarSizeInBits();
3306 APInt C =
3307 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3308 VPValue *CI = Plan.getConstantInt(C);
3309 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3310 }
3311 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3312 }
3313
3314 for (VPRecipeBase &R : *Plan.getEntry()) {
3315 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3316 if (!ExpSCEV)
3317 continue;
3318 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3319 auto *NewSCEV =
3320 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3321 if (NewSCEV != ScevExpr) {
3322 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3323 ExpSCEV->replaceAllUsesWith(NewExp);
3324 if (Plan.getTripCount() == ExpSCEV)
3325 Plan.resetTripCount(NewExp);
3326 }
3327 }
3328}
3329
3331 VPlan &Plan,
3332 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
3333 // Collect recipes in the backward slice of `Root` that may generate a poison
3334 // value that is used after vectorization.
3336 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3338 Worklist.push_back(Root);
3339
3340 // Traverse the backward slice of Root through its use-def chain.
3341 while (!Worklist.empty()) {
3342 VPRecipeBase *CurRec = Worklist.pop_back_val();
3343
3344 if (!Visited.insert(CurRec).second)
3345 continue;
3346
3347 // Prune search if we find another recipe generating a widen memory
3348 // instruction. Widen memory instructions involved in address computation
3349 // will lead to gather/scatter instructions, which don't need to be
3350 // handled.
3352 VPHeaderPHIRecipe>(CurRec))
3353 continue;
3354
3355 // This recipe contributes to the address computation of a widen
3356 // load/store. If the underlying instruction has poison-generating flags,
3357 // drop them directly.
3358 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3359 VPValue *A, *B;
3360 // Dropping disjoint from an OR may yield incorrect results, as some
3361 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3362 // for dependence analysis). Instead, replace it with an equivalent Add.
3363 // This is possible as all users of the disjoint OR only access lanes
3364 // where the operands are disjoint or poison otherwise.
3365 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3366 RecWithFlags->isDisjoint()) {
3367 VPBuilder Builder(RecWithFlags);
3368 VPInstruction *New = Builder.createOverflowingOp(
3369 Instruction::Add, {A, B}, {false, false},
3370 RecWithFlags->getDebugLoc());
3371 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3372 RecWithFlags->replaceAllUsesWith(New);
3373 RecWithFlags->eraseFromParent();
3374 CurRec = New;
3375 } else
3376 RecWithFlags->dropPoisonGeneratingFlags();
3377 } else {
3380 (void)Instr;
3381 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3382 "found instruction with poison generating flags not covered by "
3383 "VPRecipeWithIRFlags");
3384 }
3385
3386 // Add new definitions to the worklist.
3387 for (VPValue *Operand : CurRec->operands())
3388 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3389 Worklist.push_back(OpDef);
3390 }
3391 });
3392
3393 // Traverse all the recipes in the VPlan and collect the poison-generating
3394 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3395 // VPInterleaveRecipe.
3396 auto Iter = vp_depth_first_deep(Plan.getEntry());
3398 for (VPRecipeBase &Recipe : *VPBB) {
3399 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3400 Instruction &UnderlyingInstr = WidenRec->getIngredient();
3401 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3402 if (AddrDef && WidenRec->isConsecutive() &&
3403 BlockNeedsPredication(UnderlyingInstr.getParent()))
3404 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3405 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3406 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3407 if (AddrDef) {
3408 // Check if any member of the interleave group needs predication.
3409 const InterleaveGroup<Instruction> *InterGroup =
3410 InterleaveRec->getInterleaveGroup();
3411 bool NeedPredication = false;
3412 for (int I = 0, NumMembers = InterGroup->getNumMembers();
3413 I < NumMembers; ++I) {
3414 Instruction *Member = InterGroup->getMember(I);
3415 if (Member)
3416 NeedPredication |= BlockNeedsPredication(Member->getParent());
3417 }
3418
3419 if (NeedPredication)
3420 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3421 }
3422 }
3423 }
3424 }
3425}
3426
3428 VPlan &Plan,
3430 &InterleaveGroups,
3431 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
3432 if (InterleaveGroups.empty())
3433 return;
3434
3435 // Interleave memory: for each Interleave Group we marked earlier as relevant
3436 // for this VPlan, replace the Recipes widening its memory instructions with a
3437 // single VPInterleaveRecipe at its insertion point.
3438 VPDominatorTree VPDT(Plan);
3439 for (const auto *IG : InterleaveGroups) {
3440 auto *Start =
3441 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
3442 VPIRMetadata InterleaveMD(*Start);
3443 SmallVector<VPValue *, 4> StoredValues;
3444 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
3445 StoredValues.push_back(StoreR->getStoredValue());
3446 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3447 Instruction *MemberI = IG->getMember(I);
3448 if (!MemberI)
3449 continue;
3450 VPWidenMemoryRecipe *MemoryR =
3451 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
3452 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
3453 StoredValues.push_back(StoreR->getStoredValue());
3454 InterleaveMD.intersect(*MemoryR);
3455 }
3456
3457 bool NeedsMaskForGaps =
3458 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
3459 (!StoredValues.empty() && !IG->isFull());
3460
3461 Instruction *IRInsertPos = IG->getInsertPos();
3462 auto *InsertPos =
3463 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
3464
3466 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3467 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3468 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3469
3470 // Get or create the start address for the interleave group.
3471 VPValue *Addr = Start->getAddr();
3472 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3473 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
3474 // We cannot re-use the address of member zero because it does not
3475 // dominate the insert position. Instead, use the address of the insert
3476 // position and create a PtrAdd adjusting it to the address of member
3477 // zero.
3478 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3479 // InsertPos or sink loads above zero members to join it.
3480 assert(IG->getIndex(IRInsertPos) != 0 &&
3481 "index of insert position shouldn't be zero");
3482 auto &DL = IRInsertPos->getDataLayout();
3483 APInt Offset(32,
3484 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3485 IG->getIndex(IRInsertPos),
3486 /*IsSigned=*/true);
3487 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3488 VPBuilder B(InsertPos);
3489 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3490 }
3491 // If the group is reverse, adjust the index to refer to the last vector
3492 // lane instead of the first. We adjust the index from the first vector
3493 // lane, rather than directly getting the pointer for lane VF - 1, because
3494 // the pointer operand of the interleaved access is supposed to be uniform.
3495 if (IG->isReverse()) {
3496 auto *ReversePtr = new VPVectorEndPointerRecipe(
3497 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3498 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3499 ReversePtr->insertBefore(InsertPos);
3500 Addr = ReversePtr;
3501 }
3502 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3503 InsertPos->getMask(), NeedsMaskForGaps,
3504 InterleaveMD, InsertPos->getDebugLoc());
3505 VPIG->insertBefore(InsertPos);
3506
3507 unsigned J = 0;
3508 for (unsigned i = 0; i < IG->getFactor(); ++i)
3509 if (Instruction *Member = IG->getMember(i)) {
3510 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
3511 if (!Member->getType()->isVoidTy()) {
3512 VPValue *OriginalV = MemberR->getVPSingleValue();
3513 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3514 J++;
3515 }
3516 MemberR->eraseFromParent();
3517 }
3518 }
3519}
3520
3521/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3522/// value, phi and backedge value. In the following example:
3523///
3524/// vector.ph:
3525/// Successor(s): vector loop
3526///
3527/// <x1> vector loop: {
3528/// vector.body:
3529/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3530/// ...
3531/// EMIT branch-on-count ...
3532/// No successors
3533/// }
3534///
3535/// WIDEN-INDUCTION will get expanded to:
3536///
3537/// vector.ph:
3538/// ...
3539/// vp<%induction.start> = ...
3540/// vp<%induction.increment> = ...
3541///
3542/// Successor(s): vector loop
3543///
3544/// <x1> vector loop: {
3545/// vector.body:
3546/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3547/// ...
3548/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3549/// EMIT branch-on-count ...
3550/// No successors
3551/// }
3552static void
3554 VPTypeAnalysis &TypeInfo) {
3555 VPlan *Plan = WidenIVR->getParent()->getPlan();
3556 VPValue *Start = WidenIVR->getStartValue();
3557 VPValue *Step = WidenIVR->getStepValue();
3558 VPValue *VF = WidenIVR->getVFValue();
3559 DebugLoc DL = WidenIVR->getDebugLoc();
3560
3561 // The value from the original loop to which we are mapping the new induction
3562 // variable.
3563 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3564
3565 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3568 VPIRFlags Flags = *WidenIVR;
3569 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3570 AddOp = Instruction::Add;
3571 MulOp = Instruction::Mul;
3572 } else {
3573 AddOp = ID.getInductionOpcode();
3574 MulOp = Instruction::FMul;
3575 }
3576
3577 // If the phi is truncated, truncate the start and step values.
3578 VPBuilder Builder(Plan->getVectorPreheader());
3579 Type *StepTy = TypeInfo.inferScalarType(Step);
3580 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3581 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3582 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3583 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3584 // Truncation doesn't preserve WrapFlags.
3585 Flags.dropPoisonGeneratingFlags();
3586 StepTy = Ty;
3587 }
3588
3589 // Construct the initial value of the vector IV in the vector loop preheader.
3590 Type *IVIntTy =
3592 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3593 if (StepTy->isFloatingPointTy())
3594 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3595
3596 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3597 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3598
3599 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3600 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3601 DebugLoc::getUnknown(), "induction");
3602
3603 // Create the widened phi of the vector IV.
3604 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), Init,
3605 WidenIVR->getDebugLoc(), "vec.ind");
3606 WidePHI->insertBefore(WidenIVR);
3607
3608 // Create the backedge value for the vector IV.
3609 VPValue *Inc;
3610 VPValue *Prev;
3611 // If unrolled, use the increment and prev value from the operands.
3612 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3613 Inc = SplatVF;
3614 Prev = WidenIVR->getLastUnrolledPartOperand();
3615 } else {
3616 if (VPRecipeBase *R = VF->getDefiningRecipe())
3617 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3618 // Multiply the vectorization factor by the step using integer or
3619 // floating-point arithmetic as appropriate.
3620 if (StepTy->isFloatingPointTy())
3621 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3622 DL);
3623 else
3624 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3625 TypeInfo.inferScalarType(VF), DL);
3626
3627 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3628 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3629 Prev = WidePHI;
3630 }
3631
3633 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3634 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3635 WidenIVR->getDebugLoc(), "vec.ind.next");
3636
3637 WidePHI->addOperand(Next);
3638
3639 WidenIVR->replaceAllUsesWith(WidePHI);
3640}
3641
3642/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3643/// initial value, phi and backedge value. In the following example:
3644///
3645/// <x1> vector loop: {
3646/// vector.body:
3647/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3648/// ...
3649/// EMIT branch-on-count ...
3650/// }
3651///
3652/// WIDEN-POINTER-INDUCTION will get expanded to:
3653///
3654/// <x1> vector loop: {
3655/// vector.body:
3656/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3657/// EMIT %mul = mul %stepvector, %step
3658/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3659/// ...
3660/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3661/// EMIT branch-on-count ...
3662/// }
3664 VPTypeAnalysis &TypeInfo) {
3665 VPlan *Plan = R->getParent()->getPlan();
3666 VPValue *Start = R->getStartValue();
3667 VPValue *Step = R->getStepValue();
3668 VPValue *VF = R->getVFValue();
3669
3670 assert(R->getInductionDescriptor().getKind() ==
3672 "Not a pointer induction according to InductionDescriptor!");
3673 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3674 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3675 "Recipe should have been replaced");
3676
3677 VPBuilder Builder(R);
3678 DebugLoc DL = R->getDebugLoc();
3679
3680 // Build a scalar pointer phi.
3681 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3682
3683 // Create actual address geps that use the pointer phi as base and a
3684 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3685 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3686 Type *StepTy = TypeInfo.inferScalarType(Step);
3687 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3688 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3689 VPValue *PtrAdd = Builder.createNaryOp(
3690 VPInstruction::WidePtrAdd, {ScalarPtrPhi, Offset}, DL, "vector.gep");
3691 R->replaceAllUsesWith(PtrAdd);
3692
3693 // Create the backedge value for the scalar pointer phi.
3695 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3696 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3697 DL);
3698 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
3699
3700 VPValue *InductionGEP =
3701 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3702 ScalarPtrPhi->addOperand(InductionGEP);
3703}
3704
3706 // Replace loop regions with explicity CFG.
3707 SmallVector<VPRegionBlock *> LoopRegions;
3709 vp_depth_first_deep(Plan.getEntry()))) {
3710 if (!R->isReplicator())
3711 LoopRegions.push_back(R);
3712 }
3713 for (VPRegionBlock *R : LoopRegions)
3714 R->dissolveToCFGLoop();
3715}
3716
3718 VPTypeAnalysis TypeInfo(Plan);
3721 vp_depth_first_deep(Plan.getEntry()))) {
3722 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3723 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
3724 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
3725 ToRemove.push_back(WidenIVR);
3726 continue;
3727 }
3728
3729 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
3730 // If the recipe only generates scalars, scalarize it instead of
3731 // expanding it.
3732 if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
3733 VPBuilder Builder(WidenIVR);
3734 VPValue *PtrAdd =
3735 scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
3736 WidenIVR->replaceAllUsesWith(PtrAdd);
3737 ToRemove.push_back(WidenIVR);
3738 continue;
3739 }
3740 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
3741 ToRemove.push_back(WidenIVR);
3742 continue;
3743 }
3744
3745 // Expand VPBlendRecipe into VPInstruction::Select.
3746 VPBuilder Builder(&R);
3747 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
3748 VPValue *Select = Blend->getIncomingValue(0);
3749 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
3750 Select = Builder.createSelect(Blend->getMask(I),
3751 Blend->getIncomingValue(I), Select,
3752 R.getDebugLoc(), "predphi");
3753 Blend->replaceAllUsesWith(Select);
3754 ToRemove.push_back(Blend);
3755 }
3756
3757 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
3758 Expr->decompose();
3759 ToRemove.push_back(Expr);
3760 }
3761
3762 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
3763 auto *LastActiveL = dyn_cast<VPInstruction>(&R);
3764 if (LastActiveL &&
3765 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
3766 // Create Not(Mask) for all operands.
3768 for (VPValue *Op : LastActiveL->operands()) {
3769 VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
3770 NotMasks.push_back(NotMask);
3771 }
3772
3773 // Create FirstActiveLane on the inverted masks.
3774 VPValue *FirstInactiveLane = Builder.createNaryOp(
3776 LastActiveL->getDebugLoc(), "first.inactive.lane");
3777
3778 // Subtract 1 to get the last active lane.
3779 VPValue *One = Plan.getOrAddLiveIn(
3780 ConstantInt::get(Type::getInt64Ty(Plan.getContext()), 1));
3781 VPValue *LastLane = Builder.createNaryOp(
3782 Instruction::Sub, {FirstInactiveLane, One},
3783 LastActiveL->getDebugLoc(), "last.active.lane");
3784
3785 LastActiveL->replaceAllUsesWith(LastLane);
3786 ToRemove.push_back(LastActiveL);
3787 continue;
3788 }
3789
3790 // Lower BranchOnCount to ICmp + BranchOnCond.
3791 VPValue *IV, *TC;
3792 if (match(&R, m_BranchOnCount(m_VPValue(IV), m_VPValue(TC)))) {
3793 auto *BranchOnCountInst = cast<VPInstruction>(&R);
3794 DebugLoc DL = BranchOnCountInst->getDebugLoc();
3795 VPValue *Cond = Builder.createICmp(CmpInst::ICMP_EQ, IV, TC, DL);
3796 Builder.createNaryOp(VPInstruction::BranchOnCond, Cond, DL);
3797 ToRemove.push_back(BranchOnCountInst);
3798 continue;
3799 }
3800
3801 VPValue *VectorStep;
3802 VPValue *ScalarStep;
3804 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
3805 continue;
3806
3807 // Expand WideIVStep.
3808 auto *VPI = cast<VPInstruction>(&R);
3809 Type *IVTy = TypeInfo.inferScalarType(VPI);
3810 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
3812 ? Instruction::UIToFP
3813 : Instruction::Trunc;
3814 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
3815 }
3816
3817 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
3818 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
3819 ScalarStep =
3820 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
3821 }
3822
3823 VPIRFlags Flags;
3824 if (IVTy->isFloatingPointTy())
3825 Flags = {VPI->getFastMathFlags()};
3826
3827 unsigned MulOpc =
3828 IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
3829 VPInstruction *Mul = Builder.createNaryOp(
3830 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
3831 VectorStep = Mul;
3832 VPI->replaceAllUsesWith(VectorStep);
3833 ToRemove.push_back(VPI);
3834 }
3835 }
3836
3837 for (VPRecipeBase *R : ToRemove)
3838 R->eraseFromParent();
3839}
3840
3842 VPBasicBlock *EarlyExitVPBB,
3843 VPlan &Plan,
3844 VPBasicBlock *HeaderVPBB,
3845 VPBasicBlock *LatchVPBB) {
3846 VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0];
3847 if (!EarlyExitVPBB->getSinglePredecessor() &&
3848 EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
3849 assert(EarlyExitVPBB->getNumPredecessors() == 2 &&
3850 EarlyExitVPBB->getPredecessors()[0] == EarlyExitingVPBB &&
3851 "unsupported early exit VPBB");
3852 // Early exit operand should always be last phi operand. If EarlyExitVPBB
3853 // has two predecessors and EarlyExitingVPBB is the first, swap the operands
3854 // of the phis.
3855 for (VPRecipeBase &R : EarlyExitVPBB->phis())
3856 cast<VPIRPhi>(&R)->swapOperands();
3857 }
3858
3859 VPBuilder Builder(LatchVPBB->getTerminator());
3860 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
3861 assert(match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond()) &&
3862 "Terminator must be be BranchOnCond");
3863 VPValue *CondOfEarlyExitingVPBB =
3864 EarlyExitingVPBB->getTerminator()->getOperand(0);
3865 auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
3866 ? CondOfEarlyExitingVPBB
3867 : Builder.createNot(CondOfEarlyExitingVPBB);
3868
3869 // Split the middle block and have it conditionally branch to the early exit
3870 // block if CondToEarlyExit.
3871 VPValue *IsEarlyExitTaken =
3872 Builder.createNaryOp(VPInstruction::AnyOf, {CondToEarlyExit});
3873 VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split");
3874 VPBasicBlock *VectorEarlyExitVPBB =
3875 Plan.createVPBasicBlock("vector.early.exit");
3876 VPBlockUtils::insertOnEdge(LatchVPBB, MiddleVPBB, NewMiddle);
3877 VPBlockUtils::connectBlocks(NewMiddle, VectorEarlyExitVPBB);
3878 NewMiddle->swapSuccessors();
3879
3880 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
3881
3882 // Update the exit phis in the early exit block.
3883 VPBuilder MiddleBuilder(NewMiddle);
3884 VPBuilder EarlyExitB(VectorEarlyExitVPBB);
3885 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
3886 auto *ExitIRI = cast<VPIRPhi>(&R);
3887 // Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has
3888 // a single predecessor and 1 if it has two.
3889 unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1;
3890 if (ExitIRI->getNumOperands() != 1) {
3891 // The first of two operands corresponds to the latch exit, via MiddleVPBB
3892 // predecessor. Extract its final lane.
3893 ExitIRI->extractLastLaneOfLastPartOfFirstOperand(MiddleBuilder);
3894 }
3895
3896 VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
3897 if (!IncomingFromEarlyExit->isLiveIn()) {
3898 // Update the incoming value from the early exit.
3899 VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
3900 VPInstruction::FirstActiveLane, {CondToEarlyExit},
3901 DebugLoc::getUnknown(), "first.active.lane");
3902 IncomingFromEarlyExit = EarlyExitB.createNaryOp(
3903 VPInstruction::ExtractLane, {FirstActiveLane, IncomingFromEarlyExit},
3904 DebugLoc::getUnknown(), "early.exit.value");
3905 ExitIRI->setOperand(EarlyExitIdx, IncomingFromEarlyExit);
3906 }
3907 }
3908 MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});
3909
3910 // Replace the condition controlling the non-early exit from the vector loop
3911 // with one exiting if either the original condition of the vector latch is
3912 // true or the early exit has been taken.
3913 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
3914 // Skip single-iteration loop region
3915 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
3916 "Unexpected terminator");
3917 auto *IsLatchExitTaken =
3918 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
3919 LatchExitingBranch->getOperand(1));
3920 auto *AnyExitTaken = Builder.createNaryOp(
3921 Instruction::Or, {IsEarlyExitTaken, IsLatchExitTaken});
3922 Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
3923 LatchExitingBranch->eraseFromParent();
3924}
3925
3926/// This function tries convert extended in-loop reductions to
3927/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
3928/// valid. The created recipe must be decomposed to its constituent
3929/// recipes before execution.
3930static VPExpressionRecipe *
3932 VFRange &Range) {
3933 Type *RedTy = Ctx.Types.inferScalarType(Red);
3934 VPValue *VecOp = Red->getVecOp();
3935
3936 // Clamp the range if using extended-reduction is profitable.
3937 auto IsExtendedRedValidAndClampRange =
3938 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
3940 [&](ElementCount VF) {
3941 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3943
3944 InstructionCost ExtRedCost;
3945 InstructionCost ExtCost =
3946 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
3947 InstructionCost RedCost = Red->computeCost(VF, Ctx);
3948
3949 if (Red->isPartialReduction()) {
3952 // FIXME: Move partial reduction creation, costing and clamping
3953 // here from LoopVectorize.cpp.
3954 ExtRedCost = Ctx.TTI.getPartialReductionCost(
3955 Opcode, SrcTy, nullptr, RedTy, VF, ExtKind,
3956 llvm::TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind);
3957 } else {
3958 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
3959 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
3960 Red->getFastMathFlags(), CostKind);
3961 }
3962 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
3963 },
3964 Range);
3965 };
3966
3967 VPValue *A;
3968 // Match reduce(ext)).
3969 if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) &&
3970 IsExtendedRedValidAndClampRange(
3971 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
3972 cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
3973 Ctx.Types.inferScalarType(A)))
3974 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
3975
3976 return nullptr;
3977}
3978
3979/// This function tries convert extended in-loop reductions to
3980/// VPExpressionRecipe and clamp the \p Range if it is beneficial
3981/// and valid. The created VPExpressionRecipe must be decomposed to its
3982/// constituent recipes before execution. Patterns of the
3983/// VPExpressionRecipe:
3984/// reduce.add(mul(...)),
3985/// reduce.add(mul(ext(A), ext(B))),
3986/// reduce.add(ext(mul(ext(A), ext(B)))).
3987static VPExpressionRecipe *
3989 VPCostContext &Ctx, VFRange &Range) {
3990 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
3991 if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
3992 return nullptr;
3993
3994 Type *RedTy = Ctx.Types.inferScalarType(Red);
3995
3996 // Clamp the range if using multiply-accumulate-reduction is profitable.
3997 auto IsMulAccValidAndClampRange =
3999 VPWidenCastRecipe *OuterExt) -> bool {
4001 [&](ElementCount VF) {
4003 Type *SrcTy =
4004 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
4005 InstructionCost MulAccCost;
4006
4007 if (Red->isPartialReduction()) {
4008 Type *SrcTy2 =
4009 Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr;
4010 // FIXME: Move partial reduction creation, costing and clamping
4011 // here from LoopVectorize.cpp.
4012 MulAccCost = Ctx.TTI.getPartialReductionCost(
4013 Opcode, SrcTy, SrcTy2, RedTy, VF,
4015 Ext0->getOpcode())
4018 Ext1->getOpcode())
4020 Mul->getOpcode(), CostKind);
4021 } else {
4022 // Only partial reductions support mixed extends at the moment.
4023 if (Ext0 && Ext1 && Ext0->getOpcode() != Ext1->getOpcode())
4024 return false;
4025
4026 bool IsZExt =
4027 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
4028 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4029 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
4030 SrcVecTy, CostKind);
4031 }
4032
4033 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4034 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4035 InstructionCost ExtCost = 0;
4036 if (Ext0)
4037 ExtCost += Ext0->computeCost(VF, Ctx);
4038 if (Ext1)
4039 ExtCost += Ext1->computeCost(VF, Ctx);
4040 if (OuterExt)
4041 ExtCost += OuterExt->computeCost(VF, Ctx);
4042
4043 return MulAccCost.isValid() &&
4044 MulAccCost < ExtCost + MulCost + RedCost;
4045 },
4046 Range);
4047 };
4048
4049 VPValue *VecOp = Red->getVecOp();
4050 VPRecipeBase *Sub = nullptr;
4051 VPValue *A, *B;
4052 VPValue *Tmp = nullptr;
4053 // Sub reductions could have a sub between the add reduction and vec op.
4054 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
4055 Sub = VecOp->getDefiningRecipe();
4056 VecOp = Tmp;
4057 }
4058
4059 // If ValB is a constant and can be safely extended, truncate it to the same
4060 // type as ExtA's operand, then extend it to the same type as ExtA. This
4061 // creates two uniform extends that can more easily be matched by the rest of
4062 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4063 // replaced with the new extend of the constant.
4064 auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA,
4065 VPWidenCastRecipe *&ExtB,
4066 VPValue *&ValB, VPWidenRecipe *Mul) {
4067 if (!ExtA || ExtB || !ValB->isLiveIn())
4068 return;
4069 Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
4070 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4071 const APInt *Const;
4072 if (!match(ValB, m_APInt(Const)) ||
4074 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
4075 return;
4076 // The truncate ensures that the type of each extended operand is the
4077 // same, and it's been proven that the constant can be extended from
4078 // NarrowTy safely. Necessary since ExtA's extended operand would be
4079 // e.g. an i8, while the const will likely be an i32. This will be
4080 // elided by later optimisations.
4081 VPBuilder Builder(Mul);
4082 auto *Trunc =
4083 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
4084 Type *WideTy = Ctx.Types.inferScalarType(ExtA);
4085 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
4086 Mul->setOperand(1, ExtB);
4087 };
4088
4089 // Try to match reduce.add(mul(...)).
4090 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
4093 auto *Mul = cast<VPWidenRecipe>(VecOp);
4094
4095 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4096 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4097
4098 // Match reduce.add/sub(mul(ext, ext)).
4099 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
4100 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
4101 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4102 if (Sub)
4103 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4104 cast<VPWidenRecipe>(Sub), Red);
4105 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4106 }
4107 // TODO: Add an expression type for this variant with a negated mul
4108 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4109 return new VPExpressionRecipe(Mul, Red);
4110 }
4111 // TODO: Add an expression type for negated versions of other expression
4112 // variants.
4113 if (Sub)
4114 return nullptr;
4115
4116 // Match reduce.add(ext(mul(A, B))).
4117 if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
4118 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
4119 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
4122
4123 // reduce.add(ext(mul(ext, const)))
4124 // -> reduce.add(ext(mul(ext, ext(const))))
4125 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
4126
4127 // reduce.add(ext(mul(ext(A), ext(B))))
4128 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
4129 // The inner extends must either have the same opcode as the outer extend or
4130 // be the same, in which case the multiply can never result in a negative
4131 // value and the outer extend can be folded away by doing wider
4132 // extends for the operands of the mul.
4133 if (Ext0 && Ext1 &&
4134 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
4135 Ext0->getOpcode() == Ext1->getOpcode() &&
4136 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
4137 auto *NewExt0 = new VPWidenCastRecipe(
4138 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), nullptr,
4139 *Ext0, *Ext0, Ext0->getDebugLoc());
4140 NewExt0->insertBefore(Ext0);
4141
4142 VPWidenCastRecipe *NewExt1 = NewExt0;
4143 if (Ext0 != Ext1) {
4144 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
4145 Ext->getResultType(), nullptr, *Ext1,
4146 *Ext1, Ext1->getDebugLoc());
4147 NewExt1->insertBefore(Ext1);
4148 }
4149 Mul->setOperand(0, NewExt0);
4150 Mul->setOperand(1, NewExt1);
4151 Red->setOperand(1, Mul);
4152 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
4153 }
4154 }
4155 return nullptr;
4156}
4157
4158/// This function tries to create abstract recipes from the reduction recipe for
4159/// following optimizations and cost estimation.
4161 VPCostContext &Ctx,
4162 VFRange &Range) {
4163 VPExpressionRecipe *AbstractR = nullptr;
4164 auto IP = std::next(Red->getIterator());
4165 auto *VPBB = Red->getParent();
4166 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
4167 AbstractR = MulAcc;
4168 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
4169 AbstractR = ExtRed;
4170 // Cannot create abstract inloop reduction recipes.
4171 if (!AbstractR)
4172 return;
4173
4174 AbstractR->insertBefore(*VPBB, IP);
4175 Red->replaceAllUsesWith(AbstractR);
4176}
4177
4188
4190 if (Plan.hasScalarVFOnly())
4191 return;
4192
4193#ifndef NDEBUG
4194 VPDominatorTree VPDT(Plan);
4195#endif
4196
4197 SmallVector<VPValue *> VPValues;
4200 append_range(VPValues, Plan.getLiveIns());
4201 for (VPRecipeBase &R : *Plan.getEntry())
4202 append_range(VPValues, R.definedValues());
4203
4204 auto *VectorPreheader = Plan.getVectorPreheader();
4205 for (VPValue *VPV : VPValues) {
4207 (VPV->isLiveIn() && VPV->getLiveInIRValue() &&
4208 isa<Constant>(VPV->getLiveInIRValue())))
4209 continue;
4210
4211 // Add explicit broadcast at the insert point that dominates all users.
4212 VPBasicBlock *HoistBlock = VectorPreheader;
4213 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
4214 for (VPUser *User : VPV->users()) {
4215 if (User->usesScalars(VPV))
4216 continue;
4217 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
4218 HoistPoint = HoistBlock->begin();
4219 else
4220 assert(VPDT.dominates(VectorPreheader,
4221 cast<VPRecipeBase>(User)->getParent()) &&
4222 "All users must be in the vector preheader or dominated by it");
4223 }
4224
4225 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
4226 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
4227 VPV->replaceUsesWithIf(Broadcast,
4228 [VPV, Broadcast](VPUser &U, unsigned Idx) {
4229 return Broadcast != &U && !U.usesScalars(VPV);
4230 });
4231 }
4232}
4233
4235 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4236
4237 // Collect candidate loads with invariant addresses and noalias scopes
4238 // metadata and memory-writing recipes with noalias metadata.
4242 vp_depth_first_shallow(LoopRegion->getEntry()))) {
4243 for (VPRecipeBase &R : *VPBB) {
4244 // Only handle single-scalar replicated loads with invariant addresses.
4245 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4246 if (RepR->isPredicated() || !RepR->isSingleScalar() ||
4247 RepR->getOpcode() != Instruction::Load)
4248 continue;
4249
4250 VPValue *Addr = RepR->getOperand(0);
4251 if (Addr->isDefinedOutsideLoopRegions()) {
4253 if (!Loc.AATags.Scope)
4254 continue;
4255 CandidateLoads.push_back({RepR, Loc});
4256 }
4257 }
4258 if (R.mayWriteToMemory()) {
4260 if (!Loc || !Loc->AATags.Scope || !Loc->AATags.NoAlias)
4261 return;
4262 Stores.push_back(*Loc);
4263 }
4264 }
4265 }
4266
4267 VPBasicBlock *Preheader = Plan.getVectorPreheader();
4268 for (auto &[LoadRecipe, LoadLoc] : CandidateLoads) {
4269 // Hoist the load to the preheader if it doesn't alias with any stores
4270 // according to the noalias metadata. Other loads should have been hoisted
4271 // by other passes
4272 const AAMDNodes &LoadAA = LoadLoc.AATags;
4273 if (all_of(Stores, [&](const MemoryLocation &StoreLoc) {
4275 LoadAA.Scope, StoreLoc.AATags.NoAlias);
4276 })) {
4277 LoadRecipe->moveBefore(*Preheader, Preheader->getFirstNonPhi());
4278 }
4279 }
4280}
4281
4282// Collect common metadata from a group of replicate recipes by intersecting
4283// metadata from all recipes in the group.
4285 VPIRMetadata CommonMetadata = *Recipes.front();
4286 for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
4287 CommonMetadata.intersect(*Recipe);
4288 return CommonMetadata;
4289}
4290
4291template <unsigned Opcode>
4295 const Loop *L) {
4296 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
4297 "Only Load and Store opcodes supported");
4298 constexpr bool IsLoad = (Opcode == Instruction::Load);
4299 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4300 VPTypeAnalysis TypeInfo(Plan);
4301
4302 // Group predicated operations by their address SCEV.
4304 for (VPBlockBase *Block : vp_depth_first_shallow(LoopRegion->getEntry())) {
4305 auto *VPBB = cast<VPBasicBlock>(Block);
4306 for (VPRecipeBase &R : *VPBB) {
4307 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
4308 if (!RepR || RepR->getOpcode() != Opcode || !RepR->isPredicated())
4309 continue;
4310
4311 // For loads, operand 0 is address; for stores, operand 1 is address.
4312 VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
4313 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);
4314 if (!isa<SCEVCouldNotCompute>(AddrSCEV))
4315 RecipesByAddress[AddrSCEV].push_back(RepR);
4316 }
4317 }
4318
4319 // For each address, collect operations with the same or complementary masks.
4321 auto GetLoadStoreValueType = [&](VPReplicateRecipe *Recipe) {
4322 return TypeInfo.inferScalarType(IsLoad ? Recipe : Recipe->getOperand(0));
4323 };
4324 for (auto &[Addr, Recipes] : RecipesByAddress) {
4325 if (Recipes.size() < 2)
4326 continue;
4327
4328 // Collect groups with the same or complementary masks.
4329 for (VPReplicateRecipe *&RecipeI : Recipes) {
4330 if (!RecipeI)
4331 continue;
4332
4333 VPValue *MaskI = RecipeI->getMask();
4334 Type *TypeI = GetLoadStoreValueType(RecipeI);
4336 Group.push_back(RecipeI);
4337 RecipeI = nullptr;
4338
4339 // Find all operations with the same or complementary masks.
4340 bool HasComplementaryMask = false;
4341 for (VPReplicateRecipe *&RecipeJ : Recipes) {
4342 if (!RecipeJ)
4343 continue;
4344
4345 VPValue *MaskJ = RecipeJ->getMask();
4346 Type *TypeJ = GetLoadStoreValueType(RecipeJ);
4347 if (TypeI == TypeJ) {
4348 // Check if any operation in the group has a complementary mask with
4349 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
4350 HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
4351 match(MaskJ, m_Not(m_Specific(MaskI)));
4352 Group.push_back(RecipeJ);
4353 RecipeJ = nullptr;
4354 }
4355 }
4356
4357 if (HasComplementaryMask) {
4358 assert(Group.size() >= 2 && "must have at least 2 entries");
4359 AllGroups.push_back(std::move(Group));
4360 }
4361 }
4362 }
4363
4364 return AllGroups;
4365}
4366
4367// Find the recipe with minimum alignment in the group.
4368template <typename InstType>
4369static VPReplicateRecipe *
4371 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4372 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
4373 cast<InstType>(B->getUnderlyingInstr())->getAlign();
4374 });
4375}
4376
4379 const Loop *L) {
4380 auto Groups =
4382 if (Groups.empty())
4383 return;
4384
4385 VPDominatorTree VPDT(Plan);
4386
4387 // Process each group of loads.
4388 for (auto &Group : Groups) {
4389 // Sort loads by dominance order, with earliest (most dominating) first.
4390 sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4391 return VPDT.properlyDominates(A, B);
4392 });
4393
4394 // Try to use the earliest (most dominating) load to replace all others.
4395 VPReplicateRecipe *EarliestLoad = Group[0];
4396 VPBasicBlock *FirstBB = EarliestLoad->getParent();
4397 VPBasicBlock *LastBB = Group.back()->getParent();
4398
4399 // Check that the load doesn't alias with stores between first and last.
4400 auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
4401 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
4402 continue;
4403
4404 // Collect common metadata from all loads in the group.
4405 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4406
4407 // Find the load with minimum alignment to use.
4408 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
4409
4410 // Create an unpredicated version of the earliest load with common
4411 // metadata.
4412 auto *UnpredicatedLoad = new VPReplicateRecipe(
4413 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
4414 /*IsSingleScalar=*/false, /*Mask=*/nullptr, *EarliestLoad,
4415 CommonMetadata);
4416
4417 UnpredicatedLoad->insertBefore(EarliestLoad);
4418
4419 // Replace all loads in the group with the unpredicated load.
4420 for (VPReplicateRecipe *Load : Group) {
4421 Load->replaceAllUsesWith(UnpredicatedLoad);
4422 Load->eraseFromParent();
4423 }
4424 }
4425}
4426
4427static bool
4429 PredicatedScalarEvolution &PSE, const Loop &L,
4430 VPTypeAnalysis &TypeInfo) {
4431 auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
4432 if (!StoreLoc || !StoreLoc->AATags.Scope)
4433 return false;
4434
4435 // When sinking a group of stores, all members of the group alias each other.
4436 // Skip them during the alias checks.
4437 SmallPtrSet<VPRecipeBase *, 4> StoresToSinkSet(StoresToSink.begin(),
4438 StoresToSink.end());
4439
4440 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
4441 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
4442 SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], PSE, L, TypeInfo);
4443 return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
4444}
4445
4448 const Loop *L) {
4449 auto Groups =
4451 if (Groups.empty())
4452 return;
4453
4454 VPDominatorTree VPDT(Plan);
4455 VPTypeAnalysis TypeInfo(Plan);
4456
4457 for (auto &Group : Groups) {
4458 sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4459 return VPDT.properlyDominates(A, B);
4460 });
4461
4462 if (!canSinkStoreWithNoAliasCheck(Group, PSE, *L, TypeInfo))
4463 continue;
4464
4465 // Use the last (most dominated) store's location for the unconditional
4466 // store.
4467 VPReplicateRecipe *LastStore = Group.back();
4468 VPBasicBlock *InsertBB = LastStore->getParent();
4469
4470 // Collect common alias metadata from all stores in the group.
4471 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4472
4473 // Build select chain for stored values.
4474 VPValue *SelectedValue = Group[0]->getOperand(0);
4475 VPBuilder Builder(InsertBB, LastStore->getIterator());
4476
4477 for (unsigned I = 1; I < Group.size(); ++I) {
4478 VPValue *Mask = Group[I]->getMask();
4479 VPValue *Value = Group[I]->getOperand(0);
4480 SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,
4481 Group[I]->getDebugLoc());
4482 }
4483
4484 // Find the store with minimum alignment to use.
4485 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
4486
4487 // Create unconditional store with selected value and common metadata.
4488 auto *UnpredicatedStore =
4489 new VPReplicateRecipe(StoreWithMinAlign->getUnderlyingInstr(),
4490 {SelectedValue, LastStore->getOperand(1)},
4491 /*IsSingleScalar=*/false,
4492 /*Mask=*/nullptr, *LastStore, CommonMetadata);
4493 UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());
4494
4495 // Remove all predicated stores from the group.
4496 for (VPReplicateRecipe *Store : Group)
4497 Store->eraseFromParent();
4498 }
4499}
4500
4502 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
4504 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
4505 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
4506
4507 VPValue *TC = Plan.getTripCount();
4508 // Skip cases for which the trip count may be non-trivial to materialize.
4509 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
4510 // tail is required.
4511 if (!Plan.hasScalarTail() ||
4513 Plan.getScalarPreheader() ||
4514 !TC->isLiveIn())
4515 return;
4516
4517 // Materialize vector trip counts for constants early if it can simply
4518 // be computed as (Original TC / VF * UF) * VF * UF.
4519 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
4520 // tail-folded loops.
4521 ScalarEvolution &SE = *PSE.getSE();
4522 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
4523 if (!isa<SCEVConstant>(TCScev))
4524 return;
4525 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
4526 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
4527 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
4528 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
4529}
4530
4532 VPBasicBlock *VectorPH) {
4534 if (BTC->getNumUsers() == 0)
4535 return;
4536
4537 VPBuilder Builder(VectorPH, VectorPH->begin());
4538 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4539 auto *TCMO = Builder.createNaryOp(
4540 Instruction::Sub, {Plan.getTripCount(), Plan.getConstantInt(TCTy, 1)},
4541 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
4542 BTC->replaceAllUsesWith(TCMO);
4543}
4544
4546 if (Plan.hasScalarVFOnly())
4547 return;
4548
4549 VPTypeAnalysis TypeInfo(Plan);
4550 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4551 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4553 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4554 vp_depth_first_shallow(LoopRegion->getEntry()));
4555 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes and
4556 // VPInstructions, excluding ones in replicate regions. Those are not
4557 // materialized explicitly yet. Those vector users are still handled in
4558 // VPReplicateRegion::execute(), via shouldPack().
4559 // TODO: materialize build vectors for replicating recipes in replicating
4560 // regions.
4561 for (VPBasicBlock *VPBB :
4562 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
4563 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4565 continue;
4566 auto *DefR = cast<VPRecipeWithIRFlags>(&R);
4567 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
4568 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4569 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
4570 };
4571 if ((isa<VPReplicateRecipe>(DefR) &&
4572 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
4573 (isa<VPInstruction>(DefR) &&
4575 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
4576 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
4577 continue;
4578
4579 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
4580 unsigned Opcode = ScalarTy->isStructTy()
4583 auto *BuildVector = new VPInstruction(Opcode, {DefR});
4584 BuildVector->insertAfter(DefR);
4585
4586 DefR->replaceUsesWithIf(
4587 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
4588 VPUser &U, unsigned) {
4589 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
4590 });
4591 }
4592 }
4593
4594 // Create explicit VPInstructions to convert vectors to scalars. The current
4595 // implementation is conservative - it may miss some cases that may or may not
4596 // be vector values. TODO: introduce Unpacks speculatively - remove them later
4597 // if they are known to operate on scalar values.
4598 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
4599 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4602 continue;
4603 for (VPValue *Def : R.definedValues()) {
4604 // Skip recipes that are single-scalar or only have their first lane
4605 // used.
4606 // TODO: The Defs skipped here may or may not be vector values.
4607 // Introduce Unpacks, and remove them later, if they are guaranteed to
4608 // produce scalar values.
4610 continue;
4611
4612 // At the moment, we create unpacks only for scalar users outside
4613 // replicate regions. Recipes inside replicate regions still extract the
4614 // required lanes implicitly.
4615 // TODO: Remove once replicate regions are unrolled completely.
4616 auto IsCandidateUnpackUser = [Def](VPUser *U) {
4617 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4618 return U->usesScalars(Def) &&
4619 (!ParentRegion || !ParentRegion->isReplicator());
4620 };
4621 if (none_of(Def->users(), IsCandidateUnpackUser))
4622 continue;
4623
4624 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
4625 if (R.isPhi())
4626 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
4627 else
4628 Unpack->insertAfter(&R);
4629 Def->replaceUsesWithIf(Unpack,
4630 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
4631 return IsCandidateUnpackUser(&U);
4632 });
4633 }
4634 }
4635 }
4636}
4637
4639 VPBasicBlock *VectorPHVPBB,
4640 bool TailByMasking,
4641 bool RequiresScalarEpilogue) {
4642 VPValue &VectorTC = Plan.getVectorTripCount();
4643 assert(VectorTC.isLiveIn() && "vector-trip-count must be a live-in");
4644 // There's nothing to do if there are no users of the vector trip count or its
4645 // IR value has already been set.
4646 if (VectorTC.getNumUsers() == 0 || VectorTC.getLiveInIRValue())
4647 return;
4648
4649 VPValue *TC = Plan.getTripCount();
4650 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
4651 VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin());
4652 VPValue *Step = &Plan.getVFxUF();
4653
4654 // If the tail is to be folded by masking, round the number of iterations N
4655 // up to a multiple of Step instead of rounding down. This is done by first
4656 // adding Step-1 and then rounding down. Note that it's ok if this addition
4657 // overflows: the vector induction variable will eventually wrap to zero given
4658 // that it starts at zero and its Step is a power of two; the loop will then
4659 // exit, with the last early-exit vector comparison also producing all-true.
4660 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
4661 // is accounted for in emitIterationCountCheck that adds an overflow check.
4662 if (TailByMasking) {
4663 TC = Builder.createNaryOp(
4664 Instruction::Add,
4665 {TC, Builder.createNaryOp(Instruction::Sub,
4666 {Step, Plan.getConstantInt(TCTy, 1)})},
4667 DebugLoc::getCompilerGenerated(), "n.rnd.up");
4668 }
4669
4670 // Now we need to generate the expression for the part of the loop that the
4671 // vectorized body will execute. This is equal to N - (N % Step) if scalar
4672 // iterations are not required for correctness, or N - Step, otherwise. Step
4673 // is equal to the vectorization factor (number of SIMD elements) times the
4674 // unroll factor (number of SIMD instructions).
4675 VPValue *R =
4676 Builder.createNaryOp(Instruction::URem, {TC, Step},
4677 DebugLoc::getCompilerGenerated(), "n.mod.vf");
4678
4679 // There are cases where we *must* run at least one iteration in the remainder
4680 // loop. See the cost model for when this can happen. If the step evenly
4681 // divides the trip count, we set the remainder to be equal to the step. If
4682 // the step does not evenly divide the trip count, no adjustment is necessary
4683 // since there will already be scalar iterations. Note that the minimum
4684 // iterations check ensures that N >= Step.
4685 if (RequiresScalarEpilogue) {
4686 assert(!TailByMasking &&
4687 "requiring scalar epilogue is not supported with fail folding");
4688 VPValue *IsZero =
4689 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getConstantInt(TCTy, 0));
4690 R = Builder.createSelect(IsZero, Step, R);
4691 }
4692
4693 VPValue *Res = Builder.createNaryOp(
4694 Instruction::Sub, {TC, R}, DebugLoc::getCompilerGenerated(), "n.vec");
4695 VectorTC.replaceAllUsesWith(Res);
4696}
4697
4699 ElementCount VFEC) {
4700 VPBuilder Builder(VectorPH, VectorPH->begin());
4701 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4702 VPValue &VF = Plan.getVF();
4703 VPValue &VFxUF = Plan.getVFxUF();
4704 // Note that after the transform, Plan.getVF and Plan.getVFxUF should not be
4705 // used.
4706 // TODO: Assert that they aren't used.
4707
4708 // If there are no users of the runtime VF, compute VFxUF by constant folding
4709 // the multiplication of VF and UF.
4710 if (VF.getNumUsers() == 0) {
4711 VPValue *RuntimeVFxUF =
4712 Builder.createElementCount(TCTy, VFEC * Plan.getUF());
4713 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
4714 return;
4715 }
4716
4717 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
4718 // vscale) * UF.
4719 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
4721 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
4723 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
4724 }
4725 VF.replaceAllUsesWith(RuntimeVF);
4726
4727 VPValue *UF = Plan.getConstantInt(TCTy, Plan.getUF());
4728 VPValue *MulByUF = Builder.createOverflowingOp(
4729 Instruction::Mul, {RuntimeVF, UF}, {true, false});
4730 VFxUF.replaceAllUsesWith(MulByUF);
4731}
4732
4735 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
4736
4737 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
4738 BasicBlock *EntryBB = Entry->getIRBasicBlock();
4739 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
4740 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
4742 continue;
4743 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
4744 if (!ExpSCEV)
4745 break;
4746 const SCEV *Expr = ExpSCEV->getSCEV();
4747 Value *Res =
4748 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
4749 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
4750 VPValue *Exp = Plan.getOrAddLiveIn(Res);
4751 ExpSCEV->replaceAllUsesWith(Exp);
4752 if (Plan.getTripCount() == ExpSCEV)
4753 Plan.resetTripCount(Exp);
4754 ExpSCEV->eraseFromParent();
4755 }
4757 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
4758 "after any VPIRInstructions");
4759 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
4760 // to the VPIRBasicBlock.
4761 auto EI = Entry->begin();
4762 for (Instruction &I : drop_end(*EntryBB)) {
4763 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
4764 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
4765 EI++;
4766 continue;
4767 }
4769 }
4770
4771 return ExpandedSCEVs;
4772}
4773
4774/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
4775/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
4776/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
4777/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
4778/// an index-independent load if it feeds all wide ops at all indices (\p OpV
4779/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
4780/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
4781/// is defined at \p Idx of a load interleave group.
4782static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
4783 VPValue *OpV, unsigned Idx) {
4784 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
4785 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
4786 if (!Member0OpR)
4787 return Member0Op == OpV;
4788 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
4789 return !W->getMask() && Member0Op == OpV;
4790 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
4791 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
4792 return false;
4793}
4794
4795/// Returns true if \p IR is a full interleave group with factor and number of
4796/// members both equal to \p VF. The interleave group must also access the full
4797/// vector width \p VectorRegWidth.
4799 ElementCount VF,
4800 VPTypeAnalysis &TypeInfo,
4801 TypeSize VectorRegWidth) {
4802 if (!InterleaveR || InterleaveR->getMask())
4803 return false;
4804
4805 Type *GroupElementTy = nullptr;
4806 if (InterleaveR->getStoredValues().empty()) {
4807 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
4808 if (!all_of(InterleaveR->definedValues(),
4809 [&TypeInfo, GroupElementTy](VPValue *Op) {
4810 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4811 }))
4812 return false;
4813 } else {
4814 GroupElementTy =
4815 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
4816 if (!all_of(InterleaveR->getStoredValues(),
4817 [&TypeInfo, GroupElementTy](VPValue *Op) {
4818 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4819 }))
4820 return false;
4821 }
4822
4823 unsigned VFMin = VF.getKnownMinValue();
4824 TypeSize GroupSize = TypeSize::get(
4825 GroupElementTy->getScalarSizeInBits() * VFMin, VF.isScalable());
4826 const auto *IG = InterleaveR->getInterleaveGroup();
4827 return IG->getFactor() == VFMin && IG->getNumMembers() == VFMin &&
4828 GroupSize == VectorRegWidth;
4829}
4830
4831/// Returns true if \p VPValue is a narrow VPValue.
4832static bool isAlreadyNarrow(VPValue *VPV) {
4833 if (VPV->isLiveIn())
4834 return true;
4835 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
4836 return RepR && RepR->isSingleScalar();
4837}
4838
4839// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
4840// a narrow variant.
4841static VPValue *
4843 auto *R = V->getDefiningRecipe();
4844 if (!R || NarrowedOps.contains(V))
4845 return V;
4846
4847 if (isAlreadyNarrow(V))
4848 return V;
4849
4850 if (auto *WideMember0 = dyn_cast<VPWidenRecipe>(R)) {
4851 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
4852 WideMember0->setOperand(
4853 Idx,
4854 narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps));
4855 return V;
4856 }
4857
4858 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
4859 // Narrow interleave group to wide load, as transformed VPlan will only
4860 // process one original iteration.
4861 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
4862 auto *L = new VPWidenLoadRecipe(
4863 *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
4864 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
4865 L->insertBefore(LoadGroup);
4866 NarrowedOps.insert(L);
4867 return L;
4868 }
4869
4870 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
4871 assert(RepR->isSingleScalar() &&
4872 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
4873 "must be a single scalar load");
4874 NarrowedOps.insert(RepR);
4875 return RepR;
4876 }
4877
4878 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
4879 VPValue *PtrOp = WideLoad->getAddr();
4880 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
4881 PtrOp = VecPtr->getOperand(0);
4882 // Narrow wide load to uniform scalar load, as transformed VPlan will only
4883 // process one original iteration.
4884 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
4885 /*IsUniform*/ true,
4886 /*Mask*/ nullptr, {}, *WideLoad);
4887 N->insertBefore(WideLoad);
4888 NarrowedOps.insert(N);
4889 return N;
4890}
4891
4893 TypeSize VectorRegWidth) {
4894 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
4895 if (!VectorLoop || VectorLoop->getEntry()->getNumSuccessors() != 0)
4896 return;
4897
4898 VPTypeAnalysis TypeInfo(Plan);
4899
4901 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
4903 continue;
4904
4907 continue;
4908
4909 // Bail out on recipes not supported at the moment:
4910 // * phi recipes other than the canonical induction
4911 // * recipes writing to memory except interleave groups
4912 // Only support plans with a canonical induction phi.
4913 if (R.isPhi())
4914 return;
4915
4916 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
4917 if (R.mayWriteToMemory() && !InterleaveR)
4918 return;
4919
4920 // Do not narrow interleave groups if there are VectorPointer recipes and
4921 // the plan was unrolled. The recipe implicitly uses VF from
4922 // VPTransformState.
4923 // TODO: Remove restriction once the VF for the VectorPointer offset is
4924 // modeled explicitly as operand.
4925 if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
4926 return;
4927
4928 // All other ops are allowed, but we reject uses that cannot be converted
4929 // when checking all allowed consumers (store interleave groups) below.
4930 if (!InterleaveR)
4931 continue;
4932
4933 // Bail out on non-consecutive interleave groups.
4934 if (!isConsecutiveInterleaveGroup(InterleaveR, VF, TypeInfo,
4935 VectorRegWidth))
4936 return;
4937
4938 // Skip read interleave groups.
4939 if (InterleaveR->getStoredValues().empty())
4940 continue;
4941
4942 // Narrow interleave groups, if all operands are already matching narrow
4943 // ops.
4944 auto *Member0 = InterleaveR->getStoredValues()[0];
4945 if (isAlreadyNarrow(Member0) &&
4946 all_of(InterleaveR->getStoredValues(),
4947 [Member0](VPValue *VPV) { return Member0 == VPV; })) {
4948 StoreGroups.push_back(InterleaveR);
4949 continue;
4950 }
4951
4952 // For now, we only support full interleave groups storing load interleave
4953 // groups.
4954 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
4955 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
4956 if (!DefR)
4957 return false;
4958 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
4959 return IR && IR->getInterleaveGroup()->isFull() &&
4960 IR->getVPValue(Op.index()) == Op.value();
4961 })) {
4962 StoreGroups.push_back(InterleaveR);
4963 continue;
4964 }
4965
4966 // Check if all values feeding InterleaveR are matching wide recipes, which
4967 // operands that can be narrowed.
4968 auto *WideMember0 =
4969 dyn_cast_or_null<VPWidenRecipe>(InterleaveR->getStoredValues()[0]);
4970 if (!WideMember0)
4971 return;
4972 for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
4974 if (!R || R->getOpcode() != WideMember0->getOpcode() ||
4975 R->getNumOperands() > 2)
4976 return;
4977 if (any_of(enumerate(R->operands()),
4978 [WideMember0, Idx = I](const auto &P) {
4979 const auto &[OpIdx, OpV] = P;
4980 return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
4981 }))
4982 return;
4983 }
4984 StoreGroups.push_back(InterleaveR);
4985 }
4986
4987 if (StoreGroups.empty())
4988 return;
4989
4990 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
4991 SmallPtrSet<VPValue *, 4> NarrowedOps;
4992 // Narrow operation tree rooted at store groups.
4993 for (auto *StoreGroup : StoreGroups) {
4994 VPValue *Res =
4995 narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
4996 auto *SI =
4997 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
4998 auto *S = new VPWidenStoreRecipe(
4999 *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
5000 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
5001 S->insertBefore(StoreGroup);
5002 StoreGroup->eraseFromParent();
5003 }
5004
5005 // Adjust induction to reflect that the transformed plan only processes one
5006 // original iteration.
5007 auto *CanIV = VectorLoop->getCanonicalIV();
5008 auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
5009 VPBuilder PHBuilder(Plan.getVectorPreheader());
5010
5011 VPValue *UF = Plan.getOrAddLiveIn(
5012 ConstantInt::get(VectorLoop->getCanonicalIVType(), 1 * Plan.getUF()));
5013 if (VF.isScalable()) {
5014 VPValue *VScale = PHBuilder.createElementCount(
5016 VPValue *VScaleUF = PHBuilder.createOverflowingOp(
5017 Instruction::Mul, {VScale, UF}, {true, false});
5018 Inc->setOperand(1, VScaleUF);
5019 Plan.getVF().replaceAllUsesWith(VScale);
5020 } else {
5021 Inc->setOperand(1, UF);
5023 Plan.getConstantInt(CanIV->getScalarType(), 1));
5024 }
5025 removeDeadRecipes(Plan);
5026}
5027
5028/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
5029/// BranchOnCond recipe.
5031 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
5032 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5033 auto *MiddleTerm =
5035 // Only add branch metadata if there is a (conditional) terminator.
5036 if (!MiddleTerm)
5037 return;
5038
5039 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
5040 "must have a BranchOnCond");
5041 // Assume that `TripCount % VectorStep ` is equally distributed.
5042 unsigned VectorStep = Plan.getUF() * VF.getKnownMinValue();
5043 if (VF.isScalable() && VScaleForTuning.has_value())
5044 VectorStep *= *VScaleForTuning;
5045 assert(VectorStep > 0 && "trip count should not be zero");
5046 MDBuilder MDB(Plan.getContext());
5047 MDNode *BranchWeights =
5048 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
5049 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
5050}
5051
5052/// Compute and return the end value for \p WideIV, unless it is truncated. If
5053/// the induction recipe is not canonical, creates a VPDerivedIVRecipe to
5054/// compute the end value of the induction.
5056 VPBuilder &VectorPHBuilder,
5057 VPTypeAnalysis &TypeInfo,
5058 VPValue *VectorTC) {
5059 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
5060 // Truncated wide inductions resume from the last lane of their vector value
5061 // in the last vector iteration which is handled elsewhere.
5062 if (WideIntOrFp && WideIntOrFp->getTruncInst())
5063 return nullptr;
5064
5065 VPValue *Start = WideIV->getStartValue();
5066 VPValue *Step = WideIV->getStepValue();
5068 VPValue *EndValue = VectorTC;
5069 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
5070 EndValue = VectorPHBuilder.createDerivedIV(
5071 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
5072 Start, VectorTC, Step);
5073 }
5074
5075 // EndValue is derived from the vector trip count (which has the same type as
5076 // the widest induction) and thus may be wider than the induction here.
5077 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
5078 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
5079 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
5080 ScalarTypeOfWideIV,
5081 WideIV->getDebugLoc());
5082 }
5083
5084 return EndValue;
5085}
5086
5088 VPlan &Plan, DenseMap<VPValue *, VPValue *> &IVEndValues) {
5089 VPTypeAnalysis TypeInfo(Plan);
5090 auto *ScalarPH = Plan.getScalarPreheader();
5091 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
5092 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5093 VPBuilder VectorPHBuilder(
5094 cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
5095 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5096 for (VPRecipeBase &PhiR : Plan.getScalarPreheader()->phis()) {
5097 auto *ResumePhiR = cast<VPPhi>(&PhiR);
5098
5099 // TODO: Extract final value from induction recipe initially, optimize to
5100 // pre-computed end value together in optimizeInductionExitUsers.
5101 auto *VectorPhiR = cast<VPHeaderPHIRecipe>(ResumePhiR->getOperand(0));
5102 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
5104 WideIVR, VectorPHBuilder, TypeInfo, &Plan.getVectorTripCount())) {
5105 IVEndValues[WideIVR] = EndValue;
5106 ResumePhiR->setOperand(0, EndValue);
5107 ResumePhiR->setName("bc.resume.val");
5108 continue;
5109 }
5110 // TODO: Also handle truncated inductions here. Computing end-values
5111 // separately should be done as VPlan-to-VPlan optimization, after
5112 // legalizing all resume values to use the last lane from the loop.
5113 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
5114 "should only skip truncated wide inductions");
5115 continue;
5116 }
5117
5118 // The backedge value provides the value to resume coming out of a loop,
5119 // which for FORs is a vector whose last element needs to be extracted. The
5120 // start value provides the value if the loop is bypassed.
5121 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
5122 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
5123 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5124 "Cannot handle loops with uncountable early exits");
5125 if (IsFOR) {
5126 auto *ExtractPart = MiddleBuilder.createNaryOp(
5127 VPInstruction::ExtractLastPart, ResumeFromVectorLoop);
5128 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
5130 "vector.recur.extract");
5131 }
5132 ResumePhiR->setName(IsFOR ? "scalar.recur.init" : "bc.merge.rdx");
5133 ResumePhiR->setOperand(0, ResumeFromVectorLoop);
5134 }
5135}
5136
5138 VFRange &Range) {
5139 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5140 auto *ScalarPHVPBB = Plan.getScalarPreheader();
5141 auto *MiddleVPBB = Plan.getMiddleBlock();
5142 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
5143 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5144
5145 auto IsScalableOne = [](ElementCount VF) -> bool {
5146 return VF == ElementCount::getScalable(1);
5147 };
5148
5149 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
5150 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
5151 if (!FOR)
5152 continue;
5153
5154 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5155 "Cannot handle loops with uncountable early exits");
5156
5157 // This is the second phase of vectorizing first-order recurrences, creating
5158 // extract for users outside the loop. An overview of the transformation is
5159 // described below. Suppose we have the following loop with some use after
5160 // the loop of the last a[i-1],
5161 //
5162 // for (int i = 0; i < n; ++i) {
5163 // t = a[i - 1];
5164 // b[i] = a[i] - t;
5165 // }
5166 // use t;
5167 //
5168 // There is a first-order recurrence on "a". For this loop, the shorthand
5169 // scalar IR looks like:
5170 //
5171 // scalar.ph:
5172 // s.init = a[-1]
5173 // br scalar.body
5174 //
5175 // scalar.body:
5176 // i = phi [0, scalar.ph], [i+1, scalar.body]
5177 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
5178 // s2 = a[i]
5179 // b[i] = s2 - s1
5180 // br cond, scalar.body, exit.block
5181 //
5182 // exit.block:
5183 // use = lcssa.phi [s1, scalar.body]
5184 //
5185 // In this example, s1 is a recurrence because it's value depends on the
5186 // previous iteration. In the first phase of vectorization, we created a
5187 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
5188 // for users in the scalar preheader and exit block.
5189 //
5190 // vector.ph:
5191 // v_init = vector(..., ..., ..., a[-1])
5192 // br vector.body
5193 //
5194 // vector.body
5195 // i = phi [0, vector.ph], [i+4, vector.body]
5196 // v1 = phi [v_init, vector.ph], [v2, vector.body]
5197 // v2 = a[i, i+1, i+2, i+3]
5198 // b[i] = v2 - v1
5199 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
5200 // b[i, i+1, i+2, i+3] = v2 - v1
5201 // br cond, vector.body, middle.block
5202 //
5203 // middle.block:
5204 // vector.recur.extract.for.phi = v2(2)
5205 // vector.recur.extract = v2(3)
5206 // br cond, scalar.ph, exit.block
5207 //
5208 // scalar.ph:
5209 // scalar.recur.init = phi [vector.recur.extract, middle.block],
5210 // [s.init, otherwise]
5211 // br scalar.body
5212 //
5213 // scalar.body:
5214 // i = phi [0, scalar.ph], [i+1, scalar.body]
5215 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
5216 // s2 = a[i]
5217 // b[i] = s2 - s1
5218 // br cond, scalar.body, exit.block
5219 //
5220 // exit.block:
5221 // lo = lcssa.phi [s1, scalar.body],
5222 // [vector.recur.extract.for.phi, middle.block]
5223 //
5224 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
5225 // Extract the penultimate value of the recurrence and use it as operand for
5226 // the VPIRInstruction modeling the phi.
5228 make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
5230 continue;
5231
5232 // For VF vscale x 1, if vscale = 1, we are unable to extract the
5233 // penultimate value of the recurrence. Instead we rely on the existing
5234 // extract of the last element from the result of
5235 // VPInstruction::FirstOrderRecurrenceSplice.
5236 // TODO: Consider vscale_range info and UF.
5238 Range))
5239 return;
5240 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
5241 VPInstruction::ExtractPenultimateElement, FOR->getBackedgeValue(), {},
5242 "vector.recur.extract.for.phi");
5243 cast<VPInstruction>(&R)->replaceAllUsesWith(PenultimateElement);
5244 }
5245 }
5246}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:80
static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, ScalarEvolution &SE)
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
static bool sinkScalarOperands(VPlan &Plan)
static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R)
Return true if we do not know how to (mechanically) hoist or sink R out of a loop region.
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo)
Try to simplify VPSingleDefRecipe Def.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck)
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL)
Replace recipes with their EVL variants.
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static VPValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Compute and return the end value for WideIV, unless it is truncated.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
From the definition of llvm.experimental.get.vector.length, VPInstruction::ExplicitVectorLength(AVL) ...
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static void recursivelyDeleteDeadRecipes(VPValue *V)
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static VPValue * narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl< VPValue * > &NarrowedOps)
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, std::optional< SinkStoreInfo > SinkInfo={})
Check if a memory operation doesn't alias with memory operations in blocks between FirstBB and LastBB...
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ElementCount VF, VPTypeAnalysis &TypeInfo, TypeSize VectorRegWidth)
Returns true if IR is a full interleave group with factor and number of members both equal to VF.
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void narrowToSingleScalarRecipes(VPlan &Plan)
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const X86InstrFMA3Group Groups[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Helper for extra no-alias checks via known-safe recipe and SCEV.
SinkStoreInfo(const SmallPtrSetImpl< VPRecipeBase * > &ExcludeRecipes, VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
bool shouldSkip(VPRecipeBase &R) const
Return true if R should be skipped during alias checking, either because it's in the exclude set or b...
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1513
APInt abs() const
Get the absolute value.
Definition APInt.h:1796
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1489
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1222
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
back - Get the last element.
Definition ArrayRef.h:151
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
iterator end() const
Definition ArrayRef.h:131
iterator begin() const
Definition ArrayRef.h:130
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=true)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:138
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:162
static DebugLoc getUnknown()
Definition DebugLoc.h:161
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
The group of interleaved loads/stores sharing the same stride and close to each other.
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getNumMembers() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1540
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1078
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:108
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getUDivExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI bool mayAliasInScopes(const MDNode *Scopes, const MDNode *NoAlias)
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize get(ScalarTy Quantity, bool Scalable)
Definition TypeSize.h:340
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:88
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:97
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
op_range operands()
Definition User.h:293
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3621
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:3982
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4057
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4009
iterator end()
Definition VPlan.h:4019
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4017
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4070
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:216
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:578
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:550
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:623
const VPRecipeBase & back() const
Definition VPlan.h:4031
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2530
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2564
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2554
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2570
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2550
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:81
VPRegionBlock * getParent()
Definition VPlan.h:173
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:186
size_t getNumSuccessors() const
Definition VPlan.h:219
void swapSuccessors()
Swap successors of the block. The block must have exactly 2 successors.
Definition VPlan.h:322
size_t getNumPredecessors() const
Definition VPlan.h:220
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:204
VPlan * getPlan()
Definition VPlan.cpp:161
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:215
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:166
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:264
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:209
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:198
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:222
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:243
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:154
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:173
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:192
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3034
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPValue * createElementCount(Type *Ty, ElementCount EC)
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRFlags &Flags={}, const VPIRMetadata &Metadata={})
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPInstruction * createOverflowingOp(unsigned Opcode, ArrayRef< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags={false, false}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL, const Twine &Name="")
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3565
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:426
ArrayRef< VPValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:421
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:399
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:411
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:3732
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe for generating the phi node for the current index of elements, adjusted in accordance with E...
Definition VPlan.h:3653
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3079
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2056
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2099
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2088
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4135
BasicBlock * getIRBasicBlock() const
Definition VPlan.h:4159
Class to record and manage LLVM IR flags.
Definition VPlan.h:609
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:982
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1036
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1131
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1073
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1068
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1065
@ CanonicalIVIncrementForPart
Definition VPlan.h:1056
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2672
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2664
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2693
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:2746
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2704
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3221
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:387
VPRegionBlock * getRegion()
Definition VPlan.h:4287
VPBasicBlock * getParent()
Definition VPlan.h:408
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:479
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:2908
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:2797
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4170
const VPBlockBase * getEntry() const
Definition VPlan.h:4206
Type * getCanonicalIVType()
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4281
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4238
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4223
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the region.
Definition VPlan.h:4268
const VPBlockBase * getExiting() const
Definition VPlan.h:4218
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4231
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:2953
bool isSingleScalar() const
Definition VPlan.h:2994
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3018
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:3802
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:531
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:595
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:202
operand_range operands()
Definition VPlanValue.h:270
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:246
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:241
void addOperand(VPValue *Operand)
Definition VPlanValue.h:235
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:46
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1373
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:131
Value * getLiveInIRValue() const
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition VPlanValue.h:181
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:83
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:191
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1376
unsigned getNumUsers() const
Definition VPlanValue.h:111
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition VPlanValue.h:176
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1380
user_range users()
Definition VPlanValue.h:132
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:1915
A Recipe for widening the canonical induction variable of the vector loop.
Definition VPlan.h:3695
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1556
Instruction::CastOps getOpcode() const
Definition VPlan.h:1592
A recipe for handling GEP instructions.
Definition VPlan.h:1852
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2123
PHINode * getPHINode() const
Definition VPlan.h:2165
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2151
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2168
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2199
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2275
A recipe for widening vector intrinsics.
Definition VPlan.h:1606
A common base class for widening memory operations.
Definition VPlan.h:3264
A recipe for widened phis.
Definition VPlan.h:2333
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1516
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4300
bool hasVF(ElementCount VF) const
Definition VPlan.h:4501
LLVMContext & getContext() const
Definition VPlan.h:4489
VPBasicBlock * getEntry()
Definition VPlan.h:4389
VPValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4480
bool hasScalableVF() const
Definition VPlan.h:4502
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4487
VPValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4483
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4451
VPValue * getTrue()
Return a VPValue wrapping i1 true.
Definition VPlan.h:4557
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4472
unsigned getUF() const
Definition VPlan.h:4521
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:4627
auto getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4582
bool hasUF(unsigned UF) const
Definition VPlan.h:4519
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4441
VPValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4563
void setVF(ElementCount VF)
Definition VPlan.h:4495
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4534
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1010
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4465
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4414
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4605
VPValue * getFalse()
Return a VPValue wrapping i1 false.
Definition VPlan.h:4560
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4543
bool hasScalarVFOnly() const
Definition VPlan.h:4512
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4432
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition VPlan.h:4437
VPValue * getLiveIn(Value *V) const
Return the live-in VPValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4579
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4394
void setUF(unsigned UF)
Definition VPlan.h:4526
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop.
Definition VPlan.h:4659
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:262
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2763
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
bool match(const SCEV *S, const Pattern &P)
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Opcode, Op0_t, Op1_t > m_c_Binary(const Op0_t &Op0, const Op1_t &Op1)
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
GEPLikeRecipe_match< Op0_t, Op1_t > m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPDerivedIV_match< Op0_t, Op1_t, Op2_t > m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
specific_intval< 1 > m_True()
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isUniformAcrossVFsAndUFs(VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPIRFlags getFlagsFromIndDesc(const InductionDescriptor &ID)
Extracts and returns NoWrap and FastMath flags from the induction binop in ID.
Definition VPlanUtils.h:93
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2068
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2530
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2184
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:216
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:243
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1150
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
iterator_range< po_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_post_order_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order while traversing through ...
Definition VPlanCFG.h:236
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:550
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1717
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
RecurKind
These are the kinds of recurrences that we support.
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2002
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2078
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2009
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2156
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2136
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
#define N
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:761
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:784
MDNode * NoAlias
The tag specifying the noalias scope.
Definition Metadata.h:787
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2375
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3396
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3355
A recipe for widening select instructions.
Definition VPlan.h:1805
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3480
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3437
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void hoistInvariantLoads(VPlan &Plan)
Hoist single-scalar loads with invariant addresses out of the vector loop to the preheader,...
static void canonicalizeEVLLoops(VPlan &Plan)
Transform EVL loops to use variable-length stepping after region dissolution.
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static bool runPass(bool(*Transform)(VPlan &, ArgsTy...), VPlan &Plan, typename std::remove_reference< ArgsTy >::type &...Args)
Helper to run a VPlan transform Transform on VPlan, forwarding extra arguments to the transform.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, TypeSize VectorRegWidth)
Try to convert a plan with interleave groups with VF elements to a plan with the interleave groups re...
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue)
Materialize vector trip count computations to a set of VPInstructions.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB)
Update Plan to account for the uncountable early exit from EarlyExitingVPBB to EarlyExitVPBB by.
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize VF and VFxUF to be computed explicitly using VPInstructions.
static void updateScalarResumePhis(VPlan &Plan, DenseMap< VPValue *, VPValue * > &IVEndValues)
Update the resume phis in the scalar preheader after creating wide recipes for first-order recurrence...