LLVM 22.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
27#include "llvm/ADT/SetVector.h"
28#include "llvm/ADT/TypeSwitch.h"
34#include "llvm/IR/Intrinsics.h"
35#include "llvm/IR/MDBuilder.h"
39
40using namespace llvm;
41using namespace VPlanPatternMatch;
42
44 "enable-wide-lane-mask", cl::init(false), cl::Hidden,
45 cl::desc("Enable use of wide get active lane mask instructions"));
46
48 VPlan &Plan,
50 GetIntOrFpInductionDescriptor,
51 const TargetLibraryInfo &TLI) {
52
54 Plan.getVectorLoopRegion());
56 // Skip blocks outside region
57 if (!VPBB->getParent())
58 break;
59 VPRecipeBase *Term = VPBB->getTerminator();
60 auto EndIter = Term ? Term->getIterator() : VPBB->end();
61 // Introduce each ingredient into VPlan.
62 for (VPRecipeBase &Ingredient :
63 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
64
65 VPValue *VPV = Ingredient.getVPSingleValue();
66 if (!VPV->getUnderlyingValue())
67 continue;
68
70
71 VPRecipeBase *NewRecipe = nullptr;
72 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
73 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
74 const auto *II = GetIntOrFpInductionDescriptor(Phi);
75 if (!II) {
76 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
77 for (VPValue *Op : PhiR->operands())
78 NewRecipe->addOperand(Op);
79 } else {
80 VPValue *Start = Plan.getOrAddLiveIn(II->getStartValue());
81 VPValue *Step =
83 NewRecipe = new VPWidenIntOrFpInductionRecipe(
84 Phi, Start, Step, &Plan.getVF(), *II, Ingredient.getDebugLoc());
85 }
86 } else {
87 assert(isa<VPInstruction>(&Ingredient) &&
88 "only VPInstructions expected here");
89 assert(!isa<PHINode>(Inst) && "phis should be handled above");
90 // Create VPWidenMemoryRecipe for loads and stores.
91 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
92 NewRecipe = new VPWidenLoadRecipe(
93 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
94 false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load),
95 Ingredient.getDebugLoc());
96 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
97 NewRecipe = new VPWidenStoreRecipe(
98 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
99 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
100 VPIRMetadata(*Store), Ingredient.getDebugLoc());
102 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
103 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
104 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
105 if (VectorID == Intrinsic::not_intrinsic)
106 return false;
107 NewRecipe = new VPWidenIntrinsicRecipe(
108 *CI, getVectorIntrinsicIDForCall(CI, &TLI),
109 drop_end(Ingredient.operands()), CI->getType(),
110 CI->getDebugLoc());
111 } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
112 NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());
113 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
114 NewRecipe = new VPWidenCastRecipe(
115 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), *CI);
116 } else {
117 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands());
118 }
119 }
120
121 NewRecipe->insertBefore(&Ingredient);
122 if (NewRecipe->getNumDefinedValues() == 1)
123 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
124 else
125 assert(NewRecipe->getNumDefinedValues() == 0 &&
126 "Only recpies with zero or one defined values expected");
127 Ingredient.eraseFromParent();
128 }
129 }
130 return true;
131}
132
133/// Return true if we do not know how to (mechanically) hoist or sink \p R out
134/// of a loop region.
136 // Assumes don't alias anything or throw; as long as they're guaranteed to
137 // execute, they're safe to hoist.
139 return false;
140
141 // TODO: Relax checks in the future, e.g. we could also hoist reads, if their
142 // memory location is not modified in the vector loop.
143 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
144 return true;
145
146 // Allocas cannot be hoisted.
147 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
148 return RepR && RepR->getOpcode() == Instruction::Alloca;
149}
150
151static bool sinkScalarOperands(VPlan &Plan) {
152 auto Iter = vp_depth_first_deep(Plan.getEntry());
153 bool ScalarVFOnly = Plan.hasScalarVFOnly();
154 bool Changed = false;
155
157 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
158 VPBasicBlock *SinkTo, VPValue *Op) {
159 auto *Candidate =
160 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
161 if (!Candidate)
162 return;
163
164 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
165 // for now.
167 return;
168
169 if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(*Candidate))
170 return;
171
172 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
173 if (!ScalarVFOnly && RepR->isSingleScalar())
174 return;
175
176 WorkList.insert({SinkTo, Candidate});
177 };
178
179 // First, collect the operands of all recipes in replicate blocks as seeds for
180 // sinking.
182 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
183 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
184 continue;
185 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
186 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
187 continue;
188 for (auto &Recipe : *VPBB)
189 for (VPValue *Op : Recipe.operands())
190 InsertIfValidSinkCandidate(VPBB, Op);
191 }
192
193 // Try to sink each replicate or scalar IV steps recipe in the worklist.
194 for (unsigned I = 0; I != WorkList.size(); ++I) {
195 VPBasicBlock *SinkTo;
196 VPSingleDefRecipe *SinkCandidate;
197 std::tie(SinkTo, SinkCandidate) = WorkList[I];
198
199 // All recipe users of SinkCandidate must be in the same block SinkTo or all
200 // users outside of SinkTo must only use the first lane of SinkCandidate. In
201 // the latter case, we need to duplicate SinkCandidate.
202 auto UsersOutsideSinkTo =
203 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
204 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
205 });
206 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
207 return !U->usesFirstLaneOnly(SinkCandidate);
208 }))
209 continue;
210 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
211
212 if (NeedsDuplicating) {
213 if (ScalarVFOnly)
214 continue;
215 VPSingleDefRecipe *Clone;
216 if (auto *SinkCandidateRepR =
217 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
218 // TODO: Handle converting to uniform recipes as separate transform,
219 // then cloning should be sufficient here.
220 Instruction *I = SinkCandidate->getUnderlyingInstr();
221 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
222 nullptr /*Mask*/, *SinkCandidateRepR);
223 // TODO: add ".cloned" suffix to name of Clone's VPValue.
224 } else {
225 Clone = SinkCandidate->clone();
226 }
227
228 Clone->insertBefore(SinkCandidate);
229 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
230 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
231 });
232 }
233 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
234 for (VPValue *Op : SinkCandidate->operands())
235 InsertIfValidSinkCandidate(SinkTo, Op);
236 Changed = true;
237 }
238 return Changed;
239}
240
241/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
242/// the mask.
244 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
245 if (!EntryBB || EntryBB->size() != 1 ||
246 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
247 return nullptr;
248
249 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
250}
251
252/// If \p R is a triangle region, return the 'then' block of the triangle.
254 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
255 if (EntryBB->getNumSuccessors() != 2)
256 return nullptr;
257
258 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
259 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
260 if (!Succ0 || !Succ1)
261 return nullptr;
262
263 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
264 return nullptr;
265 if (Succ0->getSingleSuccessor() == Succ1)
266 return Succ0;
267 if (Succ1->getSingleSuccessor() == Succ0)
268 return Succ1;
269 return nullptr;
270}
271
272// Merge replicate regions in their successor region, if a replicate region
273// is connected to a successor replicate region with the same predicate by a
274// single, empty VPBasicBlock.
276 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
277
278 // Collect replicate regions followed by an empty block, followed by another
279 // replicate region with matching masks to process front. This is to avoid
280 // iterator invalidation issues while merging regions.
283 vp_depth_first_deep(Plan.getEntry()))) {
284 if (!Region1->isReplicator())
285 continue;
286 auto *MiddleBasicBlock =
287 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
288 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
289 continue;
290
291 auto *Region2 =
292 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
293 if (!Region2 || !Region2->isReplicator())
294 continue;
295
296 VPValue *Mask1 = getPredicatedMask(Region1);
297 VPValue *Mask2 = getPredicatedMask(Region2);
298 if (!Mask1 || Mask1 != Mask2)
299 continue;
300
301 assert(Mask1 && Mask2 && "both region must have conditions");
302 WorkList.push_back(Region1);
303 }
304
305 // Move recipes from Region1 to its successor region, if both are triangles.
306 for (VPRegionBlock *Region1 : WorkList) {
307 if (TransformedRegions.contains(Region1))
308 continue;
309 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
310 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
311
312 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
313 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
314 if (!Then1 || !Then2)
315 continue;
316
317 // Note: No fusion-preventing memory dependencies are expected in either
318 // region. Such dependencies should be rejected during earlier dependence
319 // checks, which guarantee accesses can be re-ordered for vectorization.
320 //
321 // Move recipes to the successor region.
322 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
323 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
324
325 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
326 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
327
328 // Move VPPredInstPHIRecipes from the merge block to the successor region's
329 // merge block. Update all users inside the successor region to use the
330 // original values.
331 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
332 VPValue *PredInst1 =
333 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
334 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
335 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
336 return cast<VPRecipeBase>(&U)->getParent() == Then2;
337 });
338
339 // Remove phi recipes that are unused after merging the regions.
340 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
341 Phi1ToMove.eraseFromParent();
342 continue;
343 }
344 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
345 }
346
347 // Remove the dead recipes in Region1's entry block.
348 for (VPRecipeBase &R :
349 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
350 R.eraseFromParent();
351
352 // Finally, remove the first region.
353 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
354 VPBlockUtils::disconnectBlocks(Pred, Region1);
355 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
356 }
357 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
358 TransformedRegions.insert(Region1);
359 }
360
361 return !TransformedRegions.empty();
362}
363
365 VPlan &Plan) {
366 Instruction *Instr = PredRecipe->getUnderlyingInstr();
367 // Build the triangular if-then region.
368 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
369 assert(Instr->getParent() && "Predicated instruction not in any basic block");
370 auto *BlockInMask = PredRecipe->getMask();
371 auto *MaskDef = BlockInMask->getDefiningRecipe();
372 auto *BOMRecipe = new VPBranchOnMaskRecipe(
373 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
374 auto *Entry =
375 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
376
377 // Replace predicated replicate recipe with a replicate recipe without a
378 // mask but in the replicate region.
379 auto *RecipeWithoutMask = new VPReplicateRecipe(
380 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
381 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe);
382 auto *Pred =
383 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
384
385 VPPredInstPHIRecipe *PHIRecipe = nullptr;
386 if (PredRecipe->getNumUsers() != 0) {
387 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
388 RecipeWithoutMask->getDebugLoc());
389 PredRecipe->replaceAllUsesWith(PHIRecipe);
390 PHIRecipe->setOperand(0, RecipeWithoutMask);
391 }
392 PredRecipe->eraseFromParent();
393 auto *Exiting =
394 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
396 Plan.createReplicateRegion(Entry, Exiting, RegionName);
397
398 // Note: first set Entry as region entry and then connect successors starting
399 // from it in order, to propagate the "parent" of each VPBasicBlock.
400 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
401 VPBlockUtils::connectBlocks(Pred, Exiting);
402
403 return Region;
404}
405
406static void addReplicateRegions(VPlan &Plan) {
409 vp_depth_first_deep(Plan.getEntry()))) {
410 for (VPRecipeBase &R : *VPBB)
411 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
412 if (RepR->isPredicated())
413 WorkList.push_back(RepR);
414 }
415 }
416
417 unsigned BBNum = 0;
418 for (VPReplicateRecipe *RepR : WorkList) {
419 VPBasicBlock *CurrentBlock = RepR->getParent();
420 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
421
422 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
423 SplitBlock->setName(
424 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
425 // Record predicated instructions for above packing optimizations.
427 Region->setParent(CurrentBlock->getParent());
429
430 VPRegionBlock *ParentRegion = Region->getParent();
431 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
432 ParentRegion->setExiting(SplitBlock);
433 }
434}
435
436/// Remove redundant VPBasicBlocks by merging them into their predecessor if
437/// the predecessor has a single successor.
441 vp_depth_first_deep(Plan.getEntry()))) {
442 // Don't fold the blocks in the skeleton of the Plan into their single
443 // predecessors for now.
444 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
445 if (!VPBB->getParent())
446 continue;
447 auto *PredVPBB =
448 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
449 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
450 isa<VPIRBasicBlock>(PredVPBB))
451 continue;
452 WorkList.push_back(VPBB);
453 }
454
455 for (VPBasicBlock *VPBB : WorkList) {
456 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
457 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
458 R.moveBefore(*PredVPBB, PredVPBB->end());
459 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
460 auto *ParentRegion = VPBB->getParent();
461 if (ParentRegion && ParentRegion->getExiting() == VPBB)
462 ParentRegion->setExiting(PredVPBB);
463 for (auto *Succ : to_vector(VPBB->successors())) {
465 VPBlockUtils::connectBlocks(PredVPBB, Succ);
466 }
467 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
468 }
469 return !WorkList.empty();
470}
471
473 // Convert masked VPReplicateRecipes to if-then region blocks.
475
476 bool ShouldSimplify = true;
477 while (ShouldSimplify) {
478 ShouldSimplify = sinkScalarOperands(Plan);
479 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
480 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
481 }
482}
483
484/// Remove redundant casts of inductions.
485///
486/// Such redundant casts are casts of induction variables that can be ignored,
487/// because we already proved that the casted phi is equal to the uncasted phi
488/// in the vectorized loop. There is no need to vectorize the cast - the same
489/// value can be used for both the phi and casts in the vector loop.
491 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
493 if (!IV || IV->getTruncInst())
494 continue;
495
496 // A sequence of IR Casts has potentially been recorded for IV, which
497 // *must be bypassed* when the IV is vectorized, because the vectorized IV
498 // will produce the desired casted value. This sequence forms a def-use
499 // chain and is provided in reverse order, ending with the cast that uses
500 // the IV phi. Search for the recipe of the last cast in the chain and
501 // replace it with the original IV. Note that only the final cast is
502 // expected to have users outside the cast-chain and the dead casts left
503 // over will be cleaned up later.
504 auto &Casts = IV->getInductionDescriptor().getCastInsts();
505 VPValue *FindMyCast = IV;
506 for (Instruction *IRCast : reverse(Casts)) {
507 VPSingleDefRecipe *FoundUserCast = nullptr;
508 for (auto *U : FindMyCast->users()) {
509 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
510 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
511 FoundUserCast = UserCast;
512 break;
513 }
514 }
515 FindMyCast = FoundUserCast;
516 }
517 FindMyCast->replaceAllUsesWith(IV);
518 }
519}
520
521/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
522/// recipe, if it exists.
524 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
525 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
526 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
527 for (VPUser *U : CanonicalIV->users()) {
529 if (WidenNewIV)
530 break;
531 }
532
533 if (!WidenNewIV)
534 return;
535
536 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
537 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
538 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
539
540 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
541 continue;
542
543 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
544 // everything WidenNewIV's users need. That is, WidenOriginalIV will
545 // generate a vector phi or all users of WidenNewIV demand the first lane
546 // only.
547 if (!vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
548 vputils::onlyFirstLaneUsed(WidenNewIV)) {
549 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
550 WidenNewIV->eraseFromParent();
551 return;
552 }
553 }
554}
555
556/// Returns true if \p R is dead and can be removed.
557static bool isDeadRecipe(VPRecipeBase &R) {
558 // Do remove conditional assume instructions as their conditions may be
559 // flattened.
560 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
561 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
563 if (IsConditionalAssume)
564 return true;
565
566 if (R.mayHaveSideEffects())
567 return false;
568
569 // Recipe is dead if no user keeps the recipe alive.
570 return all_of(R.definedValues(),
571 [](VPValue *V) { return V->getNumUsers() == 0; });
572}
573
576 vp_post_order_deep(Plan.getEntry()))) {
577 // The recipes in the block are processed in reverse order, to catch chains
578 // of dead recipes.
579 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
580 if (isDeadRecipe(R)) {
581 R.eraseFromParent();
582 continue;
583 }
584
585 // Check if R is a dead VPPhi <-> update cycle and remove it.
586 auto *PhiR = dyn_cast<VPPhi>(&R);
587 if (!PhiR || PhiR->getNumOperands() != 2 || PhiR->getNumUsers() != 1)
588 continue;
589 VPValue *Incoming = PhiR->getOperand(1);
590 if (*PhiR->user_begin() != Incoming->getDefiningRecipe() ||
591 Incoming->getNumUsers() != 1)
592 continue;
593 PhiR->replaceAllUsesWith(PhiR->getOperand(0));
594 PhiR->eraseFromParent();
595 Incoming->getDefiningRecipe()->eraseFromParent();
596 }
597 }
598}
599
602 Instruction::BinaryOps InductionOpcode,
603 FPMathOperator *FPBinOp, Instruction *TruncI,
604 VPValue *StartV, VPValue *Step, DebugLoc DL,
605 VPBuilder &Builder) {
606 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
607 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
608 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
609 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
610 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
611
612 // Truncate base induction if needed.
613 VPTypeAnalysis TypeInfo(Plan);
614 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
615 if (TruncI) {
616 Type *TruncTy = TruncI->getType();
617 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
618 "Not truncating.");
619 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
620 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
621 ResultTy = TruncTy;
622 }
623
624 // Truncate step if needed.
625 Type *StepTy = TypeInfo.inferScalarType(Step);
626 if (ResultTy != StepTy) {
627 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
628 "Not truncating.");
629 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
630 auto *VecPreheader =
632 VPBuilder::InsertPointGuard Guard(Builder);
633 Builder.setInsertPoint(VecPreheader);
634 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
635 }
636 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
637 &Plan.getVF(), DL);
638}
639
642 for (unsigned I = 0; I != Users.size(); ++I) {
644 if (isa<VPHeaderPHIRecipe>(Cur))
645 continue;
646 for (VPValue *V : Cur->definedValues())
647 Users.insert_range(V->users());
648 }
649 return Users.takeVector();
650}
651
652/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
653/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
654/// VPWidenPointerInductionRecipe will generate vectors only. If some users
655/// require vectors while other require scalars, the scalar uses need to extract
656/// the scalars from the generated vectors (Note that this is different to how
657/// int/fp inductions are handled). Legalize extract-from-ends using uniform
658/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
659/// the correct end value is available. Also optimize
660/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
661/// providing them scalar steps built on the canonical scalar IV and update the
662/// original IV's users. This is an optional optimization to reduce the needs of
663/// vector extracts.
666 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
667 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
668 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
669 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
670 if (!PhiR)
671 continue;
672
673 // Try to narrow wide and replicating recipes to uniform recipes, based on
674 // VPlan analysis.
675 // TODO: Apply to all recipes in the future, to replace legacy uniformity
676 // analysis.
677 auto Users = collectUsersRecursively(PhiR);
678 for (VPUser *U : reverse(Users)) {
679 auto *Def = dyn_cast<VPSingleDefRecipe>(U);
680 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
681 // Skip recipes that shouldn't be narrowed.
682 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
683 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
684 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
685 continue;
686
687 // Skip recipes that may have other lanes than their first used.
689 continue;
690
691 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
692 Def->operands(), /*IsUniform*/ true);
693 Clone->insertAfter(Def);
694 Def->replaceAllUsesWith(Clone);
695 }
696
697 // Replace wide pointer inductions which have only their scalars used by
698 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
699 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
700 if (!PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
701 continue;
702
703 const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
704 VPValue *StartV = Plan.getConstantInt(ID.getStep()->getType(), 0);
705 VPValue *StepV = PtrIV->getOperand(1);
707 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
708 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
709
710 VPValue *PtrAdd = Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
711 PtrIV->getDebugLoc(), "next.gep");
712
713 PtrIV->replaceAllUsesWith(PtrAdd);
714 continue;
715 }
716
717 // Replace widened induction with scalar steps for users that only use
718 // scalars.
719 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
720 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
721 return U->usesScalars(WideIV);
722 }))
723 continue;
724
725 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
727 Plan, ID.getKind(), ID.getInductionOpcode(),
728 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
729 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
730 WideIV->getDebugLoc(), Builder);
731
732 // Update scalar users of IV to use Step instead.
733 if (!HasOnlyVectorVFs)
734 WideIV->replaceAllUsesWith(Steps);
735 else
736 WideIV->replaceUsesWithIf(Steps, [WideIV](VPUser &U, unsigned) {
737 return U.usesScalars(WideIV);
738 });
739 }
740}
741
742/// Check if \p VPV is an untruncated wide induction, either before or after the
743/// increment. If so return the header IV (before the increment), otherwise
744/// return null.
746 ScalarEvolution &SE) {
747 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
748 if (WideIV) {
749 // VPV itself is a wide induction, separately compute the end value for exit
750 // users if it is not a truncated IV.
751 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
752 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
753 }
754
755 // Check if VPV is an optimizable induction increment.
756 VPRecipeBase *Def = VPV->getDefiningRecipe();
757 if (!Def || Def->getNumOperands() != 2)
758 return nullptr;
759 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
760 if (!WideIV)
761 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
762 if (!WideIV)
763 return nullptr;
764
765 auto IsWideIVInc = [&]() {
766 auto &ID = WideIV->getInductionDescriptor();
767
768 // Check if VPV increments the induction by the induction step.
769 VPValue *IVStep = WideIV->getStepValue();
770 switch (ID.getInductionOpcode()) {
771 case Instruction::Add:
772 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
773 case Instruction::FAdd:
775 m_Specific(IVStep)));
776 case Instruction::FSub:
777 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
778 m_Specific(IVStep)));
779 case Instruction::Sub: {
780 // IVStep will be the negated step of the subtraction. Check if Step == -1
781 // * IVStep.
782 VPValue *Step;
783 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
784 return false;
785 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, SE);
786 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, SE);
787 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
788 !isa<SCEVCouldNotCompute>(StepSCEV) &&
789 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
790 }
791 default:
792 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
793 match(VPV, m_GetElementPtr(m_Specific(WideIV),
794 m_Specific(WideIV->getStepValue())));
795 }
796 llvm_unreachable("should have been covered by switch above");
797 };
798 return IsWideIVInc() ? WideIV : nullptr;
799}
800
801/// Attempts to optimize the induction variable exit values for users in the
802/// early exit block.
804 VPTypeAnalysis &TypeInfo,
805 VPBlockBase *PredVPBB,
806 VPValue *Op,
807 ScalarEvolution &SE) {
808 VPValue *Incoming, *Mask;
811 return nullptr;
812
813 auto *WideIV = getOptimizableIVOf(Incoming, SE);
814 if (!WideIV)
815 return nullptr;
816
817 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
818 if (WideIntOrFp && WideIntOrFp->getTruncInst())
819 return nullptr;
820
821 // Calculate the final index.
822 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
823 auto *CanonicalIV = LoopRegion->getCanonicalIV();
824 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
825 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
826
827 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
828 VPValue *FirstActiveLane =
829 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
830 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
831 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
832 FirstActiveLaneType, DL);
833 VPValue *EndValue =
834 B.createNaryOp(Instruction::Add, {CanonicalIV, FirstActiveLane}, DL);
835
836 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
837 // changed it means the exit is using the incremented value, so we need to
838 // add the step.
839 if (Incoming != WideIV) {
840 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
841 EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
842 }
843
844 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
845 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
846 VPValue *Start = WideIV->getStartValue();
847 VPValue *Step = WideIV->getStepValue();
848 EndValue = B.createDerivedIV(
849 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
850 Start, EndValue, Step);
851 }
852
853 return EndValue;
854}
855
856/// Attempts to optimize the induction variable exit values for users in the
857/// exit block coming from the latch in the original scalar loop.
859 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
863 return nullptr;
864
865 auto *WideIV = getOptimizableIVOf(Incoming, SE);
866 if (!WideIV)
867 return nullptr;
868
869 VPValue *EndValue = EndValues.lookup(WideIV);
870 assert(EndValue && "end value must have been pre-computed");
871
872 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
873 // changed it means the exit is using the incremented value, so we don't
874 // need to subtract the step.
875 if (Incoming != WideIV)
876 return EndValue;
877
878 // Otherwise, subtract the step from the EndValue.
879 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
880 VPValue *Step = WideIV->getStepValue();
881 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
882 if (ScalarTy->isIntegerTy())
883 return B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape");
884 if (ScalarTy->isPointerTy()) {
885 Type *StepTy = TypeInfo.inferScalarType(Step);
886 auto *Zero = Plan.getConstantInt(StepTy, 0);
887 return B.createPtrAdd(EndValue,
888 B.createNaryOp(Instruction::Sub, {Zero, Step}),
889 DebugLoc::getUnknown(), "ind.escape");
890 }
891 if (ScalarTy->isFloatingPointTy()) {
892 const auto &ID = WideIV->getInductionDescriptor();
893 return B.createNaryOp(
894 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
895 ? Instruction::FSub
896 : Instruction::FAdd,
897 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
898 }
899 llvm_unreachable("all possible induction types must be handled");
900 return nullptr;
901}
902
904 VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues,
905 ScalarEvolution &SE) {
906 VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
907 VPTypeAnalysis TypeInfo(Plan);
908 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
909 for (VPRecipeBase &R : ExitVPBB->phis()) {
910 auto *ExitIRI = cast<VPIRPhi>(&R);
911
912 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
913 VPValue *Escape = nullptr;
914 if (PredVPBB == MiddleVPBB)
915 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
916 ExitIRI->getOperand(Idx),
917 EndValues, SE);
918 else
919 Escape = optimizeEarlyExitInductionUser(Plan, TypeInfo, PredVPBB,
920 ExitIRI->getOperand(Idx), SE);
921 if (Escape)
922 ExitIRI->setOperand(Idx, Escape);
923 }
924 }
925 }
926}
927
928/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
929/// them with already existing recipes expanding the same SCEV expression.
932
933 for (VPRecipeBase &R :
935 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
936 if (!ExpR)
937 continue;
938
939 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
940 if (Inserted)
941 continue;
942 ExpR->replaceAllUsesWith(V->second);
943 ExpR->eraseFromParent();
944 }
945}
946
948 SmallVector<VPValue *> WorkList;
950 WorkList.push_back(V);
951
952 while (!WorkList.empty()) {
953 VPValue *Cur = WorkList.pop_back_val();
954 if (!Seen.insert(Cur).second)
955 continue;
957 if (!R)
958 continue;
959 if (!isDeadRecipe(*R))
960 continue;
961 append_range(WorkList, R->operands());
962 R->eraseFromParent();
963 }
964}
965
966/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
967/// Returns an optional pair, where the first element indicates whether it is
968/// an intrinsic ID.
969static std::optional<std::pair<bool, unsigned>>
971 return TypeSwitch<const VPSingleDefRecipe *,
972 std::optional<std::pair<bool, unsigned>>>(R)
975 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
976 .Case<VPWidenIntrinsicRecipe>([](auto *I) {
977 return std::make_pair(true, I->getVectorIntrinsicID());
978 })
979 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
980 // For recipes that do not directly map to LLVM IR instructions,
981 // assign opcodes after the last VPInstruction opcode (which is also
982 // after the last IR Instruction opcode), based on the VPDefID.
983 return std::make_pair(false,
984 VPInstruction::OpsEnd + 1 + I->getVPDefID());
985 })
986 .Default([](auto *) { return std::nullopt; });
987}
988
989/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
990/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
991/// Operands are foldable live-ins.
993 ArrayRef<VPValue *> Operands,
994 const DataLayout &DL,
995 VPTypeAnalysis &TypeInfo) {
996 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
997 if (!OpcodeOrIID)
998 return nullptr;
999
1001 for (VPValue *Op : Operands) {
1002 if (!Op->isLiveIn() || !Op->getLiveInIRValue())
1003 return nullptr;
1004 Ops.push_back(Op->getLiveInIRValue());
1005 }
1006
1007 auto FoldToIRValue = [&]() -> Value * {
1008 InstSimplifyFolder Folder(DL);
1009 if (OpcodeOrIID->first) {
1010 if (R.getNumOperands() != 2)
1011 return nullptr;
1012 unsigned ID = OpcodeOrIID->second;
1013 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
1014 TypeInfo.inferScalarType(&R));
1015 }
1016 unsigned Opcode = OpcodeOrIID->second;
1017 if (Instruction::isBinaryOp(Opcode))
1018 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1019 Ops[0], Ops[1]);
1020 if (Instruction::isCast(Opcode))
1021 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1022 TypeInfo.inferScalarType(R.getVPSingleValue()));
1023 switch (Opcode) {
1025 return Folder.FoldSelect(Ops[0], Ops[1],
1027 case VPInstruction::Not:
1028 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1030 case Instruction::Select:
1031 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1032 case Instruction::ICmp:
1033 case Instruction::FCmp:
1034 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1035 Ops[1]);
1036 case Instruction::GetElementPtr: {
1037 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1038 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1039 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1040 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1041 }
1044 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
1045 Ops[0], Ops[1],
1046 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1047 // An extract of a live-in is an extract of a broadcast, so return the
1048 // broadcasted element.
1049 case Instruction::ExtractElement:
1050 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1051 return Ops[0];
1052 }
1053 return nullptr;
1054 };
1055
1056 if (Value *V = FoldToIRValue())
1057 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1058 return nullptr;
1059}
1060
1061/// Try to simplify VPSingleDefRecipe \p Def.
1063 VPlan *Plan = Def->getParent()->getPlan();
1064
1065 // Simplification of live-in IR values for SingleDef recipes using
1066 // InstSimplifyFolder.
1067 const DataLayout &DL =
1069 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
1070 return Def->replaceAllUsesWith(V);
1071
1072 // Fold PredPHI LiveIn -> LiveIn.
1073 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1074 VPValue *Op = PredPHI->getOperand(0);
1075 if (Op->isLiveIn())
1076 PredPHI->replaceAllUsesWith(Op);
1077 }
1078
1079 VPBuilder Builder(Def);
1080 VPValue *A;
1081 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1082 Type *TruncTy = TypeInfo.inferScalarType(Def);
1083 Type *ATy = TypeInfo.inferScalarType(A);
1084 if (TruncTy == ATy) {
1085 Def->replaceAllUsesWith(A);
1086 } else {
1087 // Don't replace a scalarizing recipe with a widened cast.
1088 if (isa<VPReplicateRecipe>(Def))
1089 return;
1090 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1091
1092 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1093 ? Instruction::SExt
1094 : Instruction::ZExt;
1095 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1096 TruncTy);
1097 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1098 // UnderlyingExt has distinct return type, used to retain legacy cost.
1099 Ext->setUnderlyingValue(UnderlyingExt);
1100 }
1101 Def->replaceAllUsesWith(Ext);
1102 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1103 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1104 Def->replaceAllUsesWith(Trunc);
1105 }
1106 }
1107#ifndef NDEBUG
1108 // Verify that the cached type info is for both A and its users is still
1109 // accurate by comparing it to freshly computed types.
1110 VPTypeAnalysis TypeInfo2(*Plan);
1111 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1112 for (VPUser *U : A->users()) {
1113 auto *R = cast<VPRecipeBase>(U);
1114 for (VPValue *VPV : R->definedValues())
1115 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1116 }
1117#endif
1118 }
1119
1120 // Simplify (X && Y) || (X && !Y) -> X.
1121 // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
1122 // && (Y || Z) and (X || !X) into true. This requires queuing newly created
1123 // recipes to be visited during simplification.
1124 VPValue *X, *Y, *Z;
1125 if (match(Def,
1128 Def->replaceAllUsesWith(X);
1129 Def->eraseFromParent();
1130 return;
1131 }
1132
1133 // x | 1 -> 1
1134 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1135 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1136
1137 // x | 0 -> x
1138 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1139 return Def->replaceAllUsesWith(X);
1140
1141 // x & 0 -> 0
1142 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1143 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1144
1145 // x && false -> false
1146 if (match(Def, m_LogicalAnd(m_VPValue(X), m_False())))
1147 return Def->replaceAllUsesWith(Def->getOperand(1));
1148
1149 // (x && y) || (x && z) -> x && (y || z)
1152 // Simplify only if one of the operands has one use to avoid creating an
1153 // extra recipe.
1154 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1155 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1156 return Def->replaceAllUsesWith(
1157 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1158
1159 // x && !x -> 0
1161 return Def->replaceAllUsesWith(Plan->getFalse());
1162
1163 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1164 return Def->replaceAllUsesWith(X);
1165
1166 // select !c, x, y -> select c, y, x
1167 VPValue *C;
1168 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1169 Def->setOperand(0, C);
1170 Def->setOperand(1, Y);
1171 Def->setOperand(2, X);
1172 return;
1173 }
1174
1175 // Reassociate (x && y) && z -> x && (y && z) if x has multiple users. With
1176 // tail folding it is likely that x is a header mask and can be simplified
1177 // further.
1179 m_VPValue(Z))) &&
1180 X->hasMoreThanOneUniqueUser())
1181 return Def->replaceAllUsesWith(
1182 Builder.createLogicalAnd(X, Builder.createLogicalAnd(Y, Z)));
1183
1184 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1185 return Def->replaceAllUsesWith(A);
1186
1187 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1188 return Def->replaceAllUsesWith(
1189 Def->getOperand(0) == A ? Def->getOperand(1) : Def->getOperand(0));
1190
1191 if (match(Def, m_Not(m_VPValue(A)))) {
1192 if (match(A, m_Not(m_VPValue(A))))
1193 return Def->replaceAllUsesWith(A);
1194
1195 // Try to fold Not into compares by adjusting the predicate in-place.
1196 CmpPredicate Pred;
1197 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1198 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1199 if (all_of(Cmp->users(),
1201 m_Not(m_Specific(Cmp)),
1202 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1203 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1204 for (VPUser *U : to_vector(Cmp->users())) {
1205 auto *R = cast<VPSingleDefRecipe>(U);
1206 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1207 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1208 R->setOperand(1, Y);
1209 R->setOperand(2, X);
1210 } else {
1211 // not (cmp pred) -> cmp inv_pred
1212 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1213 R->replaceAllUsesWith(Cmp);
1214 }
1215 }
1216 // If Cmp doesn't have a debug location, use the one from the negation,
1217 // to preserve the location.
1218 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1219 Cmp->setDebugLoc(Def->getDebugLoc());
1220 }
1221 }
1222 }
1223
1224 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1225 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1226 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1227 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1228 TypeInfo.inferScalarType(Def))
1229 return Def->replaceAllUsesWith(Def->getOperand(1));
1230
1232 m_One()))) {
1233 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1234 if (TypeInfo.inferScalarType(X) != WideStepTy)
1235 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1236 Def->replaceAllUsesWith(X);
1237 return;
1238 }
1239
1240 // For i1 vp.merges produced by AnyOf reductions:
1241 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1243 m_VPValue(X), m_VPValue())) &&
1245 TypeInfo.inferScalarType(Def)->isIntegerTy(1)) {
1246 Def->setOperand(1, Def->getOperand(0));
1247 Def->setOperand(0, Y);
1248 return;
1249 }
1250
1251 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1252 if (Phi->getOperand(0) == Phi->getOperand(1))
1253 Phi->replaceAllUsesWith(Phi->getOperand(0));
1254 return;
1255 }
1256
1257 // Look through ExtractLastElement (BuildVector ....).
1260 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1261 Def->replaceAllUsesWith(
1262 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1263 return;
1264 }
1265
1266 // Look through ExtractPenultimateElement (BuildVector ....).
1268 m_BuildVector()))) {
1269 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1270 Def->replaceAllUsesWith(
1271 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1272 return;
1273 }
1274
1275 uint64_t Idx;
1277 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1278 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1279 return;
1280 }
1281
1282 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1283 Def->replaceAllUsesWith(
1284 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1285 return;
1286 }
1287
1288 // Look through broadcast of single-scalar when used as select conditions; in
1289 // that case the scalar condition can be used directly.
1290 if (match(Def,
1293 "broadcast operand must be single-scalar");
1294 Def->setOperand(0, C);
1295 return;
1296 }
1297
1298 if (auto *Phi = dyn_cast<VPPhi>(Def)) {
1299 if (Phi->getNumOperands() == 1)
1300 Phi->replaceAllUsesWith(Phi->getOperand(0));
1301 return;
1302 }
1303
1304 // Some simplifications can only be applied after unrolling. Perform them
1305 // below.
1306 if (!Plan->isUnrolled())
1307 return;
1308
1309 // Hoist an invariant increment Y of a phi X, by having X start at Y.
1310 if (match(Def, m_c_Add(m_VPValue(X), m_VPValue(Y))) && Y->isLiveIn() &&
1311 isa<VPPhi>(X)) {
1312 auto *Phi = cast<VPPhi>(X);
1313 if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) &&
1314 Phi->getNumUsers() == 1 && (*Phi->user_begin() == Def)) {
1315 Phi->setOperand(0, Y);
1316 Def->replaceAllUsesWith(Phi);
1317 return;
1318 }
1319 }
1320
1321 // VPVectorPointer for part 0 can be replaced by their start pointer.
1322 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(Def)) {
1323 if (VecPtr->isFirstPart()) {
1324 VecPtr->replaceAllUsesWith(VecPtr->getOperand(0));
1325 return;
1326 }
1327 }
1328
1329 // VPScalarIVSteps for part 0 can be replaced by their start value, if only
1330 // the first lane is demanded.
1331 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1332 if (Steps->isPart0() && vputils::onlyFirstLaneUsed(Steps)) {
1333 Steps->replaceAllUsesWith(Steps->getOperand(0));
1334 return;
1335 }
1336 }
1337 // Simplify redundant ReductionStartVector recipes after unrolling.
1338 VPValue *StartV;
1340 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1341 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1342 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1343 return PhiR && PhiR->isInLoop();
1344 });
1345 return;
1346 }
1347
1348 if (match(Def,
1351 Def->replaceAllUsesWith(A);
1352 return;
1353 }
1354
1359 cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
1360 all_of(A->users(),
1361 [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
1362 return Def->replaceAllUsesWith(A);
1363 }
1364
1365 if (Plan->getUF() == 1 &&
1367 return Def->replaceAllUsesWith(
1368 Builder.createNaryOp(VPInstruction::ExtractLastElement, {A}));
1369 }
1370}
1371
1374 Plan.getEntry());
1375 VPTypeAnalysis TypeInfo(Plan);
1377 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1378 if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R))
1379 simplifyRecipe(Def, TypeInfo);
1380 }
1381}
1382
1384 if (Plan.hasScalarVFOnly())
1385 return;
1386
1387 // Try to narrow wide and replicating recipes to single scalar recipes,
1388 // based on VPlan analysis. Only process blocks in the loop region for now,
1389 // without traversing into nested regions, as recipes in replicate regions
1390 // cannot be converted yet.
1393 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1395 continue;
1396 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1397 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1398 continue;
1399
1400 auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R);
1401 if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
1402 vputils::isSingleScalar(RepR->getOperand(1))) {
1403 auto *Clone = new VPReplicateRecipe(
1404 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1405 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Metadata*/);
1406 Clone->insertBefore(RepOrWidenR);
1407 unsigned ExtractOpc =
1408 vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1))
1411 auto *Ext = new VPInstruction(ExtractOpc, {Clone->getOperand(0)});
1412 Ext->insertBefore(Clone);
1413 Clone->setOperand(0, Ext);
1414 RepR->eraseFromParent();
1415 continue;
1416 }
1417
1418 // Skip recipes that aren't single scalars or don't have only their
1419 // scalar results used. In the latter case, we would introduce extra
1420 // broadcasts.
1421 if (!vputils::isSingleScalar(RepOrWidenR) ||
1422 !all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) {
1423 return U->usesScalars(RepOrWidenR) ||
1424 match(cast<VPRecipeBase>(U),
1425 m_CombineOr(m_ExtractLastElement(m_VPValue()),
1426 m_ExtractLastLanePerPart(m_VPValue())));
1427 }))
1428 continue;
1429
1430 auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(),
1431 RepOrWidenR->operands(),
1432 true /*IsSingleScalar*/);
1433 Clone->insertBefore(RepOrWidenR);
1434 RepOrWidenR->replaceAllUsesWith(Clone);
1435 if (isDeadRecipe(*RepOrWidenR))
1436 RepOrWidenR->eraseFromParent();
1437 }
1438 }
1439}
1440
1441/// Try to see if all of \p Blend's masks share a common value logically and'ed
1442/// and remove it from the masks.
1444 if (Blend->isNormalized())
1445 return;
1446 VPValue *CommonEdgeMask;
1447 if (!match(Blend->getMask(0),
1448 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1449 return;
1450 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1451 if (!match(Blend->getMask(I),
1452 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1453 return;
1454 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1455 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1456}
1457
1458/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1459/// to make sure the masks are simplified.
1460static void simplifyBlends(VPlan &Plan) {
1463 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1464 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1465 if (!Blend)
1466 continue;
1467
1468 removeCommonBlendMask(Blend);
1469
1470 // Try to remove redundant blend recipes.
1471 SmallPtrSet<VPValue *, 4> UniqueValues;
1472 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1473 UniqueValues.insert(Blend->getIncomingValue(0));
1474 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1475 if (!match(Blend->getMask(I), m_False()))
1476 UniqueValues.insert(Blend->getIncomingValue(I));
1477
1478 if (UniqueValues.size() == 1) {
1479 Blend->replaceAllUsesWith(*UniqueValues.begin());
1480 Blend->eraseFromParent();
1481 continue;
1482 }
1483
1484 if (Blend->isNormalized())
1485 continue;
1486
1487 // Normalize the blend so its first incoming value is used as the initial
1488 // value with the others blended into it.
1489
1490 unsigned StartIndex = 0;
1491 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1492 // If a value's mask is used only by the blend then is can be deadcoded.
1493 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1494 // that's used by multiple blends where it can be removed from them all.
1495 VPValue *Mask = Blend->getMask(I);
1496 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1497 StartIndex = I;
1498 break;
1499 }
1500 }
1501
1502 SmallVector<VPValue *, 4> OperandsWithMask;
1503 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1504
1505 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1506 if (I == StartIndex)
1507 continue;
1508 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1509 OperandsWithMask.push_back(Blend->getMask(I));
1510 }
1511
1512 auto *NewBlend =
1513 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1514 OperandsWithMask, Blend->getDebugLoc());
1515 NewBlend->insertBefore(&R);
1516
1517 VPValue *DeadMask = Blend->getMask(StartIndex);
1518 Blend->replaceAllUsesWith(NewBlend);
1519 Blend->eraseFromParent();
1521
1522 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1523 VPValue *NewMask;
1524 if (NewBlend->getNumOperands() == 3 &&
1525 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1526 VPValue *Inc0 = NewBlend->getOperand(0);
1527 VPValue *Inc1 = NewBlend->getOperand(1);
1528 VPValue *OldMask = NewBlend->getOperand(2);
1529 NewBlend->setOperand(0, Inc1);
1530 NewBlend->setOperand(1, Inc0);
1531 NewBlend->setOperand(2, NewMask);
1532 if (OldMask->getNumUsers() == 0)
1533 cast<VPInstruction>(OldMask)->eraseFromParent();
1534 }
1535 }
1536 }
1537}
1538
1539/// Optimize the width of vector induction variables in \p Plan based on a known
1540/// constant Trip Count, \p BestVF and \p BestUF.
1542 ElementCount BestVF,
1543 unsigned BestUF) {
1544 // Only proceed if we have not completely removed the vector region.
1545 if (!Plan.getVectorLoopRegion())
1546 return false;
1547
1548 const APInt *TC;
1549 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
1550 return false;
1551
1552 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1553 // and UF. Returns at least 8.
1554 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1555 APInt AlignedTC =
1558 APInt MaxVal = AlignedTC - 1;
1559 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
1560 };
1561 unsigned NewBitWidth =
1562 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
1563
1564 LLVMContext &Ctx = Plan.getContext();
1565 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
1566
1567 bool MadeChange = false;
1568
1569 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1570 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1571 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1572
1573 // Currently only handle canonical IVs as it is trivial to replace the start
1574 // and stop values, and we currently only perform the optimization when the
1575 // IV has a single use.
1576 if (!WideIV || !WideIV->isCanonical() ||
1577 WideIV->hasMoreThanOneUniqueUser() ||
1578 NewIVTy == WideIV->getScalarType())
1579 continue;
1580
1581 // Currently only handle cases where the single user is a header-mask
1582 // comparison with the backedge-taken-count.
1583 if (!match(*WideIV->user_begin(),
1584 m_ICmp(m_Specific(WideIV),
1587 continue;
1588
1589 // Update IV operands and comparison bound to use new narrower type.
1590 auto *NewStart = Plan.getConstantInt(NewIVTy, 0);
1591 WideIV->setStartValue(NewStart);
1592 auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
1593 WideIV->setStepValue(NewStep);
1594
1595 auto *NewBTC = new VPWidenCastRecipe(
1596 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
1597 Plan.getVectorPreheader()->appendRecipe(NewBTC);
1598 auto *Cmp = cast<VPInstruction>(*WideIV->user_begin());
1599 Cmp->setOperand(1, NewBTC);
1600
1601 MadeChange = true;
1602 }
1603
1604 return MadeChange;
1605}
1606
1607/// Return true if \p Cond is known to be true for given \p BestVF and \p
1608/// BestUF.
1610 ElementCount BestVF, unsigned BestUF,
1611 ScalarEvolution &SE) {
1613 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
1614 &SE](VPValue *C) {
1615 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, SE);
1616 });
1617
1618 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
1620 m_Specific(CanIV->getBackedgeValue()),
1621 m_Specific(&Plan.getVectorTripCount()))))
1622 return false;
1623
1624 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
1625 // count is not conveniently available as SCEV so far, so we compare directly
1626 // against the original trip count. This is stricter than necessary, as we
1627 // will only return true if the trip count == vector trip count.
1628 const SCEV *VectorTripCount =
1630 if (isa<SCEVCouldNotCompute>(VectorTripCount))
1631 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
1632 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
1633 "Trip count SCEV must be computable");
1634 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1635 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
1636 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
1637}
1638
1639/// Try to replace multiple active lane masks used for control flow with
1640/// a single, wide active lane mask instruction followed by multiple
1641/// extract subvector intrinsics. This applies to the active lane mask
1642/// instructions both in the loop and in the preheader.
1643/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
1644/// new extracts from the first active lane mask, which has it's last
1645/// operand (multiplier) set to UF.
1647 unsigned UF) {
1648 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
1649 return false;
1650
1651 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1652 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1653 auto *Term = &ExitingVPBB->back();
1654
1655 using namespace llvm::VPlanPatternMatch;
1657 m_VPValue(), m_VPValue(), m_VPValue())))))
1658 return false;
1659
1660 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1661 LLVMContext &Ctx = Plan.getContext();
1662
1663 auto ExtractFromALM = [&](VPInstruction *ALM,
1664 SmallVectorImpl<VPValue *> &Extracts) {
1665 DebugLoc DL = ALM->getDebugLoc();
1666 for (unsigned Part = 0; Part < UF; ++Part) {
1668 Ops.append({ALM, Plan.getOrAddLiveIn(
1669 ConstantInt::get(IntegerType::getInt64Ty(Ctx),
1670 VF.getKnownMinValue() * Part))});
1671 auto *Ext = new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
1673 Extracts[Part] = Ext;
1674 Ext->insertAfter(ALM);
1675 }
1676 };
1677
1678 // Create a list of each active lane mask phi, ordered by unroll part.
1680 for (VPRecipeBase &R : Header->phis()) {
1682 if (!Phi)
1683 continue;
1684 VPValue *Index = nullptr;
1685 match(Phi->getBackedgeValue(),
1687 assert(Index && "Expected index from ActiveLaneMask instruction");
1688
1689 uint64_t Part;
1690 if (match(Index,
1692 m_VPValue(), m_ConstantInt(Part))))
1693 Phis[Part] = Phi;
1694 else
1695 // Anything other than a CanonicalIVIncrementForPart is part 0
1696 Phis[0] = Phi;
1697 }
1698
1699 assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
1700 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
1701
1702 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
1703 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
1704
1705 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
1706 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
1707 "Expected incoming values of Phi to be ActiveLaneMasks");
1708
1709 // When using wide lane masks, the return type of the get.active.lane.mask
1710 // intrinsic is VF x UF (last operand).
1711 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
1712 EntryALM->setOperand(2, ALMMultiplier);
1713 LoopALM->setOperand(2, ALMMultiplier);
1714
1715 // Create UF x extract vectors and insert into preheader.
1716 SmallVector<VPValue *> EntryExtracts(UF);
1717 ExtractFromALM(EntryALM, EntryExtracts);
1718
1719 // Create UF x extract vectors and insert before the loop compare & branch,
1720 // updating the compare to use the first extract.
1721 SmallVector<VPValue *> LoopExtracts(UF);
1722 ExtractFromALM(LoopALM, LoopExtracts);
1723 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
1724 Not->setOperand(0, LoopExtracts[0]);
1725
1726 // Update the incoming values of active lane mask phis.
1727 for (unsigned Part = 0; Part < UF; ++Part) {
1728 Phis[Part]->setStartValue(EntryExtracts[Part]);
1729 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
1730 }
1731
1732 return true;
1733}
1734
1735/// Try to simplify the branch condition of \p Plan. This may restrict the
1736/// resulting plan to \p BestVF and \p BestUF.
1738 unsigned BestUF,
1740 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1741 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1742 auto *Term = &ExitingVPBB->back();
1743 VPValue *Cond;
1744 ScalarEvolution &SE = *PSE.getSE();
1745 if (match(Term, m_BranchOnCount()) ||
1747 m_VPValue(), m_VPValue(), m_VPValue()))))) {
1748 // Try to simplify the branch condition if TC <= VF * UF when the latch
1749 // terminator is BranchOnCount or BranchOnCond where the input is
1750 // Not(ActiveLaneMask).
1751 const SCEV *TripCount =
1753 assert(!isa<SCEVCouldNotCompute>(TripCount) &&
1754 "Trip count SCEV must be computable");
1755 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1756 const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements);
1757 if (TripCount->isZero() ||
1758 !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C))
1759 return false;
1760 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond)))) {
1761 // For BranchOnCond, check if we can prove the condition to be true using VF
1762 // and UF.
1763 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, SE))
1764 return false;
1765 } else {
1766 return false;
1767 }
1768
1769 // The vector loop region only executes once. If possible, completely remove
1770 // the region, otherwise replace the terminator controlling the latch with
1771 // (BranchOnCond true).
1772 // TODO: VPWidenIntOrFpInductionRecipe is only partially supported; add
1773 // support for other non-canonical widen induction recipes (e.g.,
1774 // VPWidenPointerInductionRecipe).
1775 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1776 if (all_of(Header->phis(), [](VPRecipeBase &Phi) {
1777 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi))
1778 return R->isCanonical();
1779 return isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
1780 VPFirstOrderRecurrencePHIRecipe, VPPhi>(&Phi);
1781 })) {
1782 for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
1783 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&HeaderR)) {
1784 VPBuilder Builder(Plan.getVectorPreheader());
1785 VPValue *StepV = Builder.createNaryOp(VPInstruction::StepVector, {},
1786 R->getScalarType());
1787 HeaderR.getVPSingleValue()->replaceAllUsesWith(StepV);
1788 HeaderR.eraseFromParent();
1789 continue;
1790 }
1791 auto *Phi = cast<VPPhiAccessors>(&HeaderR);
1792 HeaderR.getVPSingleValue()->replaceAllUsesWith(Phi->getIncomingValue(0));
1793 HeaderR.eraseFromParent();
1794 }
1795
1796 VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
1797 VPBlockBase *Exit = VectorRegion->getSingleSuccessor();
1798 VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
1799 VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
1800
1801 for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
1802 B->setParent(nullptr);
1803
1804 VPBlockUtils::connectBlocks(Preheader, Header);
1805 VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
1807 } else {
1808 // The vector region contains header phis for which we cannot remove the
1809 // loop region yet.
1810 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
1811 Term->getDebugLoc());
1812 ExitingVPBB->appendRecipe(BOC);
1813 }
1814
1815 Term->eraseFromParent();
1816
1817 return true;
1818}
1819
1821 unsigned BestUF,
1823 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
1824 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
1825
1826 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
1827 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
1828 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
1829
1830 if (MadeChange) {
1831 Plan.setVF(BestVF);
1832 assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF");
1833 }
1834 // TODO: Further simplifications are possible
1835 // 1. Replace inductions with constants.
1836 // 2. Replace vector loop region with VPBasicBlock.
1837}
1838
1839/// Sink users of \p FOR after the recipe defining the previous value \p
1840/// Previous of the recurrence. \returns true if all users of \p FOR could be
1841/// re-arranged as needed or false if it is not possible.
1842static bool
1844 VPRecipeBase *Previous,
1845 VPDominatorTree &VPDT) {
1846 // Collect recipes that need sinking.
1849 Seen.insert(Previous);
1850 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
1851 // The previous value must not depend on the users of the recurrence phi. In
1852 // that case, FOR is not a fixed order recurrence.
1853 if (SinkCandidate == Previous)
1854 return false;
1855
1856 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
1857 !Seen.insert(SinkCandidate).second ||
1858 VPDT.properlyDominates(Previous, SinkCandidate))
1859 return true;
1860
1861 if (cannotHoistOrSinkRecipe(*SinkCandidate))
1862 return false;
1863
1864 WorkList.push_back(SinkCandidate);
1865 return true;
1866 };
1867
1868 // Recursively sink users of FOR after Previous.
1869 WorkList.push_back(FOR);
1870 for (unsigned I = 0; I != WorkList.size(); ++I) {
1871 VPRecipeBase *Current = WorkList[I];
1872 assert(Current->getNumDefinedValues() == 1 &&
1873 "only recipes with a single defined value expected");
1874
1875 for (VPUser *User : Current->getVPSingleValue()->users()) {
1876 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
1877 return false;
1878 }
1879 }
1880
1881 // Keep recipes to sink ordered by dominance so earlier instructions are
1882 // processed first.
1883 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
1884 return VPDT.properlyDominates(A, B);
1885 });
1886
1887 for (VPRecipeBase *SinkCandidate : WorkList) {
1888 if (SinkCandidate == FOR)
1889 continue;
1890
1891 SinkCandidate->moveAfter(Previous);
1892 Previous = SinkCandidate;
1893 }
1894 return true;
1895}
1896
1897/// Try to hoist \p Previous and its operands before all users of \p FOR.
1899 VPRecipeBase *Previous,
1900 VPDominatorTree &VPDT) {
1901 if (cannotHoistOrSinkRecipe(*Previous))
1902 return false;
1903
1904 // Collect recipes that need hoisting.
1905 SmallVector<VPRecipeBase *> HoistCandidates;
1907 VPRecipeBase *HoistPoint = nullptr;
1908 // Find the closest hoist point by looking at all users of FOR and selecting
1909 // the recipe dominating all other users.
1910 for (VPUser *U : FOR->users()) {
1911 auto *R = cast<VPRecipeBase>(U);
1912 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
1913 HoistPoint = R;
1914 }
1915 assert(all_of(FOR->users(),
1916 [&VPDT, HoistPoint](VPUser *U) {
1917 auto *R = cast<VPRecipeBase>(U);
1918 return HoistPoint == R ||
1919 VPDT.properlyDominates(HoistPoint, R);
1920 }) &&
1921 "HoistPoint must dominate all users of FOR");
1922
1923 auto NeedsHoisting = [HoistPoint, &VPDT,
1924 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
1925 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
1926 if (!HoistCandidate)
1927 return nullptr;
1928 VPRegionBlock *EnclosingLoopRegion =
1929 HoistCandidate->getParent()->getEnclosingLoopRegion();
1930 assert((!HoistCandidate->getRegion() ||
1931 HoistCandidate->getRegion() == EnclosingLoopRegion) &&
1932 "CFG in VPlan should still be flat, without replicate regions");
1933 // Hoist candidate was already visited, no need to hoist.
1934 if (!Visited.insert(HoistCandidate).second)
1935 return nullptr;
1936
1937 // Candidate is outside loop region or a header phi, dominates FOR users w/o
1938 // hoisting.
1939 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
1940 return nullptr;
1941
1942 // If we reached a recipe that dominates HoistPoint, we don't need to
1943 // hoist the recipe.
1944 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
1945 return nullptr;
1946 return HoistCandidate;
1947 };
1948
1949 if (!NeedsHoisting(Previous->getVPSingleValue()))
1950 return true;
1951
1952 // Recursively try to hoist Previous and its operands before all users of FOR.
1953 HoistCandidates.push_back(Previous);
1954
1955 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
1956 VPRecipeBase *Current = HoistCandidates[I];
1957 assert(Current->getNumDefinedValues() == 1 &&
1958 "only recipes with a single defined value expected");
1959 if (cannotHoistOrSinkRecipe(*Current))
1960 return false;
1961
1962 for (VPValue *Op : Current->operands()) {
1963 // If we reach FOR, it means the original Previous depends on some other
1964 // recurrence that in turn depends on FOR. If that is the case, we would
1965 // also need to hoist recipes involving the other FOR, which may break
1966 // dependencies.
1967 if (Op == FOR)
1968 return false;
1969
1970 if (auto *R = NeedsHoisting(Op))
1971 HoistCandidates.push_back(R);
1972 }
1973 }
1974
1975 // Order recipes to hoist by dominance so earlier instructions are processed
1976 // first.
1977 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
1978 return VPDT.properlyDominates(A, B);
1979 });
1980
1981 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
1982 HoistCandidate->moveBefore(*HoistPoint->getParent(),
1983 HoistPoint->getIterator());
1984 }
1985
1986 return true;
1987}
1988
1990 VPBuilder &LoopBuilder) {
1991 VPDominatorTree VPDT(Plan);
1992
1994 for (VPRecipeBase &R :
1997 RecurrencePhis.push_back(FOR);
1998
1999 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
2001 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
2002 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
2003 // to terminate.
2004 while (auto *PrevPhi =
2006 assert(PrevPhi->getParent() == FOR->getParent());
2007 assert(SeenPhis.insert(PrevPhi).second);
2008 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
2009 }
2010
2011 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
2012 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
2013 return false;
2014
2015 // Introduce a recipe to combine the incoming and previous values of a
2016 // fixed-order recurrence.
2017 VPBasicBlock *InsertBlock = Previous->getParent();
2018 if (isa<VPHeaderPHIRecipe>(Previous))
2019 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
2020 else
2021 LoopBuilder.setInsertPoint(InsertBlock,
2022 std::next(Previous->getIterator()));
2023
2024 auto *RecurSplice =
2026 {FOR, FOR->getBackedgeValue()});
2027
2028 FOR->replaceAllUsesWith(RecurSplice);
2029 // Set the first operand of RecurSplice to FOR again, after replacing
2030 // all users.
2031 RecurSplice->setOperand(0, FOR);
2032 }
2033 return true;
2034}
2035
2037 for (VPRecipeBase &R :
2039 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2040 if (!PhiR)
2041 continue;
2042 RecurKind RK = PhiR->getRecurrenceKind();
2043 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2045 continue;
2046
2047 for (VPUser *U : collectUsersRecursively(PhiR))
2048 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2049 RecWithFlags->dropPoisonGeneratingFlags();
2050 }
2051 }
2052}
2053
2054namespace {
2055struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2056 static bool isSentinel(const VPSingleDefRecipe *Def) {
2057 return Def == getEmptyKey() || Def == getTombstoneKey();
2058 }
2059
2060 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2061 /// return that source element type.
2062 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2063 // All VPInstructions that lower to GEPs must have the i8 source element
2064 // type (as they are PtrAdds), so we omit it.
2066 .Case<VPReplicateRecipe>([](auto *I) -> Type * {
2067 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2068 return GEP->getSourceElementType();
2069 return nullptr;
2070 })
2071 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2072 [](auto *I) { return I->getSourceElementType(); })
2073 .Default([](auto *) { return nullptr; });
2074 }
2075
2076 /// Returns true if recipe \p Def can be safely handed for CSE.
2077 static bool canHandle(const VPSingleDefRecipe *Def) {
2078 // We can extend the list of handled recipes in the future,
2079 // provided we account for the data embedded in them while checking for
2080 // equality or hashing.
2081 auto C = getOpcodeOrIntrinsicID(Def);
2082
2083 // The issue with (Insert|Extract)Value is that the index of the
2084 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2085 // VPlan.
2086 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2087 C->second == Instruction::ExtractValue)))
2088 return false;
2089
2090 // During CSE, we can only handle recipes that don't read from memory: if
2091 // they read from memory, there could be an intervening write to memory
2092 // before the next instance is CSE'd, leading to an incorrect result.
2093 return !Def->mayReadFromMemory();
2094 }
2095
2096 /// Hash the underlying data of \p Def.
2097 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2098 const VPlan *Plan = Def->getParent()->getPlan();
2099 VPTypeAnalysis TypeInfo(*Plan);
2100 hash_code Result = hash_combine(
2101 Def->getVPDefID(), getOpcodeOrIntrinsicID(Def),
2102 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2104 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2105 if (RFlags->hasPredicate())
2106 return hash_combine(Result, RFlags->getPredicate());
2107 return Result;
2108 }
2109
2110 /// Check equality of underlying data of \p L and \p R.
2111 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2112 if (isSentinel(L) || isSentinel(R))
2113 return L == R;
2114 if (L->getVPDefID() != R->getVPDefID() ||
2116 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2118 !equal(L->operands(), R->operands()))
2119 return false;
2121 "must have valid opcode info for both recipes");
2122 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2123 if (LFlags->hasPredicate() &&
2124 LFlags->getPredicate() !=
2125 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2126 return false;
2127 // Recipes in replicate regions implicitly depend on predicate. If either
2128 // recipe is in a replicate region, only consider them equal if both have
2129 // the same parent.
2130 const VPRegionBlock *RegionL = L->getRegion();
2131 const VPRegionBlock *RegionR = R->getRegion();
2132 if (((RegionL && RegionL->isReplicator()) ||
2133 (RegionR && RegionR->isReplicator())) &&
2134 L->getParent() != R->getParent())
2135 return false;
2136 const VPlan *Plan = L->getParent()->getPlan();
2137 VPTypeAnalysis TypeInfo(*Plan);
2138 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2139 }
2140};
2141} // end anonymous namespace
2142
2143/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2144/// Plan.
2146 VPDominatorTree VPDT(Plan);
2148
2150 vp_depth_first_deep(Plan.getEntry()))) {
2151 for (VPRecipeBase &R : *VPBB) {
2152 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2153 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2154 continue;
2155 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2156 // V must dominate Def for a valid replacement.
2157 if (!VPDT.dominates(V->getParent(), VPBB))
2158 continue;
2159 // Only keep flags present on both V and Def.
2160 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2161 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2162 Def->replaceAllUsesWith(V);
2163 continue;
2164 }
2165 CSEMap[Def] = Def;
2166 }
2167 }
2168}
2169
2170/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2171static void licm(VPlan &Plan) {
2172 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2173
2174 // Hoist any loop invariant recipes from the vector loop region to the
2175 // preheader. Preform a shallow traversal of the vector loop region, to
2176 // exclude recipes in replicate regions. Since the top-level blocks in the
2177 // vector loop region are guaranteed to execute if the vector pre-header is,
2178 // we don't need to check speculation safety.
2179 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2180 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2181 "Expected vector prehader's successor to be the vector loop region");
2183 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2184 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2186 continue;
2187 if (any_of(R.operands(), [](VPValue *Op) {
2188 return !Op->isDefinedOutsideLoopRegions();
2189 }))
2190 continue;
2191 R.moveBefore(*Preheader, Preheader->end());
2192 }
2193 }
2194}
2195
2197 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2198 if (Plan.hasScalarVFOnly())
2199 return;
2200 // Keep track of created truncates, so they can be re-used. Note that we
2201 // cannot use RAUW after creating a new truncate, as this would could make
2202 // other uses have different types for their operands, making them invalidly
2203 // typed.
2205 VPTypeAnalysis TypeInfo(Plan);
2206 VPBasicBlock *PH = Plan.getVectorPreheader();
2209 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2212 &R))
2213 continue;
2214
2215 VPValue *ResultVPV = R.getVPSingleValue();
2216 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2217 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2218 if (!NewResSizeInBits)
2219 continue;
2220
2221 // If the value wasn't vectorized, we must maintain the original scalar
2222 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2223 // skip casts which do not need to be handled explicitly here, as
2224 // redundant casts will be removed during recipe simplification.
2226 continue;
2227
2228 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2229 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2230 assert(OldResTy->isIntegerTy() && "only integer types supported");
2231 (void)OldResSizeInBits;
2232
2233 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2234
2235 // Any wrapping introduced by shrinking this operation shouldn't be
2236 // considered undefined behavior. So, we can't unconditionally copy
2237 // arithmetic wrapping flags to VPW.
2238 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2239 VPW->dropPoisonGeneratingFlags();
2240
2241 if (OldResSizeInBits != NewResSizeInBits &&
2242 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2243 // Extend result to original width.
2244 auto *Ext =
2245 new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
2246 Ext->insertAfter(&R);
2247 ResultVPV->replaceAllUsesWith(Ext);
2248 Ext->setOperand(0, ResultVPV);
2249 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2250 } else {
2251 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2252 "Only ICmps should not need extending the result.");
2253 }
2254
2255 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2257 continue;
2258
2259 // Shrink operands by introducing truncates as needed.
2260 unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0;
2261 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2262 auto *Op = R.getOperand(Idx);
2263 unsigned OpSizeInBits =
2265 if (OpSizeInBits == NewResSizeInBits)
2266 continue;
2267 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2268 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2269 if (!IterIsEmpty) {
2270 R.setOperand(Idx, ProcessedIter->second);
2271 continue;
2272 }
2273
2274 VPBuilder Builder;
2275 if (Op->isLiveIn())
2276 Builder.setInsertPoint(PH);
2277 else
2278 Builder.setInsertPoint(&R);
2279 VPWidenCastRecipe *NewOp =
2280 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2281 ProcessedIter->second = NewOp;
2282 R.setOperand(Idx, NewOp);
2283 }
2284
2285 }
2286 }
2287}
2288
2292 VPValue *Cond;
2293 // Skip blocks that are not terminated by BranchOnCond.
2294 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2295 continue;
2296
2297 assert(VPBB->getNumSuccessors() == 2 &&
2298 "Two successors expected for BranchOnCond");
2299 unsigned RemovedIdx;
2300 if (match(Cond, m_True()))
2301 RemovedIdx = 1;
2302 else if (match(Cond, m_False()))
2303 RemovedIdx = 0;
2304 else
2305 continue;
2306
2307 VPBasicBlock *RemovedSucc =
2308 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2309 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2310 "There must be a single edge between VPBB and its successor");
2311 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
2312 // these recipes.
2313 for (VPRecipeBase &R : RemovedSucc->phis())
2314 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2315
2316 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2317 // automatically on VPlan destruction if it becomes unreachable.
2318 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2319 VPBB->back().eraseFromParent();
2320 }
2321}
2322
2341
2342// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2343// the loop terminator with a branch-on-cond recipe with the negated
2344// active-lane-mask as operand. Note that this turns the loop into an
2345// uncountable one. Only the existing terminator is replaced, all other existing
2346// recipes/users remain unchanged, except for poison-generating flags being
2347// dropped from the canonical IV increment. Return the created
2348// VPActiveLaneMaskPHIRecipe.
2349//
2350// The function uses the following definitions:
2351//
2352// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
2353// calculate-trip-count-minus-VF (original TC) : original TC
2354// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
2355// CanonicalIVPhi : CanonicalIVIncrement
2356// %StartV is the canonical induction start value.
2357//
2358// The function adds the following recipes:
2359//
2360// vector.ph:
2361// %TripCount = calculate-trip-count-minus-VF (original TC)
2362// [if DataWithControlFlowWithoutRuntimeCheck]
2363// %EntryInc = canonical-iv-increment-for-part %StartV
2364// %EntryALM = active-lane-mask %EntryInc, %TripCount
2365//
2366// vector.body:
2367// ...
2368// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2369// ...
2370// %InLoopInc = canonical-iv-increment-for-part %IncrementValue
2371// %ALM = active-lane-mask %InLoopInc, TripCount
2372// %Negated = Not %ALM
2373// branch-on-cond %Negated
2374//
2377 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2378 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2379 auto *CanonicalIVPHI = TopRegion->getCanonicalIV();
2380 VPValue *StartV = CanonicalIVPHI->getStartValue();
2381
2382 auto *CanonicalIVIncrement =
2383 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2384 // TODO: Check if dropping the flags is needed if
2385 // !DataAndControlFlowWithoutRuntimeCheck.
2386 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2387 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2388 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2389 // we have to take unrolling into account. Each part needs to start at
2390 // Part * VF
2391 auto *VecPreheader = Plan.getVectorPreheader();
2392 VPBuilder Builder(VecPreheader);
2393
2394 // Create the ActiveLaneMask instruction using the correct start values.
2395 VPValue *TC = Plan.getTripCount();
2396
2397 VPValue *TripCount, *IncrementValue;
2399 // When the loop is guarded by a runtime overflow check for the loop
2400 // induction variable increment by VF, we can increment the value before
2401 // the get.active.lane mask and use the unmodified tripcount.
2402 IncrementValue = CanonicalIVIncrement;
2403 TripCount = TC;
2404 } else {
2405 // When avoiding a runtime check, the active.lane.mask inside the loop
2406 // uses a modified trip count and the induction variable increment is
2407 // done after the active.lane.mask intrinsic is called.
2408 IncrementValue = CanonicalIVPHI;
2409 TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
2410 {TC}, DL);
2411 }
2412 auto *EntryIncrement = Builder.createOverflowingOp(
2413 VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
2414 "index.part.next");
2415
2416 // Create the active lane mask instruction in the VPlan preheader.
2417 VPValue *ALMMultiplier =
2418 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2419 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2420 {EntryIncrement, TC, ALMMultiplier}, DL,
2421 "active.lane.mask.entry");
2422
2423 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2424 // preheader ActiveLaneMask instruction.
2425 auto *LaneMaskPhi =
2427 LaneMaskPhi->insertAfter(CanonicalIVPHI);
2428
2429 // Create the active lane mask for the next iteration of the loop before the
2430 // original terminator.
2431 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2432 Builder.setInsertPoint(OriginalTerminator);
2433 auto *InLoopIncrement =
2434 Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
2435 {IncrementValue}, {false, false}, DL);
2436 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2437 {InLoopIncrement, TripCount, ALMMultiplier},
2438 DL, "active.lane.mask.next");
2439 LaneMaskPhi->addOperand(ALM);
2440
2441 // Replace the original terminator with BranchOnCond. We have to invert the
2442 // mask here because a true condition means jumping to the exit block.
2443 auto *NotMask = Builder.createNot(ALM, DL);
2444 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2445 OriginalTerminator->eraseFromParent();
2446 return LaneMaskPhi;
2447}
2448
2449/// Collect the header mask with the pattern:
2450/// (ICMP_ULE, WideCanonicalIV, backedge-taken-count)
2451/// TODO: Introduce explicit recipe for header-mask instead of searching
2452/// for the header-mask pattern manually.
2454 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2455 SmallVector<VPValue *> WideCanonicalIVs;
2456 auto *FoundWidenCanonicalIVUser = find_if(
2458 assert(count_if(LoopRegion->getCanonicalIV()->users(),
2460 "Must have at most one VPWideCanonicalIVRecipe");
2461 if (FoundWidenCanonicalIVUser !=
2462 LoopRegion->getCanonicalIV()->users().end()) {
2463 auto *WideCanonicalIV =
2464 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2465 WideCanonicalIVs.push_back(WideCanonicalIV);
2466 }
2467
2468 // Also include VPWidenIntOrFpInductionRecipes that represent a widened
2469 // version of the canonical induction.
2470 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
2471 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2472 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2473 if (WidenOriginalIV && WidenOriginalIV->isCanonical())
2474 WideCanonicalIVs.push_back(WidenOriginalIV);
2475 }
2476
2477 // Walk users of wide canonical IVs and find the single compare of the form
2478 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
2479 VPSingleDefRecipe *HeaderMask = nullptr;
2480 for (auto *Wide : WideCanonicalIVs) {
2481 for (VPUser *U : SmallVector<VPUser *>(Wide->users())) {
2482 auto *VPI = dyn_cast<VPInstruction>(U);
2483 if (!VPI || !vputils::isHeaderMask(VPI, Plan))
2484 continue;
2485
2486 assert(VPI->getOperand(0) == Wide &&
2487 "WidenCanonicalIV must be the first operand of the compare");
2488 assert(!HeaderMask && "Multiple header masks found?");
2489 HeaderMask = VPI;
2490 }
2491 }
2492 return HeaderMask;
2493}
2494
2496 VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
2499 UseActiveLaneMaskForControlFlow) &&
2500 "DataAndControlFlowWithoutRuntimeCheck implies "
2501 "UseActiveLaneMaskForControlFlow");
2502
2503 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2504 auto *FoundWidenCanonicalIVUser = find_if(
2506 assert(FoundWidenCanonicalIVUser &&
2507 "Must have widened canonical IV when tail folding!");
2508 VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan);
2509 auto *WideCanonicalIV =
2510 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2511 VPSingleDefRecipe *LaneMask;
2512 if (UseActiveLaneMaskForControlFlow) {
2515 } else {
2516 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2517 VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
2518 ConstantInt::get(LoopRegion->getCanonicalIVType(), 1));
2519 LaneMask =
2520 B.createNaryOp(VPInstruction::ActiveLaneMask,
2521 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2522 nullptr, "active.lane.mask");
2523 }
2524
2525 // Walk users of WideCanonicalIV and replace the header mask of the form
2526 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2527 // removing the old one to ensure there is always only a single header mask.
2528 HeaderMask->replaceAllUsesWith(LaneMask);
2529 HeaderMask->eraseFromParent();
2530}
2531
2532template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
2533 Op0_t In;
2535
2536 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
2537
2538 template <typename OpTy> bool match(OpTy *V) const {
2539 if (m_Specific(In).match(V)) {
2540 Out = nullptr;
2541 return true;
2542 }
2544 return true;
2545 return false;
2546 }
2547};
2548
2549/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
2550/// Returns the remaining part \p Out if so, or nullptr otherwise.
2551template <typename Op0_t, typename Op1_t>
2552static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
2553 Op1_t &Out) {
2554 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
2555}
2556
2557/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2558/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2559/// recipe could be created.
2560/// \p HeaderMask Header Mask.
2561/// \p CurRecipe Recipe to be transform.
2562/// \p TypeInfo VPlan-based type analysis.
2563/// \p EVL The explicit vector length parameter of vector-predication
2564/// intrinsics.
2566 VPRecipeBase &CurRecipe,
2567 VPTypeAnalysis &TypeInfo, VPValue &EVL) {
2568 VPlan *Plan = CurRecipe.getParent()->getPlan();
2569 VPValue *Addr, *Mask, *EndPtr;
2570
2571 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2572 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
2573 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
2574 EVLEndPtr->insertBefore(&CurRecipe);
2575 EVLEndPtr->setOperand(1, &EVL);
2576 return EVLEndPtr;
2577 };
2578
2579 if (match(&CurRecipe,
2580 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))) &&
2581 !cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
2582 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
2583 EVL, Mask);
2584
2585 if (match(&CurRecipe,
2586 m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) &&
2587 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
2588 cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
2589 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe),
2590 AdjustEndPtr(EndPtr), EVL, Mask);
2591
2592 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(),
2593 m_RemoveMask(HeaderMask, Mask))) &&
2594 !cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
2595 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
2596 EVL, Mask);
2597
2598 if (match(&CurRecipe, m_MaskedStore(m_VPValue(EndPtr), m_VPValue(),
2599 m_RemoveMask(HeaderMask, Mask))) &&
2600 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
2601 cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
2602 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
2603 AdjustEndPtr(EndPtr), EVL, Mask);
2604
2605 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
2606 if (Rdx->isConditional() &&
2607 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
2608 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
2609
2610 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
2611 if (Interleave->getMask() &&
2612 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
2613 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
2614
2615 VPValue *LHS, *RHS;
2616 if (match(&CurRecipe,
2617 m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
2618 return new VPWidenIntrinsicRecipe(
2619 Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
2620 TypeInfo.inferScalarType(LHS), CurRecipe.getDebugLoc());
2621
2622 return nullptr;
2623}
2624
2625/// Replace recipes with their EVL variants.
2627 VPTypeAnalysis TypeInfo(Plan);
2628 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2629 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
2630
2631 assert(all_of(Plan.getVF().users(),
2634 "User of VF that we can't transform to EVL.");
2635 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2637 });
2638
2639 assert(all_of(Plan.getVFxUF().users(),
2640 [&LoopRegion, &Plan](VPUser *U) {
2641 return match(U,
2642 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
2643 m_Specific(&Plan.getVFxUF()))) ||
2644 isa<VPWidenPointerInductionRecipe>(U);
2645 }) &&
2646 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
2647 "increment of the canonical induction.");
2648 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2649 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
2650 // canonical induction must not be updated.
2652 });
2653
2654 // Defer erasing recipes till the end so that we don't invalidate the
2655 // VPTypeAnalysis cache.
2657
2658 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
2659 // contained.
2660 bool ContainsFORs =
2662 if (ContainsFORs) {
2663 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
2664 VPValue *MaxEVL = &Plan.getVF();
2665 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
2666 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
2667 MaxEVL = Builder.createScalarZExtOrTrunc(
2668 MaxEVL, Type::getInt32Ty(Plan.getContext()),
2669 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
2670
2671 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
2672 VPValue *PrevEVL = Builder.createScalarPhi(
2673 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
2674
2677 for (VPRecipeBase &R : *VPBB) {
2678 VPValue *V1, *V2;
2679 if (!match(&R,
2681 m_VPValue(V1), m_VPValue(V2))))
2682 continue;
2683 VPValue *Imm = Plan.getOrAddLiveIn(
2686 Intrinsic::experimental_vp_splice,
2687 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
2688 TypeInfo.inferScalarType(R.getVPSingleValue()), R.getDebugLoc());
2689 VPSplice->insertBefore(&R);
2690 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
2691 ToErase.push_back(&R);
2692 }
2693 }
2694 }
2695
2696 VPValue *HeaderMask = findHeaderMask(Plan);
2697 if (!HeaderMask)
2698 return;
2699
2700 // Replace header masks with a mask equivalent to predicating by EVL:
2701 //
2702 // icmp ule widen-canonical-iv backedge-taken-count
2703 // ->
2704 // icmp ult step-vector, EVL
2705 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
2706 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
2707 Type *EVLType = TypeInfo.inferScalarType(&EVL);
2708 VPValue *EVLMask = Builder.createICmp(
2710 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
2711 HeaderMask->replaceAllUsesWith(EVLMask);
2712 ToErase.push_back(HeaderMask->getDefiningRecipe());
2713
2714 // Try to optimize header mask recipes away to their EVL variants.
2715 // TODO: Split optimizeMaskToEVL out and move into
2716 // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in
2717 // tryToBuildVPlanWithVPRecipes beforehand.
2718 for (VPUser *U : collectUsersRecursively(EVLMask)) {
2719 auto *CurRecipe = cast<VPRecipeBase>(U);
2720 VPRecipeBase *EVLRecipe =
2721 optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, EVL);
2722 if (!EVLRecipe)
2723 continue;
2724
2725 unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
2726 assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
2727 "New recipe must define the same number of values as the "
2728 "original.");
2729 EVLRecipe->insertBefore(CurRecipe);
2731 EVLRecipe)) {
2732 for (unsigned I = 0; I < NumDefVal; ++I) {
2733 VPValue *CurVPV = CurRecipe->getVPValue(I);
2734 CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I));
2735 }
2736 }
2737 ToErase.push_back(CurRecipe);
2738 }
2739 // Remove dead EVL mask.
2740 if (EVLMask->getNumUsers() == 0)
2741 ToErase.push_back(EVLMask->getDefiningRecipe());
2742
2743 for (VPRecipeBase *R : reverse(ToErase)) {
2744 SmallVector<VPValue *> PossiblyDead(R->operands());
2745 R->eraseFromParent();
2746 for (VPValue *Op : PossiblyDead)
2748 }
2749}
2750
2751/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
2752/// replaces all uses except the canonical IV increment of
2753/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
2754/// is used only for loop iterations counting after this transformation.
2755///
2756/// The function uses the following definitions:
2757/// %StartV is the canonical induction start value.
2758///
2759/// The function adds the following recipes:
2760///
2761/// vector.ph:
2762/// ...
2763///
2764/// vector.body:
2765/// ...
2766/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2767/// [ %NextEVLIV, %vector.body ]
2768/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
2769/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
2770/// ...
2771/// %OpEVL = cast i32 %VPEVL to IVSize
2772/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
2773/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
2774/// ...
2775///
2776/// If MaxSafeElements is provided, the function adds the following recipes:
2777/// vector.ph:
2778/// ...
2779///
2780/// vector.body:
2781/// ...
2782/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2783/// [ %NextEVLIV, %vector.body ]
2784/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
2785/// %cmp = cmp ult %AVL, MaxSafeElements
2786/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
2787/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
2788/// ...
2789/// %OpEVL = cast i32 %VPEVL to IVSize
2790/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
2791/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
2792/// ...
2793///
2795 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
2796 if (Plan.hasScalarVFOnly())
2797 return;
2798 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2799 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
2800
2801 auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
2802 auto *CanIVTy = LoopRegion->getCanonicalIVType();
2803 VPValue *StartV = CanonicalIVPHI->getStartValue();
2804
2805 // Create the ExplicitVectorLengthPhi recipe in the main loop.
2806 auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc::getUnknown());
2807 EVLPhi->insertAfter(CanonicalIVPHI);
2808 VPBuilder Builder(Header, Header->getFirstNonPhi());
2809 // Create the AVL (application vector length), starting from TC -> 0 in steps
2810 // of EVL.
2811 VPPhi *AVLPhi = Builder.createScalarPhi(
2812 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
2813 VPValue *AVL = AVLPhi;
2814
2815 if (MaxSafeElements) {
2816 // Support for MaxSafeDist for correct loop emission.
2817 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
2818 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
2819 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
2820 "safe_avl");
2821 }
2822 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
2824
2825 auto *CanonicalIVIncrement =
2826 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2827 Builder.setInsertPoint(CanonicalIVIncrement);
2828 VPValue *OpVPEVL = VPEVL;
2829
2830 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
2831 OpVPEVL = Builder.createScalarZExtOrTrunc(
2832 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
2833
2834 auto *NextEVLIV = Builder.createOverflowingOp(
2835 Instruction::Add, {OpVPEVL, EVLPhi},
2836 {CanonicalIVIncrement->hasNoUnsignedWrap(),
2837 CanonicalIVIncrement->hasNoSignedWrap()},
2838 CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
2839 EVLPhi->addOperand(NextEVLIV);
2840
2841 VPValue *NextAVL = Builder.createOverflowingOp(
2842 Instruction::Sub, {AVLPhi, OpVPEVL}, {/*hasNUW=*/true, /*hasNSW=*/false},
2843 DebugLoc::getCompilerGenerated(), "avl.next");
2844 AVLPhi->addOperand(NextAVL);
2845
2846 transformRecipestoEVLRecipes(Plan, *VPEVL);
2847
2848 // Replace all uses of VPCanonicalIVPHIRecipe by
2849 // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
2850 CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
2851 CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
2852 // TODO: support unroll factor > 1.
2853 Plan.setUF(1);
2854}
2855
2857 // Find EVL loop entries by locating VPEVLBasedIVPHIRecipe.
2858 // There should be only one EVL PHI in the entire plan.
2859 VPEVLBasedIVPHIRecipe *EVLPhi = nullptr;
2860
2863 for (VPRecipeBase &R : VPBB->phis())
2864 if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
2865 assert(!EVLPhi && "Found multiple EVL PHIs. Only one expected");
2866 EVLPhi = PhiR;
2867 }
2868
2869 // Early return if no EVL PHI is found.
2870 if (!EVLPhi)
2871 return;
2872
2873 VPBasicBlock *HeaderVPBB = EVLPhi->getParent();
2874 VPValue *EVLIncrement = EVLPhi->getBackedgeValue();
2875 VPValue *AVL;
2876 [[maybe_unused]] bool FoundAVL =
2877 match(EVLIncrement,
2878 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi)));
2879 assert(FoundAVL && "Didn't find AVL?");
2880
2881 // The AVL may be capped to a safe distance.
2882 VPValue *SafeAVL;
2883 if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue())))
2884 AVL = SafeAVL;
2885
2886 VPValue *AVLNext;
2887 [[maybe_unused]] bool FoundAVLNext =
2889 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
2890 assert(FoundAVLNext && "Didn't find AVL backedge?");
2891
2892 // Convert EVLPhi to concrete recipe.
2893 auto *ScalarR =
2894 VPBuilder(EVLPhi).createScalarPhi({EVLPhi->getStartValue(), EVLIncrement},
2895 EVLPhi->getDebugLoc(), "evl.based.iv");
2896 EVLPhi->replaceAllUsesWith(ScalarR);
2897 EVLPhi->eraseFromParent();
2898
2899 // Replace CanonicalIVInc with EVL-PHI increment.
2900 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
2901 VPValue *Backedge = CanonicalIV->getIncomingValue(1);
2902 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
2903 m_Specific(&Plan.getVFxUF()))) &&
2904 "Unexpected canonical iv");
2905 Backedge->replaceAllUsesWith(EVLIncrement);
2906
2907 // Remove unused phi and increment.
2908 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
2909 CanonicalIVIncrement->eraseFromParent();
2910 CanonicalIV->eraseFromParent();
2911
2912 // Replace the use of VectorTripCount in the latch-exiting block.
2913 // Before: (branch-on-count EVLIVInc, VectorTripCount)
2914 // After: (branch-on-cond eq AVLNext, 0)
2915
2916 VPBasicBlock *LatchExiting =
2917 HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock();
2918 auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator());
2919 // Skip single-iteration loop region
2920 if (match(LatchExitingBr, m_BranchOnCond(m_True())))
2921 return;
2922 assert(LatchExitingBr &&
2923 match(LatchExitingBr,
2924 m_BranchOnCount(m_VPValue(EVLIncrement),
2925 m_Specific(&Plan.getVectorTripCount()))) &&
2926 "Unexpected terminator in EVL loop");
2927
2928 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
2929 VPBuilder Builder(LatchExitingBr);
2930 VPValue *Cmp = Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
2931 Plan.getConstantInt(AVLTy, 0));
2932 Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp);
2933 LatchExitingBr->eraseFromParent();
2934}
2935
2937 VPlan &Plan, PredicatedScalarEvolution &PSE,
2938 const DenseMap<Value *, const SCEV *> &StridesMap) {
2939 // Replace VPValues for known constant strides guaranteed by predicate scalar
2940 // evolution.
2941 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
2942 auto *R = cast<VPRecipeBase>(&U);
2943 return R->getRegion() ||
2944 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
2945 };
2946 ValueToSCEVMapTy RewriteMap;
2947 for (const SCEV *Stride : StridesMap.values()) {
2948 using namespace SCEVPatternMatch;
2949 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
2950 const APInt *StrideConst;
2951 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
2952 // Only handle constant strides for now.
2953 continue;
2954
2955 auto *CI = Plan.getConstantInt(*StrideConst);
2956 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
2957 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
2958
2959 // The versioned value may not be used in the loop directly but through a
2960 // sext/zext. Add new live-ins in those cases.
2961 for (Value *U : StrideV->users()) {
2963 continue;
2964 VPValue *StrideVPV = Plan.getLiveIn(U);
2965 if (!StrideVPV)
2966 continue;
2967 unsigned BW = U->getType()->getScalarSizeInBits();
2968 APInt C =
2969 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
2970 VPValue *CI = Plan.getConstantInt(C);
2971 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
2972 }
2973 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
2974 }
2975
2976 for (VPRecipeBase &R : *Plan.getEntry()) {
2977 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
2978 if (!ExpSCEV)
2979 continue;
2980 const SCEV *ScevExpr = ExpSCEV->getSCEV();
2981 auto *NewSCEV =
2982 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
2983 if (NewSCEV != ScevExpr) {
2984 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
2985 ExpSCEV->replaceAllUsesWith(NewExp);
2986 if (Plan.getTripCount() == ExpSCEV)
2987 Plan.resetTripCount(NewExp);
2988 }
2989 }
2990}
2991
2993 VPlan &Plan,
2994 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
2995 // Collect recipes in the backward slice of `Root` that may generate a poison
2996 // value that is used after vectorization.
2998 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3000 Worklist.push_back(Root);
3001
3002 // Traverse the backward slice of Root through its use-def chain.
3003 while (!Worklist.empty()) {
3004 VPRecipeBase *CurRec = Worklist.pop_back_val();
3005
3006 if (!Visited.insert(CurRec).second)
3007 continue;
3008
3009 // Prune search if we find another recipe generating a widen memory
3010 // instruction. Widen memory instructions involved in address computation
3011 // will lead to gather/scatter instructions, which don't need to be
3012 // handled.
3014 VPHeaderPHIRecipe>(CurRec))
3015 continue;
3016
3017 // This recipe contributes to the address computation of a widen
3018 // load/store. If the underlying instruction has poison-generating flags,
3019 // drop them directly.
3020 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3021 VPValue *A, *B;
3022 // Dropping disjoint from an OR may yield incorrect results, as some
3023 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3024 // for dependence analysis). Instead, replace it with an equivalent Add.
3025 // This is possible as all users of the disjoint OR only access lanes
3026 // where the operands are disjoint or poison otherwise.
3027 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3028 RecWithFlags->isDisjoint()) {
3029 VPBuilder Builder(RecWithFlags);
3030 VPInstruction *New = Builder.createOverflowingOp(
3031 Instruction::Add, {A, B}, {false, false},
3032 RecWithFlags->getDebugLoc());
3033 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3034 RecWithFlags->replaceAllUsesWith(New);
3035 RecWithFlags->eraseFromParent();
3036 CurRec = New;
3037 } else
3038 RecWithFlags->dropPoisonGeneratingFlags();
3039 } else {
3042 (void)Instr;
3043 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3044 "found instruction with poison generating flags not covered by "
3045 "VPRecipeWithIRFlags");
3046 }
3047
3048 // Add new definitions to the worklist.
3049 for (VPValue *Operand : CurRec->operands())
3050 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3051 Worklist.push_back(OpDef);
3052 }
3053 });
3054
3055 // Traverse all the recipes in the VPlan and collect the poison-generating
3056 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3057 // VPInterleaveRecipe.
3058 auto Iter = vp_depth_first_deep(Plan.getEntry());
3060 for (VPRecipeBase &Recipe : *VPBB) {
3061 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3062 Instruction &UnderlyingInstr = WidenRec->getIngredient();
3063 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3064 if (AddrDef && WidenRec->isConsecutive() &&
3065 BlockNeedsPredication(UnderlyingInstr.getParent()))
3066 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3067 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3068 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3069 if (AddrDef) {
3070 // Check if any member of the interleave group needs predication.
3071 const InterleaveGroup<Instruction> *InterGroup =
3072 InterleaveRec->getInterleaveGroup();
3073 bool NeedPredication = false;
3074 for (int I = 0, NumMembers = InterGroup->getNumMembers();
3075 I < NumMembers; ++I) {
3076 Instruction *Member = InterGroup->getMember(I);
3077 if (Member)
3078 NeedPredication |= BlockNeedsPredication(Member->getParent());
3079 }
3080
3081 if (NeedPredication)
3082 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3083 }
3084 }
3085 }
3086 }
3087}
3088
3090 VPlan &Plan,
3092 &InterleaveGroups,
3093 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
3094 if (InterleaveGroups.empty())
3095 return;
3096
3097 // Interleave memory: for each Interleave Group we marked earlier as relevant
3098 // for this VPlan, replace the Recipes widening its memory instructions with a
3099 // single VPInterleaveRecipe at its insertion point.
3100 VPDominatorTree VPDT(Plan);
3101 for (const auto *IG : InterleaveGroups) {
3102 auto *Start =
3103 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
3104 VPIRMetadata InterleaveMD(*Start);
3105 SmallVector<VPValue *, 4> StoredValues;
3106 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
3107 StoredValues.push_back(StoreR->getStoredValue());
3108 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3109 Instruction *MemberI = IG->getMember(I);
3110 if (!MemberI)
3111 continue;
3112 VPWidenMemoryRecipe *MemoryR =
3113 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
3114 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
3115 StoredValues.push_back(StoreR->getStoredValue());
3116 InterleaveMD.intersect(*MemoryR);
3117 }
3118
3119 bool NeedsMaskForGaps =
3120 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
3121 (!StoredValues.empty() && !IG->isFull());
3122
3123 Instruction *IRInsertPos = IG->getInsertPos();
3124 auto *InsertPos =
3125 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
3126
3128 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3129 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3130 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3131
3132 // Get or create the start address for the interleave group.
3133 VPValue *Addr = Start->getAddr();
3134 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3135 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
3136 // We cannot re-use the address of member zero because it does not
3137 // dominate the insert position. Instead, use the address of the insert
3138 // position and create a PtrAdd adjusting it to the address of member
3139 // zero.
3140 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3141 // InsertPos or sink loads above zero members to join it.
3142 assert(IG->getIndex(IRInsertPos) != 0 &&
3143 "index of insert position shouldn't be zero");
3144 auto &DL = IRInsertPos->getDataLayout();
3145 APInt Offset(32,
3146 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3147 IG->getIndex(IRInsertPos),
3148 /*IsSigned=*/true);
3149 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3150 VPBuilder B(InsertPos);
3151 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3152 }
3153 // If the group is reverse, adjust the index to refer to the last vector
3154 // lane instead of the first. We adjust the index from the first vector
3155 // lane, rather than directly getting the pointer for lane VF - 1, because
3156 // the pointer operand of the interleaved access is supposed to be uniform.
3157 if (IG->isReverse()) {
3158 auto *ReversePtr = new VPVectorEndPointerRecipe(
3159 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3160 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3161 ReversePtr->insertBefore(InsertPos);
3162 Addr = ReversePtr;
3163 }
3164 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3165 InsertPos->getMask(), NeedsMaskForGaps,
3166 InterleaveMD, InsertPos->getDebugLoc());
3167 VPIG->insertBefore(InsertPos);
3168
3169 unsigned J = 0;
3170 for (unsigned i = 0; i < IG->getFactor(); ++i)
3171 if (Instruction *Member = IG->getMember(i)) {
3172 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
3173 if (!Member->getType()->isVoidTy()) {
3174 VPValue *OriginalV = MemberR->getVPSingleValue();
3175 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3176 J++;
3177 }
3178 MemberR->eraseFromParent();
3179 }
3180 }
3181}
3182
3183/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3184/// value, phi and backedge value. In the following example:
3185///
3186/// vector.ph:
3187/// Successor(s): vector loop
3188///
3189/// <x1> vector loop: {
3190/// vector.body:
3191/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3192/// ...
3193/// EMIT branch-on-count ...
3194/// No successors
3195/// }
3196///
3197/// WIDEN-INDUCTION will get expanded to:
3198///
3199/// vector.ph:
3200/// ...
3201/// vp<%induction.start> = ...
3202/// vp<%induction.increment> = ...
3203///
3204/// Successor(s): vector loop
3205///
3206/// <x1> vector loop: {
3207/// vector.body:
3208/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3209/// ...
3210/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3211/// EMIT branch-on-count ...
3212/// No successors
3213/// }
3214static void
3216 VPTypeAnalysis &TypeInfo) {
3217 VPlan *Plan = WidenIVR->getParent()->getPlan();
3218 VPValue *Start = WidenIVR->getStartValue();
3219 VPValue *Step = WidenIVR->getStepValue();
3220 VPValue *VF = WidenIVR->getVFValue();
3221 DebugLoc DL = WidenIVR->getDebugLoc();
3222
3223 // The value from the original loop to which we are mapping the new induction
3224 // variable.
3225 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3226
3227 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3230 // FIXME: The newly created binary instructions should contain nsw/nuw
3231 // flags, which can be found from the original scalar operations.
3232 VPIRFlags Flags;
3233 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3234 AddOp = Instruction::Add;
3235 MulOp = Instruction::Mul;
3236 } else {
3237 AddOp = ID.getInductionOpcode();
3238 MulOp = Instruction::FMul;
3239 Flags = ID.getInductionBinOp()->getFastMathFlags();
3240 }
3241
3242 // If the phi is truncated, truncate the start and step values.
3243 VPBuilder Builder(Plan->getVectorPreheader());
3244 Type *StepTy = TypeInfo.inferScalarType(Step);
3245 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3246 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3247 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3248 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3249 StepTy = Ty;
3250 }
3251
3252 // Construct the initial value of the vector IV in the vector loop preheader.
3253 Type *IVIntTy =
3255 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3256 if (StepTy->isFloatingPointTy())
3257 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3258
3259 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3260 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3261
3262 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3263 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3264 DebugLoc::getUnknown(), "induction");
3265
3266 // Create the widened phi of the vector IV.
3267 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), Init,
3268 WidenIVR->getDebugLoc(), "vec.ind");
3269 WidePHI->insertBefore(WidenIVR);
3270
3271 // Create the backedge value for the vector IV.
3272 VPValue *Inc;
3273 VPValue *Prev;
3274 // If unrolled, use the increment and prev value from the operands.
3275 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3276 Inc = SplatVF;
3277 Prev = WidenIVR->getLastUnrolledPartOperand();
3278 } else {
3279 if (VPRecipeBase *R = VF->getDefiningRecipe())
3280 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3281 // Multiply the vectorization factor by the step using integer or
3282 // floating-point arithmetic as appropriate.
3283 if (StepTy->isFloatingPointTy())
3284 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3285 DL);
3286 else
3287 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3288 TypeInfo.inferScalarType(VF), DL);
3289
3290 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3291 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3292 Prev = WidePHI;
3293 }
3294
3296 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3297 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3298 WidenIVR->getDebugLoc(), "vec.ind.next");
3299
3300 WidePHI->addOperand(Next);
3301
3302 WidenIVR->replaceAllUsesWith(WidePHI);
3303}
3304
3305/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3306/// initial value, phi and backedge value. In the following example:
3307///
3308/// <x1> vector loop: {
3309/// vector.body:
3310/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3311/// ...
3312/// EMIT branch-on-count ...
3313/// }
3314///
3315/// WIDEN-POINTER-INDUCTION will get expanded to:
3316///
3317/// <x1> vector loop: {
3318/// vector.body:
3319/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3320/// EMIT %mul = mul %stepvector, %step
3321/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3322/// ...
3323/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3324/// EMIT branch-on-count ...
3325/// }
3327 VPTypeAnalysis &TypeInfo) {
3328 VPlan *Plan = R->getParent()->getPlan();
3329 VPValue *Start = R->getStartValue();
3330 VPValue *Step = R->getStepValue();
3331 VPValue *VF = R->getVFValue();
3332
3333 assert(R->getInductionDescriptor().getKind() ==
3335 "Not a pointer induction according to InductionDescriptor!");
3336 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3337 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3338 "Recipe should have been replaced");
3339
3340 VPBuilder Builder(R);
3341 DebugLoc DL = R->getDebugLoc();
3342
3343 // Build a scalar pointer phi.
3344 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3345
3346 // Create actual address geps that use the pointer phi as base and a
3347 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3348 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3349 Type *StepTy = TypeInfo.inferScalarType(Step);
3350 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3351 Offset = Builder.createNaryOp(Instruction::Mul, {Offset, Step});
3352 VPValue *PtrAdd = Builder.createNaryOp(
3353 VPInstruction::WidePtrAdd, {ScalarPtrPhi, Offset}, DL, "vector.gep");
3354 R->replaceAllUsesWith(PtrAdd);
3355
3356 // Create the backedge value for the scalar pointer phi.
3358 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3359 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3360 DL);
3361 VPValue *Inc = Builder.createNaryOp(Instruction::Mul, {Step, VF});
3362
3363 VPValue *InductionGEP =
3364 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3365 ScalarPtrPhi->addOperand(InductionGEP);
3366}
3367
3369 // Replace loop regions with explicity CFG.
3370 SmallVector<VPRegionBlock *> LoopRegions;
3372 vp_depth_first_deep(Plan.getEntry()))) {
3373 if (!R->isReplicator())
3374 LoopRegions.push_back(R);
3375 }
3376 for (VPRegionBlock *R : LoopRegions)
3377 R->dissolveToCFGLoop();
3378}
3379
3381 VPTypeAnalysis TypeInfo(Plan);
3384 vp_depth_first_deep(Plan.getEntry()))) {
3385 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3386 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
3387 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
3388 ToRemove.push_back(WidenIVR);
3389 continue;
3390 }
3391
3392 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
3393 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
3394 ToRemove.push_back(WidenIVR);
3395 continue;
3396 }
3397
3398 // Expand VPBlendRecipe into VPInstruction::Select.
3399 VPBuilder Builder(&R);
3400 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
3401 VPValue *Select = Blend->getIncomingValue(0);
3402 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
3403 Select = Builder.createSelect(Blend->getMask(I),
3404 Blend->getIncomingValue(I), Select,
3405 R.getDebugLoc(), "predphi");
3406 Blend->replaceAllUsesWith(Select);
3407 ToRemove.push_back(Blend);
3408 }
3409
3410 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
3411 Expr->decompose();
3412 ToRemove.push_back(Expr);
3413 }
3414
3415 VPValue *VectorStep;
3416 VPValue *ScalarStep;
3418 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
3419 continue;
3420
3421 // Expand WideIVStep.
3422 auto *VPI = cast<VPInstruction>(&R);
3423 Type *IVTy = TypeInfo.inferScalarType(VPI);
3424 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
3426 ? Instruction::UIToFP
3427 : Instruction::Trunc;
3428 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
3429 }
3430
3431 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
3432 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
3433 ScalarStep =
3434 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
3435 }
3436
3437 VPIRFlags Flags;
3438 if (IVTy->isFloatingPointTy())
3439 Flags = {VPI->getFastMathFlags()};
3440
3441 unsigned MulOpc =
3442 IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
3443 VPInstruction *Mul = Builder.createNaryOp(
3444 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
3445 VectorStep = Mul;
3446 VPI->replaceAllUsesWith(VectorStep);
3447 ToRemove.push_back(VPI);
3448 }
3449 }
3450
3451 for (VPRecipeBase *R : ToRemove)
3452 R->eraseFromParent();
3453}
3454
3456 VPBasicBlock *EarlyExitVPBB,
3457 VPlan &Plan,
3458 VPBasicBlock *HeaderVPBB,
3459 VPBasicBlock *LatchVPBB) {
3460 VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0];
3461 if (!EarlyExitVPBB->getSinglePredecessor() &&
3462 EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
3463 assert(EarlyExitVPBB->getNumPredecessors() == 2 &&
3464 EarlyExitVPBB->getPredecessors()[0] == EarlyExitingVPBB &&
3465 "unsupported early exit VPBB");
3466 // Early exit operand should always be last phi operand. If EarlyExitVPBB
3467 // has two predecessors and EarlyExitingVPBB is the first, swap the operands
3468 // of the phis.
3469 for (VPRecipeBase &R : EarlyExitVPBB->phis())
3470 cast<VPIRPhi>(&R)->swapOperands();
3471 }
3472
3473 VPBuilder Builder(LatchVPBB->getTerminator());
3474 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
3475 assert(match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond()) &&
3476 "Terminator must be be BranchOnCond");
3477 VPValue *CondOfEarlyExitingVPBB =
3478 EarlyExitingVPBB->getTerminator()->getOperand(0);
3479 auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
3480 ? CondOfEarlyExitingVPBB
3481 : Builder.createNot(CondOfEarlyExitingVPBB);
3482
3483 // Split the middle block and have it conditionally branch to the early exit
3484 // block if CondToEarlyExit.
3485 VPValue *IsEarlyExitTaken =
3486 Builder.createNaryOp(VPInstruction::AnyOf, {CondToEarlyExit});
3487 VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split");
3488 VPBasicBlock *VectorEarlyExitVPBB =
3489 Plan.createVPBasicBlock("vector.early.exit");
3490 VPBlockUtils::insertOnEdge(LatchVPBB, MiddleVPBB, NewMiddle);
3491 VPBlockUtils::connectBlocks(NewMiddle, VectorEarlyExitVPBB);
3492 NewMiddle->swapSuccessors();
3493
3494 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
3495
3496 // Update the exit phis in the early exit block.
3497 VPBuilder MiddleBuilder(NewMiddle);
3498 VPBuilder EarlyExitB(VectorEarlyExitVPBB);
3499 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
3500 auto *ExitIRI = cast<VPIRPhi>(&R);
3501 // Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has
3502 // a single predecessor and 1 if it has two.
3503 unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1;
3504 if (ExitIRI->getNumOperands() != 1) {
3505 // The first of two operands corresponds to the latch exit, via MiddleVPBB
3506 // predecessor. Extract its last lane.
3507 ExitIRI->extractLastLaneOfFirstOperand(MiddleBuilder);
3508 }
3509
3510 VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
3511 if (!IncomingFromEarlyExit->isLiveIn()) {
3512 // Update the incoming value from the early exit.
3513 VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
3514 VPInstruction::FirstActiveLane, {CondToEarlyExit}, nullptr,
3515 "first.active.lane");
3516 IncomingFromEarlyExit = EarlyExitB.createNaryOp(
3517 VPInstruction::ExtractLane, {FirstActiveLane, IncomingFromEarlyExit},
3518 nullptr, "early.exit.value");
3519 ExitIRI->setOperand(EarlyExitIdx, IncomingFromEarlyExit);
3520 }
3521 }
3522 MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});
3523
3524 // Replace the condition controlling the non-early exit from the vector loop
3525 // with one exiting if either the original condition of the vector latch is
3526 // true or the early exit has been taken.
3527 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
3528 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
3529 "Unexpected terminator");
3530 auto *IsLatchExitTaken =
3531 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
3532 LatchExitingBranch->getOperand(1));
3533 auto *AnyExitTaken = Builder.createNaryOp(
3534 Instruction::Or, {IsEarlyExitTaken, IsLatchExitTaken});
3535 Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
3536 LatchExitingBranch->eraseFromParent();
3537}
3538
3539/// This function tries convert extended in-loop reductions to
3540/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
3541/// valid. The created recipe must be decomposed to its constituent
3542/// recipes before execution.
3543static VPExpressionRecipe *
3545 VFRange &Range) {
3546 Type *RedTy = Ctx.Types.inferScalarType(Red);
3547 VPValue *VecOp = Red->getVecOp();
3548
3549 // Clamp the range if using extended-reduction is profitable.
3550 auto IsExtendedRedValidAndClampRange =
3551 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
3553 [&](ElementCount VF) {
3554 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3556
3557 InstructionCost ExtRedCost;
3558 InstructionCost ExtCost =
3559 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
3560 InstructionCost RedCost = Red->computeCost(VF, Ctx);
3561
3565 // FIXME: Move partial reduction creation, costing and clamping
3566 // here from LoopVectorize.cpp.
3567 ExtRedCost = Ctx.TTI.getPartialReductionCost(
3568 Opcode, SrcTy, nullptr, RedTy, VF, ExtKind,
3569 llvm::TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind);
3570 } else {
3571 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
3572 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
3573 Red->getFastMathFlags(), CostKind);
3574 }
3575 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
3576 },
3577 Range);
3578 };
3579
3580 VPValue *A;
3581 // Match reduce(ext)).
3582 if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) &&
3583 IsExtendedRedValidAndClampRange(
3584 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
3585 cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
3586 Ctx.Types.inferScalarType(A)))
3587 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
3588
3589 return nullptr;
3590}
3591
3592/// This function tries convert extended in-loop reductions to
3593/// VPExpressionRecipe and clamp the \p Range if it is beneficial
3594/// and valid. The created VPExpressionRecipe must be decomposed to its
3595/// constituent recipes before execution. Patterns of the
3596/// VPExpressionRecipe:
3597/// reduce.add(mul(...)),
3598/// reduce.add(mul(ext(A), ext(B))),
3599/// reduce.add(ext(mul(ext(A), ext(B)))).
3600static VPExpressionRecipe *
3602 VPCostContext &Ctx, VFRange &Range) {
3603 bool IsPartialReduction = isa<VPPartialReductionRecipe>(Red);
3604
3605 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
3606 if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
3607 return nullptr;
3608
3609 Type *RedTy = Ctx.Types.inferScalarType(Red);
3610
3611 // Clamp the range if using multiply-accumulate-reduction is profitable.
3612 auto IsMulAccValidAndClampRange =
3614 VPWidenCastRecipe *OuterExt) -> bool {
3616 [&](ElementCount VF) {
3618 Type *SrcTy =
3619 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
3620 InstructionCost MulAccCost;
3621
3622 if (IsPartialReduction) {
3623 Type *SrcTy2 =
3624 Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr;
3625 // FIXME: Move partial reduction creation, costing and clamping
3626 // here from LoopVectorize.cpp.
3627 MulAccCost = Ctx.TTI.getPartialReductionCost(
3628 Opcode, SrcTy, SrcTy2, RedTy, VF,
3630 Ext0->getOpcode())
3633 Ext1->getOpcode())
3635 Mul->getOpcode(), CostKind);
3636 } else {
3637 // Only partial reductions support mixed extends at the moment.
3638 if (Ext0 && Ext1 && Ext0->getOpcode() != Ext1->getOpcode())
3639 return false;
3640
3641 bool IsZExt =
3642 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
3643 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3644 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
3645 SrcVecTy, CostKind);
3646 }
3647
3648 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
3649 InstructionCost RedCost = Red->computeCost(VF, Ctx);
3650 InstructionCost ExtCost = 0;
3651 if (Ext0)
3652 ExtCost += Ext0->computeCost(VF, Ctx);
3653 if (Ext1)
3654 ExtCost += Ext1->computeCost(VF, Ctx);
3655 if (OuterExt)
3656 ExtCost += OuterExt->computeCost(VF, Ctx);
3657
3658 return MulAccCost.isValid() &&
3659 MulAccCost < ExtCost + MulCost + RedCost;
3660 },
3661 Range);
3662 };
3663
3664 VPValue *VecOp = Red->getVecOp();
3665 VPRecipeBase *Sub = nullptr;
3666 VPValue *A, *B;
3667 VPValue *Tmp = nullptr;
3668 // Sub reductions could have a sub between the add reduction and vec op.
3669 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
3670 Sub = VecOp->getDefiningRecipe();
3671 VecOp = Tmp;
3672 }
3673
3674 // If ValB is a constant and can be safely extended, truncate it to the same
3675 // type as ExtA's operand, then extend it to the same type as ExtA. This
3676 // creates two uniform extends that can more easily be matched by the rest of
3677 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
3678 // replaced with the new extend of the constant.
3679 auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA,
3680 VPWidenCastRecipe *&ExtB,
3681 VPValue *&ValB, VPWidenRecipe *Mul) {
3682 if (!ExtA || ExtB || !ValB->isLiveIn())
3683 return;
3684 Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
3685 Instruction::CastOps ExtOpc = ExtA->getOpcode();
3686 const APInt *Const;
3687 if (!match(ValB, m_APInt(Const)) ||
3689 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
3690 return;
3691 // The truncate ensures that the type of each extended operand is the
3692 // same, and it's been proven that the constant can be extended from
3693 // NarrowTy safely. Necessary since ExtA's extended operand would be
3694 // e.g. an i8, while the const will likely be an i32. This will be
3695 // elided by later optimisations.
3696 VPBuilder Builder(Mul);
3697 auto *Trunc =
3698 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
3699 Type *WideTy = Ctx.Types.inferScalarType(ExtA);
3700 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
3701 Mul->setOperand(1, ExtB);
3702 };
3703
3704 // Try to match reduce.add(mul(...)).
3705 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
3706 auto *RecipeA =
3707 dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe());
3708 auto *RecipeB =
3709 dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
3710 auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe());
3711
3712 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
3713 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
3714
3715 // Match reduce.add/sub(mul(ext, ext)).
3716 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
3717 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
3718 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
3719 if (Sub)
3720 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
3721 cast<VPWidenRecipe>(Sub), Red);
3722 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
3723 }
3724 // TODO: Add an expression type for this variant with a negated mul
3725 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
3726 return new VPExpressionRecipe(Mul, Red);
3727 }
3728 // TODO: Add an expression type for negated versions of other expression
3729 // variants.
3730 if (Sub)
3731 return nullptr;
3732
3733 // Match reduce.add(ext(mul(A, B))).
3734 if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
3735 auto *Ext = cast<VPWidenCastRecipe>(VecOp->getDefiningRecipe());
3736 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0)->getDefiningRecipe());
3737 auto *Ext0 = dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe());
3738 auto *Ext1 = dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
3739
3740 // reduce.add(ext(mul(ext, const)))
3741 // -> reduce.add(ext(mul(ext, ext(const))))
3742 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
3743
3744 // reduce.add(ext(mul(ext(A), ext(B))))
3745 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
3746 // The inner extends must either have the same opcode as the outer extend or
3747 // be the same, in which case the multiply can never result in a negative
3748 // value and the outer extend can be folded away by doing wider
3749 // extends for the operands of the mul.
3750 if (Ext0 && Ext1 &&
3751 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
3752 Ext0->getOpcode() == Ext1->getOpcode() &&
3753 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
3754 auto *NewExt0 = new VPWidenCastRecipe(
3755 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), *Ext0,
3756 *Ext0, Ext0->getDebugLoc());
3757 NewExt0->insertBefore(Ext0);
3758
3759 VPWidenCastRecipe *NewExt1 = NewExt0;
3760 if (Ext0 != Ext1) {
3761 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
3762 Ext->getResultType(), *Ext1, *Ext1,
3763 Ext1->getDebugLoc());
3764 NewExt1->insertBefore(Ext1);
3765 }
3766 Mul->setOperand(0, NewExt0);
3767 Mul->setOperand(1, NewExt1);
3768 Red->setOperand(1, Mul);
3769 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
3770 }
3771 }
3772 return nullptr;
3773}
3774
3775/// This function tries to create abstract recipes from the reduction recipe for
3776/// following optimizations and cost estimation.
3778 VPCostContext &Ctx,
3779 VFRange &Range) {
3780 VPExpressionRecipe *AbstractR = nullptr;
3781 auto IP = std::next(Red->getIterator());
3782 auto *VPBB = Red->getParent();
3783 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
3784 AbstractR = MulAcc;
3785 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
3786 AbstractR = ExtRed;
3787 // Cannot create abstract inloop reduction recipes.
3788 if (!AbstractR)
3789 return;
3790
3791 AbstractR->insertBefore(*VPBB, IP);
3792 Red->replaceAllUsesWith(AbstractR);
3793}
3794
3805
3807 if (Plan.hasScalarVFOnly())
3808 return;
3809
3810#ifndef NDEBUG
3811 VPDominatorTree VPDT(Plan);
3812#endif
3813
3814 SmallVector<VPValue *> VPValues;
3817 append_range(VPValues, Plan.getLiveIns());
3818 for (VPRecipeBase &R : *Plan.getEntry())
3819 append_range(VPValues, R.definedValues());
3820
3821 auto *VectorPreheader = Plan.getVectorPreheader();
3822 for (VPValue *VPV : VPValues) {
3824 (VPV->isLiveIn() && VPV->getLiveInIRValue() &&
3825 isa<Constant>(VPV->getLiveInIRValue())))
3826 continue;
3827
3828 // Add explicit broadcast at the insert point that dominates all users.
3829 VPBasicBlock *HoistBlock = VectorPreheader;
3830 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
3831 for (VPUser *User : VPV->users()) {
3832 if (User->usesScalars(VPV))
3833 continue;
3834 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
3835 HoistPoint = HoistBlock->begin();
3836 else
3837 assert(VPDT.dominates(VectorPreheader,
3838 cast<VPRecipeBase>(User)->getParent()) &&
3839 "All users must be in the vector preheader or dominated by it");
3840 }
3841
3842 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
3843 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
3844 VPV->replaceUsesWithIf(Broadcast,
3845 [VPV, Broadcast](VPUser &U, unsigned Idx) {
3846 return Broadcast != &U && !U.usesScalars(VPV);
3847 });
3848 }
3849}
3850
3852 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
3854 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
3855 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
3856
3857 VPValue *TC = Plan.getTripCount();
3858 // Skip cases for which the trip count may be non-trivial to materialize.
3859 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
3860 // tail is required.
3861 if (!Plan.hasScalarTail() ||
3863 Plan.getScalarPreheader() ||
3864 !TC->isLiveIn())
3865 return;
3866
3867 // Materialize vector trip counts for constants early if it can simply
3868 // be computed as (Original TC / VF * UF) * VF * UF.
3869 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
3870 // tail-folded loops.
3871 ScalarEvolution &SE = *PSE.getSE();
3872 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
3873 if (!isa<SCEVConstant>(TCScev))
3874 return;
3875 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
3876 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
3877 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
3878 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
3879}
3880
3882 VPBasicBlock *VectorPH) {
3884 if (BTC->getNumUsers() == 0)
3885 return;
3886
3887 VPBuilder Builder(VectorPH, VectorPH->begin());
3888 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
3889 auto *TCMO = Builder.createNaryOp(
3890 Instruction::Sub, {Plan.getTripCount(), Plan.getConstantInt(TCTy, 1)},
3891 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
3892 BTC->replaceAllUsesWith(TCMO);
3893}
3894
3896 if (Plan.hasScalarVFOnly())
3897 return;
3898
3899 VPTypeAnalysis TypeInfo(Plan);
3900 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3901 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
3903 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
3904 vp_depth_first_shallow(LoopRegion->getEntry()));
3905 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes and
3906 // VPInstructions, excluding ones in replicate regions. Those are not
3907 // materialized explicitly yet. Those vector users are still handled in
3908 // VPReplicateRegion::execute(), via shouldPack().
3909 // TODO: materialize build vectors for replicating recipes in replicating
3910 // regions.
3911 for (VPBasicBlock *VPBB :
3912 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
3913 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3915 continue;
3916 auto *DefR = cast<VPRecipeWithIRFlags>(&R);
3917 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
3918 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
3919 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
3920 };
3921 if ((isa<VPReplicateRecipe>(DefR) &&
3922 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
3923 (isa<VPInstruction>(DefR) &&
3925 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
3926 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
3927 continue;
3928
3929 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
3930 unsigned Opcode = ScalarTy->isStructTy()
3933 auto *BuildVector = new VPInstruction(Opcode, {DefR});
3934 BuildVector->insertAfter(DefR);
3935
3936 DefR->replaceUsesWithIf(
3937 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
3938 VPUser &U, unsigned) {
3939 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
3940 });
3941 }
3942 }
3943
3944 // Create explicit VPInstructions to convert vectors to scalars. The current
3945 // implementation is conservative - it may miss some cases that may or may not
3946 // be vector values. TODO: introduce Unpacks speculatively - remove them later
3947 // if they are known to operate on scalar values.
3948 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
3949 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3952 continue;
3953 for (VPValue *Def : R.definedValues()) {
3954 // Skip recipes that are single-scalar or only have their first lane
3955 // used.
3956 // TODO: The Defs skipped here may or may not be vector values.
3957 // Introduce Unpacks, and remove them later, if they are guaranteed to
3958 // produce scalar values.
3960 continue;
3961
3962 // At the moment, we create unpacks only for scalar users outside
3963 // replicate regions. Recipes inside replicate regions still extract the
3964 // required lanes implicitly.
3965 // TODO: Remove once replicate regions are unrolled completely.
3966 auto IsCandidateUnpackUser = [Def](VPUser *U) {
3967 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
3968 return U->usesScalars(Def) &&
3969 (!ParentRegion || !ParentRegion->isReplicator());
3970 };
3971 if (none_of(Def->users(), IsCandidateUnpackUser))
3972 continue;
3973
3974 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
3975 if (R.isPhi())
3976 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
3977 else
3978 Unpack->insertAfter(&R);
3979 Def->replaceUsesWithIf(Unpack,
3980 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
3981 return IsCandidateUnpackUser(&U);
3982 });
3983 }
3984 }
3985 }
3986}
3987
3989 VPBasicBlock *VectorPHVPBB,
3990 bool TailByMasking,
3991 bool RequiresScalarEpilogue) {
3992 VPValue &VectorTC = Plan.getVectorTripCount();
3993 assert(VectorTC.isLiveIn() && "vector-trip-count must be a live-in");
3994 // There's nothing to do if there are no users of the vector trip count or its
3995 // IR value has already been set.
3996 if (VectorTC.getNumUsers() == 0 || VectorTC.getLiveInIRValue())
3997 return;
3998
3999 VPValue *TC = Plan.getTripCount();
4000 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
4001 VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin());
4002 VPValue *Step = &Plan.getVFxUF();
4003
4004 // If the tail is to be folded by masking, round the number of iterations N
4005 // up to a multiple of Step instead of rounding down. This is done by first
4006 // adding Step-1 and then rounding down. Note that it's ok if this addition
4007 // overflows: the vector induction variable will eventually wrap to zero given
4008 // that it starts at zero and its Step is a power of two; the loop will then
4009 // exit, with the last early-exit vector comparison also producing all-true.
4010 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
4011 // is accounted for in emitIterationCountCheck that adds an overflow check.
4012 if (TailByMasking) {
4013 TC = Builder.createNaryOp(
4014 Instruction::Add,
4015 {TC, Builder.createNaryOp(Instruction::Sub,
4016 {Step, Plan.getConstantInt(TCTy, 1)})},
4017 DebugLoc::getCompilerGenerated(), "n.rnd.up");
4018 }
4019
4020 // Now we need to generate the expression for the part of the loop that the
4021 // vectorized body will execute. This is equal to N - (N % Step) if scalar
4022 // iterations are not required for correctness, or N - Step, otherwise. Step
4023 // is equal to the vectorization factor (number of SIMD elements) times the
4024 // unroll factor (number of SIMD instructions).
4025 VPValue *R =
4026 Builder.createNaryOp(Instruction::URem, {TC, Step},
4027 DebugLoc::getCompilerGenerated(), "n.mod.vf");
4028
4029 // There are cases where we *must* run at least one iteration in the remainder
4030 // loop. See the cost model for when this can happen. If the step evenly
4031 // divides the trip count, we set the remainder to be equal to the step. If
4032 // the step does not evenly divide the trip count, no adjustment is necessary
4033 // since there will already be scalar iterations. Note that the minimum
4034 // iterations check ensures that N >= Step.
4035 if (RequiresScalarEpilogue) {
4036 assert(!TailByMasking &&
4037 "requiring scalar epilogue is not supported with fail folding");
4038 VPValue *IsZero =
4039 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getConstantInt(TCTy, 0));
4040 R = Builder.createSelect(IsZero, Step, R);
4041 }
4042
4043 VPValue *Res = Builder.createNaryOp(
4044 Instruction::Sub, {TC, R}, DebugLoc::getCompilerGenerated(), "n.vec");
4045 VectorTC.replaceAllUsesWith(Res);
4046}
4047
4049 ElementCount VFEC) {
4050 VPBuilder Builder(VectorPH, VectorPH->begin());
4051 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4052 VPValue &VF = Plan.getVF();
4053 VPValue &VFxUF = Plan.getVFxUF();
4054 // Note that after the transform, Plan.getVF and Plan.getVFxUF should not be
4055 // used.
4056 // TODO: Assert that they aren't used.
4057
4058 // If there are no users of the runtime VF, compute VFxUF by constant folding
4059 // the multiplication of VF and UF.
4060 if (VF.getNumUsers() == 0) {
4061 VPValue *RuntimeVFxUF =
4062 Builder.createElementCount(TCTy, VFEC * Plan.getUF());
4063 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
4064 return;
4065 }
4066
4067 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
4068 // vscale) * UF.
4069 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
4071 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
4073 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
4074 }
4075 VF.replaceAllUsesWith(RuntimeVF);
4076
4077 VPValue *UF = Plan.getConstantInt(TCTy, Plan.getUF());
4078 VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF});
4079 VFxUF.replaceAllUsesWith(MulByUF);
4080}
4081
4084 const DataLayout &DL = SE.getDataLayout();
4085 SCEVExpander Expander(SE, DL, "induction", /*PreserveLCSSA=*/false);
4086
4087 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
4088 BasicBlock *EntryBB = Entry->getIRBasicBlock();
4089 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
4090 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
4092 continue;
4093 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
4094 if (!ExpSCEV)
4095 break;
4096 const SCEV *Expr = ExpSCEV->getSCEV();
4097 Value *Res =
4098 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
4099 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
4100 VPValue *Exp = Plan.getOrAddLiveIn(Res);
4101 ExpSCEV->replaceAllUsesWith(Exp);
4102 if (Plan.getTripCount() == ExpSCEV)
4103 Plan.resetTripCount(Exp);
4104 ExpSCEV->eraseFromParent();
4105 }
4107 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
4108 "after any VPIRInstructions");
4109 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
4110 // to the VPIRBasicBlock.
4111 auto EI = Entry->begin();
4112 for (Instruction &I : drop_end(*EntryBB)) {
4113 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
4114 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
4115 EI++;
4116 continue;
4117 }
4119 }
4120
4121 return ExpandedSCEVs;
4122}
4123
4124/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
4125/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
4126/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
4127/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
4128/// an index-independent load if it feeds all wide ops at all indices (\p OpV
4129/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
4130/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
4131/// is defined at \p Idx of a load interleave group.
4132static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
4133 VPValue *OpV, unsigned Idx) {
4134 auto *DefR = OpV->getDefiningRecipe();
4135 if (!DefR)
4136 return WideMember0->getOperand(OpIdx) == OpV;
4137 if (auto *W = dyn_cast<VPWidenLoadRecipe>(DefR))
4138 return !W->getMask() && WideMember0->getOperand(OpIdx) == OpV;
4139
4140 if (auto *IR = dyn_cast<VPInterleaveRecipe>(DefR))
4141 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
4142 return false;
4143}
4144
4145/// Returns true if \p IR is a full interleave group with factor and number of
4146/// members both equal to \p VF. The interleave group must also access the full
4147/// vector width \p VectorRegWidth.
4149 unsigned VF, VPTypeAnalysis &TypeInfo,
4150 unsigned VectorRegWidth) {
4151 if (!InterleaveR || InterleaveR->getMask())
4152 return false;
4153
4154 Type *GroupElementTy = nullptr;
4155 if (InterleaveR->getStoredValues().empty()) {
4156 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
4157 if (!all_of(InterleaveR->definedValues(),
4158 [&TypeInfo, GroupElementTy](VPValue *Op) {
4159 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4160 }))
4161 return false;
4162 } else {
4163 GroupElementTy =
4164 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
4165 if (!all_of(InterleaveR->getStoredValues(),
4166 [&TypeInfo, GroupElementTy](VPValue *Op) {
4167 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4168 }))
4169 return false;
4170 }
4171
4172 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF;
4173 auto IG = InterleaveR->getInterleaveGroup();
4174 return IG->getFactor() == VF && IG->getNumMembers() == VF &&
4175 GroupSize == VectorRegWidth;
4176}
4177
4178/// Returns true if \p VPValue is a narrow VPValue.
4179static bool isAlreadyNarrow(VPValue *VPV) {
4180 if (VPV->isLiveIn())
4181 return true;
4182 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
4183 return RepR && RepR->isSingleScalar();
4184}
4185
4186// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
4187// a narrow variant.
4188static VPValue *
4190 auto *R = V->getDefiningRecipe();
4191 if (!R || NarrowedOps.contains(V))
4192 return V;
4193
4194 if (isAlreadyNarrow(V))
4195 return V;
4196
4197 if (auto *WideMember0 = dyn_cast<VPWidenRecipe>(R)) {
4198 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
4199 WideMember0->setOperand(
4200 Idx,
4201 narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps));
4202 return V;
4203 }
4204
4205 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
4206 // Narrow interleave group to wide load, as transformed VPlan will only
4207 // process one original iteration.
4208 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
4209 auto *L = new VPWidenLoadRecipe(
4210 *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
4211 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
4212 L->insertBefore(LoadGroup);
4213 NarrowedOps.insert(L);
4214 return L;
4215 }
4216
4217 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
4218 assert(RepR->isSingleScalar() &&
4219 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
4220 "must be a single scalar load");
4221 NarrowedOps.insert(RepR);
4222 return RepR;
4223 }
4224
4225 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
4226 VPValue *PtrOp = WideLoad->getAddr();
4227 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
4228 PtrOp = VecPtr->getOperand(0);
4229 // Narrow wide load to uniform scalar load, as transformed VPlan will only
4230 // process one original iteration.
4231 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
4232 /*IsUniform*/ true,
4233 /*Mask*/ nullptr, *WideLoad);
4234 N->insertBefore(WideLoad);
4235 NarrowedOps.insert(N);
4236 return N;
4237}
4238
4240 unsigned VectorRegWidth) {
4241 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
4242 if (!VectorLoop || VectorLoop->getEntry()->getNumSuccessors() != 0)
4243 return;
4244
4245 VPTypeAnalysis TypeInfo(Plan);
4246
4247 unsigned VFMinVal = VF.getKnownMinValue();
4249 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
4251 continue;
4252
4255 continue;
4256
4257 // Bail out on recipes not supported at the moment:
4258 // * phi recipes other than the canonical induction
4259 // * recipes writing to memory except interleave groups
4260 // Only support plans with a canonical induction phi.
4261 if (R.isPhi())
4262 return;
4263
4264 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
4265 if (R.mayWriteToMemory() && !InterleaveR)
4266 return;
4267
4268 // Do not narrow interleave groups if there are VectorPointer recipes and
4269 // the plan was unrolled. The recipe implicitly uses VF from
4270 // VPTransformState.
4271 // TODO: Remove restriction once the VF for the VectorPointer offset is
4272 // modeled explicitly as operand.
4273 if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
4274 return;
4275
4276 // All other ops are allowed, but we reject uses that cannot be converted
4277 // when checking all allowed consumers (store interleave groups) below.
4278 if (!InterleaveR)
4279 continue;
4280
4281 // Bail out on non-consecutive interleave groups.
4282 if (!isConsecutiveInterleaveGroup(InterleaveR, VFMinVal, TypeInfo,
4283 VectorRegWidth))
4284 return;
4285
4286 // Skip read interleave groups.
4287 if (InterleaveR->getStoredValues().empty())
4288 continue;
4289
4290 // Narrow interleave groups, if all operands are already matching narrow
4291 // ops.
4292 auto *Member0 = InterleaveR->getStoredValues()[0];
4293 if (isAlreadyNarrow(Member0) &&
4294 all_of(InterleaveR->getStoredValues(),
4295 [Member0](VPValue *VPV) { return Member0 == VPV; })) {
4296 StoreGroups.push_back(InterleaveR);
4297 continue;
4298 }
4299
4300 // For now, we only support full interleave groups storing load interleave
4301 // groups.
4302 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
4303 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
4304 if (!DefR)
4305 return false;
4306 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
4307 return IR && IR->getInterleaveGroup()->isFull() &&
4308 IR->getVPValue(Op.index()) == Op.value();
4309 })) {
4310 StoreGroups.push_back(InterleaveR);
4311 continue;
4312 }
4313
4314 // Check if all values feeding InterleaveR are matching wide recipes, which
4315 // operands that can be narrowed.
4316 auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
4317 InterleaveR->getStoredValues()[0]->getDefiningRecipe());
4318 if (!WideMember0)
4319 return;
4320 for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
4321 auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe());
4322 if (!R || R->getOpcode() != WideMember0->getOpcode() ||
4323 R->getNumOperands() > 2)
4324 return;
4325 if (any_of(enumerate(R->operands()),
4326 [WideMember0, Idx = I](const auto &P) {
4327 const auto &[OpIdx, OpV] = P;
4328 return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
4329 }))
4330 return;
4331 }
4332 StoreGroups.push_back(InterleaveR);
4333 }
4334
4335 if (StoreGroups.empty())
4336 return;
4337
4338 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
4339 SmallPtrSet<VPValue *, 4> NarrowedOps;
4340 // Narrow operation tree rooted at store groups.
4341 for (auto *StoreGroup : StoreGroups) {
4342 VPValue *Res =
4343 narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
4344 auto *SI =
4345 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
4346 auto *S = new VPWidenStoreRecipe(
4347 *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
4348 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
4349 S->insertBefore(StoreGroup);
4350 StoreGroup->eraseFromParent();
4351 }
4352
4353 // Adjust induction to reflect that the transformed plan only processes one
4354 // original iteration.
4355 auto *CanIV = VectorLoop->getCanonicalIV();
4356 auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
4357 VPBuilder PHBuilder(Plan.getVectorPreheader());
4358
4359 VPValue *UF = Plan.getOrAddLiveIn(
4360 ConstantInt::get(VectorLoop->getCanonicalIVType(), 1 * Plan.getUF()));
4361 if (VF.isScalable()) {
4362 VPValue *VScale = PHBuilder.createElementCount(
4364 VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF});
4365 Inc->setOperand(1, VScaleUF);
4366 Plan.getVF().replaceAllUsesWith(VScale);
4367 } else {
4368 Inc->setOperand(1, UF);
4370 Plan.getConstantInt(CanIV->getScalarType(), 1));
4371 }
4372 removeDeadRecipes(Plan);
4373}
4374
4375/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
4376/// BranchOnCond recipe.
4378 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
4379 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
4380 auto *MiddleTerm =
4382 // Only add branch metadata if there is a (conditional) terminator.
4383 if (!MiddleTerm)
4384 return;
4385
4386 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
4387 "must have a BranchOnCond");
4388 // Assume that `TripCount % VectorStep ` is equally distributed.
4389 unsigned VectorStep = Plan.getUF() * VF.getKnownMinValue();
4390 if (VF.isScalable() && VScaleForTuning.has_value())
4391 VectorStep *= *VScaleForTuning;
4392 assert(VectorStep > 0 && "trip count should not be zero");
4393 MDBuilder MDB(Plan.getContext());
4394 MDNode *BranchWeights =
4395 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
4396 MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights);
4397}
4398
4399/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
4400/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
4401/// the end value of the induction.
4403 VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
4404 VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) {
4405 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
4406 // Truncated wide inductions resume from the last lane of their vector value
4407 // in the last vector iteration which is handled elsewhere.
4408 if (WideIntOrFp && WideIntOrFp->getTruncInst())
4409 return nullptr;
4410
4411 VPValue *Start = WideIV->getStartValue();
4412 VPValue *Step = WideIV->getStepValue();
4414 VPValue *EndValue = VectorTC;
4415 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
4416 EndValue = VectorPHBuilder.createDerivedIV(
4417 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
4418 Start, VectorTC, Step);
4419 }
4420
4421 // EndValue is derived from the vector trip count (which has the same type as
4422 // the widest induction) and thus may be wider than the induction here.
4423 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
4424 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
4425 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
4426 ScalarTypeOfWideIV,
4427 WideIV->getDebugLoc());
4428 }
4429
4430 auto *ResumePhiRecipe = ScalarPHBuilder.createScalarPhi(
4431 {EndValue, Start}, WideIV->getDebugLoc(), "bc.resume.val");
4432 return ResumePhiRecipe;
4433}
4434
4436 VPlan &Plan, VPRecipeBuilder &Builder,
4437 DenseMap<VPValue *, VPValue *> &IVEndValues) {
4438 VPTypeAnalysis TypeInfo(Plan);
4439 auto *ScalarPH = Plan.getScalarPreheader();
4440 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
4441 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
4442 VPBuilder VectorPHBuilder(
4443 cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
4444 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
4445 VPBuilder ScalarPHBuilder(ScalarPH);
4446 for (VPRecipeBase &ScalarPhiR : Plan.getScalarHeader()->phis()) {
4447 auto *ScalarPhiIRI = cast<VPIRPhi>(&ScalarPhiR);
4448
4449 // TODO: Extract final value from induction recipe initially, optimize to
4450 // pre-computed end value together in optimizeInductionExitUsers.
4451 auto *VectorPhiR =
4452 cast<VPHeaderPHIRecipe>(Builder.getRecipe(&ScalarPhiIRI->getIRPhi()));
4453 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
4455 WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
4456 &Plan.getVectorTripCount())) {
4457 assert(isa<VPPhi>(ResumePhi) && "Expected a phi");
4458 IVEndValues[WideIVR] = ResumePhi->getOperand(0);
4459 ScalarPhiIRI->addOperand(ResumePhi);
4460 continue;
4461 }
4462 // TODO: Also handle truncated inductions here. Computing end-values
4463 // separately should be done as VPlan-to-VPlan optimization, after
4464 // legalizing all resume values to use the last lane from the loop.
4465 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
4466 "should only skip truncated wide inductions");
4467 continue;
4468 }
4469
4470 // The backedge value provides the value to resume coming out of a loop,
4471 // which for FORs is a vector whose last element needs to be extracted. The
4472 // start value provides the value if the loop is bypassed.
4473 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
4474 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
4475 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
4476 "Cannot handle loops with uncountable early exits");
4477 if (IsFOR)
4478 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
4479 VPInstruction::ExtractLastElement, {ResumeFromVectorLoop}, {},
4480 "vector.recur.extract");
4481 StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
4482 auto *ResumePhiR = ScalarPHBuilder.createScalarPhi(
4483 {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
4484 ScalarPhiIRI->addOperand(ResumePhiR);
4485 }
4486}
4487
4489 VFRange &Range) {
4490 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
4491 auto *ScalarPHVPBB = Plan.getScalarPreheader();
4492 auto *MiddleVPBB = Plan.getMiddleBlock();
4493 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
4494 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
4495
4496 auto IsScalableOne = [](ElementCount VF) -> bool {
4497 return VF == ElementCount::getScalable(1);
4498 };
4499
4500 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
4501 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
4502 if (!FOR)
4503 continue;
4504
4505 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
4506 "Cannot handle loops with uncountable early exits");
4507
4508 // This is the second phase of vectorizing first-order recurrences, creating
4509 // extract for users outside the loop. An overview of the transformation is
4510 // described below. Suppose we have the following loop with some use after
4511 // the loop of the last a[i-1],
4512 //
4513 // for (int i = 0; i < n; ++i) {
4514 // t = a[i - 1];
4515 // b[i] = a[i] - t;
4516 // }
4517 // use t;
4518 //
4519 // There is a first-order recurrence on "a". For this loop, the shorthand
4520 // scalar IR looks like:
4521 //
4522 // scalar.ph:
4523 // s.init = a[-1]
4524 // br scalar.body
4525 //
4526 // scalar.body:
4527 // i = phi [0, scalar.ph], [i+1, scalar.body]
4528 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
4529 // s2 = a[i]
4530 // b[i] = s2 - s1
4531 // br cond, scalar.body, exit.block
4532 //
4533 // exit.block:
4534 // use = lcssa.phi [s1, scalar.body]
4535 //
4536 // In this example, s1 is a recurrence because it's value depends on the
4537 // previous iteration. In the first phase of vectorization, we created a
4538 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
4539 // for users in the scalar preheader and exit block.
4540 //
4541 // vector.ph:
4542 // v_init = vector(..., ..., ..., a[-1])
4543 // br vector.body
4544 //
4545 // vector.body
4546 // i = phi [0, vector.ph], [i+4, vector.body]
4547 // v1 = phi [v_init, vector.ph], [v2, vector.body]
4548 // v2 = a[i, i+1, i+2, i+3]
4549 // b[i] = v2 - v1
4550 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
4551 // b[i, i+1, i+2, i+3] = v2 - v1
4552 // br cond, vector.body, middle.block
4553 //
4554 // middle.block:
4555 // vector.recur.extract.for.phi = v2(2)
4556 // vector.recur.extract = v2(3)
4557 // br cond, scalar.ph, exit.block
4558 //
4559 // scalar.ph:
4560 // scalar.recur.init = phi [vector.recur.extract, middle.block],
4561 // [s.init, otherwise]
4562 // br scalar.body
4563 //
4564 // scalar.body:
4565 // i = phi [0, scalar.ph], [i+1, scalar.body]
4566 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
4567 // s2 = a[i]
4568 // b[i] = s2 - s1
4569 // br cond, scalar.body, exit.block
4570 //
4571 // exit.block:
4572 // lo = lcssa.phi [s1, scalar.body],
4573 // [vector.recur.extract.for.phi, middle.block]
4574 //
4575 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
4576 // Extract the penultimate value of the recurrence and use it as operand for
4577 // the VPIRInstruction modeling the phi.
4578 for (VPUser *U : FOR->users()) {
4579 using namespace llvm::VPlanPatternMatch;
4580 if (!match(U, m_ExtractLastElement(m_Specific(FOR))))
4581 continue;
4582 // For VF vscale x 1, if vscale = 1, we are unable to extract the
4583 // penultimate value of the recurrence. Instead we rely on the existing
4584 // extract of the last element from the result of
4585 // VPInstruction::FirstOrderRecurrenceSplice.
4586 // TODO: Consider vscale_range info and UF.
4588 Range))
4589 return;
4590 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
4591 VPInstruction::ExtractPenultimateElement, {FOR->getBackedgeValue()},
4592 {}, "vector.recur.extract.for.phi");
4593 cast<VPInstruction>(U)->replaceAllUsesWith(PenultimateElement);
4594 }
4595 }
4596}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:389
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:80
static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, ScalarEvolution &SE)
#define I(x, y, z)
Definition MD5.cpp:58
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static bool sinkScalarOperands(VPlan &Plan)
static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R)
Return true if we do not know how to (mechanically) hoist or sink R out of a loop region.
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static VPInstruction * addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Create and return a ResumePhi for WideIV, unless it is truncated.
static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo)
Try to simplify VPSingleDefRecipe Def.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck)
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL)
Replace recipes with their EVL variants.
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static VPValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, ScalarEvolution &SE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, unsigned VF, VPTypeAnalysis &TypeInfo, unsigned VectorRegWidth)
Returns true if IR is a full interleave group with factor and number of members both equal to VF.
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static void recursivelyDeleteDeadRecipes(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static cl::opt< bool > EnableWideActiveLaneMask("enable-wide-lane-mask", cl::init(false), cl::Hidden, cl::desc("Enable use of wide get active lane mask instructions"))
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, ScalarEvolution &SE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static VPValue * narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl< VPValue * > &NarrowedOps)
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void narrowToSingleScalarRecipes(VPlan &Plan)
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1513
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1489
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:131
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
A debug info location.
Definition DebugLoc.h:124
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:163
static DebugLoc getUnknown()
Definition DebugLoc.h:162
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:248
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:325
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:313
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
The group of interleaved loads/stores sharing the same stride and close to each other.
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getNumMembers() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1546
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1078
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:103
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getUDivExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:59
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:102
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:150
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Provides information about what library functions are available for the current target.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:88
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:97
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:295
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
op_range operands()
Definition User.h:292
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3532
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:3819
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:3894
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:3846
iterator end()
Definition VPlan.h:3856
iterator begin()
Recipe iterator methods.
Definition VPlan.h:3854
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:3907
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:220
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:582
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:554
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:627
const VPRecipeBase & back() const
Definition VPlan.h:3868
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2407
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2441
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2431
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2447
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2427
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:80
VPRegionBlock * getParent()
Definition VPlan.h:172
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:190
size_t getNumSuccessors() const
Definition VPlan.h:218
void swapSuccessors()
Swap successors of the block. The block must have exactly 2 successors.
Definition VPlan.h:321
size_t getNumPredecessors() const
Definition VPlan.h:219
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:203
VPlan * getPlan()
Definition VPlan.cpp:165
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:214
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:170
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:263
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:208
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:197
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:187
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:208
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:127
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:146
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:165
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:2951
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createElementCount(Type *Ty, ElementCount EC)
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRFlags &Flags={}, const VPIRMetadata &Metadata={})
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL, const Twine &Name="")
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3475
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:424
ArrayRef< VPValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:419
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:397
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:409
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:3640
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe for generating the phi node for the current index of elements, adjusted in accordance with E...
Definition VPlan.h:3563
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:2996
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:1982
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2030
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2019
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:3972
BasicBlock * getIRBasicBlock() const
Definition VPlan.h:3996
Class to record and manage LLVM IR flags.
Definition VPlan.h:596
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:938
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetada object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:976
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1060
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1011
@ FirstOrderRecurrenceSplice
Definition VPlan.h:982
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1006
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1003
@ CanonicalIVIncrementForPart
Definition VPlan.h:996
@ CalculateTripCountMinusVF
Definition VPlan.h:994
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2548
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2540
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2569
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:2621
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2580
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3137
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:386
VPRegionBlock * getRegion()
Definition VPlan.h:4124
VPBasicBlock * getParent()
Definition VPlan.h:407
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:478
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:2829
A recipe to represent inloop reduction operations, performing a reduction on a vector operand into a ...
Definition VPlan.h:2670
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4007
const VPBlockBase * getEntry() const
Definition VPlan.h:4043
Type * getCanonicalIVType()
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4118
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4075
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4060
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the region.
Definition VPlan.h:4105
const VPBlockBase * getExiting() const
Definition VPlan.h:4055
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4068
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:2873
bool isSingleScalar() const
Definition VPlan.h:2918
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:2942
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:3709
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:517
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:582
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:199
operand_range operands()
Definition VPlanValue.h:267
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:243
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:238
void addOperand(VPValue *Operand)
Definition VPlanValue.h:232
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:48
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:135
Value * getLiveInIRValue() const
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition VPlanValue.h:176
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:85
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:186
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1382
unsigned getNumUsers() const
Definition VPlanValue.h:113
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition VPlanValue.h:171
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1386
user_range users()
Definition VPlanValue.h:134
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:1846
A Recipe for widening the canonical induction variable of the vector loop.
Definition VPlan.h:3604
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1489
Instruction::CastOps getOpcode() const
Definition VPlan.h:1540
A recipe for handling GEP instructions.
Definition VPlan.h:1774
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2047
PHINode * getPHINode() const
Definition VPlan.h:2089
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2075
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2092
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2122
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2203
A recipe for widening vector intrinsics.
Definition VPlan.h:1547
A common base class for widening memory operations.
Definition VPlan.h:3179
A recipe for widened phis.
Definition VPlan.h:2258
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1446
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4137
bool hasVF(ElementCount VF) const
Definition VPlan.h:4343
LLVMContext & getContext() const
Definition VPlan.h:4331
VPBasicBlock * getEntry()
Definition VPlan.h:4231
VPValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4322
bool hasScalableVF() const
Definition VPlan.h:4344
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4329
VPValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4325
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4293
VPValue * getTrue()
Return a VPValue wrapping i1 true.
Definition VPlan.h:4400
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4314
unsigned getUF() const
Definition VPlan.h:4363
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:4477
bool hasUF(unsigned UF) const
Definition VPlan.h:4361
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4283
VPValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4406
void setVF(ElementCount VF)
Definition VPlan.h:4337
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4376
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1016
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4307
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4256
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4455
VPValue * getFalse()
Return a VPValue wrapping i1 false.
Definition VPlan.h:4403
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4385
bool hasScalarVFOnly() const
Definition VPlan.h:4354
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4274
ArrayRef< VPValue * > getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4425
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition VPlan.h:4279
VPValue * getLiveIn(Value *V) const
Return the live-in VPValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4422
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4236
void setUF(unsigned UF)
Definition VPlan.h:4368
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop.
Definition VPlan.h:4509
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:262
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:169
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:257
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:172
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2763
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
AllRecipe_commutative_match< Opcode, Op0_t, Op1_t > m_c_Binary(const Op0_t &Op0, const Op1_t &Op1)
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
GEPLikeRecipe_match< Op0_t, Op1_t > m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastLanePerPart, Op0_t > m_ExtractLastLanePerPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExtractLastElement, Op0_t > m_ExtractLastElement(const Op0_t &Op0)
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPDerivedIV_match< Op0_t, Op1_t, Op2_t > m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
specific_intval< 1 > m_True()
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
initializer< Ty > init(const Ty &Val)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isUniformAcrossVFsAndUFs(VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
const SCEV * getSCEVExprForVPValue(const VPValue *V, ScalarEvolution &SE, const Loop *L=nullptr)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:477
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2136
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:216
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:243
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1150
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
iterator_range< po_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_post_order_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order while traversing through ...
Definition VPlanCFG.h:236
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1622
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:550
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1723
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
RecurKind
These are the kinds of recurrences that we support.
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1954
DWARFExpression::Operation Op
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1961
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1758
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2108
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2088
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
#define N
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2300
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3310
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3270
A recipe for widening select instructions.
Definition VPlan.h:1732
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3392
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3350
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void addScalarResumePhis(VPlan &Plan, VPRecipeBuilder &Builder, DenseMap< VPValue *, VPValue * > &IVEndValues)
Create resume phis in the scalar preheader for first-order recurrences, reductions and inductions,...
static void canonicalizeEVLLoops(VPlan &Plan)
Transform EVL loops to use variable-length stepping after region dissolution.
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static bool runPass(bool(*Transform)(VPlan &, ArgsTy...), VPlan &Plan, typename std::remove_reference< ArgsTy >::type &...Args)
Helper to run a VPlan transform Transform on VPlan, forwarding extra arguments to the transform.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue)
Materialize vector trip count computations to a set of VPInstructions.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB)
Update Plan to account for the uncountable early exit from EarlyExitingVPBB to EarlyExitVPBB by.
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, unsigned VectorRegWidth)
Try to convert a plan with interleave groups with VF elements to a plan with the interleave groups re...
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize VF and VFxUF to be computed explicitly using VPInstructions.