LLVM 22.0.0git
LoopInterchange.cpp
Go to the documentation of this file.
1//===- LoopInterchange.cpp - Loop interchange pass-------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This Pass handles loop interchange transform.
10// This pass interchanges loops to provide a more cache-friendly memory access
11// patterns.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/STLExtras.h"
17#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/Statistic.h"
20#include "llvm/ADT/StringMap.h"
21#include "llvm/ADT/StringRef.h"
30#include "llvm/IR/BasicBlock.h"
32#include "llvm/IR/Dominators.h"
33#include "llvm/IR/Function.h"
34#include "llvm/IR/InstrTypes.h"
35#include "llvm/IR/Instruction.h"
37#include "llvm/IR/User.h"
38#include "llvm/IR/Value.h"
41#include "llvm/Support/Debug.h"
48#include <cassert>
49#include <utility>
50#include <vector>
51
52using namespace llvm;
53
54#define DEBUG_TYPE "loop-interchange"
55
56STATISTIC(LoopsInterchanged, "Number of loops interchanged");
57
59 "loop-interchange-threshold", cl::init(0), cl::Hidden,
60 cl::desc("Interchange if you gain more than this number"));
61
62// Maximum number of load-stores that can be handled in the dependency matrix.
64 "loop-interchange-max-meminstr-count", cl::init(64), cl::Hidden,
66 "Maximum number of load-store instructions that should be handled "
67 "in the dependency matrix. Higher value may lead to more interchanges "
68 "at the cost of compile-time"));
69
70namespace {
71
73
74/// A list of direction vectors. Each entry represents a direction vector
75/// corresponding to one or more dependencies existing in the loop nest. The
76/// length of all direction vectors is equal and is N + 1, where N is the depth
77/// of the loop nest. The first N elements correspond to the dependency
78/// direction of each N loops. The last one indicates whether this entry is
79/// forward dependency ('<') or not ('*'). The term "forward" aligns with what
80/// is defined in LoopAccessAnalysis.
81// TODO: Check if we can use a sparse matrix here.
82using CharMatrix = std::vector<std::vector<char>>;
83
84/// Types of rules used in profitability check.
85enum class RuleTy {
86 PerLoopCacheAnalysis,
87 PerInstrOrderCost,
88 ForVectorization,
89 Ignore
90};
91
92} // end anonymous namespace
93
94// Minimum loop depth supported.
96 "loop-interchange-min-loop-nest-depth", cl::init(2), cl::Hidden,
97 cl::desc("Minimum depth of loop nest considered for the transform"));
98
99// Maximum loop depth supported.
101 "loop-interchange-max-loop-nest-depth", cl::init(10), cl::Hidden,
102 cl::desc("Maximum depth of loop nest considered for the transform"));
103
104// We prefer cache cost to vectorization by default.
106 "loop-interchange-profitabilities", cl::ZeroOrMore,
108 cl::desc("List of profitability heuristics to be used. They are applied in "
109 "the given order"),
110 cl::list_init<RuleTy>({RuleTy::PerLoopCacheAnalysis,
111 RuleTy::PerInstrOrderCost,
112 RuleTy::ForVectorization}),
113 cl::values(clEnumValN(RuleTy::PerLoopCacheAnalysis, "cache",
114 "Prioritize loop cache cost"),
115 clEnumValN(RuleTy::PerInstrOrderCost, "instorder",
116 "Prioritize the IVs order of each instruction"),
117 clEnumValN(RuleTy::ForVectorization, "vectorize",
118 "Prioritize vectorization"),
119 clEnumValN(RuleTy::Ignore, "ignore",
120 "Ignore profitability, force interchange (does not "
121 "work with other options)")));
122
123#ifndef NDEBUG
126 for (RuleTy Rule : Rules) {
127 if (!Set.insert(Rule).second)
128 return false;
129 if (Rule == RuleTy::Ignore)
130 return false;
131 }
132 return true;
133}
134
135static void printDepMatrix(CharMatrix &DepMatrix) {
136 for (auto &Row : DepMatrix) {
137 // Drop the last element because it is a flag indicating whether this is
138 // forward dependency or not, which doesn't affect the legality check.
139 for (char D : drop_end(Row))
140 LLVM_DEBUG(dbgs() << D << " ");
141 LLVM_DEBUG(dbgs() << "\n");
142 }
143}
144
145/// Return true if \p Src appears before \p Dst in the same basic block.
146/// Precondition: \p Src and \Dst are distinct instructions within the same
147/// basic block.
148static bool inThisOrder(const Instruction *Src, const Instruction *Dst) {
149 assert(Src->getParent() == Dst->getParent() && Src != Dst &&
150 "Expected Src and Dst to be different instructions in the same BB");
151
152 bool FoundSrc = false;
153 for (const Instruction &I : *(Src->getParent())) {
154 if (&I == Src) {
155 FoundSrc = true;
156 continue;
157 }
158 if (&I == Dst)
159 return FoundSrc;
160 }
161
162 llvm_unreachable("Dst not found");
163}
164#endif
165
166static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
167 Loop *L, DependenceInfo *DI,
168 ScalarEvolution *SE,
171
172 ValueVector MemInstr;
173
174 // For each block.
175 for (BasicBlock *BB : L->blocks()) {
176 // Scan the BB and collect legal loads and stores.
177 for (Instruction &I : *BB) {
178 if (!isa<Instruction>(I))
179 return false;
180 if (auto *Ld = dyn_cast<LoadInst>(&I)) {
181 if (!Ld->isSimple())
182 return false;
183 MemInstr.push_back(&I);
184 } else if (auto *St = dyn_cast<StoreInst>(&I)) {
185 if (!St->isSimple())
186 return false;
187 MemInstr.push_back(&I);
188 }
189 }
190 }
191
192 LLVM_DEBUG(dbgs() << "Found " << MemInstr.size()
193 << " Loads and Stores to analyze\n");
194 if (MemInstr.size() > MaxMemInstrCount) {
195 LLVM_DEBUG(dbgs() << "The transform doesn't support more than "
196 << MaxMemInstrCount << " load/stores in a loop\n");
197 ORE->emit([&]() {
198 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoop",
199 L->getStartLoc(), L->getHeader())
200 << "Number of loads/stores exceeded, the supported maximum "
201 "can be increased with option "
202 "-loop-interchange-maxmeminstr-count.";
203 });
204 return false;
205 }
206 ValueVector::iterator I, IE, J, JE;
207
208 // Manage direction vectors that are already seen. Map each direction vector
209 // to an index of DepMatrix at which it is stored.
211
212 for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
213 for (J = I, JE = MemInstr.end(); J != JE; ++J) {
214 std::vector<char> Dep;
217 // Ignore Input dependencies.
218 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
219 continue;
220 // Track Output, Flow, and Anti dependencies.
221 if (auto D = DI->depends(Src, Dst)) {
222 assert(D->isOrdered() && "Expected an output, flow or anti dep.");
223 // If the direction vector is negative, normalize it to
224 // make it non-negative.
225 if (D->normalize(SE))
226 LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n");
227 LLVM_DEBUG(StringRef DepType =
228 D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
229 dbgs() << "Found " << DepType
230 << " dependency between Src and Dst\n"
231 << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
232 unsigned Levels = D->getLevels();
233 char Direction;
234 for (unsigned II = 1; II <= Levels; ++II) {
235 // `DVEntry::LE` is converted to `*`. This is because `LE` means `<`
236 // or `=`, for which we don't have an equivalent representation, so
237 // that the conservative approximation is necessary. The same goes for
238 // `DVEntry::GE`.
239 // TODO: Use of fine-grained expressions allows for more accurate
240 // analysis.
241 unsigned Dir = D->getDirection(II);
242 if (Dir == Dependence::DVEntry::LT)
243 Direction = '<';
244 else if (Dir == Dependence::DVEntry::GT)
245 Direction = '>';
246 else if (Dir == Dependence::DVEntry::EQ)
247 Direction = '=';
248 else
249 Direction = '*';
250 Dep.push_back(Direction);
251 }
252
253 // If the Dependence object doesn't have any information, fill the
254 // dependency vector with '*'.
255 if (D->isConfused()) {
256 assert(Dep.empty() && "Expected empty dependency vector");
257 Dep.assign(Level, '*');
258 }
259
260 while (Dep.size() != Level) {
261 Dep.push_back('I');
262 }
263
264 // If all the elements of any direction vector have only '*', legality
265 // can't be proven. Exit early to save compile time.
266 if (all_of(Dep, [](char C) { return C == '*'; })) {
267 ORE->emit([&]() {
268 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
269 L->getStartLoc(), L->getHeader())
270 << "All loops have dependencies in all directions.";
271 });
272 return false;
273 }
274
275 // Test whether the dependency is forward or not.
276 bool IsKnownForward = true;
277 if (Src->getParent() != Dst->getParent()) {
278 // In general, when Src and Dst are in different BBs, the execution
279 // order of them within a single iteration is not guaranteed. Treat
280 // conservatively as not-forward dependency in this case.
281 IsKnownForward = false;
282 } else {
283 // Src and Dst are in the same BB. If they are the different
284 // instructions, Src should appear before Dst in the BB as they are
285 // stored to MemInstr in that order.
286 assert((Src == Dst || inThisOrder(Src, Dst)) &&
287 "Unexpected instructions");
288
289 // If the Dependence object is reversed (due to normalization), it
290 // represents the dependency from Dst to Src, meaning it is a backward
291 // dependency. Otherwise it should be a forward dependency.
292 bool IsReversed = D->getSrc() != Src;
293 if (IsReversed)
294 IsKnownForward = false;
295 }
296
297 // Initialize the last element. Assume forward dependencies only; it
298 // will be updated later if there is any non-forward dependency.
299 Dep.push_back('<');
300
301 // The last element should express the "summary" among one or more
302 // direction vectors whose first N elements are the same (where N is
303 // the depth of the loop nest). Hence we exclude the last element from
304 // the Seen map.
305 auto [Ite, Inserted] = Seen.try_emplace(
306 StringRef(Dep.data(), Dep.size() - 1), DepMatrix.size());
307
308 // Make sure we only add unique entries to the dependency matrix.
309 if (Inserted)
310 DepMatrix.push_back(Dep);
311
312 // If we cannot prove that this dependency is forward, change the last
313 // element of the corresponding entry. Since a `[... *]` dependency
314 // includes a `[... <]` dependency, we do not need to keep both and
315 // change the existing entry instead.
316 if (!IsKnownForward)
317 DepMatrix[Ite->second].back() = '*';
318 }
319 }
320 }
321
322 return true;
323}
324
325// A loop is moved from index 'from' to an index 'to'. Update the Dependence
326// matrix by exchanging the two columns.
327static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
328 unsigned ToIndx) {
329 for (auto &Row : DepMatrix)
330 std::swap(Row[ToIndx], Row[FromIndx]);
331}
332
333// Check if a direction vector is lexicographically positive. Return true if it
334// is positive, nullopt if it is "zero", otherwise false.
335// [Theorem] A permutation of the loops in a perfect nest is legal if and only
336// if the direction matrix, after the same permutation is applied to its
337// columns, has no ">" direction as the leftmost non-"=" direction in any row.
338static std::optional<bool>
339isLexicographicallyPositive(ArrayRef<char> DV, unsigned Begin, unsigned End) {
340 for (unsigned char Direction : DV.slice(Begin, End - Begin)) {
341 if (Direction == '<')
342 return true;
343 if (Direction == '>' || Direction == '*')
344 return false;
345 }
346 return std::nullopt;
347}
348
349// Checks if it is legal to interchange 2 loops.
350static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
351 unsigned InnerLoopId,
352 unsigned OuterLoopId) {
353 unsigned NumRows = DepMatrix.size();
354 std::vector<char> Cur;
355 // For each row check if it is valid to interchange.
356 for (unsigned Row = 0; Row < NumRows; ++Row) {
357 // Create temporary DepVector check its lexicographical order
358 // before and after swapping OuterLoop vs InnerLoop
359 Cur = DepMatrix[Row];
360
361 // If the surrounding loops already ensure that the direction vector is
362 // lexicographically positive, nothing within the loop will be able to break
363 // the dependence. In such a case we can skip the subsequent check.
364 if (isLexicographicallyPositive(Cur, 0, OuterLoopId) == true)
365 continue;
366
367 // Check if the direction vector is lexicographically positive (or zero)
368 // for both before/after exchanged. Ignore the last element because it
369 // doesn't affect the legality.
370 if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
371 return false;
372 std::swap(Cur[InnerLoopId], Cur[OuterLoopId]);
373 if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
374 return false;
375 }
376 return true;
377}
378
379static void populateWorklist(Loop &L, LoopVector &LoopList) {
380 LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
381 << L.getHeader()->getParent()->getName() << " Loop: %"
382 << L.getHeader()->getName() << '\n');
383 assert(LoopList.empty() && "LoopList should initially be empty!");
384 Loop *CurrentLoop = &L;
385 const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
386 while (!Vec->empty()) {
387 // The current loop has multiple subloops in it hence it is not tightly
388 // nested.
389 // Discard all loops above it added into Worklist.
390 if (Vec->size() != 1) {
391 LoopList = {};
392 return;
393 }
394
395 LoopList.push_back(CurrentLoop);
396 CurrentLoop = Vec->front();
397 Vec = &CurrentLoop->getSubLoops();
398 }
399 LoopList.push_back(CurrentLoop);
400}
401
404 unsigned LoopNestDepth = LoopList.size();
405 if (LoopNestDepth < MinLoopNestDepth || LoopNestDepth > MaxLoopNestDepth) {
406 LLVM_DEBUG(dbgs() << "Unsupported depth of loop nest " << LoopNestDepth
407 << ", the supported range is [" << MinLoopNestDepth
408 << ", " << MaxLoopNestDepth << "].\n");
409 Loop *OuterLoop = LoopList.front();
410 ORE.emit([&]() {
411 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoopNestDepth",
412 OuterLoop->getStartLoc(),
413 OuterLoop->getHeader())
414 << "Unsupported depth of loop nest, the supported range is ["
415 << std::to_string(MinLoopNestDepth) << ", "
416 << std::to_string(MaxLoopNestDepth) << "].\n";
417 });
418 return false;
419 }
420 return true;
421}
422
424 ArrayRef<Loop *> LoopList) {
425 for (Loop *L : LoopList) {
426 const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
427 if (isa<SCEVCouldNotCompute>(ExitCountOuter)) {
428 LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n");
429 return false;
430 }
431 if (L->getNumBackEdges() != 1) {
432 LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
433 return false;
434 }
435 if (!L->getExitingBlock()) {
436 LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
437 return false;
438 }
439 }
440 return true;
441}
442
443namespace {
444
445/// LoopInterchangeLegality checks if it is legal to interchange the loop.
446class LoopInterchangeLegality {
447public:
448 LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
449 OptimizationRemarkEmitter *ORE)
450 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
451
452 /// Check if the loops can be interchanged.
453 bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
454 CharMatrix &DepMatrix);
455
456 /// Discover induction PHIs in the header of \p L. Induction
457 /// PHIs are added to \p Inductions.
458 bool findInductions(Loop *L, SmallVectorImpl<PHINode *> &Inductions);
459
460 /// Check if the loop structure is understood. We do not handle triangular
461 /// loops for now.
462 bool isLoopStructureUnderstood();
463
464 bool currentLimitations();
465
466 const SmallPtrSetImpl<PHINode *> &getOuterInnerReductions() const {
467 return OuterInnerReductions;
468 }
469
470 const ArrayRef<PHINode *> getInnerLoopInductions() const {
471 return InnerLoopInductions;
472 }
473
474 ArrayRef<Instruction *> getHasNoWrapReductions() const {
475 return HasNoWrapReductions;
476 }
477
478private:
479 bool tightlyNested(Loop *Outer, Loop *Inner);
480 bool containsUnsafeInstructions(BasicBlock *BB);
481
482 /// Discover induction and reduction PHIs in the header of \p L. Induction
483 /// PHIs are added to \p Inductions, reductions are added to
484 /// OuterInnerReductions. When the outer loop is passed, the inner loop needs
485 /// to be passed as \p InnerLoop.
486 bool findInductionAndReductions(Loop *L,
487 SmallVector<PHINode *, 8> &Inductions,
488 Loop *InnerLoop);
489
490 Loop *OuterLoop;
491 Loop *InnerLoop;
492
493 ScalarEvolution *SE;
494
495 /// Interface to emit optimization remarks.
496 OptimizationRemarkEmitter *ORE;
497
498 /// Set of reduction PHIs taking part of a reduction across the inner and
499 /// outer loop.
500 SmallPtrSet<PHINode *, 4> OuterInnerReductions;
501
502 /// Set of inner loop induction PHIs
503 SmallVector<PHINode *, 8> InnerLoopInductions;
504
505 /// Hold instructions that have nuw/nsw flags and involved in reductions,
506 /// like integer addition/multiplication. Those flags must be dropped when
507 /// interchanging the loops.
508 SmallVector<Instruction *, 4> HasNoWrapReductions;
509};
510
511/// Manages information utilized by the profitability check for cache. The main
512/// purpose of this class is to delay the computation of CacheCost until it is
513/// actually needed.
514class CacheCostManager {
515 Loop *OutermostLoop;
516 LoopStandardAnalysisResults *AR;
517 DependenceInfo *DI;
518
519 /// CacheCost for \ref OutermostLoop. Once it is computed, it is cached. Note
520 /// that the result can be nullptr.
521 std::optional<std::unique_ptr<CacheCost>> CC;
522
523 /// Maps each loop to an index representing the optimal position within the
524 /// loop-nest, as determined by the cache cost analysis.
525 DenseMap<const Loop *, unsigned> CostMap;
526
527 void computeIfUnitinialized();
528
529public:
530 CacheCostManager(Loop *OutermostLoop, LoopStandardAnalysisResults *AR,
531 DependenceInfo *DI)
532 : OutermostLoop(OutermostLoop), AR(AR), DI(DI) {}
533 CacheCost *getCacheCost();
534 const DenseMap<const Loop *, unsigned> &getCostMap();
535};
536
537/// LoopInterchangeProfitability checks if it is profitable to interchange the
538/// loop.
539class LoopInterchangeProfitability {
540public:
541 LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
542 OptimizationRemarkEmitter *ORE)
543 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
544
545 /// Check if the loop interchange is profitable.
546 bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
547 unsigned InnerLoopId, unsigned OuterLoopId,
548 CharMatrix &DepMatrix, CacheCostManager &CCM);
549
550private:
551 int getInstrOrderCost();
552 std::optional<bool> isProfitablePerLoopCacheAnalysis(
553 const DenseMap<const Loop *, unsigned> &CostMap, CacheCost *CC);
554 std::optional<bool> isProfitablePerInstrOrderCost();
555 std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
556 unsigned OuterLoopId,
557 CharMatrix &DepMatrix);
558 Loop *OuterLoop;
559 Loop *InnerLoop;
560
561 /// Scev analysis.
562 ScalarEvolution *SE;
563
564 /// Interface to emit optimization remarks.
565 OptimizationRemarkEmitter *ORE;
566};
567
568/// LoopInterchangeTransform interchanges the loop.
569class LoopInterchangeTransform {
570public:
571 LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
572 LoopInfo *LI, DominatorTree *DT,
573 const LoopInterchangeLegality &LIL)
574 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LIL(LIL) {}
575
576 /// Interchange OuterLoop and InnerLoop.
577 bool transform(ArrayRef<Instruction *> DropNoWrapInsts);
578 void restructureLoops(Loop *NewInner, Loop *NewOuter,
579 BasicBlock *OrigInnerPreHeader,
580 BasicBlock *OrigOuterPreHeader);
581 void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
582
583private:
584 bool adjustLoopLinks();
585 bool adjustLoopBranches();
586
587 Loop *OuterLoop;
588 Loop *InnerLoop;
589
590 /// Scev analysis.
591 ScalarEvolution *SE;
592
593 LoopInfo *LI;
594 DominatorTree *DT;
595
596 const LoopInterchangeLegality &LIL;
597};
598
599struct LoopInterchange {
600 ScalarEvolution *SE = nullptr;
601 LoopInfo *LI = nullptr;
602 DependenceInfo *DI = nullptr;
603 DominatorTree *DT = nullptr;
604 LoopStandardAnalysisResults *AR = nullptr;
605
606 /// Interface to emit optimization remarks.
607 OptimizationRemarkEmitter *ORE;
608
609 LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,
610 DominatorTree *DT, LoopStandardAnalysisResults *AR,
611 OptimizationRemarkEmitter *ORE)
612 : SE(SE), LI(LI), DI(DI), DT(DT), AR(AR), ORE(ORE) {}
613
614 bool run(Loop *L) {
615 if (L->getParentLoop())
616 return false;
617 SmallVector<Loop *, 8> LoopList;
618 populateWorklist(*L, LoopList);
619 return processLoopList(LoopList);
620 }
621
622 bool run(LoopNest &LN) {
623 SmallVector<Loop *, 8> LoopList(LN.getLoops());
624 for (unsigned I = 1; I < LoopList.size(); ++I)
625 if (LoopList[I]->getParentLoop() != LoopList[I - 1])
626 return false;
627 return processLoopList(LoopList);
628 }
629
630 unsigned selectLoopForInterchange(ArrayRef<Loop *> LoopList) {
631 // TODO: Add a better heuristic to select the loop to be interchanged based
632 // on the dependence matrix. Currently we select the innermost loop.
633 return LoopList.size() - 1;
634 }
635
636 bool processLoopList(SmallVectorImpl<Loop *> &LoopList) {
637 bool Changed = false;
638
639 // Ensure proper loop nest depth.
640 assert(hasSupportedLoopDepth(LoopList, *ORE) &&
641 "Unsupported depth of loop nest.");
642
643 unsigned LoopNestDepth = LoopList.size();
644
645 LLVM_DEBUG({
646 dbgs() << "Processing LoopList of size = " << LoopNestDepth
647 << " containing the following loops:\n";
648 for (auto *L : LoopList) {
649 dbgs() << " - ";
650 L->print(dbgs());
651 }
652 });
653
654 CharMatrix DependencyMatrix;
655 Loop *OuterMostLoop = *(LoopList.begin());
656 if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
657 OuterMostLoop, DI, SE, ORE)) {
658 LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
659 return false;
660 }
661
662 LLVM_DEBUG(dbgs() << "Dependency matrix before interchange:\n";
663 printDepMatrix(DependencyMatrix));
664
665 // Get the Outermost loop exit.
666 BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();
667 if (!LoopNestExit) {
668 LLVM_DEBUG(dbgs() << "OuterMostLoop '" << OuterMostLoop->getName()
669 << "' needs an unique exit block");
670 return false;
671 }
672
673 unsigned SelecLoopId = selectLoopForInterchange(LoopList);
674 CacheCostManager CCM(LoopList[0], AR, DI);
675 // We try to achieve the globally optimal memory access for the loopnest,
676 // and do interchange based on a bubble-sort fasion. We start from
677 // the innermost loop, move it outwards to the best possible position
678 // and repeat this process.
679 for (unsigned j = SelecLoopId; j > 0; j--) {
680 bool ChangedPerIter = false;
681 for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
682 bool Interchanged =
683 processLoop(LoopList, i, i - 1, DependencyMatrix, CCM);
684 ChangedPerIter |= Interchanged;
685 Changed |= Interchanged;
686 }
687 // Early abort if there was no interchange during an entire round of
688 // moving loops outwards.
689 if (!ChangedPerIter)
690 break;
691 }
692 return Changed;
693 }
694
695 bool processLoop(SmallVectorImpl<Loop *> &LoopList, unsigned InnerLoopId,
696 unsigned OuterLoopId,
697 std::vector<std::vector<char>> &DependencyMatrix,
698 CacheCostManager &CCM) {
699 Loop *OuterLoop = LoopList[OuterLoopId];
700 Loop *InnerLoop = LoopList[InnerLoopId];
701 LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
702 << " and OuterLoopId = " << OuterLoopId << "\n");
703 LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE);
704 if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
705 LLVM_DEBUG(dbgs() << "Cannot prove legality, not interchanging loops '"
706 << OuterLoop->getName() << "' and '"
707 << InnerLoop->getName() << "'\n");
708 return false;
709 }
710 LLVM_DEBUG(dbgs() << "Loops '" << OuterLoop->getName() << "' and '"
711 << InnerLoop->getName()
712 << "' are legal to interchange\n");
713 LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
714 if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
715 DependencyMatrix, CCM)) {
716 LLVM_DEBUG(dbgs() << "Interchanging loops '" << OuterLoop->getName()
717 << "' and '" << InnerLoop->getName()
718 << "' not profitable.\n");
719 return false;
720 }
721
722 ORE->emit([&]() {
723 return OptimizationRemark(DEBUG_TYPE, "Interchanged",
724 InnerLoop->getStartLoc(),
725 InnerLoop->getHeader())
726 << "Loop interchanged with enclosing loop.";
727 });
728
729 LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LIL);
730 LIT.transform(LIL.getHasNoWrapReductions());
731 LLVM_DEBUG(dbgs() << "Loops interchanged: outer loop '"
732 << OuterLoop->getName() << "' and inner loop '"
733 << InnerLoop->getName() << "'\n");
734 LoopsInterchanged++;
735
736 llvm::formLCSSARecursively(*OuterLoop, *DT, LI, SE);
737
738 // Loops interchanged, update LoopList accordingly.
739 std::swap(LoopList[OuterLoopId], LoopList[InnerLoopId]);
740 // Update the DependencyMatrix
741 interChangeDependencies(DependencyMatrix, InnerLoopId, OuterLoopId);
742
743 LLVM_DEBUG(dbgs() << "Dependency matrix after interchange:\n";
744 printDepMatrix(DependencyMatrix));
745
746 return true;
747 }
748};
749
750} // end anonymous namespace
751
752bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB) {
753 return any_of(*BB, [](const Instruction &I) {
754 return I.mayHaveSideEffects() || I.mayReadFromMemory();
755 });
756}
757
758bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
759 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
760 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
761 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
762
763 LLVM_DEBUG(dbgs() << "Checking if loops '" << OuterLoop->getName()
764 << "' and '" << InnerLoop->getName()
765 << "' are tightly nested\n");
766
767 // A perfectly nested loop will not have any branch in between the outer and
768 // inner block i.e. outer header will branch to either inner preheader and
769 // outerloop latch.
770 BranchInst *OuterLoopHeaderBI =
771 dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
772 if (!OuterLoopHeaderBI)
773 return false;
774
775 for (BasicBlock *Succ : successors(OuterLoopHeaderBI))
776 if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader() &&
777 Succ != OuterLoopLatch)
778 return false;
779
780 LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
781 // We do not have any basic block in between now make sure the outer header
782 // and outer loop latch doesn't contain any unsafe instructions.
783 if (containsUnsafeInstructions(OuterLoopHeader) ||
784 containsUnsafeInstructions(OuterLoopLatch))
785 return false;
786
787 // Also make sure the inner loop preheader does not contain any unsafe
788 // instructions. Note that all instructions in the preheader will be moved to
789 // the outer loop header when interchanging.
790 if (InnerLoopPreHeader != OuterLoopHeader &&
791 containsUnsafeInstructions(InnerLoopPreHeader))
792 return false;
793
794 BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
795 // Ensure the inner loop exit block flows to the outer loop latch possibly
796 // through empty blocks.
797 const BasicBlock &SuccInner =
798 LoopNest::skipEmptyBlockUntil(InnerLoopExit, OuterLoopLatch);
799 if (&SuccInner != OuterLoopLatch) {
800 LLVM_DEBUG(dbgs() << "Inner loop exit block " << *InnerLoopExit
801 << " does not lead to the outer loop latch.\n";);
802 return false;
803 }
804 // The inner loop exit block does flow to the outer loop latch and not some
805 // other BBs, now make sure it contains safe instructions, since it will be
806 // moved into the (new) inner loop after interchange.
807 if (containsUnsafeInstructions(InnerLoopExit))
808 return false;
809
810 LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
811 // We have a perfect loop nest.
812 return true;
813}
814
815bool LoopInterchangeLegality::isLoopStructureUnderstood() {
816 BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
817 for (PHINode *InnerInduction : InnerLoopInductions) {
818 unsigned Num = InnerInduction->getNumOperands();
819 for (unsigned i = 0; i < Num; ++i) {
820 Value *Val = InnerInduction->getOperand(i);
821 if (isa<Constant>(Val))
822 continue;
824 if (!I)
825 return false;
826 // TODO: Handle triangular loops.
827 // e.g. for(int i=0;i<N;i++)
828 // for(int j=i;j<N;j++)
829 unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
830 if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
831 InnerLoopPreheader &&
832 !OuterLoop->isLoopInvariant(I)) {
833 return false;
834 }
835 }
836 }
837
838 // TODO: Handle triangular loops of another form.
839 // e.g. for(int i=0;i<N;i++)
840 // for(int j=0;j<i;j++)
841 // or,
842 // for(int i=0;i<N;i++)
843 // for(int j=0;j*i<N;j++)
844 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
845 BranchInst *InnerLoopLatchBI =
846 dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
847 if (!InnerLoopLatchBI->isConditional())
848 return false;
849 if (CmpInst *InnerLoopCmp =
850 dyn_cast<CmpInst>(InnerLoopLatchBI->getCondition())) {
851 Value *Op0 = InnerLoopCmp->getOperand(0);
852 Value *Op1 = InnerLoopCmp->getOperand(1);
853
854 // LHS and RHS of the inner loop exit condition, e.g.,
855 // in "for(int j=0;j<i;j++)", LHS is j and RHS is i.
856 Value *Left = nullptr;
857 Value *Right = nullptr;
858
859 // Check if V only involves inner loop induction variable.
860 // Return true if V is InnerInduction, or a cast from
861 // InnerInduction, or a binary operator that involves
862 // InnerInduction and a constant.
863 std::function<bool(Value *)> IsPathToInnerIndVar;
864 IsPathToInnerIndVar = [this, &IsPathToInnerIndVar](const Value *V) -> bool {
865 if (llvm::is_contained(InnerLoopInductions, V))
866 return true;
867 if (isa<Constant>(V))
868 return true;
870 if (!I)
871 return false;
872 if (isa<CastInst>(I))
873 return IsPathToInnerIndVar(I->getOperand(0));
875 return IsPathToInnerIndVar(I->getOperand(0)) &&
876 IsPathToInnerIndVar(I->getOperand(1));
877 return false;
878 };
879
880 // In case of multiple inner loop indvars, it is okay if LHS and RHS
881 // are both inner indvar related variables.
882 if (IsPathToInnerIndVar(Op0) && IsPathToInnerIndVar(Op1))
883 return true;
884
885 // Otherwise we check if the cmp instruction compares an inner indvar
886 // related variable (Left) with a outer loop invariant (Right).
887 if (IsPathToInnerIndVar(Op0) && !isa<Constant>(Op0)) {
888 Left = Op0;
889 Right = Op1;
890 } else if (IsPathToInnerIndVar(Op1) && !isa<Constant>(Op1)) {
891 Left = Op1;
892 Right = Op0;
893 }
894
895 if (Left == nullptr)
896 return false;
897
898 const SCEV *S = SE->getSCEV(Right);
899 if (!SE->isLoopInvariant(S, OuterLoop))
900 return false;
901 }
902
903 return true;
904}
905
906// If SV is a LCSSA PHI node with a single incoming value, return the incoming
907// value.
908static Value *followLCSSA(Value *SV) {
910 if (!PHI)
911 return SV;
912
913 if (PHI->getNumIncomingValues() != 1)
914 return SV;
915 return followLCSSA(PHI->getIncomingValue(0));
916}
917
918// Check V's users to see if it is involved in a reduction in L.
919static PHINode *
921 SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
922 // Reduction variables cannot be constants.
923 if (isa<Constant>(V))
924 return nullptr;
925
926 for (Value *User : V->users()) {
928 if (PHI->getNumIncomingValues() == 1)
929 continue;
932 // Detect floating point reduction only when it can be reordered.
933 if (RD.getExactFPMathInst() != nullptr)
934 return nullptr;
935
937 switch (RK) {
938 case RecurKind::Or:
939 case RecurKind::And:
940 case RecurKind::Xor:
941 case RecurKind::SMin:
942 case RecurKind::SMax:
943 case RecurKind::UMin:
944 case RecurKind::UMax:
945 case RecurKind::FAdd:
946 case RecurKind::FMul:
947 case RecurKind::FMin:
948 case RecurKind::FMax:
954 case RecurKind::AnyOf:
955 return PHI;
956
957 // Change the order of integer addition/multiplication may change the
958 // semantics. Consider the following case:
959 //
960 // int A[2][2] = {{ INT_MAX, INT_MAX }, { INT_MIN, INT_MIN }};
961 // int sum = 0;
962 // for (int i = 0; i < 2; i++)
963 // for (int j = 0; j < 2; j++)
964 // sum += A[j][i];
965 //
966 // If the above loops are exchanged, the addition will cause an
967 // overflow. To prevent this, we must drop the nuw/nsw flags from the
968 // addition/multiplication instructions when we actually exchanges the
969 // loops.
970 case RecurKind::Add:
971 case RecurKind::Mul: {
972 unsigned OpCode = RecurrenceDescriptor::getOpcode(RK);
974
975 // Bail out when we fail to collect reduction instructions chain.
976 if (Ops.empty())
977 return nullptr;
978
979 for (Instruction *I : Ops) {
980 assert(I->getOpcode() == OpCode &&
981 "Expected the instruction to be the reduction operation");
982 (void)OpCode;
983
984 // If the instruction has nuw/nsw flags, we must drop them when the
985 // transformation is actually performed.
986 if (I->hasNoSignedWrap() || I->hasNoUnsignedWrap())
987 HasNoWrapInsts.push_back(I);
988 }
989 return PHI;
990 }
991
992 default:
993 return nullptr;
994 }
995 }
996 return nullptr;
997 }
998 }
999
1000 return nullptr;
1001}
1002
1003bool LoopInterchangeLegality::findInductionAndReductions(
1004 Loop *L, SmallVector<PHINode *, 8> &Inductions, Loop *InnerLoop) {
1005 if (!L->getLoopLatch() || !L->getLoopPredecessor())
1006 return false;
1007 for (PHINode &PHI : L->getHeader()->phis()) {
1008 InductionDescriptor ID;
1010 Inductions.push_back(&PHI);
1011 else {
1012 // PHIs in inner loops need to be part of a reduction in the outer loop,
1013 // discovered when checking the PHIs of the outer loop earlier.
1014 if (!InnerLoop) {
1015 if (!OuterInnerReductions.count(&PHI)) {
1016 LLVM_DEBUG(dbgs() << "Inner loop PHI is not part of reductions "
1017 "across the outer loop.\n");
1018 return false;
1019 }
1020 } else {
1021 assert(PHI.getNumIncomingValues() == 2 &&
1022 "Phis in loop header should have exactly 2 incoming values");
1023 // Check if we have a PHI node in the outer loop that has a reduction
1024 // result from the inner loop as an incoming value.
1025 Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch()));
1026 PHINode *InnerRedPhi =
1027 findInnerReductionPhi(InnerLoop, V, HasNoWrapReductions);
1028 if (!InnerRedPhi ||
1029 !llvm::is_contained(InnerRedPhi->incoming_values(), &PHI)) {
1030 LLVM_DEBUG(
1031 dbgs()
1032 << "Failed to recognize PHI as an induction or reduction.\n");
1033 return false;
1034 }
1035 OuterInnerReductions.insert(&PHI);
1036 OuterInnerReductions.insert(InnerRedPhi);
1037 }
1038 }
1039 }
1040 return true;
1041}
1042
1043// This function indicates the current limitations in the transform as a result
1044// of which we do not proceed.
1045bool LoopInterchangeLegality::currentLimitations() {
1046 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1047
1048 // transform currently expects the loop latches to also be the exiting
1049 // blocks.
1050 if (InnerLoop->getExitingBlock() != InnerLoopLatch ||
1051 OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() ||
1052 !isa<BranchInst>(InnerLoopLatch->getTerminator()) ||
1053 !isa<BranchInst>(OuterLoop->getLoopLatch()->getTerminator())) {
1054 LLVM_DEBUG(
1055 dbgs() << "Loops where the latch is not the exiting block are not"
1056 << " supported currently.\n");
1057 ORE->emit([&]() {
1058 return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch",
1059 OuterLoop->getStartLoc(),
1060 OuterLoop->getHeader())
1061 << "Loops where the latch is not the exiting block cannot be"
1062 " interchange currently.";
1063 });
1064 return true;
1065 }
1066
1067 SmallVector<PHINode *, 8> Inductions;
1068 if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) {
1069 LLVM_DEBUG(
1070 dbgs() << "Only outer loops with induction or reduction PHI nodes "
1071 << "are supported currently.\n");
1072 ORE->emit([&]() {
1073 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",
1074 OuterLoop->getStartLoc(),
1075 OuterLoop->getHeader())
1076 << "Only outer loops with induction or reduction PHI nodes can be"
1077 " interchanged currently.";
1078 });
1079 return true;
1080 }
1081
1082 Inductions.clear();
1083 // For multi-level loop nests, make sure that all phi nodes for inner loops
1084 // at all levels can be recognized as a induction or reduction phi. Bail out
1085 // if a phi node at a certain nesting level cannot be properly recognized.
1086 Loop *CurLevelLoop = OuterLoop;
1087 while (!CurLevelLoop->getSubLoops().empty()) {
1088 // We already made sure that the loop nest is tightly nested.
1089 CurLevelLoop = CurLevelLoop->getSubLoops().front();
1090 if (!findInductionAndReductions(CurLevelLoop, Inductions, nullptr)) {
1091 LLVM_DEBUG(
1092 dbgs() << "Only inner loops with induction or reduction PHI nodes "
1093 << "are supported currently.\n");
1094 ORE->emit([&]() {
1095 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",
1096 CurLevelLoop->getStartLoc(),
1097 CurLevelLoop->getHeader())
1098 << "Only inner loops with induction or reduction PHI nodes can be"
1099 " interchange currently.";
1100 });
1101 return true;
1102 }
1103 }
1104
1105 // TODO: Triangular loops are not handled for now.
1106 if (!isLoopStructureUnderstood()) {
1107 LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");
1108 ORE->emit([&]() {
1109 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
1110 InnerLoop->getStartLoc(),
1111 InnerLoop->getHeader())
1112 << "Inner loop structure not understood currently.";
1113 });
1114 return true;
1115 }
1116
1117 return false;
1118}
1119
1120bool LoopInterchangeLegality::findInductions(
1121 Loop *L, SmallVectorImpl<PHINode *> &Inductions) {
1122 for (PHINode &PHI : L->getHeader()->phis()) {
1123 InductionDescriptor ID;
1125 Inductions.push_back(&PHI);
1126 }
1127 return !Inductions.empty();
1128}
1129
1130// We currently only support LCSSA PHI nodes in the inner loop exit, if their
1131// users are either reduction PHIs or PHIs outside the outer loop (which means
1132// the we are only interested in the final value after the loop).
1133static bool
1136 BasicBlock *InnerExit = OuterL->getUniqueExitBlock();
1137 for (PHINode &PHI : InnerExit->phis()) {
1138 // Reduction lcssa phi will have only 1 incoming block that from loop latch.
1139 if (PHI.getNumIncomingValues() > 1)
1140 return false;
1141 if (any_of(PHI.users(), [&Reductions, OuterL](User *U) {
1142 PHINode *PN = dyn_cast<PHINode>(U);
1143 return !PN ||
1144 (!Reductions.count(PN) && OuterL->contains(PN->getParent()));
1145 })) {
1146 return false;
1147 }
1148 }
1149 return true;
1150}
1151
1152// We currently support LCSSA PHI nodes in the outer loop exit, if their
1153// incoming values do not come from the outer loop latch or if the
1154// outer loop latch has a single predecessor. In that case, the value will
1155// be available if both the inner and outer loop conditions are true, which
1156// will still be true after interchanging. If we have multiple predecessor,
1157// that may not be the case, e.g. because the outer loop latch may be executed
1158// if the inner loop is not executed.
1159static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
1160 BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
1161 for (PHINode &PHI : LoopNestExit->phis()) {
1162 for (Value *Incoming : PHI.incoming_values()) {
1164 if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
1165 continue;
1166
1167 // The incoming value is defined in the outer loop latch. Currently we
1168 // only support that in case the outer loop latch has a single predecessor.
1169 // This guarantees that the outer loop latch is executed if and only if
1170 // the inner loop is executed (because tightlyNested() guarantees that the
1171 // outer loop header only branches to the inner loop or the outer loop
1172 // latch).
1173 // FIXME: We could weaken this logic and allow multiple predecessors,
1174 // if the values are produced outside the loop latch. We would need
1175 // additional logic to update the PHI nodes in the exit block as
1176 // well.
1177 if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
1178 return false;
1179 }
1180 }
1181 return true;
1182}
1183
1184// In case of multi-level nested loops, it may occur that lcssa phis exist in
1185// the latch of InnerLoop, i.e., when defs of the incoming values are further
1186// inside the loopnest. Sometimes those incoming values are not available
1187// after interchange, since the original inner latch will become the new outer
1188// latch which may have predecessor paths that do not include those incoming
1189// values.
1190// TODO: Handle transformation of lcssa phis in the InnerLoop latch in case of
1191// multi-level loop nests.
1192static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
1193 if (InnerLoop->getSubLoops().empty())
1194 return true;
1195 // If the original outer latch has only one predecessor, then values defined
1196 // further inside the looploop, e.g., in the innermost loop, will be available
1197 // at the new outer latch after interchange.
1198 if (OuterLoop->getLoopLatch()->getUniquePredecessor() != nullptr)
1199 return true;
1200
1201 // The outer latch has more than one predecessors, i.e., the inner
1202 // exit and the inner header.
1203 // PHI nodes in the inner latch are lcssa phis where the incoming values
1204 // are defined further inside the loopnest. Check if those phis are used
1205 // in the original inner latch. If that is the case then bail out since
1206 // those incoming values may not be available at the new outer latch.
1207 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1208 for (PHINode &PHI : InnerLoopLatch->phis()) {
1209 for (auto *U : PHI.users()) {
1211 if (InnerLoopLatch == UI->getParent())
1212 return false;
1213 }
1214 }
1215 return true;
1216}
1217
1218bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
1219 unsigned OuterLoopId,
1220 CharMatrix &DepMatrix) {
1221 if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
1222 LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
1223 << " and OuterLoopId = " << OuterLoopId
1224 << " due to dependence\n");
1225 ORE->emit([&]() {
1226 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
1227 InnerLoop->getStartLoc(),
1228 InnerLoop->getHeader())
1229 << "Cannot interchange loops due to dependences.";
1230 });
1231 return false;
1232 }
1233 // Check if outer and inner loop contain legal instructions only.
1234 for (auto *BB : OuterLoop->blocks())
1235 for (Instruction &I : BB->instructionsWithoutDebug())
1236 if (CallInst *CI = dyn_cast<CallInst>(&I)) {
1237 // readnone functions do not prevent interchanging.
1238 if (CI->onlyWritesMemory())
1239 continue;
1240 LLVM_DEBUG(
1241 dbgs() << "Loops with call instructions cannot be interchanged "
1242 << "safely.");
1243 ORE->emit([&]() {
1244 return OptimizationRemarkMissed(DEBUG_TYPE, "CallInst",
1245 CI->getDebugLoc(),
1246 CI->getParent())
1247 << "Cannot interchange loops due to call instruction.";
1248 });
1249
1250 return false;
1251 }
1252
1253 if (!findInductions(InnerLoop, InnerLoopInductions)) {
1254 LLVM_DEBUG(dbgs() << "Could not find inner loop induction variables.\n");
1255 return false;
1256 }
1257
1258 if (!areInnerLoopLatchPHIsSupported(OuterLoop, InnerLoop)) {
1259 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n");
1260 ORE->emit([&]() {
1261 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerLatchPHI",
1262 InnerLoop->getStartLoc(),
1263 InnerLoop->getHeader())
1264 << "Cannot interchange loops because unsupported PHI nodes found "
1265 "in inner loop latch.";
1266 });
1267 return false;
1268 }
1269
1270 // TODO: The loops could not be interchanged due to current limitations in the
1271 // transform module.
1272 if (currentLimitations()) {
1273 LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n");
1274 return false;
1275 }
1276
1277 // Check if the loops are tightly nested.
1278 if (!tightlyNested(OuterLoop, InnerLoop)) {
1279 LLVM_DEBUG(dbgs() << "Loops not tightly nested\n");
1280 ORE->emit([&]() {
1281 return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested",
1282 InnerLoop->getStartLoc(),
1283 InnerLoop->getHeader())
1284 << "Cannot interchange loops because they are not tightly "
1285 "nested.";
1286 });
1287 return false;
1288 }
1289
1290 if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop,
1291 OuterInnerReductions)) {
1292 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n");
1293 ORE->emit([&]() {
1294 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1295 InnerLoop->getStartLoc(),
1296 InnerLoop->getHeader())
1297 << "Found unsupported PHI node in loop exit.";
1298 });
1299 return false;
1300 }
1301
1302 if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
1303 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");
1304 ORE->emit([&]() {
1305 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1306 OuterLoop->getStartLoc(),
1307 OuterLoop->getHeader())
1308 << "Found unsupported PHI node in loop exit.";
1309 });
1310 return false;
1311 }
1312
1313 return true;
1314}
1315
1316void CacheCostManager::computeIfUnitinialized() {
1317 if (CC.has_value())
1318 return;
1319
1320 LLVM_DEBUG(dbgs() << "Compute CacheCost.\n");
1321 CC = CacheCost::getCacheCost(*OutermostLoop, *AR, *DI);
1322 // Obtain the loop vector returned from loop cache analysis beforehand,
1323 // and put each <Loop, index> pair into a map for constant time query
1324 // later. Indices in loop vector reprsent the optimal order of the
1325 // corresponding loop, e.g., given a loopnest with depth N, index 0
1326 // indicates the loop should be placed as the outermost loop and index N
1327 // indicates the loop should be placed as the innermost loop.
1328 //
1329 // For the old pass manager CacheCost would be null.
1330 if (*CC != nullptr)
1331 for (const auto &[Idx, Cost] : enumerate((*CC)->getLoopCosts()))
1332 CostMap[Cost.first] = Idx;
1333}
1334
1335CacheCost *CacheCostManager::getCacheCost() {
1336 computeIfUnitinialized();
1337 return CC->get();
1338}
1339
1340const DenseMap<const Loop *, unsigned> &CacheCostManager::getCostMap() {
1341 computeIfUnitinialized();
1342 return CostMap;
1343}
1344
1345int LoopInterchangeProfitability::getInstrOrderCost() {
1346 unsigned GoodOrder, BadOrder;
1347 BadOrder = GoodOrder = 0;
1348 for (BasicBlock *BB : InnerLoop->blocks()) {
1349 for (Instruction &Ins : *BB) {
1350 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) {
1351 bool FoundInnerInduction = false;
1352 bool FoundOuterInduction = false;
1353 for (Value *Op : GEP->operands()) {
1354 // Skip operands that are not SCEV-able.
1355 if (!SE->isSCEVable(Op->getType()))
1356 continue;
1357
1358 const SCEV *OperandVal = SE->getSCEV(Op);
1359 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal);
1360 if (!AR)
1361 continue;
1362
1363 // If we find the inner induction after an outer induction e.g.
1364 // for(int i=0;i<N;i++)
1365 // for(int j=0;j<N;j++)
1366 // A[i][j] = A[i-1][j-1]+k;
1367 // then it is a good order.
1368 if (AR->getLoop() == InnerLoop) {
1369 // We found an InnerLoop induction after OuterLoop induction. It is
1370 // a good order.
1371 FoundInnerInduction = true;
1372 if (FoundOuterInduction) {
1373 GoodOrder++;
1374 break;
1375 }
1376 }
1377 // If we find the outer induction after an inner induction e.g.
1378 // for(int i=0;i<N;i++)
1379 // for(int j=0;j<N;j++)
1380 // A[j][i] = A[j-1][i-1]+k;
1381 // then it is a bad order.
1382 if (AR->getLoop() == OuterLoop) {
1383 // We found an OuterLoop induction after InnerLoop induction. It is
1384 // a bad order.
1385 FoundOuterInduction = true;
1386 if (FoundInnerInduction) {
1387 BadOrder++;
1388 break;
1389 }
1390 }
1391 }
1392 }
1393 }
1394 }
1395 return GoodOrder - BadOrder;
1396}
1397
1398std::optional<bool>
1399LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis(
1400 const DenseMap<const Loop *, unsigned> &CostMap, CacheCost *CC) {
1401 // This is the new cost model returned from loop cache analysis.
1402 // A smaller index means the loop should be placed an outer loop, and vice
1403 // versa.
1404 auto InnerLoopIt = CostMap.find(InnerLoop);
1405 if (InnerLoopIt == CostMap.end())
1406 return std::nullopt;
1407 auto OuterLoopIt = CostMap.find(OuterLoop);
1408 if (OuterLoopIt == CostMap.end())
1409 return std::nullopt;
1410
1411 if (CC->getLoopCost(*OuterLoop) == CC->getLoopCost(*InnerLoop))
1412 return std::nullopt;
1413 unsigned InnerIndex = InnerLoopIt->second;
1414 unsigned OuterIndex = OuterLoopIt->second;
1415 LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
1416 << ", OuterIndex = " << OuterIndex << "\n");
1417 assert(InnerIndex != OuterIndex && "CostMap should assign unique "
1418 "numbers to each loop");
1419 return std::optional<bool>(InnerIndex < OuterIndex);
1420}
1421
1422std::optional<bool>
1423LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
1424 // Legacy cost model: this is rough cost estimation algorithm. It counts the
1425 // good and bad order of induction variables in the instruction and allows
1426 // reordering if number of bad orders is more than good.
1427 int Cost = getInstrOrderCost();
1428 LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
1430 return std::optional<bool>(true);
1431
1432 return std::nullopt;
1433}
1434
1435/// Return true if we can vectorize the loop specified by \p LoopId.
1436static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) {
1437 for (const auto &Dep : DepMatrix) {
1438 char Dir = Dep[LoopId];
1439 char DepType = Dep.back();
1440 assert((DepType == '<' || DepType == '*') &&
1441 "Unexpected element in dependency vector");
1442
1443 // There are no loop-carried dependencies.
1444 if (Dir == '=' || Dir == 'I')
1445 continue;
1446
1447 // DepType being '<' means that this direction vector represents a forward
1448 // dependency. In principle, a loop with '<' direction can be vectorized in
1449 // this case.
1450 if (Dir == '<' && DepType == '<')
1451 continue;
1452
1453 // We cannot prove that the loop is vectorizable.
1454 return false;
1455 }
1456 return true;
1457}
1458
1459std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
1460 unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
1461 // If the outer loop cannot be vectorized, it is not profitable to move this
1462 // to inner position.
1463 if (!canVectorize(DepMatrix, OuterLoopId))
1464 return false;
1465
1466 // If the inner loop cannot be vectorized but the outer loop can be, then it
1467 // is profitable to interchange to enable inner loop parallelism.
1468 if (!canVectorize(DepMatrix, InnerLoopId))
1469 return true;
1470
1471 // If both the inner and the outer loop can be vectorized, it is necessary to
1472 // check the cost of each vectorized loop for profitability decision. At this
1473 // time we do not have a cost model to estimate them, so return nullopt.
1474 // TODO: Estimate the cost of vectorized loop when both the outer and the
1475 // inner loop can be vectorized.
1476 return std::nullopt;
1477}
1478
1479bool LoopInterchangeProfitability::isProfitable(
1480 const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
1481 unsigned OuterLoopId, CharMatrix &DepMatrix, CacheCostManager &CCM) {
1482 // Do not consider loops with a backedge that isn't taken, e.g. an
1483 // unconditional branch true/false, as candidates for interchange.
1484 // TODO: when interchange is forced, we should probably also allow
1485 // interchange for these loops, and thus this logic should be moved just
1486 // below the cost-model ignore check below. But this check is done first
1487 // to avoid the issue in #163954.
1488 const SCEV *InnerBTC = SE->getBackedgeTakenCount(InnerLoop);
1489 const SCEV *OuterBTC = SE->getBackedgeTakenCount(OuterLoop);
1490 if (InnerBTC && InnerBTC->isZero()) {
1491 LLVM_DEBUG(dbgs() << "Inner loop back-edge isn't taken, rejecting "
1492 "single iteration loop\n");
1493 return false;
1494 }
1495 if (OuterBTC && OuterBTC->isZero()) {
1496 LLVM_DEBUG(dbgs() << "Outer loop back-edge isn't taken, rejecting "
1497 "single iteration loop\n");
1498 return false;
1499 }
1500
1501 // Return true if interchange is forced and the cost-model ignored.
1502 if (Profitabilities.size() == 1 && Profitabilities[0] == RuleTy::Ignore)
1503 return true;
1505 "Duplicate rules and option 'ignore' are not allowed");
1506
1507 // isProfitable() is structured to avoid endless loop interchange. If the
1508 // highest priority rule (isProfitablePerLoopCacheAnalysis by default) could
1509 // decide the profitability then, profitability check will stop and return the
1510 // analysis result. If it failed to determine it (e.g., cache analysis failed
1511 // to analyze the loopnest due to delinearization issues) then go ahead the
1512 // second highest priority rule (isProfitablePerInstrOrderCost by default).
1513 // Likewise, if it failed to analysis the profitability then only, the last
1514 // rule (isProfitableForVectorization by default) will decide.
1515 std::optional<bool> shouldInterchange;
1516 for (RuleTy RT : Profitabilities) {
1517 switch (RT) {
1518 case RuleTy::PerLoopCacheAnalysis: {
1519 CacheCost *CC = CCM.getCacheCost();
1520 const DenseMap<const Loop *, unsigned> &CostMap = CCM.getCostMap();
1521 shouldInterchange = isProfitablePerLoopCacheAnalysis(CostMap, CC);
1522 break;
1523 }
1524 case RuleTy::PerInstrOrderCost:
1525 shouldInterchange = isProfitablePerInstrOrderCost();
1526 break;
1527 case RuleTy::ForVectorization:
1528 shouldInterchange =
1529 isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
1530 break;
1531 case RuleTy::Ignore:
1532 llvm_unreachable("Option 'ignore' is not supported with other options");
1533 break;
1534 }
1535
1536 // If this rule could determine the profitability, don't call subsequent
1537 // rules.
1538 if (shouldInterchange.has_value())
1539 break;
1540 }
1541
1542 if (!shouldInterchange.has_value()) {
1543 ORE->emit([&]() {
1544 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1545 InnerLoop->getStartLoc(),
1546 InnerLoop->getHeader())
1547 << "Insufficient information to calculate the cost of loop for "
1548 "interchange.";
1549 });
1550 return false;
1551 } else if (!shouldInterchange.value()) {
1552 ORE->emit([&]() {
1553 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1554 InnerLoop->getStartLoc(),
1555 InnerLoop->getHeader())
1556 << "Interchanging loops is not considered to improve cache "
1557 "locality nor vectorization.";
1558 });
1559 return false;
1560 }
1561 return true;
1562}
1563
1564void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
1565 Loop *InnerLoop) {
1566 for (Loop *L : *OuterLoop)
1567 if (L == InnerLoop) {
1568 OuterLoop->removeChildLoop(L);
1569 return;
1570 }
1571 llvm_unreachable("Couldn't find loop");
1572}
1573
1574/// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the
1575/// new inner and outer loop after interchanging: NewInner is the original
1576/// outer loop and NewOuter is the original inner loop.
1577///
1578/// Before interchanging, we have the following structure
1579/// Outer preheader
1580// Outer header
1581// Inner preheader
1582// Inner header
1583// Inner body
1584// Inner latch
1585// outer bbs
1586// Outer latch
1587//
1588// After interchanging:
1589// Inner preheader
1590// Inner header
1591// Outer preheader
1592// Outer header
1593// Inner body
1594// outer bbs
1595// Outer latch
1596// Inner latch
1597void LoopInterchangeTransform::restructureLoops(
1598 Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader,
1599 BasicBlock *OrigOuterPreHeader) {
1600 Loop *OuterLoopParent = OuterLoop->getParentLoop();
1601 // The original inner loop preheader moves from the new inner loop to
1602 // the parent loop, if there is one.
1603 NewInner->removeBlockFromLoop(OrigInnerPreHeader);
1604 LI->changeLoopFor(OrigInnerPreHeader, OuterLoopParent);
1605
1606 // Switch the loop levels.
1607 if (OuterLoopParent) {
1608 // Remove the loop from its parent loop.
1609 removeChildLoop(OuterLoopParent, NewInner);
1610 removeChildLoop(NewInner, NewOuter);
1611 OuterLoopParent->addChildLoop(NewOuter);
1612 } else {
1613 removeChildLoop(NewInner, NewOuter);
1614 LI->changeTopLevelLoop(NewInner, NewOuter);
1615 }
1616 while (!NewOuter->isInnermost())
1617 NewInner->addChildLoop(NewOuter->removeChildLoop(NewOuter->begin()));
1618 NewOuter->addChildLoop(NewInner);
1619
1620 // BBs from the original inner loop.
1621 SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks());
1622
1623 // Add BBs from the original outer loop to the original inner loop (excluding
1624 // BBs already in inner loop)
1625 for (BasicBlock *BB : NewInner->blocks())
1626 if (LI->getLoopFor(BB) == NewInner)
1627 NewOuter->addBlockEntry(BB);
1628
1629 // Now remove inner loop header and latch from the new inner loop and move
1630 // other BBs (the loop body) to the new inner loop.
1631 BasicBlock *OuterHeader = NewOuter->getHeader();
1632 BasicBlock *OuterLatch = NewOuter->getLoopLatch();
1633 for (BasicBlock *BB : OrigInnerBBs) {
1634 // Nothing will change for BBs in child loops.
1635 if (LI->getLoopFor(BB) != NewOuter)
1636 continue;
1637 // Remove the new outer loop header and latch from the new inner loop.
1638 if (BB == OuterHeader || BB == OuterLatch)
1639 NewInner->removeBlockFromLoop(BB);
1640 else
1641 LI->changeLoopFor(BB, NewInner);
1642 }
1643
1644 // The preheader of the original outer loop becomes part of the new
1645 // outer loop.
1646 NewOuter->addBlockEntry(OrigOuterPreHeader);
1647 LI->changeLoopFor(OrigOuterPreHeader, NewOuter);
1648
1649 // Tell SE that we move the loops around.
1650 SE->forgetLoop(NewOuter);
1651}
1652
1653bool LoopInterchangeTransform::transform(
1654 ArrayRef<Instruction *> DropNoWrapInsts) {
1655 bool Transformed = false;
1656
1657 if (InnerLoop->getSubLoops().empty()) {
1658 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1659 LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");
1660 auto &InductionPHIs = LIL.getInnerLoopInductions();
1661 if (InductionPHIs.empty()) {
1662 LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
1663 return false;
1664 }
1665
1666 SmallVector<Instruction *, 8> InnerIndexVarList;
1667 for (PHINode *CurInductionPHI : InductionPHIs) {
1668 if (CurInductionPHI->getIncomingBlock(0) == InnerLoopPreHeader)
1669 InnerIndexVarList.push_back(
1670 dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(1)));
1671 else
1672 InnerIndexVarList.push_back(
1673 dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(0)));
1674 }
1675
1676 // Create a new latch block for the inner loop. We split at the
1677 // current latch's terminator and then move the condition and all
1678 // operands that are not either loop-invariant or the induction PHI into the
1679 // new latch block.
1680 BasicBlock *NewLatch =
1681 SplitBlock(InnerLoop->getLoopLatch(),
1682 InnerLoop->getLoopLatch()->getTerminator(), DT, LI);
1683
1684 SmallSetVector<Instruction *, 4> WorkList;
1685 unsigned i = 0;
1686 auto MoveInstructions = [&i, &WorkList, this, &InductionPHIs, NewLatch]() {
1687 for (; i < WorkList.size(); i++) {
1688 // Duplicate instruction and move it the new latch. Update uses that
1689 // have been moved.
1690 Instruction *NewI = WorkList[i]->clone();
1691 NewI->insertBefore(NewLatch->getFirstNonPHIIt());
1692 assert(!NewI->mayHaveSideEffects() &&
1693 "Moving instructions with side-effects may change behavior of "
1694 "the loop nest!");
1695 for (Use &U : llvm::make_early_inc_range(WorkList[i]->uses())) {
1696 Instruction *UserI = cast<Instruction>(U.getUser());
1697 if (!InnerLoop->contains(UserI->getParent()) ||
1698 UserI->getParent() == NewLatch ||
1699 llvm::is_contained(InductionPHIs, UserI))
1700 U.set(NewI);
1701 }
1702 // Add operands of moved instruction to the worklist, except if they are
1703 // outside the inner loop or are the induction PHI.
1704 for (Value *Op : WorkList[i]->operands()) {
1706 if (!OpI ||
1707 this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop ||
1708 llvm::is_contained(InductionPHIs, OpI))
1709 continue;
1710 WorkList.insert(OpI);
1711 }
1712 }
1713 };
1714
1715 // FIXME: Should we interchange when we have a constant condition?
1718 ->getCondition());
1719 if (CondI)
1720 WorkList.insert(CondI);
1721 MoveInstructions();
1722 for (Instruction *InnerIndexVar : InnerIndexVarList)
1723 WorkList.insert(cast<Instruction>(InnerIndexVar));
1724 MoveInstructions();
1725 }
1726
1727 // Ensure the inner loop phi nodes have a separate basic block.
1728 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1729 if (&*InnerLoopHeader->getFirstNonPHIIt() !=
1730 InnerLoopHeader->getTerminator()) {
1731 SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHIIt(), DT, LI);
1732 LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n");
1733 }
1734
1735 // Instructions in the original inner loop preheader may depend on values
1736 // defined in the outer loop header. Move them there, because the original
1737 // inner loop preheader will become the entry into the interchanged loop nest.
1738 // Currently we move all instructions and rely on LICM to move invariant
1739 // instructions outside the loop nest.
1740 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1741 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1742 if (InnerLoopPreHeader != OuterLoopHeader) {
1743 for (Instruction &I :
1744 make_early_inc_range(make_range(InnerLoopPreHeader->begin(),
1745 std::prev(InnerLoopPreHeader->end()))))
1746 I.moveBeforePreserving(OuterLoopHeader->getTerminator()->getIterator());
1747 }
1748
1749 Transformed |= adjustLoopLinks();
1750 if (!Transformed) {
1751 LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n");
1752 return false;
1753 }
1754
1755 // Finally, drop the nsw/nuw flags from the instructions for reduction
1756 // calculations.
1757 for (Instruction *Reduction : DropNoWrapInsts) {
1758 Reduction->setHasNoSignedWrap(false);
1759 Reduction->setHasNoUnsignedWrap(false);
1760 }
1761
1762 return true;
1763}
1764
1765/// \brief Move all instructions except the terminator from FromBB right before
1766/// InsertBefore
1767static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
1768 BasicBlock *ToBB = InsertBefore->getParent();
1769
1770 ToBB->splice(InsertBefore->getIterator(), FromBB, FromBB->begin(),
1771 FromBB->getTerminator()->getIterator());
1772}
1773
1774/// Swap instructions between \p BB1 and \p BB2 but keep terminators intact.
1775static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2) {
1776 // Save all non-terminator instructions of BB1 into TempInstrs and unlink them
1777 // from BB1 afterwards.
1778 auto Iter = map_range(*BB1, [](Instruction &I) { return &I; });
1779 SmallVector<Instruction *, 4> TempInstrs(Iter.begin(), std::prev(Iter.end()));
1780 for (Instruction *I : TempInstrs)
1781 I->removeFromParent();
1782
1783 // Move instructions from BB2 to BB1.
1784 moveBBContents(BB2, BB1->getTerminator());
1785
1786 // Move instructions from TempInstrs to BB2.
1787 for (Instruction *I : TempInstrs)
1788 I->insertBefore(BB2->getTerminator()->getIterator());
1789}
1790
1791// Update BI to jump to NewBB instead of OldBB. Records updates to the
1792// dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that
1793// \p OldBB is exactly once in BI's successor list.
1794static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB,
1795 BasicBlock *NewBB,
1796 std::vector<DominatorTree::UpdateType> &DTUpdates,
1797 bool MustUpdateOnce = true) {
1798 assert((!MustUpdateOnce || llvm::count(successors(BI), OldBB) == 1) &&
1799 "BI must jump to OldBB exactly once.");
1800 bool Changed = false;
1801 for (Use &Op : BI->operands())
1802 if (Op == OldBB) {
1803 Op.set(NewBB);
1804 Changed = true;
1805 }
1806
1807 if (Changed) {
1808 DTUpdates.push_back(
1809 {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB});
1810 DTUpdates.push_back(
1811 {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB});
1812 }
1813 assert(Changed && "Expected a successor to be updated");
1814}
1815
1816// Move Lcssa PHIs to the right place.
1817static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
1818 BasicBlock *InnerLatch, BasicBlock *OuterHeader,
1819 BasicBlock *OuterLatch, BasicBlock *OuterExit,
1820 Loop *InnerLoop, LoopInfo *LI) {
1821
1822 // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are
1823 // defined either in the header or latch. Those blocks will become header and
1824 // latch of the new outer loop, and the only possible users can PHI nodes
1825 // in the exit block of the loop nest or the outer loop header (reduction
1826 // PHIs, in that case, the incoming value must be defined in the inner loop
1827 // header). We can just substitute the user with the incoming value and remove
1828 // the PHI.
1829 for (PHINode &P : make_early_inc_range(InnerExit->phis())) {
1830 assert(P.getNumIncomingValues() == 1 &&
1831 "Only loops with a single exit are supported!");
1832
1833 // Incoming values are guaranteed be instructions currently.
1834 auto IncI = cast<Instruction>(P.getIncomingValueForBlock(InnerLatch));
1835 // In case of multi-level nested loops, follow LCSSA to find the incoming
1836 // value defined from the innermost loop.
1837 auto IncIInnerMost = cast<Instruction>(followLCSSA(IncI));
1838 // Skip phis with incoming values from the inner loop body, excluding the
1839 // header and latch.
1840 if (IncIInnerMost->getParent() != InnerLatch &&
1841 IncIInnerMost->getParent() != InnerHeader)
1842 continue;
1843
1844 assert(all_of(P.users(),
1845 [OuterHeader, OuterExit, IncI, InnerHeader](User *U) {
1846 return (cast<PHINode>(U)->getParent() == OuterHeader &&
1847 IncI->getParent() == InnerHeader) ||
1848 cast<PHINode>(U)->getParent() == OuterExit;
1849 }) &&
1850 "Can only replace phis iff the uses are in the loop nest exit or "
1851 "the incoming value is defined in the inner header (it will "
1852 "dominate all loop blocks after interchanging)");
1853 P.replaceAllUsesWith(IncI);
1854 P.eraseFromParent();
1855 }
1856
1857 SmallVector<PHINode *, 8> LcssaInnerExit(
1858 llvm::make_pointer_range(InnerExit->phis()));
1859
1860 SmallVector<PHINode *, 8> LcssaInnerLatch(
1861 llvm::make_pointer_range(InnerLatch->phis()));
1862
1863 // Lcssa PHIs for values used outside the inner loop are in InnerExit.
1864 // If a PHI node has users outside of InnerExit, it has a use outside the
1865 // interchanged loop and we have to preserve it. We move these to
1866 // InnerLatch, which will become the new exit block for the innermost
1867 // loop after interchanging.
1868 for (PHINode *P : LcssaInnerExit)
1869 P->moveBefore(InnerLatch->getFirstNonPHIIt());
1870
1871 // If the inner loop latch contains LCSSA PHIs, those come from a child loop
1872 // and we have to move them to the new inner latch.
1873 for (PHINode *P : LcssaInnerLatch)
1874 P->moveBefore(InnerExit->getFirstNonPHIIt());
1875
1876 // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have
1877 // incoming values defined in the outer loop, we have to add a new PHI
1878 // in the inner loop latch, which became the exit block of the outer loop,
1879 // after interchanging.
1880 if (OuterExit) {
1881 for (PHINode &P : OuterExit->phis()) {
1882 if (P.getNumIncomingValues() != 1)
1883 continue;
1884 // Skip Phis with incoming values defined in the inner loop. Those should
1885 // already have been updated.
1886 auto I = dyn_cast<Instruction>(P.getIncomingValue(0));
1887 if (!I || LI->getLoopFor(I->getParent()) == InnerLoop)
1888 continue;
1889
1890 PHINode *NewPhi = dyn_cast<PHINode>(P.clone());
1891 NewPhi->setIncomingValue(0, P.getIncomingValue(0));
1892 NewPhi->setIncomingBlock(0, OuterLatch);
1893 // We might have incoming edges from other BBs, i.e., the original outer
1894 // header.
1895 for (auto *Pred : predecessors(InnerLatch)) {
1896 if (Pred == OuterLatch)
1897 continue;
1898 NewPhi->addIncoming(P.getIncomingValue(0), Pred);
1899 }
1900 NewPhi->insertBefore(InnerLatch->getFirstNonPHIIt());
1901 P.setIncomingValue(0, NewPhi);
1902 }
1903 }
1904
1905 // Now adjust the incoming blocks for the LCSSA PHIs.
1906 // For PHIs moved from Inner's exit block, we need to replace Inner's latch
1907 // with the new latch.
1908 InnerLatch->replacePhiUsesWith(InnerLatch, OuterLatch);
1909}
1910
1911/// This deals with a corner case when a LCSSA phi node appears in a non-exit
1912/// block: the outer loop latch block does not need to be exit block of the
1913/// inner loop. Consider a loop that was in LCSSA form, but then some
1914/// transformation like loop-unswitch comes along and creates an empty block,
1915/// where BB5 in this example is the outer loop latch block:
1916///
1917/// BB4:
1918/// br label %BB5
1919/// BB5:
1920/// %old.cond.lcssa = phi i16 [ %cond, %BB4 ]
1921/// br outer.header
1922///
1923/// Interchange then brings it in LCSSA form again resulting in this chain of
1924/// single-input phi nodes:
1925///
1926/// BB4:
1927/// %new.cond.lcssa = phi i16 [ %cond, %BB3 ]
1928/// br label %BB5
1929/// BB5:
1930/// %old.cond.lcssa = phi i16 [ %new.cond.lcssa, %BB4 ]
1931///
1932/// The problem is that interchange can reoder blocks BB4 and BB5 placing the
1933/// use before the def if we don't check this. The solution is to simplify
1934/// lcssa phi nodes (remove) if they appear in non-exit blocks.
1935///
1936static void simplifyLCSSAPhis(Loop *OuterLoop, Loop *InnerLoop) {
1937 BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
1938 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
1939
1940 // Do not modify lcssa phis where they actually belong, i.e. in exit blocks.
1941 if (OuterLoopLatch == InnerLoopExit)
1942 return;
1943
1944 // Collect and remove phis in non-exit blocks if they have 1 input.
1946 llvm::make_pointer_range(OuterLoopLatch->phis()));
1947 for (PHINode *Phi : Phis) {
1948 assert(Phi->getNumIncomingValues() == 1 && "Single input phi expected");
1949 LLVM_DEBUG(dbgs() << "Removing 1-input phi in non-exit block: " << *Phi
1950 << "\n");
1951 Phi->replaceAllUsesWith(Phi->getIncomingValue(0));
1952 Phi->eraseFromParent();
1953 }
1954}
1955
1956bool LoopInterchangeTransform::adjustLoopBranches() {
1957 LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n");
1958 std::vector<DominatorTree::UpdateType> DTUpdates;
1959
1960 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1961 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1962
1963 assert(OuterLoopPreHeader != OuterLoop->getHeader() &&
1964 InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader &&
1965 InnerLoopPreHeader && "Guaranteed by loop-simplify form");
1966
1967 simplifyLCSSAPhis(OuterLoop, InnerLoop);
1968
1969 // Ensure that both preheaders do not contain PHI nodes and have single
1970 // predecessors. This allows us to move them easily. We use
1971 // InsertPreHeaderForLoop to create an 'extra' preheader, if the existing
1972 // preheaders do not satisfy those conditions.
1973 if (isa<PHINode>(OuterLoopPreHeader->begin()) ||
1974 !OuterLoopPreHeader->getUniquePredecessor())
1975 OuterLoopPreHeader =
1976 InsertPreheaderForLoop(OuterLoop, DT, LI, nullptr, true);
1977 if (InnerLoopPreHeader == OuterLoop->getHeader())
1978 InnerLoopPreHeader =
1979 InsertPreheaderForLoop(InnerLoop, DT, LI, nullptr, true);
1980
1981 // Adjust the loop preheader
1982 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1983 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1984 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1985 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
1986 BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
1987 BasicBlock *InnerLoopLatchPredecessor =
1988 InnerLoopLatch->getUniquePredecessor();
1989 BasicBlock *InnerLoopLatchSuccessor;
1990 BasicBlock *OuterLoopLatchSuccessor;
1991
1992 BranchInst *OuterLoopLatchBI =
1993 dyn_cast<BranchInst>(OuterLoopLatch->getTerminator());
1994 BranchInst *InnerLoopLatchBI =
1995 dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
1996 BranchInst *OuterLoopHeaderBI =
1997 dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
1998 BranchInst *InnerLoopHeaderBI =
1999 dyn_cast<BranchInst>(InnerLoopHeader->getTerminator());
2000
2001 if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
2002 !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
2003 !InnerLoopHeaderBI)
2004 return false;
2005
2006 BranchInst *InnerLoopLatchPredecessorBI =
2007 dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator());
2008 BranchInst *OuterLoopPredecessorBI =
2009 dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator());
2010
2011 if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
2012 return false;
2013 BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
2014 if (!InnerLoopHeaderSuccessor)
2015 return false;
2016
2017 // Adjust Loop Preheader and headers.
2018 // The branches in the outer loop predecessor and the outer loop header can
2019 // be unconditional branches or conditional branches with duplicates. Consider
2020 // this when updating the successors.
2021 updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader,
2022 InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false);
2023 // The outer loop header might or might not branch to the outer latch.
2024 // We are guaranteed to branch to the inner loop preheader.
2025 if (llvm::is_contained(OuterLoopHeaderBI->successors(), OuterLoopLatch)) {
2026 // In this case the outerLoopHeader should branch to the InnerLoopLatch.
2027 updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, InnerLoopLatch,
2028 DTUpdates,
2029 /*MustUpdateOnce=*/false);
2030 }
2031 updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader,
2032 InnerLoopHeaderSuccessor, DTUpdates,
2033 /*MustUpdateOnce=*/false);
2034
2035 // Adjust reduction PHI's now that the incoming block has changed.
2036 InnerLoopHeaderSuccessor->replacePhiUsesWith(InnerLoopHeader,
2037 OuterLoopHeader);
2038
2039 updateSuccessor(InnerLoopHeaderBI, InnerLoopHeaderSuccessor,
2040 OuterLoopPreHeader, DTUpdates);
2041
2042 // -------------Adjust loop latches-----------
2043 if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)
2044 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1);
2045 else
2046 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);
2047
2048 updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch,
2049 InnerLoopLatchSuccessor, DTUpdates);
2050
2051 if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)
2052 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);
2053 else
2054 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);
2055
2056 updateSuccessor(InnerLoopLatchBI, InnerLoopLatchSuccessor,
2057 OuterLoopLatchSuccessor, DTUpdates);
2058 updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch,
2059 DTUpdates);
2060
2061 DT->applyUpdates(DTUpdates);
2062 restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader,
2063 OuterLoopPreHeader);
2064
2065 moveLCSSAPhis(InnerLoopLatchSuccessor, InnerLoopHeader, InnerLoopLatch,
2066 OuterLoopHeader, OuterLoopLatch, InnerLoop->getExitBlock(),
2067 InnerLoop, LI);
2068 // For PHIs in the exit block of the outer loop, outer's latch has been
2069 // replaced by Inners'.
2070 OuterLoopLatchSuccessor->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
2071
2072 auto &OuterInnerReductions = LIL.getOuterInnerReductions();
2073 // Now update the reduction PHIs in the inner and outer loop headers.
2074 SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
2075 for (PHINode &PHI : InnerLoopHeader->phis())
2076 if (OuterInnerReductions.contains(&PHI))
2077 InnerLoopPHIs.push_back(&PHI);
2078
2079 for (PHINode &PHI : OuterLoopHeader->phis())
2080 if (OuterInnerReductions.contains(&PHI))
2081 OuterLoopPHIs.push_back(&PHI);
2082
2083 // Now move the remaining reduction PHIs from outer to inner loop header and
2084 // vice versa. The PHI nodes must be part of a reduction across the inner and
2085 // outer loop and all the remains to do is and updating the incoming blocks.
2086 for (PHINode *PHI : OuterLoopPHIs) {
2087 LLVM_DEBUG(dbgs() << "Outer loop reduction PHIs:\n"; PHI->dump(););
2088 PHI->moveBefore(InnerLoopHeader->getFirstNonPHIIt());
2089 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
2090 }
2091 for (PHINode *PHI : InnerLoopPHIs) {
2092 LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n"; PHI->dump(););
2093 PHI->moveBefore(OuterLoopHeader->getFirstNonPHIIt());
2094 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
2095 }
2096
2097 // Update the incoming blocks for moved PHI nodes.
2098 OuterLoopHeader->replacePhiUsesWith(InnerLoopPreHeader, OuterLoopPreHeader);
2099 OuterLoopHeader->replacePhiUsesWith(InnerLoopLatch, OuterLoopLatch);
2100 InnerLoopHeader->replacePhiUsesWith(OuterLoopPreHeader, InnerLoopPreHeader);
2101 InnerLoopHeader->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
2102
2103 // Values defined in the outer loop header could be used in the inner loop
2104 // latch. In that case, we need to create LCSSA phis for them, because after
2105 // interchanging they will be defined in the new inner loop and used in the
2106 // new outer loop.
2107 SmallVector<Instruction *, 4> MayNeedLCSSAPhis;
2108 for (Instruction &I :
2109 make_range(OuterLoopHeader->begin(), std::prev(OuterLoopHeader->end())))
2110 MayNeedLCSSAPhis.push_back(&I);
2111 formLCSSAForInstructions(MayNeedLCSSAPhis, *DT, *LI, SE);
2112
2113 return true;
2114}
2115
2116bool LoopInterchangeTransform::adjustLoopLinks() {
2117 // Adjust all branches in the inner and outer loop.
2118 bool Changed = adjustLoopBranches();
2119 if (Changed) {
2120 // We have interchanged the preheaders so we need to interchange the data in
2121 // the preheaders as well. This is because the content of the inner
2122 // preheader was previously executed inside the outer loop.
2123 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
2124 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2125 swapBBContents(OuterLoopPreHeader, InnerLoopPreHeader);
2126 }
2127 return Changed;
2128}
2129
2133 LPMUpdater &U) {
2134 Function &F = *LN.getParent();
2135 SmallVector<Loop *, 8> LoopList(LN.getLoops());
2136
2137 if (MaxMemInstrCount < 1) {
2138 LLVM_DEBUG(dbgs() << "MaxMemInstrCount should be at least 1");
2139 return PreservedAnalyses::all();
2140 }
2142
2143 // Ensure minimum depth of the loop nest to do the interchange.
2144 if (!hasSupportedLoopDepth(LoopList, ORE))
2145 return PreservedAnalyses::all();
2146 // Ensure computable loop nest.
2147 if (!isComputableLoopNest(&AR.SE, LoopList)) {
2148 LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
2149 return PreservedAnalyses::all();
2150 }
2151
2152 ORE.emit([&]() {
2153 return OptimizationRemarkAnalysis(DEBUG_TYPE, "Dependence",
2156 << "Computed dependence info, invoking the transform.";
2157 });
2158
2159 DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
2160 if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &AR, &ORE).run(LN))
2161 return PreservedAnalyses::all();
2162 U.markLoopNestChanged(true);
2164}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file defines the StringMap class.
Rewrite undef for PHI
ReachingDefInfo InstSet InstSet & Ignore
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
#define DEBUG_TYPE
Hexagon Common GEP
static void moveBBContents(BasicBlock &SourceBB, BasicBlock &TargetBB)
Move the contents of SourceBB to before the last instruction of TargetBB.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
This file defines the interface for the loop cache analysis.
SmallVector< Loop *, 4 > LoopVector
Definition LoopFuse.cpp:382
Loop::LoopBounds::Direction Direction
Definition LoopInfo.cpp:231
static cl::opt< unsigned int > MaxMemInstrCount("loop-interchange-max-meminstr-count", cl::init(64), cl::Hidden, cl::desc("Maximum number of load-store instructions that should be handled " "in the dependency matrix. Higher value may lead to more interchanges " "at the cost of compile-time"))
static cl::opt< int > LoopInterchangeCostThreshold("loop-interchange-threshold", cl::init(0), cl::Hidden, cl::desc("Interchange if you gain more than this number"))
static PHINode * findInnerReductionPhi(Loop *L, Value *V, SmallVectorImpl< Instruction * > &HasNoWrapInsts)
static cl::opt< unsigned int > MinLoopNestDepth("loop-interchange-min-loop-nest-depth", cl::init(2), cl::Hidden, cl::desc("Minimum depth of loop nest considered for the transform"))
static bool isComputableLoopNest(ScalarEvolution *SE, ArrayRef< Loop * > LoopList)
static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop)
static void simplifyLCSSAPhis(Loop *OuterLoop, Loop *InnerLoop)
This deals with a corner case when a LCSSA phi node appears in a non-exit block: the outer loop latch...
static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx, unsigned ToIndx)
static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader, BasicBlock *InnerLatch, BasicBlock *OuterHeader, BasicBlock *OuterLatch, BasicBlock *OuterExit, Loop *InnerLoop, LoopInfo *LI)
static void printDepMatrix(CharMatrix &DepMatrix)
static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2)
Swap instructions between BB1 and BB2 but keep terminators intact.
static cl::opt< unsigned int > MaxLoopNestDepth("loop-interchange-max-loop-nest-depth", cl::init(10), cl::Hidden, cl::desc("Maximum depth of loop nest considered for the transform"))
static bool hasSupportedLoopDepth(ArrayRef< Loop * > LoopList, OptimizationRemarkEmitter &ORE)
static bool inThisOrder(const Instruction *Src, const Instruction *Dst)
Return true if Src appears before Dst in the same basic block.
static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop)
static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId)
Return true if we can vectorize the loop specified by LoopId.
static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, unsigned InnerLoopId, unsigned OuterLoopId)
static bool areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL, SmallPtrSetImpl< PHINode * > &Reductions)
#define DEBUG_TYPE
static Value * followLCSSA(Value *SV)
static void populateWorklist(Loop &L, LoopVector &LoopList)
static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, Loop *L, DependenceInfo *DI, ScalarEvolution *SE, OptimizationRemarkEmitter *ORE)
static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB, BasicBlock *NewBB, std::vector< DominatorTree::UpdateType > &DTUpdates, bool MustUpdateOnce=true)
static std::optional< bool > isLexicographicallyPositive(ArrayRef< char > DV, unsigned Begin, unsigned End)
static cl::list< RuleTy > Profitabilities("loop-interchange-profitabilities", cl::ZeroOrMore, cl::MiscFlags::CommaSeparated, cl::Hidden, cl::desc("List of profitability heuristics to be used. They are applied in " "the given order"), cl::list_init< RuleTy >({RuleTy::PerLoopCacheAnalysis, RuleTy::PerInstrOrderCost, RuleTy::ForVectorization}), cl::values(clEnumValN(RuleTy::PerLoopCacheAnalysis, "cache", "Prioritize loop cache cost"), clEnumValN(RuleTy::PerInstrOrderCost, "instorder", "Prioritize the IVs order of each instruction"), clEnumValN(RuleTy::ForVectorization, "vectorize", "Prioritize vectorization"), clEnumValN(RuleTy::Ignore, "ignore", "Ignore profitability, force interchange (does not " "work with other options)")))
static bool noDuplicateRulesAndIgnore(ArrayRef< RuleTy > Rules)
This file defines the interface for the loop nest analysis.
This header provides classes for managing a pipeline of passes over loops in LLVM IR.
loop Loop Strength Reduction
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
uint64_t IntrinsicInst * II
#define P(N)
This file contains some templates that are useful if you are working with the STL at all.
SmallVector< Value *, 8 > ValueVector
This file defines the SmallSet class.
This file defines the SmallVector class.
static bool isProfitable(const StableFunctionMap::StableFunctionEntries &SFS)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:186
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition BasicBlock.h:528
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI iterator_range< filter_iterator< BasicBlock::const_iterator, std::function< bool(const Instruction &)> > > instructionsWithoutDebug(bool SkipPseudoOp=true) const
Return a const iterator range over the instructions in the block, skipping any debug instructions.
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
LLVM_ABI void replacePhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block to refer to basic block New instead of basic block Old.
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition BasicBlock.h:662
Conditional or Unconditional Branch instruction.
iterator_range< succ_op_iterator > successors()
bool isConditional() const
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
static std::unique_ptr< CacheCost > getCacheCost(Loop &Root, LoopStandardAnalysisResults &AR, DependenceInfo &DI, std::optional< unsigned > TRT=std::nullopt)
Create a CacheCost for the loop nest rooted by Root.
CacheCostTy getLoopCost(const Loop &L) const
Return the estimated cost of loop L if the given loop is part of the loop nest associated with this o...
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
iterator end()
Definition DenseMap.h:81
DependenceInfo - This class is the main dependence-analysis driver.
LLVM_ABI std::unique_ptr< Dependence > depends(Instruction *Src, Instruction *Dst, bool UnderRuntimeAssumptions=false)
depends - Tests for a dependence between the Src and Dst instructions.
void applyUpdates(ArrayRef< UpdateType > Updates)
Inform the dominator tree about a sequence of CFG edge insertions and deletions and perform a batch u...
static LLVM_ABI bool isInductionPHI(PHINode *Phi, const Loop *L, ScalarEvolution *SE, InductionDescriptor &D, const SCEV *Expr=nullptr, SmallVectorImpl< Instruction * > *CastsToIgnore=nullptr)
Returns true if Phi is an induction in the loop L.
LLVM_ABI void insertBefore(InstListType::iterator InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified position.
LLVM_ABI bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void removeBlockFromLoop(BlockT *BB)
This removes the specified basic block from the current loop, updating the Blocks as appropriate.
const std::vector< LoopT * > & getSubLoops() const
Return the loops contained entirely within this loop.
BlockT * getHeader() const
iterator_range< block_iterator > blocks() const
void addChildLoop(LoopT *NewChild)
Add the specified loop to be a child of this loop.
void addBlockEntry(BlockT *BB)
This adds a basic block directly to the basic block list.
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
iterator begin() const
BlockT * getUniqueExitBlock() const
If getUniqueExitBlocks would return exactly one block, return that block.
LoopT * removeChildLoop(iterator I)
This removes the specified child from being a subloop of this loop.
void changeTopLevelLoop(LoopT *OldLoop, LoopT *NewLoop)
Replace the specified loop in the top-level loops list with the indicated loop.
void changeLoopFor(BlockT *BB, LoopT *L)
Change the top-level loop that contains BB to the specified loop.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
static const BasicBlock & skipEmptyBlockUntil(const BasicBlock *From, const BasicBlock *End, bool CheckUniquePred=false)
Recursivelly traverse all empty 'single successor' basic blocks of From (if there are any).
ArrayRef< Loop * > getLoops() const
Get the loops in the nest.
Function * getParent() const
Return the function to which the loop-nest belongs.
Loop & getOutermostLoop() const
Return the outermost loop in the loop nest.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition LoopInfo.cpp:632
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition LoopInfo.cpp:61
StringRef getName() const
Definition LoopInfo.h:389
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
op_range incoming_values()
void setIncomingBlock(unsigned i, BasicBlock *BB)
void setIncomingValue(unsigned i, Value *V)
static unsigned getIncomingValueNumForOperand(unsigned i)
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Instruction * getExactFPMathInst() const
Returns 1st non-reassociative FP instruction in the PHI node's use-chain.
static LLVM_ABI bool isReductionPHI(PHINode *Phi, Loop *TheLoop, RecurrenceDescriptor &RedDes, DemandedBits *DB=nullptr, AssumptionCache *AC=nullptr, DominatorTree *DT=nullptr, ScalarEvolution *SE=nullptr)
Returns true if Phi is a reduction in TheLoop.
LLVM_ABI SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
RecurKind getRecurrenceKind() const
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
std::pair< iterator, bool > try_emplace(StringRef Key, ArgsTy &&...Args)
Emplace a new element for the specified key into the map if the key isn't already in the map.
Definition StringMap.h:381
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:293
LLVM Value Representation.
Definition Value.h:75
iterator_range< user_iterator > users()
Definition Value.h:426
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
list_initializer< Ty > list_init(ArrayRef< Ty > Vals)
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI BasicBlock * InsertPreheaderForLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
InsertPreheaderForLoop - Once we discover that a loop doesn't have a preheader, this method is called...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2530
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
auto successors(const MachineBasicBlock *BB)
LLVM_ABI bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition LCSSA.cpp:449
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
auto map_range(ContainerTy &&C, FuncTy F)
Definition STLExtras.h:364
AnalysisManager< Loop, LoopStandardAnalysisResults & > LoopAnalysisManager
The loop analysis manager.
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:2016
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FMinimumNum
FP min with llvm.minimumnum semantics.
@ Or
Bitwise or logical OR of integers.
@ FMinimum
FP min with llvm.minimum semantics.
@ Mul
Product of integers.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMaximum
FP max with llvm.maximum semantics.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Add
Sum of integers.
@ FAdd
Sum of floats.
@ FMaximumNum
FP max with llvm.maximumnum semantics.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2002
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI bool formLCSSAForInstructions(SmallVectorImpl< Instruction * > &Worklist, const DominatorTree &DT, const LoopInfo &LI, ScalarEvolution *SE, SmallVectorImpl< PHINode * > *PHIsToRemove=nullptr, SmallVectorImpl< PHINode * > *InsertedPHIs=nullptr)
Ensures LCSSA form for every instruction from the Worklist in the scope of innermost containing loop.
Definition LCSSA.cpp:308
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
LLVM_ABI PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
auto predecessors(const MachineBasicBlock *BB)
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition iterator.h:365
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
PreservedAnalyses run(LoopNest &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...