LLVM 23.0.0git
MachineBlockPlacement.cpp
Go to the documentation of this file.
1//===- MachineBlockPlacement.cpp - Basic Block Code Layout optimization ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements basic block placement transformations using the CFG
10// structure and branch probability estimates.
11//
12// The pass strives to preserve the structure of the CFG (that is, retain
13// a topological ordering of basic blocks) in the absence of a *strong* signal
14// to the contrary from probabilities. However, within the CFG structure, it
15// attempts to choose an ordering which favors placing more likely sequences of
16// blocks adjacent to each other.
17//
18// The algorithm works from the inner-most loop within a function outward, and
19// at each stage walks through the basic blocks, trying to coalesce them into
20// sequential chains where allowed by the CFG (or demanded by heavy
21// probabilities). Finally, it walks the blocks in topological order, and the
22// first time it reaches a chain of basic blocks, it schedules them in the
23// function in-order.
24//
25//===----------------------------------------------------------------------===//
26
28#include "BranchFolding.h"
29#include "llvm/ADT/ArrayRef.h"
30#include "llvm/ADT/DenseMap.h"
31#include "llvm/ADT/STLExtras.h"
32#include "llvm/ADT/SetVector.h"
35#include "llvm/ADT/Statistic.h"
52#include "llvm/IR/DebugLoc.h"
53#include "llvm/IR/Function.h"
54#include "llvm/IR/PrintPasses.h"
56#include "llvm/Pass.h"
63#include "llvm/Support/Debug.h"
67#include <algorithm>
68#include <cassert>
69#include <cstdint>
70#include <iterator>
71#include <memory>
72#include <string>
73#include <tuple>
74#include <utility>
75#include <vector>
76
77using namespace llvm;
78
79#define DEBUG_TYPE "block-placement"
80
81STATISTIC(NumCondBranches, "Number of conditional branches");
82STATISTIC(NumUncondBranches, "Number of unconditional branches");
83STATISTIC(CondBranchTakenFreq,
84 "Potential frequency of taking conditional branches");
85STATISTIC(UncondBranchTakenFreq,
86 "Potential frequency of taking unconditional branches");
87
89 "align-all-blocks",
90 cl::desc("Force the alignment of all blocks in the function in log2 format "
91 "(e.g 4 means align on 16B boundaries)."),
93
95 "align-all-nofallthru-blocks",
96 cl::desc("Force the alignment of all blocks that have no fall-through "
97 "predecessors (i.e. don't add nops that are executed). In log2 "
98 "format (e.g 4 means align on 16B boundaries)."),
100
102 "max-bytes-for-alignment",
103 cl::desc("Forces the maximum bytes allowed to be emitted when padding for "
104 "alignment"),
105 cl::init(0), cl::Hidden);
106
108 "block-placement-predecessor-limit",
109 cl::desc("For blocks with more predecessors, certain layout optimizations"
110 "will be disabled to prevent quadratic compile time."),
111 cl::init(1000), cl::Hidden);
112
113// FIXME: Find a good default for this flag and remove the flag.
115 "block-placement-exit-block-bias",
116 cl::desc("Block frequency percentage a loop exit block needs "
117 "over the original exit to be considered the new exit."),
118 cl::init(0), cl::Hidden);
119
120// Definition:
121// - Outlining: placement of a basic block outside the chain or hot path.
122
124 "loop-to-cold-block-ratio",
125 cl::desc("Outline loop blocks from loop chain if (frequency of loop) / "
126 "(frequency of block) is greater than this ratio"),
127 cl::init(5), cl::Hidden);
128
129static cl::opt<bool>
130 ForceLoopColdBlock("force-loop-cold-block",
131 cl::desc("Force outlining cold blocks from loops."),
132 cl::init(false), cl::Hidden);
133
134static cl::opt<bool>
135 PreciseRotationCost("precise-rotation-cost",
136 cl::desc("Model the cost of loop rotation more "
137 "precisely by using profile data."),
138 cl::init(false), cl::Hidden);
139
140static cl::opt<bool>
141 ForcePreciseRotationCost("force-precise-rotation-cost",
142 cl::desc("Force the use of precise cost "
143 "loop rotation strategy."),
144 cl::init(false), cl::Hidden);
145
147 "misfetch-cost",
148 cl::desc("Cost that models the probabilistic risk of an instruction "
149 "misfetch due to a jump comparing to falling through, whose cost "
150 "is zero."),
151 cl::init(1), cl::Hidden);
152
153static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
154 cl::desc("Cost of jump instructions."),
155 cl::init(1), cl::Hidden);
156static cl::opt<bool>
157 TailDupPlacement("tail-dup-placement",
158 cl::desc("Perform tail duplication during placement. "
159 "Creates more fallthrough opportunities in "
160 "outline branches."),
161 cl::init(true), cl::Hidden);
162
163static cl::opt<bool>
164 BranchFoldPlacement("branch-fold-placement",
165 cl::desc("Perform branch folding during placement. "
166 "Reduces code size."),
167 cl::init(true), cl::Hidden);
168
169// Heuristic for tail duplication.
171 "tail-dup-placement-threshold",
172 cl::desc("Instruction cutoff for tail duplication during layout. "
173 "Tail merging during layout is forced to have a threshold "
174 "that won't conflict."),
175 cl::init(2), cl::Hidden);
176
177// Heuristic for aggressive tail duplication.
179 "tail-dup-placement-aggressive-threshold",
180 cl::desc("Instruction cutoff for aggressive tail duplication during "
181 "layout. Used at -O3. Tail merging during layout is forced to "
182 "have a threshold that won't conflict."),
183 cl::init(4), cl::Hidden);
184
185// Heuristic for tail duplication.
187 "tail-dup-placement-penalty",
188 cl::desc(
189 "Cost penalty for blocks that can avoid breaking CFG by copying. "
190 "Copying can increase fallthrough, but it also increases icache "
191 "pressure. This parameter controls the penalty to account for that. "
192 "Percent as integer."),
193 cl::init(2), cl::Hidden);
194
195// Heuristic for tail duplication if profile count is used in cost model.
197 "tail-dup-profile-percent-threshold",
198 cl::desc("If profile count information is used in tail duplication cost "
199 "model, the gained fall through number from tail duplication "
200 "should be at least this percent of hot count."),
201 cl::init(50), cl::Hidden);
202
203// Heuristic for triangle chains.
205 "triangle-chain-count",
206 cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the "
207 "triangle tail duplication heuristic to kick in. 0 to disable."),
208 cl::init(2), cl::Hidden);
209
210// Use case: When block layout is visualized after MBP pass, the basic blocks
211// are labeled in layout order; meanwhile blocks could be numbered in a
212// different order. It's hard to map between the graph and pass output.
213// With this option on, the basic blocks are renumbered in function layout
214// order. For debugging only.
216 "renumber-blocks-before-view",
217 cl::desc(
218 "If true, basic blocks are re-numbered before MBP layout is printed "
219 "into a dot graph. Only used when a function is being printed."),
220 cl::init(false), cl::Hidden);
221
223 "ext-tsp-block-placement-max-blocks",
224 cl::desc("Maximum number of basic blocks in a function to run ext-TSP "
225 "block placement."),
226 cl::init(UINT_MAX), cl::Hidden);
227
228// Apply the ext-tsp algorithm minimizing the size of a binary.
229static cl::opt<bool>
230 ApplyExtTspForSize("apply-ext-tsp-for-size", cl::init(false), cl::Hidden,
231 cl::desc("Use ext-tsp for size-aware block placement."));
232
233namespace llvm {
238
239// Internal option used to control BFI display only after MBP pass.
240// Defined in CodeGen/MachineBlockFrequencyInfo.cpp:
241// -view-block-layout-with-bfi=
243
244// Command line option to specify the name of the function for CFG dump
245// Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name=
247} // namespace llvm
248
249namespace {
250
251class BlockChain;
252
253/// Type for our function-wide basic block -> block chain mapping.
254using BlockToChainMapType = DenseMap<const MachineBasicBlock *, BlockChain *>;
255
256/// A chain of blocks which will be laid out contiguously.
257///
258/// This is the datastructure representing a chain of consecutive blocks that
259/// are profitable to layout together in order to maximize fallthrough
260/// probabilities and code locality. We also can use a block chain to represent
261/// a sequence of basic blocks which have some external (correctness)
262/// requirement for sequential layout.
263///
264/// Chains can be built around a single basic block and can be merged to grow
265/// them. They participate in a block-to-chain mapping, which is updated
266/// automatically as chains are merged together.
267class BlockChain {
268 /// The sequence of blocks belonging to this chain.
269 ///
270 /// This is the sequence of blocks for a particular chain. These will be laid
271 /// out in-order within the function.
273
274 /// A handle to the function-wide basic block to block chain mapping.
275 ///
276 /// This is retained in each block chain to simplify the computation of child
277 /// block chains for SCC-formation and iteration. We store the edges to child
278 /// basic blocks, and map them back to their associated chains using this
279 /// structure.
280 BlockToChainMapType &BlockToChain;
281
282public:
283 /// Construct a new BlockChain.
284 ///
285 /// This builds a new block chain representing a single basic block in the
286 /// function. It also registers itself as the chain that block participates
287 /// in with the BlockToChain mapping.
288 BlockChain(BlockToChainMapType &BlockToChain, MachineBasicBlock *BB)
289 : Blocks(1, BB), BlockToChain(BlockToChain) {
290 assert(BB && "Cannot create a chain with a null basic block");
291 BlockToChain[BB] = this;
292 }
293
294 /// Iterator over blocks within the chain.
297
298 /// Beginning of blocks within the chain.
299 iterator begin() { return Blocks.begin(); }
300 const_iterator begin() const { return Blocks.begin(); }
301
302 /// End of blocks within the chain.
303 iterator end() { return Blocks.end(); }
304 const_iterator end() const { return Blocks.end(); }
305
306 bool remove(MachineBasicBlock *BB) {
307 for (iterator i = begin(); i != end(); ++i) {
308 if (*i == BB) {
309 Blocks.erase(i);
310 return true;
311 }
312 }
313 return false;
314 }
315
316 /// Merge a block chain into this one.
317 ///
318 /// This routine merges a block chain into this one. It takes care of forming
319 /// a contiguous sequence of basic blocks, updating the edge list, and
320 /// updating the block -> chain mapping. It does not free or tear down the
321 /// old chain, but the old chain's block list is no longer valid.
322 void merge(MachineBasicBlock *BB, BlockChain *Chain) {
323 assert(BB && "Can't merge a null block.");
324 assert(!Blocks.empty() && "Can't merge into an empty chain.");
325
326 // Fast path in case we don't have a chain already.
327 if (!Chain) {
328 assert(!BlockToChain[BB] &&
329 "Passed chain is null, but BB has entry in BlockToChain.");
330 Blocks.push_back(BB);
331 BlockToChain[BB] = this;
332 return;
333 }
334
335 assert(BB == *Chain->begin() && "Passed BB is not head of Chain.");
336 assert(Chain->begin() != Chain->end());
337
338 // Update the incoming blocks to point to this chain, and add them to the
339 // chain structure.
340 for (MachineBasicBlock *ChainBB : *Chain) {
341 Blocks.push_back(ChainBB);
342 assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain.");
343 BlockToChain[ChainBB] = this;
344 }
345 }
346
347#ifndef NDEBUG
348 /// Dump the blocks in this chain.
349 LLVM_DUMP_METHOD void dump() {
350 for (MachineBasicBlock *MBB : *this)
351 MBB->dump();
352 }
353#endif // NDEBUG
354
355 /// Count of predecessors of any block within the chain which have not
356 /// yet been scheduled. In general, we will delay scheduling this chain
357 /// until those predecessors are scheduled (or we find a sufficiently good
358 /// reason to override this heuristic.) Note that when forming loop chains,
359 /// blocks outside the loop are ignored and treated as if they were already
360 /// scheduled.
361 ///
362 /// Note: This field is reinitialized multiple times - once for each loop,
363 /// and then once for the function as a whole.
364 unsigned UnscheduledPredecessors = 0;
365};
366
367class MachineBlockPlacement {
368 /// A type for a block filter set.
369 using BlockFilterSet = SmallSetVector<const MachineBasicBlock *, 16>;
370
371 /// Pair struct containing basic block and taildup profitability
372 struct BlockAndTailDupResult {
373 MachineBasicBlock *BB = nullptr;
374 bool ShouldTailDup;
375 };
376
377 /// Triple struct containing edge weight and the edge.
378 struct WeightedEdge {
379 BlockFrequency Weight;
380 MachineBasicBlock *Src = nullptr;
381 MachineBasicBlock *Dest = nullptr;
382 };
383
384 /// work lists of blocks that are ready to be laid out
387
388 /// Edges that have already been computed as optimal.
389 DenseMap<const MachineBasicBlock *, BlockAndTailDupResult> ComputedEdges;
390
391 /// Machine Function
392 MachineFunction *F = nullptr;
393
394 /// A handle to the branch probability pass.
395 const MachineBranchProbabilityInfo *MBPI = nullptr;
396
397 /// A handle to the function-wide block frequency pass.
398 std::unique_ptr<MBFIWrapper> MBFI;
399
400 /// A handle to the loop info.
401 MachineLoopInfo *MLI = nullptr;
402
403 /// Preferred loop exit.
404 /// Member variable for convenience. It may be removed by duplication deep
405 /// in the call stack.
406 MachineBasicBlock *PreferredLoopExit = nullptr;
407
408 /// A handle to the target's instruction info.
409 const TargetInstrInfo *TII = nullptr;
410
411 /// A handle to the target's lowering info.
412 const TargetLoweringBase *TLI = nullptr;
413
414 /// A handle to the post dominator tree.
415 MachinePostDominatorTree *MPDT = nullptr;
416
417 ProfileSummaryInfo *PSI = nullptr;
418
419 // Tail merging is also determined based on
420 // whether structured CFG is required.
421 bool AllowTailMerge;
422
423 CodeGenOptLevel OptLevel;
424
425 /// Duplicator used to duplicate tails during placement.
426 ///
427 /// Placement decisions can open up new tail duplication opportunities, but
428 /// since tail duplication affects placement decisions of later blocks, it
429 /// must be done inline.
430 TailDuplicator TailDup;
431
432 /// Partial tail duplication threshold.
433 BlockFrequency DupThreshold;
434
435 unsigned TailDupSize;
436
437 /// True: use block profile count to compute tail duplication cost.
438 /// False: use block frequency to compute tail duplication cost.
439 bool UseProfileCount = false;
440
441 /// Allocator and owner of BlockChain structures.
442 ///
443 /// We build BlockChains lazily while processing the loop structure of
444 /// a function. To reduce malloc traffic, we allocate them using this
445 /// slab-like allocator, and destroy them after the pass completes. An
446 /// important guarantee is that this allocator produces stable pointers to
447 /// the chains.
448 SpecificBumpPtrAllocator<BlockChain> ChainAllocator;
449
450 /// Function wide BasicBlock to BlockChain mapping.
451 ///
452 /// This mapping allows efficiently moving from any given basic block to the
453 /// BlockChain it participates in, if any. We use it to, among other things,
454 /// allow implicitly defining edges between chains as the existing edges
455 /// between basic blocks.
456 DenseMap<const MachineBasicBlock *, BlockChain *> BlockToChain;
457
458#ifndef NDEBUG
459 /// The set of basic blocks that have terminators that cannot be fully
460 /// analyzed. These basic blocks cannot be re-ordered safely by
461 /// MachineBlockPlacement, and we must preserve physical layout of these
462 /// blocks and their successors through the pass.
463 SmallPtrSet<MachineBasicBlock *, 4> BlocksWithUnanalyzableExits;
464#endif
465
466 /// Get block profile count or frequency according to UseProfileCount.
467 /// The return value is used to model tail duplication cost.
468 BlockFrequency getBlockCountOrFrequency(const MachineBasicBlock *BB) {
469 if (UseProfileCount) {
470 auto Count = MBFI->getBlockProfileCount(BB);
471 if (Count)
472 return BlockFrequency(*Count);
473 else
474 return BlockFrequency(0);
475 } else
476 return MBFI->getBlockFreq(BB);
477 }
478
479 /// Scale the DupThreshold according to basic block size.
480 BlockFrequency scaleThreshold(MachineBasicBlock *BB);
481 void initTailDupThreshold();
482
483 /// Decrease the UnscheduledPredecessors count for all blocks in chain, and
484 /// if the count goes to 0, add them to the appropriate work list.
485 void markChainSuccessors(const BlockChain &Chain,
486 const MachineBasicBlock *LoopHeaderBB,
487 const BlockFilterSet *BlockFilter = nullptr);
488
489 /// Decrease the UnscheduledPredecessors count for a single block, and
490 /// if the count goes to 0, add them to the appropriate work list.
491 void markBlockSuccessors(const BlockChain &Chain, const MachineBasicBlock *BB,
492 const MachineBasicBlock *LoopHeaderBB,
493 const BlockFilterSet *BlockFilter = nullptr);
494
495 BranchProbability
496 collectViableSuccessors(const MachineBasicBlock *BB, const BlockChain &Chain,
497 const BlockFilterSet *BlockFilter,
498 SmallVector<MachineBasicBlock *, 4> &Successors);
499 bool isBestSuccessor(MachineBasicBlock *BB, MachineBasicBlock *Pred,
500 BlockFilterSet *BlockFilter);
501 void findDuplicateCandidates(SmallVectorImpl<MachineBasicBlock *> &Candidates,
502 MachineBasicBlock *BB,
503 BlockFilterSet *BlockFilter);
504 bool repeatedlyTailDuplicateBlock(
505 MachineBasicBlock *BB, MachineBasicBlock *&LPred,
506 const MachineBasicBlock *LoopHeaderBB, BlockChain &Chain,
507 BlockFilterSet *BlockFilter,
508 MachineFunction::iterator &PrevUnplacedBlockIt,
509 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt);
510 bool
511 maybeTailDuplicateBlock(MachineBasicBlock *BB, MachineBasicBlock *LPred,
512 BlockChain &Chain, BlockFilterSet *BlockFilter,
513 MachineFunction::iterator &PrevUnplacedBlockIt,
514 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
515 bool &DuplicatedToLPred);
516 bool hasBetterLayoutPredecessor(const MachineBasicBlock *BB,
517 const MachineBasicBlock *Succ,
518 const BlockChain &SuccChain,
519 BranchProbability SuccProb,
520 BranchProbability RealSuccProb,
521 const BlockChain &Chain,
522 const BlockFilterSet *BlockFilter);
523 BlockAndTailDupResult selectBestSuccessor(const MachineBasicBlock *BB,
524 const BlockChain &Chain,
525 const BlockFilterSet *BlockFilter);
526 MachineBasicBlock *
527 selectBestCandidateBlock(const BlockChain &Chain,
528 SmallVectorImpl<MachineBasicBlock *> &WorkList);
529 MachineBasicBlock *
530 getFirstUnplacedBlock(const BlockChain &PlacedChain,
531 MachineFunction::iterator &PrevUnplacedBlockIt);
532 MachineBasicBlock *
533 getFirstUnplacedBlock(const BlockChain &PlacedChain,
534 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
535 const BlockFilterSet *BlockFilter);
536
537 /// Add a basic block to the work list if it is appropriate.
538 ///
539 /// If the optional parameter BlockFilter is provided, only MBB
540 /// present in the set will be added to the worklist. If nullptr
541 /// is provided, no filtering occurs.
542 void fillWorkLists(const MachineBasicBlock *MBB,
543 SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
544 const BlockFilterSet *BlockFilter);
545
546 void buildChain(const MachineBasicBlock *BB, BlockChain &Chain,
547 BlockFilterSet *BlockFilter = nullptr);
548 bool canMoveBottomBlockToTop(const MachineBasicBlock *BottomBlock,
549 const MachineBasicBlock *OldTop);
550 bool hasViableTopFallthrough(const MachineBasicBlock *Top,
551 const BlockFilterSet &LoopBlockSet);
552 BlockFrequency TopFallThroughFreq(const MachineBasicBlock *Top,
553 const BlockFilterSet &LoopBlockSet);
554 BlockFrequency FallThroughGains(const MachineBasicBlock *NewTop,
555 const MachineBasicBlock *OldTop,
556 const MachineBasicBlock *ExitBB,
557 const BlockFilterSet &LoopBlockSet);
558 MachineBasicBlock *findBestLoopTopHelper(MachineBasicBlock *OldTop,
559 const MachineLoop &L,
560 const BlockFilterSet &LoopBlockSet);
561 MachineBasicBlock *findBestLoopTop(const MachineLoop &L,
562 const BlockFilterSet &LoopBlockSet);
563 MachineBasicBlock *findBestLoopExit(const MachineLoop &L,
564 const BlockFilterSet &LoopBlockSet,
565 BlockFrequency &ExitFreq);
566 BlockFilterSet collectLoopBlockSet(const MachineLoop &L);
567 void buildLoopChains(const MachineLoop &L);
568 void rotateLoop(BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
569 BlockFrequency ExitFreq, const BlockFilterSet &LoopBlockSet);
570 void rotateLoopWithProfile(BlockChain &LoopChain, const MachineLoop &L,
571 const BlockFilterSet &LoopBlockSet);
572 void buildCFGChains();
573 void optimizeBranches();
574 void alignBlocks();
575 /// Returns true if a block should be tail-duplicated to increase fallthrough
576 /// opportunities.
577 bool shouldTailDuplicate(MachineBasicBlock *BB);
578 /// Check the edge frequencies to see if tail duplication will increase
579 /// fallthroughs.
580 bool isProfitableToTailDup(const MachineBasicBlock *BB,
581 const MachineBasicBlock *Succ,
582 BranchProbability QProb, const BlockChain &Chain,
583 const BlockFilterSet *BlockFilter);
584
585 /// Check for a trellis layout.
586 bool isTrellis(const MachineBasicBlock *BB,
587 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
588 const BlockChain &Chain, const BlockFilterSet *BlockFilter);
589
590 /// Get the best successor given a trellis layout.
591 BlockAndTailDupResult getBestTrellisSuccessor(
592 const MachineBasicBlock *BB,
593 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
594 BranchProbability AdjustedSumProb, const BlockChain &Chain,
595 const BlockFilterSet *BlockFilter);
596
597 /// Get the best pair of non-conflicting edges.
598 static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges(
599 const MachineBasicBlock *BB,
601
602 /// Returns true if a block can tail duplicate into all unplaced
603 /// predecessors. Filters based on loop.
604 bool canTailDuplicateUnplacedPreds(const MachineBasicBlock *BB,
605 MachineBasicBlock *Succ,
606 const BlockChain &Chain,
607 const BlockFilterSet *BlockFilter);
608
609 /// Find chains of triangles to tail-duplicate where a global analysis works,
610 /// but a local analysis would not find them.
611 void precomputeTriangleChains();
612
613 /// Apply a post-processing step optimizing block placement.
614 void applyExtTsp(bool OptForSize);
615
616 /// Modify the existing block placement in the function and adjust all jumps.
617 void assignBlockOrder(const std::vector<const MachineBasicBlock *> &NewOrder);
618
619 /// Create a single CFG chain from the current block order.
620 void createCFGChainExtTsp();
621
622public:
623 MachineBlockPlacement(const MachineBranchProbabilityInfo *MBPI,
624 MachineLoopInfo *MLI, ProfileSummaryInfo *PSI,
625 std::unique_ptr<MBFIWrapper> MBFI,
626 MachinePostDominatorTree *MPDT, bool AllowTailMerge)
627 : MBPI(MBPI), MBFI(std::move(MBFI)), MLI(MLI), MPDT(MPDT), PSI(PSI),
628 AllowTailMerge(AllowTailMerge) {};
629
630 bool run(MachineFunction &F);
631
632 static bool allowTailDupPlacement(MachineFunction &MF) {
634 }
635};
636
637class MachineBlockPlacementLegacy : public MachineFunctionPass {
638public:
639 static char ID; // Pass identification, replacement for typeid
640
641 MachineBlockPlacementLegacy() : MachineFunctionPass(ID) {}
642
643 bool runOnMachineFunction(MachineFunction &MF) override {
644 if (skipFunction(MF.getFunction()))
645 return false;
646
647 auto *MBPI =
648 &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
649 auto MBFI = std::make_unique<MBFIWrapper>(
650 getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI());
651 auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
652 auto *MPDT = MachineBlockPlacement::allowTailDupPlacement(MF)
653 ? &getAnalysis<MachinePostDominatorTreeWrapperPass>()
654 .getPostDomTree()
655 : nullptr;
656 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
657 auto *PassConfig = &getAnalysis<TargetPassConfig>();
658 bool AllowTailMerge = PassConfig->getEnableTailMerge();
659 return MachineBlockPlacement(MBPI, MLI, PSI, std::move(MBFI), MPDT,
660 AllowTailMerge)
661 .run(MF);
662 }
663
664 void getAnalysisUsage(AnalysisUsage &AU) const override {
665 AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
666 AU.addRequired<MachineBlockFrequencyInfoWrapperPass>();
668 AU.addRequired<MachinePostDominatorTreeWrapperPass>();
669 AU.addRequired<MachineLoopInfoWrapperPass>();
670 AU.addRequired<ProfileSummaryInfoWrapperPass>();
671 AU.addRequired<TargetPassConfig>();
673 }
674};
675
676} // end anonymous namespace
677
678char MachineBlockPlacementLegacy::ID = 0;
679
680char &llvm::MachineBlockPlacementID = MachineBlockPlacementLegacy::ID;
681
682INITIALIZE_PASS_BEGIN(MachineBlockPlacementLegacy, DEBUG_TYPE,
683 "Branch Probability Basic Block Placement", false, false)
689INITIALIZE_PASS_END(MachineBlockPlacementLegacy, DEBUG_TYPE,
690 "Branch Probability Basic Block Placement", false, false)
691
692#ifndef NDEBUG
693/// Helper to print the name of a MBB.
694///
695/// Only used by debug logging.
696static std::string getBlockName(const MachineBasicBlock *BB) {
697 std::string Result;
698 raw_string_ostream OS(Result);
699 OS << printMBBReference(*BB);
700 OS << " ('" << BB->getName() << "')";
701 return Result;
702}
703#endif
704
705/// Mark a chain's successors as having one fewer preds.
706///
707/// When a chain is being merged into the "placed" chain, this routine will
708/// quickly walk the successors of each block in the chain and mark them as
709/// having one fewer active predecessor. It also adds any successors of this
710/// chain which reach the zero-predecessor state to the appropriate worklist.
711void MachineBlockPlacement::markChainSuccessors(
712 const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
713 const BlockFilterSet *BlockFilter) {
714 // Walk all the blocks in this chain, marking their successors as having
715 // a predecessor placed.
716 for (MachineBasicBlock *MBB : Chain) {
717 markBlockSuccessors(Chain, MBB, LoopHeaderBB, BlockFilter);
718 }
719}
720
721/// Mark a single block's successors as having one fewer preds.
722///
723/// Under normal circumstances, this is only called by markChainSuccessors,
724/// but if a block that was to be placed is completely tail-duplicated away,
725/// and was duplicated into the chain end, we need to redo markBlockSuccessors
726/// for just that block.
727void MachineBlockPlacement::markBlockSuccessors(
728 const BlockChain &Chain, const MachineBasicBlock *MBB,
729 const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) {
730 // Add any successors for which this is the only un-placed in-loop
731 // predecessor to the worklist as a viable candidate for CFG-neutral
732 // placement. No subsequent placement of this block will violate the CFG
733 // shape, so we get to use heuristics to choose a favorable placement.
734 for (MachineBasicBlock *Succ : MBB->successors()) {
735 if (BlockFilter && !BlockFilter->count(Succ))
736 continue;
737 BlockChain &SuccChain = *BlockToChain[Succ];
738 // Disregard edges within a fixed chain, or edges to the loop header.
739 if (&Chain == &SuccChain || Succ == LoopHeaderBB)
740 continue;
741
742 // This is a cross-chain edge that is within the loop, so decrement the
743 // loop predecessor count of the destination chain.
744 if (SuccChain.UnscheduledPredecessors == 0 ||
745 --SuccChain.UnscheduledPredecessors > 0)
746 continue;
747
748 auto *NewBB = *SuccChain.begin();
749 if (NewBB->isEHPad())
750 EHPadWorkList.push_back(NewBB);
751 else
752 BlockWorkList.push_back(NewBB);
753 }
754}
755
756/// This helper function collects the set of successors of block
757/// \p BB that are allowed to be its layout successors, and return
758/// the total branch probability of edges from \p BB to those
759/// blocks.
760BranchProbability MachineBlockPlacement::collectViableSuccessors(
761 const MachineBasicBlock *BB, const BlockChain &Chain,
762 const BlockFilterSet *BlockFilter,
763 SmallVector<MachineBasicBlock *, 4> &Successors) {
764 // Adjust edge probabilities by excluding edges pointing to blocks that is
765 // either not in BlockFilter or is already in the current chain. Consider the
766 // following CFG:
767 //
768 // --->A
769 // | / \
770 // | B C
771 // | \ / \
772 // ----D E
773 //
774 // Assume A->C is very hot (>90%), and C->D has a 50% probability, then after
775 // A->C is chosen as a fall-through, D won't be selected as a successor of C
776 // due to CFG constraint (the probability of C->D is not greater than
777 // HotProb to break topo-order). If we exclude E that is not in BlockFilter
778 // when calculating the probability of C->D, D will be selected and we
779 // will get A C D B as the layout of this loop.
780 auto AdjustedSumProb = BranchProbability::getOne();
781 for (MachineBasicBlock *Succ : BB->successors()) {
782 bool SkipSucc = false;
783 if (Succ->isEHPad() || (BlockFilter && !BlockFilter->count(Succ))) {
784 SkipSucc = true;
785 } else {
786 BlockChain *SuccChain = BlockToChain[Succ];
787 if (SuccChain == &Chain) {
788 SkipSucc = true;
789 } else if (Succ != *SuccChain->begin()) {
790 LLVM_DEBUG(dbgs() << " " << getBlockName(Succ)
791 << " -> Mid chain!\n");
792 continue;
793 }
794 }
795 if (SkipSucc)
796 AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ);
797 else
798 Successors.push_back(Succ);
799 }
800
801 return AdjustedSumProb;
802}
803
804/// The helper function returns the branch probability that is adjusted
805/// or normalized over the new total \p AdjustedSumProb.
806static BranchProbability
808 BranchProbability AdjustedSumProb) {
809 BranchProbability SuccProb;
810 uint32_t SuccProbN = OrigProb.getNumerator();
811 uint32_t SuccProbD = AdjustedSumProb.getNumerator();
812 if (SuccProbN >= SuccProbD)
813 SuccProb = BranchProbability::getOne();
814 else
815 SuccProb = BranchProbability(SuccProbN, SuccProbD);
816
817 return SuccProb;
818}
819
820/// Check if \p BB has exactly the successors in \p Successors.
821static bool
824 if (BB.succ_size() != Successors.size())
825 return false;
826 // We don't want to count self-loops
827 if (Successors.count(&BB))
828 return false;
829 for (MachineBasicBlock *Succ : BB.successors())
830 if (!Successors.count(Succ))
831 return false;
832 return true;
833}
834
835/// Check if a block should be tail duplicated to increase fallthrough
836/// opportunities.
837/// \p BB Block to check.
838bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
839 // Blocks with single successors don't create additional fallthrough
840 // opportunities. Don't duplicate them. TODO: When conditional exits are
841 // analyzable, allow them to be duplicated.
842 bool IsSimple = TailDup.isSimpleBB(BB);
843
844 if (BB->succ_size() == 1)
845 return false;
846 return TailDup.shouldTailDuplicate(IsSimple, *BB);
847}
848
849/// Compare 2 BlockFrequency's with a small penalty for \p A.
850/// In order to be conservative, we apply a X% penalty to account for
851/// increased icache pressure and static heuristics. For small frequencies
852/// we use only the numerators to improve accuracy. For simplicity, we assume
853/// the penalty is less than 100%
854/// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere.
856 BlockFrequency EntryFreq) {
857 BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
858 BlockFrequency Gain = A - B;
859 return (Gain / ThresholdProb) >= EntryFreq;
860}
861
862/// Check the edge frequencies to see if tail duplication will increase
863/// fallthroughs. It only makes sense to call this function when
864/// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is
865/// always locally profitable if we would have picked \p Succ without
866/// considering duplication.
867bool MachineBlockPlacement::isProfitableToTailDup(
868 const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
869 BranchProbability QProb, const BlockChain &Chain,
870 const BlockFilterSet *BlockFilter) {
871 // We need to do a probability calculation to make sure this is profitable.
872 // First: does succ have a successor that post-dominates? This affects the
873 // calculation. The 2 relevant cases are:
874 // BB BB
875 // | \Qout | \Qout
876 // P| C |P C
877 // = C' = C'
878 // | /Qin | /Qin
879 // | / | /
880 // Succ Succ
881 // / \ | \ V
882 // U/ =V |U \
883 // / \ = D
884 // D E | /
885 // | /
886 // |/
887 // PDom
888 // '=' : Branch taken for that CFG edge
889 // In the second case, Placing Succ while duplicating it into C prevents the
890 // fallthrough of Succ into either D or PDom, because they now have C as an
891 // unplaced predecessor
892
893 // Start by figuring out which case we fall into
894 MachineBasicBlock *PDom = nullptr;
895 SmallVector<MachineBasicBlock *, 4> SuccSuccs;
896 // Only scan the relevant successors
897 auto AdjustedSuccSumProb =
898 collectViableSuccessors(Succ, Chain, BlockFilter, SuccSuccs);
899 BranchProbability PProb = MBPI->getEdgeProbability(BB, Succ);
900 auto BBFreq = MBFI->getBlockFreq(BB);
901 auto SuccFreq = MBFI->getBlockFreq(Succ);
902 BlockFrequency P = BBFreq * PProb;
903 BlockFrequency Qout = BBFreq * QProb;
904 BlockFrequency EntryFreq = MBFI->getEntryFreq();
905 // If there are no more successors, it is profitable to copy, as it strictly
906 // increases fallthrough.
907 if (SuccSuccs.size() == 0)
908 return greaterWithBias(P, Qout, EntryFreq);
909
910 auto BestSuccSucc = BranchProbability::getZero();
911 // Find the PDom or the best Succ if no PDom exists.
912 for (MachineBasicBlock *SuccSucc : SuccSuccs) {
913 auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc);
914 if (Prob > BestSuccSucc)
915 BestSuccSucc = Prob;
916 if (PDom == nullptr)
917 if (MPDT->dominates(SuccSucc, Succ)) {
918 PDom = SuccSucc;
919 break;
920 }
921 }
922 // For the comparisons, we need to know Succ's best incoming edge that isn't
923 // from BB.
924 auto SuccBestPred = BlockFrequency(0);
925 for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
926 if (SuccPred == Succ || SuccPred == BB ||
927 BlockToChain[SuccPred] == &Chain ||
928 (BlockFilter && !BlockFilter->count(SuccPred)))
929 continue;
930 auto Freq =
931 MBFI->getBlockFreq(SuccPred) * MBPI->getEdgeProbability(SuccPred, Succ);
932 if (Freq > SuccBestPred)
933 SuccBestPred = Freq;
934 }
935 // Qin is Succ's best unplaced incoming edge that isn't BB
936 BlockFrequency Qin = SuccBestPred;
937 // If it doesn't have a post-dominating successor, here is the calculation:
938 // BB BB
939 // | \Qout | \
940 // P| C | =
941 // = C' | C
942 // | /Qin | |
943 // | / | C' (+Succ)
944 // Succ Succ /|
945 // / \ | \/ |
946 // U/ =V | == |
947 // / \ | / \|
948 // D E D E
949 // '=' : Branch taken for that CFG edge
950 // Cost in the first case is: P + V
951 // For this calculation, we always assume P > Qout. If Qout > P
952 // The result of this function will be ignored at the caller.
953 // Let F = SuccFreq - Qin
954 // Cost in the second case is: Qout + min(Qin, F) * U + max(Qin, F) * V
955
956 if (PDom == nullptr || !Succ->isSuccessor(PDom)) {
957 BranchProbability UProb = BestSuccSucc;
958 BranchProbability VProb = AdjustedSuccSumProb - UProb;
959 BlockFrequency F = SuccFreq - Qin;
960 BlockFrequency V = SuccFreq * VProb;
961 BlockFrequency QinU = std::min(Qin, F) * UProb;
962 BlockFrequency BaseCost = P + V;
963 BlockFrequency DupCost = Qout + QinU + std::max(Qin, F) * VProb;
964 return greaterWithBias(BaseCost, DupCost, EntryFreq);
965 }
966 BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
967 BranchProbability VProb = AdjustedSuccSumProb - UProb;
968 BlockFrequency U = SuccFreq * UProb;
969 BlockFrequency V = SuccFreq * VProb;
970 BlockFrequency F = SuccFreq - Qin;
971 // If there is a post-dominating successor, here is the calculation:
972 // BB BB BB BB
973 // | \Qout | \ | \Qout | \
974 // |P C | = |P C | =
975 // = C' |P C = C' |P C
976 // | /Qin | | | /Qin | |
977 // | / | C' (+Succ) | / | C' (+Succ)
978 // Succ Succ /| Succ Succ /|
979 // | \ V | \/ | | \ V | \/ |
980 // |U \ |U /\ =? |U = |U /\ |
981 // = D = = =?| | D | = =|
982 // | / |/ D | / |/ D
983 // | / | / | = | /
984 // |/ | / |/ | =
985 // Dom Dom Dom Dom
986 // '=' : Branch taken for that CFG edge
987 // The cost for taken branches in the first case is P + U
988 // Let F = SuccFreq - Qin
989 // The cost in the second case (assuming independence), given the layout:
990 // BB, Succ, (C+Succ), D, Dom or the layout:
991 // BB, Succ, D, Dom, (C+Succ)
992 // is Qout + max(F, Qin) * U + min(F, Qin)
993 // compare P + U vs Qout + P * U + Qin.
994 //
995 // The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
996 //
997 // For the 3rd case, the cost is P + 2 * V
998 // For the 4th case, the cost is Qout + min(Qin, F) * U + max(Qin, F) * V + V
999 // We choose 4 over 3 when (P + V) > Qout + min(Qin, F) * U + max(Qin, F) * V
1000 if (UProb > AdjustedSuccSumProb / 2 &&
1001 !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb,
1002 Chain, BlockFilter))
1003 // Cases 3 & 4
1004 return greaterWithBias(
1005 (P + V), (Qout + std::max(Qin, F) * VProb + std::min(Qin, F) * UProb),
1006 EntryFreq);
1007 // Cases 1 & 2
1008 return greaterWithBias((P + U),
1009 (Qout + std::min(Qin, F) * AdjustedSuccSumProb +
1010 std::max(Qin, F) * UProb),
1011 EntryFreq);
1012}
1013
1014/// Check for a trellis layout. \p BB is the upper part of a trellis if its
1015/// successors form the lower part of a trellis. A successor set S forms the
1016/// lower part of a trellis if all of the predecessors of S are either in S or
1017/// have all of S as successors. We ignore trellises where BB doesn't have 2
1018/// successors because for fewer than 2, it's trivial, and for 3 or greater they
1019/// are very uncommon and complex to compute optimally. Allowing edges within S
1020/// is not strictly a trellis, but the same algorithm works, so we allow it.
1021bool MachineBlockPlacement::isTrellis(
1022 const MachineBasicBlock *BB,
1023 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
1024 const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
1025 // Technically BB could form a trellis with branching factor higher than 2.
1026 // But that's extremely uncommon.
1027 if (BB->succ_size() != 2 || ViableSuccs.size() != 2)
1028 return false;
1029
1030 SmallPtrSet<const MachineBasicBlock *, 2> Successors(llvm::from_range,
1031 BB->successors());
1032 // To avoid reviewing the same predecessors twice.
1033 SmallPtrSet<const MachineBasicBlock *, 8> SeenPreds;
1034
1035 for (MachineBasicBlock *Succ : ViableSuccs) {
1036 // Compile-time optimization: runtime is quadratic in the number of
1037 // predecessors. For such uncommon cases, exit early.
1038 if (Succ->pred_size() > PredecessorLimit)
1039 return false;
1040
1041 int PredCount = 0;
1042 for (auto *SuccPred : Succ->predecessors()) {
1043 // Allow triangle successors, but don't count them.
1044 if (Successors.count(SuccPred)) {
1045 // Make sure that it is actually a triangle.
1046 for (MachineBasicBlock *CheckSucc : SuccPred->successors())
1047 if (!Successors.count(CheckSucc))
1048 return false;
1049 continue;
1050 }
1051 const BlockChain *PredChain = BlockToChain[SuccPred];
1052 if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) ||
1053 PredChain == &Chain || PredChain == BlockToChain[Succ])
1054 continue;
1055 ++PredCount;
1056 // Perform the successor check only once.
1057 if (!SeenPreds.insert(SuccPred).second)
1058 continue;
1059 if (!hasSameSuccessors(*SuccPred, Successors))
1060 return false;
1061 }
1062 // If one of the successors has only BB as a predecessor, it is not a
1063 // trellis.
1064 if (PredCount < 1)
1065 return false;
1066 }
1067 return true;
1068}
1069
1070/// Pick the highest total weight pair of edges that can both be laid out.
1071/// The edges in \p Edges[0] are assumed to have a different destination than
1072/// the edges in \p Edges[1]. Simple counting shows that the best pair is either
1073/// the individual highest weight edges to the 2 different destinations, or in
1074/// case of a conflict, one of them should be replaced with a 2nd best edge.
1075std::pair<MachineBlockPlacement::WeightedEdge,
1076 MachineBlockPlacement::WeightedEdge>
1077MachineBlockPlacement::getBestNonConflictingEdges(
1078 const MachineBasicBlock *BB,
1080 Edges) {
1081 // Sort the edges, and then for each successor, find the best incoming
1082 // predecessor. If the best incoming predecessors aren't the same,
1083 // then that is clearly the best layout. If there is a conflict, one of the
1084 // successors will have to fallthrough from the second best predecessor. We
1085 // compare which combination is better overall.
1086
1087 // Sort for highest frequency.
1088 auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; };
1089
1090 llvm::stable_sort(Edges[0], Cmp);
1091 llvm::stable_sort(Edges[1], Cmp);
1092 auto BestA = Edges[0].begin();
1093 auto BestB = Edges[1].begin();
1094 // Arrange for the correct answer to be in BestA and BestB
1095 // If the 2 best edges don't conflict, the answer is already there.
1096 if (BestA->Src == BestB->Src) {
1097 // Compare the total fallthrough of (Best + Second Best) for both pairs
1098 auto SecondBestA = std::next(BestA);
1099 auto SecondBestB = std::next(BestB);
1100 BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight;
1101 BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight;
1102 if (BestAScore < BestBScore)
1103 BestA = SecondBestA;
1104 else
1105 BestB = SecondBestB;
1106 }
1107 // Arrange for the BB edge to be in BestA if it exists.
1108 if (BestB->Src == BB)
1109 std::swap(BestA, BestB);
1110 return std::make_pair(*BestA, *BestB);
1111}
1112
1113/// Get the best successor from \p BB based on \p BB being part of a trellis.
1114/// We only handle trellises with 2 successors, so the algorithm is
1115/// straightforward: Find the best pair of edges that don't conflict. We find
1116/// the best incoming edge for each successor in the trellis. If those conflict,
1117/// we consider which of them should be replaced with the second best.
1118/// Upon return the two best edges will be in \p BestEdges. If one of the edges
1119/// comes from \p BB, it will be in \p BestEdges[0]
1120MachineBlockPlacement::BlockAndTailDupResult
1121MachineBlockPlacement::getBestTrellisSuccessor(
1122 const MachineBasicBlock *BB,
1123 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
1124 BranchProbability AdjustedSumProb, const BlockChain &Chain,
1125 const BlockFilterSet *BlockFilter) {
1126
1127 BlockAndTailDupResult Result = {nullptr, false};
1128 SmallPtrSet<const MachineBasicBlock *, 4> Successors(llvm::from_range,
1129 BB->successors());
1130
1131 // We assume size 2 because it's common. For general n, we would have to do
1132 // the Hungarian algorithm, but it's not worth the complexity because more
1133 // than 2 successors is fairly uncommon, and a trellis even more so.
1134 if (Successors.size() != 2 || ViableSuccs.size() != 2)
1135 return Result;
1136
1137 // Collect the edge frequencies of all edges that form the trellis.
1139 int SuccIndex = 0;
1140 for (auto *Succ : ViableSuccs) {
1141 for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
1142 // Skip any placed predecessors that are not BB
1143 if (SuccPred != BB) {
1144 if (BlockFilter && !BlockFilter->count(SuccPred))
1145 continue;
1146 const BlockChain *SuccPredChain = BlockToChain[SuccPred];
1147 if (SuccPredChain == &Chain || SuccPredChain == BlockToChain[Succ])
1148 continue;
1149 }
1150 BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) *
1151 MBPI->getEdgeProbability(SuccPred, Succ);
1152 Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ});
1153 }
1154 ++SuccIndex;
1155 }
1156
1157 // Pick the best combination of 2 edges from all the edges in the trellis.
1158 WeightedEdge BestA, BestB;
1159 std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges);
1160
1161 if (BestA.Src != BB) {
1162 // If we have a trellis, and BB doesn't have the best fallthrough edges,
1163 // we shouldn't choose any successor. We've already looked and there's a
1164 // better fallthrough edge for all the successors.
1165 LLVM_DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
1166 return Result;
1167 }
1168
1169 // Did we pick the triangle edge? If tail-duplication is profitable, do
1170 // that instead. Otherwise merge the triangle edge now while we know it is
1171 // optimal.
1172 if (BestA.Dest == BestB.Src) {
1173 // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2
1174 // would be better.
1175 MachineBasicBlock *Succ1 = BestA.Dest;
1176 MachineBasicBlock *Succ2 = BestB.Dest;
1177 // Check to see if tail-duplication would be profitable.
1178 if (allowTailDupPlacement(*F) && shouldTailDuplicate(Succ2) &&
1179 canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) &&
1180 isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1),
1181 Chain, BlockFilter)) {
1182 LLVM_DEBUG(BranchProbability Succ2Prob = getAdjustedProbability(
1183 MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
1184 dbgs() << " Selected: " << getBlockName(Succ2)
1185 << ", probability: " << Succ2Prob
1186 << " (Tail Duplicate)\n");
1187 Result.BB = Succ2;
1188 Result.ShouldTailDup = true;
1189 return Result;
1190 }
1191 }
1192 // We have already computed the optimal edge for the other side of the
1193 // trellis.
1194 ComputedEdges[BestB.Src] = {BestB.Dest, false};
1195
1196 auto TrellisSucc = BestA.Dest;
1197 LLVM_DEBUG(BranchProbability SuccProb = getAdjustedProbability(
1198 MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
1199 dbgs() << " Selected: " << getBlockName(TrellisSucc)
1200 << ", probability: " << SuccProb << " (Trellis)\n");
1201 Result.BB = TrellisSucc;
1202 return Result;
1203}
1204
1205/// When the option allowTailDupPlacement() is on, this method checks if the
1206/// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
1207/// into all of its unplaced, unfiltered predecessors, that are not BB.
1208bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
1209 const MachineBasicBlock *BB, MachineBasicBlock *Succ,
1210 const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
1211 if (!shouldTailDuplicate(Succ))
1212 return false;
1213
1214 // The result of canTailDuplicate.
1215 bool Duplicate = true;
1216 // Number of possible duplication.
1217 unsigned int NumDup = 0;
1218
1219 // For CFG checking.
1220 SmallPtrSet<const MachineBasicBlock *, 4> Successors(llvm::from_range,
1221 BB->successors());
1222 for (MachineBasicBlock *Pred : Succ->predecessors()) {
1223 // Make sure all unplaced and unfiltered predecessors can be
1224 // tail-duplicated into.
1225 // Skip any blocks that are already placed or not in this loop.
1226 if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred)) ||
1227 (BlockToChain[Pred] == &Chain && !Succ->succ_empty()))
1228 continue;
1229 if (!TailDup.canTailDuplicate(Succ, Pred)) {
1230 if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
1231 // This will result in a trellis after tail duplication, so we don't
1232 // need to copy Succ into this predecessor. In the presence
1233 // of a trellis tail duplication can continue to be profitable.
1234 // For example:
1235 // A A
1236 // |\ |\
1237 // | \ | \
1238 // | C | C+BB
1239 // | / | |
1240 // |/ | |
1241 // BB => BB |
1242 // |\ |\/|
1243 // | \ |/\|
1244 // | D | D
1245 // | / | /
1246 // |/ |/
1247 // Succ Succ
1248 //
1249 // After BB was duplicated into C, the layout looks like the one on the
1250 // right. BB and C now have the same successors. When considering
1251 // whether Succ can be duplicated into all its unplaced predecessors, we
1252 // ignore C.
1253 // We can do this because C already has a profitable fallthrough, namely
1254 // D. TODO(iteratee): ignore sufficiently cold predecessors for
1255 // duplication and for this test.
1256 //
1257 // This allows trellises to be laid out in 2 separate chains
1258 // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
1259 // because it allows the creation of 2 fallthrough paths with links
1260 // between them, and we correctly identify the best layout for these
1261 // CFGs. We want to extend trellises that the user created in addition
1262 // to trellises created by tail-duplication, so we just look for the
1263 // CFG.
1264 continue;
1265 Duplicate = false;
1266 continue;
1267 }
1268 NumDup++;
1269 }
1270
1271 // No possible duplication in current filter set.
1272 if (NumDup == 0)
1273 return false;
1274
1275 // If profile information is available, findDuplicateCandidates can do more
1276 // precise benefit analysis.
1277 if (F->getFunction().hasProfileData())
1278 return true;
1279
1280 // This is mainly for function exit BB.
1281 // The integrated tail duplication is really designed for increasing
1282 // fallthrough from predecessors from Succ to its successors. We may need
1283 // other machanism to handle different cases.
1284 if (Succ->succ_empty())
1285 return true;
1286
1287 // Plus the already placed predecessor.
1288 NumDup++;
1289
1290 // If the duplication candidate has more unplaced predecessors than
1291 // successors, the extra duplication can't bring more fallthrough.
1292 //
1293 // Pred1 Pred2 Pred3
1294 // \ | /
1295 // \ | /
1296 // \ | /
1297 // Dup
1298 // / \
1299 // / \
1300 // Succ1 Succ2
1301 //
1302 // In this example Dup has 2 successors and 3 predecessors, duplication of Dup
1303 // can increase the fallthrough from Pred1 to Succ1 and from Pred2 to Succ2,
1304 // but the duplication into Pred3 can't increase fallthrough.
1305 //
1306 // A small number of extra duplication may not hurt too much. We need a better
1307 // heuristic to handle it.
1308 if ((NumDup > Succ->succ_size()) || !Duplicate)
1309 return false;
1310
1311 return true;
1312}
1313
1314/// Find chains of triangles where we believe it would be profitable to
1315/// tail-duplicate them all, but a local analysis would not find them.
1316/// There are 3 ways this can be profitable:
1317/// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with
1318/// longer chains)
1319/// 2) The chains are statically correlated. Branch probabilities have a very
1320/// U-shaped distribution.
1321/// [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805]
1322/// If the branches in a chain are likely to be from the same side of the
1323/// distribution as their predecessor, but are independent at runtime, this
1324/// transformation is profitable. (Because the cost of being wrong is a small
1325/// fixed cost, unlike the standard triangle layout where the cost of being
1326/// wrong scales with the # of triangles.)
1327/// 3) The chains are dynamically correlated. If the probability that a previous
1328/// branch was taken positively influences whether the next branch will be
1329/// taken
1330/// We believe that 2 and 3 are common enough to justify the small margin in 1.
1331void MachineBlockPlacement::precomputeTriangleChains() {
1332 struct TriangleChain {
1333 std::vector<MachineBasicBlock *> Edges;
1334
1335 TriangleChain(MachineBasicBlock *src, MachineBasicBlock *dst)
1336 : Edges({src, dst}) {}
1337
1338 void append(MachineBasicBlock *dst) {
1339 assert(getKey()->isSuccessor(dst) &&
1340 "Attempting to append a block that is not a successor.");
1341 Edges.push_back(dst);
1342 }
1343
1344 unsigned count() const { return Edges.size() - 1; }
1345
1346 MachineBasicBlock *getKey() const { return Edges.back(); }
1347 };
1348
1349 if (TriangleChainCount == 0)
1350 return;
1351
1352 LLVM_DEBUG(dbgs() << "Pre-computing triangle chains.\n");
1353 // Map from last block to the chain that contains it. This allows us to extend
1354 // chains as we find new triangles.
1355 DenseMap<const MachineBasicBlock *, TriangleChain> TriangleChainMap;
1356 for (MachineBasicBlock &BB : *F) {
1357 // If BB doesn't have 2 successors, it doesn't start a triangle.
1358 if (BB.succ_size() != 2)
1359 continue;
1360 MachineBasicBlock *PDom = nullptr;
1361 for (MachineBasicBlock *Succ : BB.successors()) {
1362 if (!MPDT->dominates(Succ, &BB))
1363 continue;
1364 PDom = Succ;
1365 break;
1366 }
1367 // If BB doesn't have a post-dominating successor, it doesn't form a
1368 // triangle.
1369 if (PDom == nullptr)
1370 continue;
1371 // If PDom has a hint that it is low probability, skip this triangle.
1372 if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100))
1373 continue;
1374 // If PDom isn't eligible for duplication, this isn't the kind of triangle
1375 // we're looking for.
1376 if (!shouldTailDuplicate(PDom))
1377 continue;
1378 bool CanTailDuplicate = true;
1379 // If PDom can't tail-duplicate into it's non-BB predecessors, then this
1380 // isn't the kind of triangle we're looking for.
1381 for (MachineBasicBlock *Pred : PDom->predecessors()) {
1382 if (Pred == &BB)
1383 continue;
1384 if (!TailDup.canTailDuplicate(PDom, Pred)) {
1385 CanTailDuplicate = false;
1386 break;
1387 }
1388 }
1389 // If we can't tail-duplicate PDom to its predecessors, then skip this
1390 // triangle.
1391 if (!CanTailDuplicate)
1392 continue;
1393
1394 // Now we have an interesting triangle. Insert it if it's not part of an
1395 // existing chain.
1396 // Note: This cannot be replaced with a call insert() or emplace() because
1397 // the find key is BB, but the insert/emplace key is PDom.
1398 auto Found = TriangleChainMap.find(&BB);
1399 // If it is, remove the chain from the map, grow it, and put it back in the
1400 // map with the end as the new key.
1401 if (Found != TriangleChainMap.end()) {
1402 TriangleChain Chain = std::move(Found->second);
1403 TriangleChainMap.erase(Found);
1404 Chain.append(PDom);
1405 TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain)));
1406 } else {
1407 auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom);
1408 assert(InsertResult.second && "Block seen twice.");
1409 (void)InsertResult;
1410 }
1411 }
1412
1413 // Iterating over a DenseMap is safe here, because the only thing in the body
1414 // of the loop is inserting into another DenseMap (ComputedEdges).
1415 // ComputedEdges is never iterated, so this doesn't lead to non-determinism.
1416 for (auto &ChainPair : TriangleChainMap) {
1417 TriangleChain &Chain = ChainPair.second;
1418 // Benchmarking has shown that due to branch correlation duplicating 2 or
1419 // more triangles is profitable, despite the calculations assuming
1420 // independence.
1421 if (Chain.count() < TriangleChainCount)
1422 continue;
1423 MachineBasicBlock *dst = Chain.Edges.back();
1424 Chain.Edges.pop_back();
1425 for (MachineBasicBlock *src : reverse(Chain.Edges)) {
1426 LLVM_DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->"
1427 << getBlockName(dst)
1428 << " as pre-computed based on triangles.\n");
1429
1430 auto InsertResult = ComputedEdges.insert({src, {dst, true}});
1431 assert(InsertResult.second && "Block seen twice.");
1432 (void)InsertResult;
1433
1434 dst = src;
1435 }
1436 }
1437}
1438
1439// When profile is not present, return the StaticLikelyProb.
1440// When profile is available, we need to handle the triangle-shape CFG.
1441static BranchProbability
1443 if (!BB->getParent()->getFunction().hasProfileData())
1445 if (BB->succ_size() == 2) {
1446 const MachineBasicBlock *Succ1 = *BB->succ_begin();
1447 const MachineBasicBlock *Succ2 = *(BB->succ_begin() + 1);
1448 if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) {
1449 /* See case 1 below for the cost analysis. For BB->Succ to
1450 * be taken with smaller cost, the following needs to hold:
1451 * Prob(BB->Succ) > 2 * Prob(BB->Pred)
1452 * So the threshold T in the calculation below
1453 * (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred)
1454 * So T / (1 - T) = 2, Yielding T = 2/3
1455 *
1456 * Then remap the user-controlled ProfileLikelyProb into
1457 * a triangle-specific threshold T.
1458 * T = (2/3) * (ProfileLikelyProb / 50)
1459 * = (2 * ProfileLikelyProb) / 150
1460 * This preserves T = 2/3 at ProfileLikelyProb = 50.
1461 * The result is capped at 1.
1462 */
1463 return BranchProbability(ProfileLikelyProb, 150) * 2;
1464 }
1465 }
1467}
1468
1469/// Checks to see if the layout candidate block \p Succ has a better layout
1470/// predecessor than \c BB. If yes, returns true.
1471/// \p SuccProb: The probability adjusted for only remaining blocks.
1472/// Only used for logging
1473/// \p RealSuccProb: The un-adjusted probability.
1474/// \p Chain: The chain that BB belongs to and Succ is being considered for.
1475/// \p BlockFilter: if non-null, the set of blocks that make up the loop being
1476/// considered
1477bool MachineBlockPlacement::hasBetterLayoutPredecessor(
1478 const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
1479 const BlockChain &SuccChain, BranchProbability SuccProb,
1480 BranchProbability RealSuccProb, const BlockChain &Chain,
1481 const BlockFilterSet *BlockFilter) {
1482
1483 // There isn't a better layout when there are no unscheduled predecessors.
1484 if (SuccChain.UnscheduledPredecessors == 0)
1485 return false;
1486
1487 // Compile-time optimization: runtime is quadratic in the number of
1488 // predecessors. For such uncommon cases, exit early.
1489 if (Succ->pred_size() > PredecessorLimit)
1490 return false;
1491
1492 // There are two basic scenarios here:
1493 // -------------------------------------
1494 // Case 1: triangular shape CFG (if-then):
1495 // BB
1496 // | \
1497 // | \
1498 // | Pred
1499 // | /
1500 // Succ
1501 // In this case, we are evaluating whether to select edge -> Succ, e.g.
1502 // set Succ as the layout successor of BB. Picking Succ as BB's
1503 // successor breaks the CFG constraints (FIXME: define these constraints).
1504 // With this layout, Pred BB
1505 // is forced to be outlined, so the overall cost will be cost of the
1506 // branch taken from BB to Pred, plus the cost of back taken branch
1507 // from Pred to Succ, as well as the additional cost associated
1508 // with the needed unconditional jump instruction from Pred To Succ.
1509
1510 // The cost of the topological order layout is the taken branch cost
1511 // from BB to Succ, so to make BB->Succ a viable candidate, the following
1512 // must hold:
1513 // 2 * freq(BB->Pred) * taken_branch_cost + unconditional_jump_cost
1514 // < freq(BB->Succ) * taken_branch_cost.
1515 // Ignoring unconditional jump cost, we get
1516 // freq(BB->Succ) > 2 * freq(BB->Pred), i.e.,
1517 // prob(BB->Succ) > 2 * prob(BB->Pred)
1518 //
1519 // When real profile data is available, we can precisely compute the
1520 // probability threshold that is needed for edge BB->Succ to be considered.
1521 // Without profile data, the heuristic requires the branch bias to be
1522 // a lot larger to make sure the signal is very strong (e.g. 80% default).
1523 // -----------------------------------------------------------------
1524 // Case 2: diamond like CFG (if-then-else):
1525 // S
1526 // / \
1527 // | \
1528 // BB Pred
1529 // \ /
1530 // Succ
1531 // ..
1532 //
1533 // The current block is BB and edge BB->Succ is now being evaluated.
1534 // Note that edge S->BB was previously already selected because
1535 // prob(S->BB) > prob(S->Pred).
1536 // At this point, 2 blocks can be placed after BB: Pred or Succ. If we
1537 // choose Pred, we will have a topological ordering as shown on the left
1538 // in the picture below. If we choose Succ, we have the solution as shown
1539 // on the right:
1540 //
1541 // topo-order:
1542 //
1543 // S----- ---S
1544 // | | | |
1545 // ---BB | | BB
1546 // | | | |
1547 // | Pred-- | Succ--
1548 // | | | |
1549 // ---Succ ---Pred--
1550 //
1551 // cost = freq(S->Pred) + freq(BB->Succ) cost = 2 * freq (S->Pred)
1552 // = freq(S->Pred) + freq(S->BB)
1553 //
1554 // If we have profile data (i.e, branch probabilities can be trusted), the
1555 // cost (number of taken branches) with layout S->BB->Succ->Pred is 2 *
1556 // freq(S->Pred) while the cost of topo order is freq(S->Pred) + freq(S->BB).
1557 // We know Prob(S->BB) > Prob(S->Pred), so freq(S->BB) > freq(S->Pred), which
1558 // means the cost of topological order is greater.
1559 // When profile data is not available, however, we need to be more
1560 // conservative. If the branch prediction is wrong, breaking the topo-order
1561 // will actually yield a layout with large cost. For this reason, we need
1562 // strong biased branch at block S with Prob(S->BB) in order to select
1563 // BB->Succ. This is equivalent to looking the CFG backward with backward
1564 // edge: Prob(Succ->BB) needs to >= HotProb in order to be selected (without
1565 // profile data).
1566 // --------------------------------------------------------------------------
1567 // Case 3: forked diamond
1568 // S
1569 // / \
1570 // / \
1571 // BB Pred
1572 // | \ / |
1573 // | \ / |
1574 // | X |
1575 // | / \ |
1576 // | / \ |
1577 // S1 S2
1578 //
1579 // The current block is BB and edge BB->S1 is now being evaluated.
1580 // As above S->BB was already selected because
1581 // prob(S->BB) > prob(S->Pred). Assume that prob(BB->S1) >= prob(BB->S2).
1582 //
1583 // topo-order:
1584 //
1585 // S-------| ---S
1586 // | | | |
1587 // ---BB | | BB
1588 // | | | |
1589 // | Pred----| | S1----
1590 // | | | |
1591 // --(S1 or S2) ---Pred--
1592 // |
1593 // S2
1594 //
1595 // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
1596 // + min(freq(Pred->S1), freq(Pred->S2))
1597 // Non-topo-order cost:
1598 // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
1599 // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
1600 // is 0. Then the non topo layout is better when
1601 // freq(S->Pred) < freq(BB->S1).
1602 // This is exactly what is checked below.
1603 // Note there are other shapes that apply (Pred may not be a single block,
1604 // but they all fit this general pattern.)
1605 BranchProbability HotProb = getLayoutSuccessorProbThreshold(BB);
1606
1607 // Make sure that a hot successor doesn't have a globally more
1608 // important predecessor.
1609 BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb;
1610 bool BadCFGConflict = false;
1611
1612 for (MachineBasicBlock *Pred : Succ->predecessors()) {
1613 BlockChain *PredChain = BlockToChain[Pred];
1614 if (Pred == Succ || PredChain == &SuccChain ||
1615 (BlockFilter && !BlockFilter->count(Pred)) || PredChain == &Chain ||
1616 Pred != *std::prev(PredChain->end()) ||
1617 // This check is redundant except for look ahead. This function is
1618 // called for lookahead by isProfitableToTailDup when BB hasn't been
1619 // placed yet.
1620 (Pred == BB))
1621 continue;
1622 // Do backward checking.
1623 // For all cases above, we need a backward checking to filter out edges that
1624 // are not 'strongly' biased.
1625 // BB Pred
1626 // \ /
1627 // Succ
1628 // We select edge BB->Succ if
1629 // freq(BB->Succ) > freq(Succ) * HotProb
1630 // i.e. freq(BB->Succ) > freq(BB->Succ) * HotProb + freq(Pred->Succ) *
1631 // HotProb
1632 // i.e. freq((BB->Succ) * (1 - HotProb) > freq(Pred->Succ) * HotProb
1633 // Case 1 is covered too, because the first equation reduces to:
1634 // prob(BB->Succ) > HotProb. (freq(Succ) = freq(BB) for a triangle)
1635 BlockFrequency PredEdgeFreq =
1636 MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ);
1637 if (PredEdgeFreq * HotProb >= CandidateEdgeFreq * HotProb.getCompl()) {
1638 BadCFGConflict = true;
1639 break;
1640 }
1641 }
1642
1643 if (BadCFGConflict) {
1644 LLVM_DEBUG(dbgs() << " Not a candidate: " << getBlockName(Succ) << " -> "
1645 << SuccProb << " (prob) (non-cold CFG conflict)\n");
1646 return true;
1647 }
1648
1649 return false;
1650}
1651
1652/// Select the best successor for a block.
1653///
1654/// This looks across all successors of a particular block and attempts to
1655/// select the "best" one to be the layout successor. It only considers direct
1656/// successors which also pass the block filter. It will attempt to avoid
1657/// breaking CFG structure, but cave and break such structures in the case of
1658/// very hot successor edges.
1659///
1660/// \returns The best successor block found, or null if none are viable, along
1661/// with a boolean indicating if tail duplication is necessary.
1662MachineBlockPlacement::BlockAndTailDupResult
1663MachineBlockPlacement::selectBestSuccessor(const MachineBasicBlock *BB,
1664 const BlockChain &Chain,
1665 const BlockFilterSet *BlockFilter) {
1666 const BranchProbability HotProb(StaticLikelyProb, 100);
1667
1668 BlockAndTailDupResult BestSucc = {nullptr, false};
1669 auto BestProb = BranchProbability::getZero();
1670
1671 SmallVector<MachineBasicBlock *, 4> Successors;
1672 auto AdjustedSumProb =
1673 collectViableSuccessors(BB, Chain, BlockFilter, Successors);
1674
1675 LLVM_DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB)
1676 << "\n");
1677
1678 // if we already precomputed the best successor for BB, return that if still
1679 // applicable.
1680 auto FoundEdge = ComputedEdges.find(BB);
1681 if (FoundEdge != ComputedEdges.end()) {
1682 BlockAndTailDupResult Result = FoundEdge->second;
1683 ComputedEdges.erase(FoundEdge);
1684 BlockChain *SuccChain = BlockToChain[Result.BB];
1685 if (BB->isSuccessor(Result.BB) &&
1686 (!BlockFilter || BlockFilter->count(Result.BB)) &&
1687 SuccChain != &Chain && Result.BB == *SuccChain->begin())
1688 return Result;
1689 }
1690
1691 // if BB is part of a trellis, Use the trellis to determine the optimal
1692 // fallthrough edges
1693 if (isTrellis(BB, Successors, Chain, BlockFilter))
1694 return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain,
1695 BlockFilter);
1696
1697 // For blocks with CFG violations, we may be able to lay them out anyway with
1698 // tail-duplication. We keep this vector so we can perform the probability
1699 // calculations the minimum number of times.
1701 DupCandidates;
1702 for (MachineBasicBlock *Succ : Successors) {
1703 auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
1704 BranchProbability SuccProb =
1705 getAdjustedProbability(RealSuccProb, AdjustedSumProb);
1706
1707 BlockChain &SuccChain = *BlockToChain[Succ];
1708 // Skip the edge \c BB->Succ if block \c Succ has a better layout
1709 // predecessor that yields lower global cost.
1710 if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
1711 Chain, BlockFilter)) {
1712 // If tail duplication would make Succ profitable, place it.
1713 if (allowTailDupPlacement(*F) && shouldTailDuplicate(Succ))
1714 DupCandidates.emplace_back(SuccProb, Succ);
1715 continue;
1716 }
1717
1718 LLVM_DEBUG(
1719 dbgs() << " Candidate: " << getBlockName(Succ)
1720 << ", probability: " << SuccProb
1721 << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "")
1722 << "\n");
1723
1724 if (BestSucc.BB && BestProb >= SuccProb) {
1725 LLVM_DEBUG(dbgs() << " Not the best candidate, continuing\n");
1726 continue;
1727 }
1728
1729 LLVM_DEBUG(dbgs() << " Setting it as best candidate\n");
1730 BestSucc.BB = Succ;
1731 BestProb = SuccProb;
1732 }
1733 // Handle the tail duplication candidates in order of decreasing probability.
1734 // Stop at the first one that is profitable. Also stop if they are less
1735 // profitable than BestSucc. Position is important because we preserve it and
1736 // prefer first best match. Here we aren't comparing in order, so we capture
1737 // the position instead.
1738 llvm::stable_sort(DupCandidates,
1739 [](std::tuple<BranchProbability, MachineBasicBlock *> L,
1740 std::tuple<BranchProbability, MachineBasicBlock *> R) {
1741 return std::get<0>(L) > std::get<0>(R);
1742 });
1743 for (auto &Tup : DupCandidates) {
1744 BranchProbability DupProb;
1745 MachineBasicBlock *Succ;
1746 std::tie(DupProb, Succ) = Tup;
1747 if (DupProb < BestProb)
1748 break;
1749 if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter) &&
1750 (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) {
1751 LLVM_DEBUG(dbgs() << " Candidate: " << getBlockName(Succ)
1752 << ", probability: " << DupProb
1753 << " (Tail Duplicate)\n");
1754 BestSucc.BB = Succ;
1755 BestSucc.ShouldTailDup = true;
1756 break;
1757 }
1758 }
1759
1760 if (BestSucc.BB)
1761 LLVM_DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc.BB) << "\n");
1762
1763 return BestSucc;
1764}
1765
1766/// Select the best block from a worklist.
1767///
1768/// This looks through the provided worklist as a list of candidate basic
1769/// blocks and select the most profitable one to place. The definition of
1770/// profitable only really makes sense in the context of a loop. This returns
1771/// the most frequently visited block in the worklist, which in the case of
1772/// a loop, is the one most desirable to be physically close to the rest of the
1773/// loop body in order to improve i-cache behavior.
1774///
1775/// \returns The best block found, or null if none are viable.
1776MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
1777 const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
1778 // Once we need to walk the worklist looking for a candidate, cleanup the
1779 // worklist of already placed entries.
1780 // FIXME: If this shows up on profiles, it could be folded (at the cost of
1781 // some code complexity) into the loop below.
1782 llvm::erase_if(WorkList, [&](MachineBasicBlock *BB) {
1783 return BlockToChain.lookup(BB) == &Chain;
1784 });
1785
1786 if (WorkList.empty())
1787 return nullptr;
1788
1789 bool IsEHPad = WorkList[0]->isEHPad();
1790
1791 MachineBasicBlock *BestBlock = nullptr;
1792 BlockFrequency BestFreq;
1793 for (MachineBasicBlock *MBB : WorkList) {
1794 assert(MBB->isEHPad() == IsEHPad &&
1795 "EHPad mismatch between block and work list.");
1796
1797 BlockChain &SuccChain = *BlockToChain[MBB];
1798 if (&SuccChain == &Chain)
1799 continue;
1800
1801 assert(SuccChain.UnscheduledPredecessors == 0 &&
1802 "Found CFG-violating block");
1803
1804 BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB);
1805 LLVM_DEBUG(dbgs() << " " << getBlockName(MBB) << " -> "
1806 << printBlockFreq(MBFI->getMBFI(), CandidateFreq)
1807 << " (freq)\n");
1808
1809 // For ehpad, we layout the least probable first as to avoid jumping back
1810 // from least probable landingpads to more probable ones.
1811 //
1812 // FIXME: Using probability is probably (!) not the best way to achieve
1813 // this. We should probably have a more principled approach to layout
1814 // cleanup code.
1815 //
1816 // The goal is to get:
1817 //
1818 // +--------------------------+
1819 // | V
1820 // InnerLp -> InnerCleanup OuterLp -> OuterCleanup -> Resume
1821 //
1822 // Rather than:
1823 //
1824 // +-------------------------------------+
1825 // V |
1826 // OuterLp -> OuterCleanup -> Resume InnerLp -> InnerCleanup
1827 if (BestBlock && (IsEHPad ^ (BestFreq >= CandidateFreq)))
1828 continue;
1829
1830 BestBlock = MBB;
1831 BestFreq = CandidateFreq;
1832 }
1833
1834 return BestBlock;
1835}
1836
1837/// Retrieve the first unplaced basic block in the entire function.
1838///
1839/// This routine is called when we are unable to use the CFG to walk through
1840/// all of the basic blocks and form a chain due to unnatural loops in the CFG.
1841/// We walk through the function's blocks in order, starting from the
1842/// LastUnplacedBlockIt. We update this iterator on each call to avoid
1843/// re-scanning the entire sequence on repeated calls to this routine.
1844MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
1845 const BlockChain &PlacedChain,
1846 MachineFunction::iterator &PrevUnplacedBlockIt) {
1847
1848 for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F->end(); I != E;
1849 ++I) {
1850 if (BlockChain *Chain = BlockToChain[&*I]; Chain != &PlacedChain) {
1851 PrevUnplacedBlockIt = I;
1852 // Now select the head of the chain to which the unplaced block belongs
1853 // as the block to place. This will force the entire chain to be placed,
1854 // and satisfies the requirements of merging chains.
1855 return *Chain->begin();
1856 }
1857 }
1858 return nullptr;
1859}
1860
1861/// Retrieve the first unplaced basic block among the blocks in BlockFilter.
1862///
1863/// This is similar to getFirstUnplacedBlock for the entire function, but since
1864/// the size of BlockFilter is typically far less than the number of blocks in
1865/// the entire function, iterating through the BlockFilter is more efficient.
1866/// When processing the entire funciton, using the version without BlockFilter
1867/// has a complexity of #(loops in function) * #(blocks in function), while this
1868/// version has a complexity of sum(#(loops in block) foreach block in function)
1869/// which is always smaller. For long function mostly sequential in structure,
1870/// the complexity is amortized to 1 * #(blocks in function).
1871MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
1872 const BlockChain &PlacedChain,
1873 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
1874 const BlockFilterSet *BlockFilter) {
1875 assert(BlockFilter);
1876 for (; PrevUnplacedBlockInFilterIt != BlockFilter->end();
1877 ++PrevUnplacedBlockInFilterIt) {
1878 BlockChain *C = BlockToChain[*PrevUnplacedBlockInFilterIt];
1879 if (C != &PlacedChain) {
1880 return *C->begin();
1881 }
1882 }
1883 return nullptr;
1884}
1885
1886void MachineBlockPlacement::fillWorkLists(
1887 const MachineBasicBlock *MBB, SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
1888 const BlockFilterSet *BlockFilter = nullptr) {
1889 BlockChain &Chain = *BlockToChain[MBB];
1890 if (!UpdatedPreds.insert(&Chain).second)
1891 return;
1892
1893 assert(
1894 Chain.UnscheduledPredecessors == 0 &&
1895 "Attempting to place block with unscheduled predecessors in worklist.");
1896 for (MachineBasicBlock *ChainBB : Chain) {
1897 assert(BlockToChain[ChainBB] == &Chain &&
1898 "Block in chain doesn't match BlockToChain map.");
1899 for (MachineBasicBlock *Pred : ChainBB->predecessors()) {
1900 if (BlockFilter && !BlockFilter->count(Pred))
1901 continue;
1902 if (BlockToChain[Pred] == &Chain)
1903 continue;
1904 ++Chain.UnscheduledPredecessors;
1905 }
1906 }
1907
1908 if (Chain.UnscheduledPredecessors != 0)
1909 return;
1910
1911 MachineBasicBlock *BB = *Chain.begin();
1912 if (BB->isEHPad())
1913 EHPadWorkList.push_back(BB);
1914 else
1915 BlockWorkList.push_back(BB);
1916}
1917
1918void MachineBlockPlacement::buildChain(const MachineBasicBlock *HeadBB,
1919 BlockChain &Chain,
1920 BlockFilterSet *BlockFilter) {
1921 assert(HeadBB && "BB must not be null.\n");
1922 assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n");
1923 MachineFunction::iterator PrevUnplacedBlockIt = F->begin();
1924 BlockFilterSet::iterator PrevUnplacedBlockInFilterIt;
1925 if (BlockFilter)
1926 PrevUnplacedBlockInFilterIt = BlockFilter->begin();
1927
1928 const MachineBasicBlock *LoopHeaderBB = HeadBB;
1929 markChainSuccessors(Chain, LoopHeaderBB, BlockFilter);
1930 MachineBasicBlock *BB = *std::prev(Chain.end());
1931 while (true) {
1932 assert(BB && "null block found at end of chain in loop.");
1933 assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop.");
1934 assert(*std::prev(Chain.end()) == BB && "BB Not found at end of chain.");
1935
1936 // Look for the best viable successor if there is one to place immediately
1937 // after this block.
1938 auto Result = selectBestSuccessor(BB, Chain, BlockFilter);
1939 MachineBasicBlock *BestSucc = Result.BB;
1940 bool ShouldTailDup = Result.ShouldTailDup;
1941 if (allowTailDupPlacement(*F))
1942 ShouldTailDup |= (BestSucc && canTailDuplicateUnplacedPreds(
1943 BB, BestSucc, Chain, BlockFilter));
1944
1945 // If an immediate successor isn't available, look for the best viable
1946 // block among those we've identified as not violating the loop's CFG at
1947 // this point. This won't be a fallthrough, but it will increase locality.
1948 if (!BestSucc)
1949 BestSucc = selectBestCandidateBlock(Chain, BlockWorkList);
1950 if (!BestSucc)
1951 BestSucc = selectBestCandidateBlock(Chain, EHPadWorkList);
1952
1953 if (!BestSucc) {
1954 if (BlockFilter)
1955 BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockInFilterIt,
1956 BlockFilter);
1957 else
1958 BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockIt);
1959 if (!BestSucc)
1960 break;
1961
1962 LLVM_DEBUG(dbgs() << "Unnatural loop CFG detected, forcibly merging the "
1963 "layout successor until the CFG reduces\n");
1964 }
1965
1966 // Placement may have changed tail duplication opportunities.
1967 // Check for that now.
1968 if (allowTailDupPlacement(*F) && BestSucc && ShouldTailDup) {
1969 repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain,
1970 BlockFilter, PrevUnplacedBlockIt,
1971 PrevUnplacedBlockInFilterIt);
1972 // If the chosen successor was duplicated into BB, don't bother laying
1973 // it out, just go round the loop again with BB as the chain end.
1974 if (!BB->isSuccessor(BestSucc))
1975 continue;
1976 }
1977
1978 // Place this block, updating the datastructures to reflect its placement.
1979 BlockChain &SuccChain = *BlockToChain[BestSucc];
1980 // Zero out UnscheduledPredecessors for the successor we're about to merge
1981 // in case we selected a successor that didn't fit naturally into the CFG.
1982 SuccChain.UnscheduledPredecessors = 0;
1983 LLVM_DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to "
1984 << getBlockName(BestSucc) << "\n");
1985 markChainSuccessors(SuccChain, LoopHeaderBB, BlockFilter);
1986 Chain.merge(BestSucc, &SuccChain);
1987 BB = *std::prev(Chain.end());
1988 }
1989
1990 LLVM_DEBUG(dbgs() << "Finished forming chain for header block "
1991 << getBlockName(*Chain.begin()) << "\n");
1992}
1993
1994// If bottom of block BB has only one successor OldTop, in most cases it is
1995// profitable to move it before OldTop, except the following case:
1996//
1997// -->OldTop<-
1998// | . |
1999// | . |
2000// | . |
2001// ---Pred |
2002// | |
2003// BB-----
2004//
2005// If BB is moved before OldTop, Pred needs a taken branch to BB, and it can't
2006// layout the other successor below it, so it can't reduce taken branch.
2007// In this case we keep its original layout.
2008bool MachineBlockPlacement::canMoveBottomBlockToTop(
2009 const MachineBasicBlock *BottomBlock, const MachineBasicBlock *OldTop) {
2010 if (BottomBlock->pred_size() != 1)
2011 return true;
2012 MachineBasicBlock *Pred = *BottomBlock->pred_begin();
2013 if (Pred->succ_size() != 2)
2014 return true;
2015
2016 MachineBasicBlock *OtherBB = *Pred->succ_begin();
2017 if (OtherBB == BottomBlock)
2018 OtherBB = *Pred->succ_rbegin();
2019 if (OtherBB == OldTop)
2020 return false;
2021
2022 return true;
2023}
2024
2025// Find out the possible fall through frequence to the top of a loop.
2026BlockFrequency
2027MachineBlockPlacement::TopFallThroughFreq(const MachineBasicBlock *Top,
2028 const BlockFilterSet &LoopBlockSet) {
2029 BlockFrequency MaxFreq = BlockFrequency(0);
2030 for (MachineBasicBlock *Pred : Top->predecessors()) {
2031 BlockChain *PredChain = BlockToChain[Pred];
2032 if (!LoopBlockSet.count(Pred) &&
2033 (!PredChain || Pred == *std::prev(PredChain->end()))) {
2034 // Found a Pred block can be placed before Top.
2035 // Check if Top is the best successor of Pred.
2036 auto TopProb = MBPI->getEdgeProbability(Pred, Top);
2037 bool TopOK = true;
2038 for (MachineBasicBlock *Succ : Pred->successors()) {
2039 auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
2040 BlockChain *SuccChain = BlockToChain[Succ];
2041 // Check if Succ can be placed after Pred.
2042 // Succ should not be in any chain, or it is the head of some chain.
2043 if (!LoopBlockSet.count(Succ) && (SuccProb > TopProb) &&
2044 (!SuccChain || Succ == *SuccChain->begin())) {
2045 TopOK = false;
2046 break;
2047 }
2048 }
2049 if (TopOK) {
2050 BlockFrequency EdgeFreq =
2051 MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Top);
2052 if (EdgeFreq > MaxFreq)
2053 MaxFreq = EdgeFreq;
2054 }
2055 }
2056 }
2057 return MaxFreq;
2058}
2059
2060// Compute the fall through gains when move NewTop before OldTop.
2061//
2062// In following diagram, edges marked as "-" are reduced fallthrough, edges
2063// marked as "+" are increased fallthrough, this function computes
2064//
2065// SUM(increased fallthrough) - SUM(decreased fallthrough)
2066//
2067// |
2068// | -
2069// V
2070// --->OldTop
2071// | .
2072// | .
2073// +| . +
2074// | Pred --->
2075// | |-
2076// | V
2077// --- NewTop <---
2078// |-
2079// V
2080//
2081BlockFrequency MachineBlockPlacement::FallThroughGains(
2082 const MachineBasicBlock *NewTop, const MachineBasicBlock *OldTop,
2083 const MachineBasicBlock *ExitBB, const BlockFilterSet &LoopBlockSet) {
2084 BlockFrequency FallThrough2Top = TopFallThroughFreq(OldTop, LoopBlockSet);
2085 BlockFrequency FallThrough2Exit = BlockFrequency(0);
2086 if (ExitBB)
2087 FallThrough2Exit =
2088 MBFI->getBlockFreq(NewTop) * MBPI->getEdgeProbability(NewTop, ExitBB);
2089 BlockFrequency BackEdgeFreq =
2090 MBFI->getBlockFreq(NewTop) * MBPI->getEdgeProbability(NewTop, OldTop);
2091
2092 // Find the best Pred of NewTop.
2093 MachineBasicBlock *BestPred = nullptr;
2094 BlockFrequency FallThroughFromPred = BlockFrequency(0);
2095 for (MachineBasicBlock *Pred : NewTop->predecessors()) {
2096 if (!LoopBlockSet.count(Pred))
2097 continue;
2098 BlockChain *PredChain = BlockToChain[Pred];
2099 if (!PredChain || Pred == *std::prev(PredChain->end())) {
2100 BlockFrequency EdgeFreq =
2101 MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, NewTop);
2102 if (EdgeFreq > FallThroughFromPred) {
2103 FallThroughFromPred = EdgeFreq;
2104 BestPred = Pred;
2105 }
2106 }
2107 }
2108
2109 // If NewTop is not placed after Pred, another successor can be placed
2110 // after Pred.
2111 BlockFrequency NewFreq = BlockFrequency(0);
2112 if (BestPred) {
2113 for (MachineBasicBlock *Succ : BestPred->successors()) {
2114 if ((Succ == NewTop) || (Succ == BestPred) || !LoopBlockSet.count(Succ))
2115 continue;
2116 if (ComputedEdges.contains(Succ))
2117 continue;
2118 BlockChain *SuccChain = BlockToChain[Succ];
2119 if ((SuccChain && (Succ != *SuccChain->begin())) ||
2120 (SuccChain == BlockToChain[BestPred]))
2121 continue;
2122 BlockFrequency EdgeFreq = MBFI->getBlockFreq(BestPred) *
2123 MBPI->getEdgeProbability(BestPred, Succ);
2124 if (EdgeFreq > NewFreq)
2125 NewFreq = EdgeFreq;
2126 }
2127 BlockFrequency OrigEdgeFreq = MBFI->getBlockFreq(BestPred) *
2128 MBPI->getEdgeProbability(BestPred, NewTop);
2129 if (NewFreq > OrigEdgeFreq) {
2130 // If NewTop is not the best successor of Pred, then Pred doesn't
2131 // fallthrough to NewTop. So there is no FallThroughFromPred and
2132 // NewFreq.
2133 NewFreq = BlockFrequency(0);
2134 FallThroughFromPred = BlockFrequency(0);
2135 }
2136 }
2137
2138 BlockFrequency Result = BlockFrequency(0);
2139 BlockFrequency Gains = BackEdgeFreq + NewFreq;
2140 BlockFrequency Lost =
2141 FallThrough2Top + FallThrough2Exit + FallThroughFromPred;
2142 if (Gains > Lost)
2143 Result = Gains - Lost;
2144 return Result;
2145}
2146
2147/// Helper function of findBestLoopTop. Find the best loop top block
2148/// from predecessors of old top.
2149///
2150/// Look for a block which is strictly better than the old top for laying
2151/// out before the old top of the loop. This looks for only two patterns:
2152///
2153/// 1. a block has only one successor, the old loop top
2154///
2155/// Because such a block will always result in an unconditional jump,
2156/// rotating it in front of the old top is always profitable.
2157///
2158/// 2. a block has two successors, one is old top, another is exit
2159/// and it has more than one predecessors
2160///
2161/// If it is below one of its predecessors P, only P can fall through to
2162/// it, all other predecessors need a jump to it, and another conditional
2163/// jump to loop header. If it is moved before loop header, all its
2164/// predecessors jump to it, then fall through to loop header. So all its
2165/// predecessors except P can reduce one taken branch.
2166/// At the same time, move it before old top increases the taken branch
2167/// to loop exit block, so the reduced taken branch will be compared with
2168/// the increased taken branch to the loop exit block.
2169MachineBasicBlock *MachineBlockPlacement::findBestLoopTopHelper(
2170 MachineBasicBlock *OldTop, const MachineLoop &L,
2171 const BlockFilterSet &LoopBlockSet) {
2172 // Check that the header hasn't been fused with a preheader block due to
2173 // crazy branches. If it has, we need to start with the header at the top to
2174 // prevent pulling the preheader into the loop body.
2175 BlockChain &HeaderChain = *BlockToChain[OldTop];
2176 if (!LoopBlockSet.count(*HeaderChain.begin()))
2177 return OldTop;
2178 if (OldTop != *HeaderChain.begin())
2179 return OldTop;
2180
2181 LLVM_DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(OldTop)
2182 << "\n");
2183
2184 BlockFrequency BestGains = BlockFrequency(0);
2185 MachineBasicBlock *BestPred = nullptr;
2186 for (MachineBasicBlock *Pred : OldTop->predecessors()) {
2187 if (!LoopBlockSet.count(Pred))
2188 continue;
2189 if (Pred == L.getHeader())
2190 continue;
2191 LLVM_DEBUG(dbgs() << " old top pred: " << getBlockName(Pred) << ", has "
2192 << Pred->succ_size() << " successors, "
2193 << printBlockFreq(MBFI->getMBFI(), *Pred) << " freq\n");
2194 if (Pred->succ_size() > 2)
2195 continue;
2196
2197 MachineBasicBlock *OtherBB = nullptr;
2198 if (Pred->succ_size() == 2) {
2199 OtherBB = *Pred->succ_begin();
2200 if (OtherBB == OldTop)
2201 OtherBB = *Pred->succ_rbegin();
2202 }
2203
2204 if (!canMoveBottomBlockToTop(Pred, OldTop))
2205 continue;
2206
2207 BlockFrequency Gains =
2208 FallThroughGains(Pred, OldTop, OtherBB, LoopBlockSet);
2209 if ((Gains > BlockFrequency(0)) &&
2210 (Gains > BestGains ||
2211 ((Gains == BestGains) && Pred->isLayoutSuccessor(OldTop)))) {
2212 BestPred = Pred;
2213 BestGains = Gains;
2214 }
2215 }
2216
2217 // If no direct predecessor is fine, just use the loop header.
2218 if (!BestPred) {
2219 LLVM_DEBUG(dbgs() << " final top unchanged\n");
2220 return OldTop;
2221 }
2222
2223 // Walk backwards through any straight line of predecessors.
2224 while (BestPred->pred_size() == 1 &&
2225 (*BestPred->pred_begin())->succ_size() == 1 &&
2226 *BestPred->pred_begin() != L.getHeader())
2227 BestPred = *BestPred->pred_begin();
2228
2229 LLVM_DEBUG(dbgs() << " final top: " << getBlockName(BestPred) << "\n");
2230 return BestPred;
2231}
2232
2233/// Find the best loop top block for layout.
2234///
2235/// This function iteratively calls findBestLoopTopHelper, until no new better
2236/// BB can be found.
2237MachineBasicBlock *
2238MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
2239 const BlockFilterSet &LoopBlockSet) {
2240 // Placing the latch block before the header may introduce an extra branch
2241 // that skips this block the first time the loop is executed, which we want
2242 // to avoid when optimising for size.
2243 // FIXME: in theory there is a case that does not introduce a new branch,
2244 // i.e. when the layout predecessor does not fallthrough to the loop header.
2245 // In practice this never happens though: there always seems to be a preheader
2246 // that can fallthrough and that is also placed before the header.
2247 if (llvm::shouldOptimizeForSize(L.getHeader(), PSI, MBFI.get()))
2248 return L.getHeader();
2249
2250 MachineBasicBlock *OldTop = nullptr;
2251 MachineBasicBlock *NewTop = L.getHeader();
2252 while (NewTop != OldTop) {
2253 OldTop = NewTop;
2254 NewTop = findBestLoopTopHelper(OldTop, L, LoopBlockSet);
2255 if (NewTop != OldTop)
2256 ComputedEdges[NewTop] = {OldTop, false};
2257 }
2258 return NewTop;
2259}
2260
2261/// Find the best loop exiting block for layout.
2262///
2263/// This routine implements the logic to analyze the loop looking for the best
2264/// block to layout at the top of the loop. Typically this is done to maximize
2265/// fallthrough opportunities.
2266MachineBasicBlock *
2267MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
2268 const BlockFilterSet &LoopBlockSet,
2269 BlockFrequency &ExitFreq) {
2270 // We don't want to layout the loop linearly in all cases. If the loop header
2271 // is just a normal basic block in the loop, we want to look for what block
2272 // within the loop is the best one to layout at the top. However, if the loop
2273 // header has be pre-merged into a chain due to predecessors not having
2274 // analyzable branches, *and* the predecessor it is merged with is *not* part
2275 // of the loop, rotating the header into the middle of the loop will create
2276 // a non-contiguous range of blocks which is Very Bad. So start with the
2277 // header and only rotate if safe.
2278 BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
2279 if (!LoopBlockSet.count(*HeaderChain.begin()))
2280 return nullptr;
2281
2282 BlockFrequency BestExitEdgeFreq;
2283 unsigned BestExitLoopDepth = 0;
2284 MachineBasicBlock *ExitingBB = nullptr;
2285 // If there are exits to outer loops, loop rotation can severely limit
2286 // fallthrough opportunities unless it selects such an exit. Keep a set of
2287 // blocks where rotating to exit with that block will reach an outer loop.
2288 SmallPtrSet<MachineBasicBlock *, 4> BlocksExitingToOuterLoop;
2289
2290 LLVM_DEBUG(dbgs() << "Finding best loop exit for: "
2291 << getBlockName(L.getHeader()) << "\n");
2292 for (MachineBasicBlock *MBB : L.getBlocks()) {
2293 BlockChain &Chain = *BlockToChain[MBB];
2294 // Ensure that this block is at the end of a chain; otherwise it could be
2295 // mid-way through an inner loop or a successor of an unanalyzable branch.
2296 if (MBB != *std::prev(Chain.end()))
2297 continue;
2298
2299 // Now walk the successors. We need to establish whether this has a viable
2300 // exiting successor and whether it has a viable non-exiting successor.
2301 // We store the old exiting state and restore it if a viable looping
2302 // successor isn't found.
2303 MachineBasicBlock *OldExitingBB = ExitingBB;
2304 BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
2305 bool HasLoopingSucc = false;
2306 for (MachineBasicBlock *Succ : MBB->successors()) {
2307 if (Succ->isEHPad())
2308 continue;
2309 if (Succ == MBB)
2310 continue;
2311 BlockChain &SuccChain = *BlockToChain[Succ];
2312 // Don't split chains, either this chain or the successor's chain.
2313 if (&Chain == &SuccChain) {
2314 LLVM_DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2315 << getBlockName(Succ) << " (chain conflict)\n");
2316 continue;
2317 }
2318
2319 auto SuccProb = MBPI->getEdgeProbability(MBB, Succ);
2320 if (LoopBlockSet.count(Succ)) {
2321 LLVM_DEBUG(dbgs() << " looping: " << getBlockName(MBB) << " -> "
2322 << getBlockName(Succ) << " (" << SuccProb << ")\n");
2323 HasLoopingSucc = true;
2324 continue;
2325 }
2326
2327 unsigned SuccLoopDepth = 0;
2328 if (MachineLoop *ExitLoop = MLI->getLoopFor(Succ)) {
2329 SuccLoopDepth = ExitLoop->getLoopDepth();
2330 if (ExitLoop->contains(&L))
2331 BlocksExitingToOuterLoop.insert(MBB);
2332 }
2333
2334 BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb;
2335 LLVM_DEBUG(
2336 dbgs() << " exiting: " << getBlockName(MBB) << " -> "
2337 << getBlockName(Succ) << " [L:" << SuccLoopDepth << "] ("
2338 << printBlockFreq(MBFI->getMBFI(), ExitEdgeFreq) << ")\n");
2339 // Note that we bias this toward an existing layout successor to retain
2340 // incoming order in the absence of better information. The exit must have
2341 // a frequency higher than the current exit before we consider breaking
2342 // the layout.
2343 BranchProbability Bias(100 - ExitBlockBias, 100);
2344 if (!ExitingBB || SuccLoopDepth > BestExitLoopDepth ||
2345 ExitEdgeFreq > BestExitEdgeFreq ||
2346 (MBB->isLayoutSuccessor(Succ) &&
2347 !(ExitEdgeFreq < BestExitEdgeFreq * Bias))) {
2348 BestExitEdgeFreq = ExitEdgeFreq;
2349 ExitingBB = MBB;
2350 }
2351 }
2352
2353 if (!HasLoopingSucc) {
2354 // Restore the old exiting state, no viable looping successor was found.
2355 ExitingBB = OldExitingBB;
2356 BestExitEdgeFreq = OldBestExitEdgeFreq;
2357 }
2358 }
2359 // Without a candidate exiting block or with only a single block in the
2360 // loop, just use the loop header to layout the loop.
2361 if (!ExitingBB) {
2362 LLVM_DEBUG(
2363 dbgs() << " No other candidate exit blocks, using loop header\n");
2364 return nullptr;
2365 }
2366 if (L.getNumBlocks() == 1) {
2367 LLVM_DEBUG(dbgs() << " Loop has 1 block, using loop header as exit\n");
2368 return nullptr;
2369 }
2370
2371 // Also, if we have exit blocks which lead to outer loops but didn't select
2372 // one of them as the exiting block we are rotating toward, disable loop
2373 // rotation altogether.
2374 if (!BlocksExitingToOuterLoop.empty() &&
2375 !BlocksExitingToOuterLoop.count(ExitingBB))
2376 return nullptr;
2377
2378 LLVM_DEBUG(dbgs() << " Best exiting block: " << getBlockName(ExitingBB)
2379 << "\n");
2380 ExitFreq = BestExitEdgeFreq;
2381 return ExitingBB;
2382}
2383
2384/// Check if there is a fallthrough to loop header Top.
2385///
2386/// 1. Look for a Pred that can be layout before Top.
2387/// 2. Check if Top is the most possible successor of Pred.
2388bool MachineBlockPlacement::hasViableTopFallthrough(
2389 const MachineBasicBlock *Top, const BlockFilterSet &LoopBlockSet) {
2390 for (MachineBasicBlock *Pred : Top->predecessors()) {
2391 BlockChain *PredChain = BlockToChain[Pred];
2392 if (!LoopBlockSet.count(Pred) &&
2393 (!PredChain || Pred == *std::prev(PredChain->end()))) {
2394 // Found a Pred block can be placed before Top.
2395 // Check if Top is the best successor of Pred.
2396 auto TopProb = MBPI->getEdgeProbability(Pred, Top);
2397 bool TopOK = true;
2398 for (MachineBasicBlock *Succ : Pred->successors()) {
2399 auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
2400 BlockChain *SuccChain = BlockToChain[Succ];
2401 // Check if Succ can be placed after Pred.
2402 // Succ should not be in any chain, or it is the head of some chain.
2403 if ((!SuccChain || Succ == *SuccChain->begin()) && SuccProb > TopProb) {
2404 TopOK = false;
2405 break;
2406 }
2407 }
2408 if (TopOK)
2409 return true;
2410 }
2411 }
2412 return false;
2413}
2414
2415/// Attempt to rotate an exiting block to the bottom of the loop.
2416///
2417/// Once we have built a chain, try to rotate it to line up the hot exit block
2418/// with fallthrough out of the loop if doing so doesn't introduce unnecessary
2419/// branches. For example, if the loop has fallthrough into its header and out
2420/// of its bottom already, don't rotate it.
2421void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
2422 const MachineBasicBlock *ExitingBB,
2423 BlockFrequency ExitFreq,
2424 const BlockFilterSet &LoopBlockSet) {
2425 if (!ExitingBB)
2426 return;
2427
2428 MachineBasicBlock *Top = *LoopChain.begin();
2429 MachineBasicBlock *Bottom = *std::prev(LoopChain.end());
2430
2431 // If ExitingBB is already the last one in a chain then nothing to do.
2432 if (Bottom == ExitingBB)
2433 return;
2434
2435 // The entry block should always be the first BB in a function.
2436 if (Top->isEntryBlock())
2437 return;
2438
2439 bool ViableTopFallthrough = hasViableTopFallthrough(Top, LoopBlockSet);
2440
2441 // If the header has viable fallthrough, check whether the current loop
2442 // bottom is a viable exiting block. If so, bail out as rotating will
2443 // introduce an unnecessary branch.
2444 if (ViableTopFallthrough) {
2445 for (MachineBasicBlock *Succ : Bottom->successors()) {
2446 BlockChain *SuccChain = BlockToChain[Succ];
2447 if (!LoopBlockSet.count(Succ) &&
2448 (!SuccChain || Succ == *SuccChain->begin()))
2449 return;
2450 }
2451
2452 // Rotate will destroy the top fallthrough, we need to ensure the new exit
2453 // frequency is larger than top fallthrough.
2454 BlockFrequency FallThrough2Top = TopFallThroughFreq(Top, LoopBlockSet);
2455 if (FallThrough2Top >= ExitFreq)
2456 return;
2457 }
2458
2459 BlockChain::iterator ExitIt = llvm::find(LoopChain, ExitingBB);
2460 if (ExitIt == LoopChain.end())
2461 return;
2462
2463 // Rotating a loop exit to the bottom when there is a fallthrough to top
2464 // trades the entry fallthrough for an exit fallthrough.
2465 // If there is no bottom->top edge, but the chosen exit block does have
2466 // a fallthrough, we break that fallthrough for nothing in return.
2467
2468 // Let's consider an example. We have a built chain of basic blocks
2469 // B1, B2, ..., Bn, where Bk is a ExitingBB - chosen exit block.
2470 // By doing a rotation we get
2471 // Bk+1, ..., Bn, B1, ..., Bk
2472 // Break of fallthrough to B1 is compensated by a fallthrough from Bk.
2473 // If we had a fallthrough Bk -> Bk+1 it is broken now.
2474 // It might be compensated by fallthrough Bn -> B1.
2475 // So we have a condition to avoid creation of extra branch by loop rotation.
2476 // All below must be true to avoid loop rotation:
2477 // If there is a fallthrough to top (B1)
2478 // There was fallthrough from chosen exit block (Bk) to next one (Bk+1)
2479 // There is no fallthrough from bottom (Bn) to top (B1).
2480 // Please note that there is no exit fallthrough from Bn because we checked it
2481 // above.
2482 if (ViableTopFallthrough) {
2483 assert(std::next(ExitIt) != LoopChain.end() &&
2484 "Exit should not be last BB");
2485 MachineBasicBlock *NextBlockInChain = *std::next(ExitIt);
2486 if (ExitingBB->isSuccessor(NextBlockInChain))
2487 if (!Bottom->isSuccessor(Top))
2488 return;
2489 }
2490
2491 LLVM_DEBUG(dbgs() << "Rotating loop to put exit " << getBlockName(ExitingBB)
2492 << " at bottom\n");
2493 std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end());
2494}
2495
2496/// Attempt to rotate a loop based on profile data to reduce branch cost.
2497///
2498/// With profile data, we can determine the cost in terms of missed fall through
2499/// opportunities when rotating a loop chain and select the best rotation.
2500/// Basically, there are three kinds of cost to consider for each rotation:
2501/// 1. The possibly missed fall through edge (if it exists) from BB out of
2502/// the loop to the loop header.
2503/// 2. The possibly missed fall through edges (if they exist) from the loop
2504/// exits to BB out of the loop.
2505/// 3. The missed fall through edge (if it exists) from the last BB to the
2506/// first BB in the loop chain.
2507/// Therefore, the cost for a given rotation is the sum of costs listed above.
2508/// We select the best rotation with the smallest cost.
2509void MachineBlockPlacement::rotateLoopWithProfile(
2510 BlockChain &LoopChain, const MachineLoop &L,
2511 const BlockFilterSet &LoopBlockSet) {
2512 auto RotationPos = LoopChain.end();
2513 MachineBasicBlock *ChainHeaderBB = *LoopChain.begin();
2514
2515 // The entry block should always be the first BB in a function.
2516 if (ChainHeaderBB->isEntryBlock())
2517 return;
2518
2519 BlockFrequency SmallestRotationCost = BlockFrequency::max();
2520
2521 // A utility lambda that scales up a block frequency by dividing it by a
2522 // branch probability which is the reciprocal of the scale.
2523 auto ScaleBlockFrequency = [](BlockFrequency Freq,
2524 unsigned Scale) -> BlockFrequency {
2525 if (Scale == 0)
2526 return BlockFrequency(0);
2527 // Use operator / between BlockFrequency and BranchProbability to implement
2528 // saturating multiplication.
2529 return Freq / BranchProbability(1, Scale);
2530 };
2531
2532 // Compute the cost of the missed fall-through edge to the loop header if the
2533 // chain head is not the loop header. As we only consider natural loops with
2534 // single header, this computation can be done only once.
2535 BlockFrequency HeaderFallThroughCost(0);
2536 for (auto *Pred : ChainHeaderBB->predecessors()) {
2537 BlockChain *PredChain = BlockToChain[Pred];
2538 if (!LoopBlockSet.count(Pred) &&
2539 (!PredChain || Pred == *std::prev(PredChain->end()))) {
2540 auto EdgeFreq = MBFI->getBlockFreq(Pred) *
2541 MBPI->getEdgeProbability(Pred, ChainHeaderBB);
2542 auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost);
2543 // If the predecessor has only an unconditional jump to the header, we
2544 // need to consider the cost of this jump.
2545 if (Pred->succ_size() == 1)
2546 FallThruCost += ScaleBlockFrequency(EdgeFreq, JumpInstCost);
2547 HeaderFallThroughCost = std::max(HeaderFallThroughCost, FallThruCost);
2548 }
2549 }
2550
2551 // Here we collect all exit blocks in the loop, and for each exit we find out
2552 // its hottest exit edge. For each loop rotation, we define the loop exit cost
2553 // as the sum of frequencies of exit edges we collect here, excluding the exit
2554 // edge from the tail of the loop chain.
2556 for (auto *BB : LoopChain) {
2557 auto LargestExitEdgeProb = BranchProbability::getZero();
2558 for (auto *Succ : BB->successors()) {
2559 BlockChain *SuccChain = BlockToChain[Succ];
2560 if (!LoopBlockSet.count(Succ) &&
2561 (!SuccChain || Succ == *SuccChain->begin())) {
2562 auto SuccProb = MBPI->getEdgeProbability(BB, Succ);
2563 LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb);
2564 }
2565 }
2566 if (LargestExitEdgeProb > BranchProbability::getZero()) {
2567 auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb;
2568 ExitsWithFreq.emplace_back(BB, ExitFreq);
2569 }
2570 }
2571
2572 // In this loop we iterate every block in the loop chain and calculate the
2573 // cost assuming the block is the head of the loop chain. When the loop ends,
2574 // we should have found the best candidate as the loop chain's head.
2575 for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()),
2576 EndIter = LoopChain.end();
2577 Iter != EndIter; Iter++, TailIter++) {
2578 // TailIter is used to track the tail of the loop chain if the block we are
2579 // checking (pointed by Iter) is the head of the chain.
2580 if (TailIter == LoopChain.end())
2581 TailIter = LoopChain.begin();
2582
2583 auto TailBB = *TailIter;
2584
2585 // Calculate the cost by putting this BB to the top.
2586 BlockFrequency Cost = BlockFrequency(0);
2587
2588 // If the current BB is the loop header, we need to take into account the
2589 // cost of the missed fall through edge from outside of the loop to the
2590 // header.
2591 if (Iter != LoopChain.begin())
2592 Cost += HeaderFallThroughCost;
2593
2594 // Collect the loop exit cost by summing up frequencies of all exit edges
2595 // except the one from the chain tail.
2596 for (auto &ExitWithFreq : ExitsWithFreq)
2597 if (TailBB != ExitWithFreq.first)
2598 Cost += ExitWithFreq.second;
2599
2600 // The cost of breaking the once fall-through edge from the tail to the top
2601 // of the loop chain. Here we need to consider three cases:
2602 // 1. If the tail node has only one successor, then we will get an
2603 // additional jmp instruction. So the cost here is (MisfetchCost +
2604 // JumpInstCost) * tail node frequency.
2605 // 2. If the tail node has two successors, then we may still get an
2606 // additional jmp instruction if the layout successor after the loop
2607 // chain is not its CFG successor. Note that the more frequently executed
2608 // jmp instruction will be put ahead of the other one. Assume the
2609 // frequency of those two branches are x and y, where x is the frequency
2610 // of the edge to the chain head, then the cost will be
2611 // (x * MisfetechCost + min(x, y) * JumpInstCost) * tail node frequency.
2612 // 3. If the tail node has more than two successors (this rarely happens),
2613 // we won't consider any additional cost.
2614 if (TailBB->isSuccessor(*Iter)) {
2615 auto TailBBFreq = MBFI->getBlockFreq(TailBB);
2616 if (TailBB->succ_size() == 1)
2617 Cost += ScaleBlockFrequency(TailBBFreq, MisfetchCost + JumpInstCost);
2618 else if (TailBB->succ_size() == 2) {
2619 auto TailToHeadProb = MBPI->getEdgeProbability(TailBB, *Iter);
2620 auto TailToHeadFreq = TailBBFreq * TailToHeadProb;
2621 auto ColderEdgeFreq = TailToHeadProb > BranchProbability(1, 2)
2622 ? TailBBFreq * TailToHeadProb.getCompl()
2623 : TailToHeadFreq;
2624 Cost += ScaleBlockFrequency(TailToHeadFreq, MisfetchCost) +
2625 ScaleBlockFrequency(ColderEdgeFreq, JumpInstCost);
2626 }
2627 }
2628
2629 LLVM_DEBUG(dbgs() << "The cost of loop rotation by making "
2630 << getBlockName(*Iter) << " to the top: "
2631 << printBlockFreq(MBFI->getMBFI(), Cost) << "\n");
2632
2633 if (Cost < SmallestRotationCost) {
2634 SmallestRotationCost = Cost;
2635 RotationPos = Iter;
2636 }
2637 }
2638
2639 if (RotationPos != LoopChain.end()) {
2640 LLVM_DEBUG(dbgs() << "Rotate loop by making " << getBlockName(*RotationPos)
2641 << " to the top\n");
2642 std::rotate(LoopChain.begin(), RotationPos, LoopChain.end());
2643 }
2644}
2645
2646/// Collect blocks in the given loop that are to be placed.
2647///
2648/// When profile data is available, exclude cold blocks from the returned set;
2649/// otherwise, collect all blocks in the loop.
2650MachineBlockPlacement::BlockFilterSet
2651MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
2652 // Collect the blocks in a set ordered by block number, as this gives the same
2653 // order as they appear in the function.
2654 struct MBBCompare {
2655 bool operator()(const MachineBasicBlock *X,
2656 const MachineBasicBlock *Y) const {
2657 return X->getNumber() < Y->getNumber();
2658 }
2659 };
2660 std::set<const MachineBasicBlock *, MBBCompare> LoopBlockSet;
2661
2662 // Filter cold blocks off from LoopBlockSet when profile data is available.
2663 // Collect the sum of frequencies of incoming edges to the loop header from
2664 // outside. If we treat the loop as a super block, this is the frequency of
2665 // the loop. Then for each block in the loop, we calculate the ratio between
2666 // its frequency and the frequency of the loop block. When it is too small,
2667 // don't add it to the loop chain. If there are outer loops, then this block
2668 // will be merged into the first outer loop chain for which this block is not
2669 // cold anymore. This needs precise profile data and we only do this when
2670 // profile data is available.
2671 if (F->getFunction().hasProfileData() || ForceLoopColdBlock) {
2672 BlockFrequency LoopFreq(0);
2673 for (auto *LoopPred : L.getHeader()->predecessors())
2674 if (!L.contains(LoopPred))
2675 LoopFreq += MBFI->getBlockFreq(LoopPred) *
2676 MBPI->getEdgeProbability(LoopPred, L.getHeader());
2677
2678 for (MachineBasicBlock *LoopBB : L.getBlocks()) {
2679 if (LoopBlockSet.count(LoopBB))
2680 continue;
2681 auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency();
2682 if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio)
2683 continue;
2684 BlockChain *Chain = BlockToChain[LoopBB];
2685 for (MachineBasicBlock *ChainBB : *Chain)
2686 LoopBlockSet.insert(ChainBB);
2687 }
2688 } else
2689 LoopBlockSet.insert(L.block_begin(), L.block_end());
2690
2691 // Copy the blocks into a BlockFilterSet, as iterating it is faster than
2692 // std::set. We will only remove blocks and never insert them, which will
2693 // preserve the ordering.
2694 BlockFilterSet Ret(LoopBlockSet.begin(), LoopBlockSet.end());
2695 return Ret;
2696}
2697
2698/// Forms basic block chains from the natural loop structures.
2699///
2700/// These chains are designed to preserve the existing *structure* of the code
2701/// as much as possible. We can then stitch the chains together in a way which
2702/// both preserves the topological structure and minimizes taken conditional
2703/// branches.
2704void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
2705 // First recurse through any nested loops, building chains for those inner
2706 // loops.
2707 for (const MachineLoop *InnerLoop : L)
2708 buildLoopChains(*InnerLoop);
2709
2710 assert(BlockWorkList.empty() &&
2711 "BlockWorkList not empty when starting to build loop chains.");
2712 assert(EHPadWorkList.empty() &&
2713 "EHPadWorkList not empty when starting to build loop chains.");
2714 BlockFilterSet LoopBlockSet = collectLoopBlockSet(L);
2715
2716 // Check if we have profile data for this function. If yes, we will rotate
2717 // this loop by modeling costs more precisely which requires the profile data
2718 // for better layout.
2719 bool RotateLoopWithProfile =
2721 (PreciseRotationCost && F->getFunction().hasProfileData());
2722
2723 // First check to see if there is an obviously preferable top block for the
2724 // loop. This will default to the header, but may end up as one of the
2725 // predecessors to the header if there is one which will result in strictly
2726 // fewer branches in the loop body.
2727 MachineBasicBlock *LoopTop = findBestLoopTop(L, LoopBlockSet);
2728
2729 // If we selected just the header for the loop top, look for a potentially
2730 // profitable exit block in the event that rotating the loop can eliminate
2731 // branches by placing an exit edge at the bottom.
2732 //
2733 // Loops are processed innermost to uttermost, make sure we clear
2734 // PreferredLoopExit before processing a new loop.
2735 PreferredLoopExit = nullptr;
2736 BlockFrequency ExitFreq;
2737 if (!RotateLoopWithProfile && LoopTop == L.getHeader())
2738 PreferredLoopExit = findBestLoopExit(L, LoopBlockSet, ExitFreq);
2739
2740 BlockChain &LoopChain = *BlockToChain[LoopTop];
2741
2742 // FIXME: This is a really lame way of walking the chains in the loop: we
2743 // walk the blocks, and use a set to prevent visiting a particular chain
2744 // twice.
2745 SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2746 assert(LoopChain.UnscheduledPredecessors == 0 &&
2747 "LoopChain should not have unscheduled predecessors.");
2748 UpdatedPreds.insert(&LoopChain);
2749
2750 for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2751 fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet);
2752
2753 buildChain(LoopTop, LoopChain, &LoopBlockSet);
2754
2755 if (RotateLoopWithProfile)
2756 rotateLoopWithProfile(LoopChain, L, LoopBlockSet);
2757 else
2758 rotateLoop(LoopChain, PreferredLoopExit, ExitFreq, LoopBlockSet);
2759
2760 LLVM_DEBUG({
2761 // Crash at the end so we get all of the debugging output first.
2762 bool BadLoop = false;
2763 if (LoopChain.UnscheduledPredecessors) {
2764 BadLoop = true;
2765 dbgs() << "Loop chain contains a block without its preds placed!\n"
2766 << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2767 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n";
2768 }
2769 for (MachineBasicBlock *ChainBB : LoopChain) {
2770 dbgs() << " ... " << getBlockName(ChainBB) << "\n";
2771 if (!LoopBlockSet.remove(ChainBB)) {
2772 // We don't mark the loop as bad here because there are real situations
2773 // where this can occur. For example, with an unanalyzable fallthrough
2774 // from a loop block to a non-loop block or vice versa.
2775 dbgs() << "Loop chain contains a block not contained by the loop!\n"
2776 << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2777 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2778 << " Bad block: " << getBlockName(ChainBB) << "\n";
2779 }
2780 }
2781
2782 if (!LoopBlockSet.empty()) {
2783 BadLoop = true;
2784 for (const MachineBasicBlock *LoopBB : LoopBlockSet)
2785 dbgs() << "Loop contains blocks never placed into a chain!\n"
2786 << " Loop header: " << getBlockName(*L.block_begin()) << "\n"
2787 << " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
2788 << " Bad block: " << getBlockName(LoopBB) << "\n";
2789 }
2790 assert(!BadLoop && "Detected problems with the placement of this loop.");
2791 });
2792
2793 BlockWorkList.clear();
2794 EHPadWorkList.clear();
2795}
2796
2797void MachineBlockPlacement::buildCFGChains() {
2798 // Ensure that every BB in the function has an associated chain to simplify
2799 // the assumptions of the remaining algorithm.
2800 SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
2801 for (MachineFunction::iterator FI = F->begin(), FE = F->end(); FI != FE;
2802 ++FI) {
2803 MachineBasicBlock *BB = &*FI;
2804 BlockChain *Chain =
2805 new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB);
2806 // Also, merge any blocks which we cannot reason about and must preserve
2807 // the exact fallthrough behavior for.
2808 while (true) {
2809 Cond.clear();
2810 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2811 if (!TII->analyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())
2812 break;
2813
2814 MachineFunction::iterator NextFI = std::next(FI);
2815 MachineBasicBlock *NextBB = &*NextFI;
2816 // Ensure that the layout successor is a viable block, as we know that
2817 // fallthrough is a possibility.
2818 assert(NextFI != FE && "Can't fallthrough past the last block.");
2819 LLVM_DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: "
2820 << getBlockName(BB) << " -> " << getBlockName(NextBB)
2821 << "\n");
2822 Chain->merge(NextBB, nullptr);
2823#ifndef NDEBUG
2824 BlocksWithUnanalyzableExits.insert(&*BB);
2825#endif
2826 FI = NextFI;
2827 BB = NextBB;
2828 }
2829 }
2830
2831 // Build any loop-based chains.
2832 PreferredLoopExit = nullptr;
2833 for (MachineLoop *L : *MLI)
2834 buildLoopChains(*L);
2835
2836 assert(BlockWorkList.empty() &&
2837 "BlockWorkList should be empty before building final chain.");
2838 assert(EHPadWorkList.empty() &&
2839 "EHPadWorkList should be empty before building final chain.");
2840
2841 SmallPtrSet<BlockChain *, 4> UpdatedPreds;
2842 for (MachineBasicBlock &MBB : *F)
2843 fillWorkLists(&MBB, UpdatedPreds);
2844
2845 BlockChain &FunctionChain = *BlockToChain[&F->front()];
2846 buildChain(&F->front(), FunctionChain);
2847
2848#ifndef NDEBUG
2849 using FunctionBlockSetType = SmallPtrSet<MachineBasicBlock *, 16>;
2850#endif
2851 LLVM_DEBUG({
2852 // Crash at the end so we get all of the debugging output first.
2853 bool BadFunc = false;
2854 FunctionBlockSetType FunctionBlockSet;
2855 for (MachineBasicBlock &MBB : *F)
2856 FunctionBlockSet.insert(&MBB);
2857
2858 for (MachineBasicBlock *ChainBB : FunctionChain)
2859 if (!FunctionBlockSet.erase(ChainBB)) {
2860 BadFunc = true;
2861 dbgs() << "Function chain contains a block not in the function!\n"
2862 << " Bad block: " << getBlockName(ChainBB) << "\n";
2863 }
2864
2865 if (!FunctionBlockSet.empty()) {
2866 BadFunc = true;
2867 for (MachineBasicBlock *RemainingBB : FunctionBlockSet)
2868 dbgs() << "Function contains blocks never placed into a chain!\n"
2869 << " Bad block: " << getBlockName(RemainingBB) << "\n";
2870 }
2871 assert(!BadFunc && "Detected problems with the block placement.");
2872 });
2873
2874 // Remember original layout ordering, so we can update terminators after
2875 // reordering to point to the original layout successor.
2876 SmallVector<MachineBasicBlock *, 4> OriginalLayoutSuccessors(
2877 F->getNumBlockIDs());
2878 {
2879 MachineBasicBlock *LastMBB = nullptr;
2880 for (auto &MBB : *F) {
2881 if (LastMBB != nullptr)
2882 OriginalLayoutSuccessors[LastMBB->getNumber()] = &MBB;
2883 LastMBB = &MBB;
2884 }
2885 OriginalLayoutSuccessors[F->back().getNumber()] = nullptr;
2886 }
2887
2888 // Splice the blocks into place.
2889 MachineFunction::iterator InsertPos = F->begin();
2890 LLVM_DEBUG(dbgs() << "[MBP] Function: " << F->getName() << "\n");
2891 for (MachineBasicBlock *ChainBB : FunctionChain) {
2892 LLVM_DEBUG(dbgs() << (ChainBB == *FunctionChain.begin() ? "Placing chain "
2893 : " ... ")
2894 << getBlockName(ChainBB) << "\n");
2895 if (InsertPos != MachineFunction::iterator(ChainBB))
2896 F->splice(InsertPos, ChainBB);
2897 else
2898 ++InsertPos;
2899
2900 // Update the terminator of the previous block.
2901 if (ChainBB == *FunctionChain.begin())
2902 continue;
2903 MachineBasicBlock *PrevBB = &*std::prev(MachineFunction::iterator(ChainBB));
2904
2905 // FIXME: It would be awesome of updateTerminator would just return rather
2906 // than assert when the branch cannot be analyzed in order to remove this
2907 // boiler plate.
2908 Cond.clear();
2909 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2910
2911#ifndef NDEBUG
2912 if (!BlocksWithUnanalyzableExits.count(PrevBB)) {
2913 // Given the exact block placement we chose, we may actually not _need_ to
2914 // be able to edit PrevBB's terminator sequence, but not being _able_ to
2915 // do that at this point is a bug.
2916 assert((!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond) ||
2917 !PrevBB->canFallThrough()) &&
2918 "Unexpected block with un-analyzable fallthrough!");
2919 Cond.clear();
2920 TBB = FBB = nullptr;
2921 }
2922#endif
2923
2924 // The "PrevBB" is not yet updated to reflect current code layout, so,
2925 // o. it may fall-through to a block without explicit "goto" instruction
2926 // before layout, and no longer fall-through it after layout; or
2927 // o. just opposite.
2928 //
2929 // analyzeBranch() may return erroneous value for FBB when these two
2930 // situations take place. For the first scenario FBB is mistakenly set NULL;
2931 // for the 2nd scenario, the FBB, which is expected to be NULL, is
2932 // mistakenly pointing to "*BI".
2933 // Thus, if the future change needs to use FBB before the layout is set, it
2934 // has to correct FBB first by using the code similar to the following:
2935 //
2936 // if (!Cond.empty() && (!FBB || FBB == ChainBB)) {
2937 // PrevBB->updateTerminator();
2938 // Cond.clear();
2939 // TBB = FBB = nullptr;
2940 // if (TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2941 // // FIXME: This should never take place.
2942 // TBB = FBB = nullptr;
2943 // }
2944 // }
2945 if (!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
2946 PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]);
2947 }
2948 }
2949
2950 // Fixup the last block.
2951 Cond.clear();
2952 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
2953 if (!TII->analyzeBranch(F->back(), TBB, FBB, Cond)) {
2954 MachineBasicBlock *PrevBB = &F->back();
2955 PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]);
2956 }
2957
2958 BlockWorkList.clear();
2959 EHPadWorkList.clear();
2960}
2961
2962void MachineBlockPlacement::optimizeBranches() {
2963 BlockChain &FunctionChain = *BlockToChain[&F->front()];
2965
2966 // Now that all the basic blocks in the chain have the proper layout,
2967 // make a final call to analyzeBranch with AllowModify set.
2968 // Indeed, the target may be able to optimize the branches in a way we
2969 // cannot because all branches may not be analyzable.
2970 // E.g., the target may be able to remove an unconditional branch to
2971 // a fallthrough when it occurs after predicated terminators.
2972 for (MachineBasicBlock *ChainBB : FunctionChain) {
2973 Cond.clear();
2974 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
2975 if (TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true))
2976 continue;
2977 if (!TBB || !FBB || Cond.empty())
2978 continue;
2979 // If we are optimizing for size we do not consider the runtime performance.
2980 // Instead, we retain the original branch condition so we have more uniform
2981 // instructions which will benefit ICF.
2982 if (llvm::shouldOptimizeForSize(ChainBB, PSI, MBFI.get()))
2983 continue;
2984 // If ChainBB has a two-way branch, try to re-order the branches
2985 // such that we branch to the successor with higher probability first.
2986 if (MBPI->getEdgeProbability(ChainBB, TBB) >=
2987 MBPI->getEdgeProbability(ChainBB, FBB))
2988 continue;
2990 continue;
2991 LLVM_DEBUG(dbgs() << "Reverse order of the two branches: "
2992 << getBlockName(ChainBB) << "\n");
2993 LLVM_DEBUG(dbgs() << " " << getBlockName(TBB) << " < " << getBlockName(FBB)
2994 << "\n");
2995 auto Dl = ChainBB->findBranchDebugLoc();
2996 TII->removeBranch(*ChainBB);
2997 TII->insertBranch(*ChainBB, FBB, TBB, Cond, Dl);
2998 }
2999}
3000
3001void MachineBlockPlacement::alignBlocks() {
3002 // Walk through the backedges of the function now that we have fully laid out
3003 // the basic blocks and align the destination of each backedge. We don't rely
3004 // exclusively on the loop info here so that we can align backedges in
3005 // unnatural CFGs and backedges that were introduced purely because of the
3006 // loop rotations done during this layout pass.
3008 if (F->getFunction().hasMinSize() ||
3009 (F->getFunction().hasOptSize() && !TLI->alignLoopsWithOptSize()))
3010 return;
3011 }
3012
3013 BlockChain &FunctionChain = *BlockToChain[&F->front()];
3014 // Empty chain.
3015 if (FunctionChain.begin() == FunctionChain.end())
3016 return;
3017
3018 const BranchProbability ColdProb(1, 5); // 20%
3019 BlockFrequency EntryFreq = MBFI->getBlockFreq(&F->front());
3020 BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb;
3021 for (MachineBasicBlock *ChainBB : FunctionChain) {
3022 if (ChainBB == *FunctionChain.begin())
3023 continue;
3024
3025 // Don't align non-looping basic blocks. These are unlikely to execute
3026 // enough times to matter in practice. Note that we'll still handle
3027 // unnatural CFGs inside of a natural outer loop (the common case) and
3028 // rotated loops.
3029 MachineLoop *L = MLI->getLoopFor(ChainBB);
3030 if (!L)
3031 continue;
3032
3033 const Align TLIAlign = TLI->getPrefLoopAlignment(L);
3034 unsigned MDAlign = 1;
3035 MDNode *LoopID = L->getLoopID();
3036 if (LoopID) {
3037 for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {
3038 MDNode *MD = dyn_cast<MDNode>(MDO);
3039 if (MD == nullptr)
3040 continue;
3041 MDString *S = dyn_cast<MDString>(MD->getOperand(0));
3042 if (S == nullptr)
3043 continue;
3044 if (S->getString() == "llvm.loop.align") {
3045 assert(MD->getNumOperands() == 2 &&
3046 "per-loop align metadata should have two operands.");
3047 MDAlign =
3048 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
3049 assert(MDAlign >= 1 && "per-loop align value must be positive.");
3050 }
3051 }
3052 }
3053
3054 // Use max of the TLIAlign and MDAlign
3055 const Align LoopAlign = std::max(TLIAlign, Align(MDAlign));
3056 if (LoopAlign == 1)
3057 continue; // Don't care about loop alignment.
3058
3059 // If the block is cold relative to the function entry don't waste space
3060 // aligning it.
3061 BlockFrequency Freq = MBFI->getBlockFreq(ChainBB);
3062 if (Freq < WeightedEntryFreq)
3063 continue;
3064
3065 // If the block is cold relative to its loop header, don't align it
3066 // regardless of what edges into the block exist.
3067 MachineBasicBlock *LoopHeader = L->getHeader();
3068 BlockFrequency LoopHeaderFreq = MBFI->getBlockFreq(LoopHeader);
3069 if (Freq < (LoopHeaderFreq * ColdProb))
3070 continue;
3071
3072 // If the global profiles indicates so, don't align it.
3073 if (llvm::shouldOptimizeForSize(ChainBB, PSI, MBFI.get()) &&
3074 !TLI->alignLoopsWithOptSize())
3075 continue;
3076
3077 // Check for the existence of a non-layout predecessor which would benefit
3078 // from aligning this block.
3079 MachineBasicBlock *LayoutPred =
3080 &*std::prev(MachineFunction::iterator(ChainBB));
3081
3082 auto DetermineMaxAlignmentPadding = [&]() {
3083 // Set the maximum bytes allowed to be emitted for alignment.
3084 unsigned MaxBytes;
3085 if (MaxBytesForAlignmentOverride.getNumOccurrences() > 0)
3087 else
3088 MaxBytes = TLI->getMaxPermittedBytesForAlignment(ChainBB);
3089 ChainBB->setMaxBytesForAlignment(MaxBytes);
3090 };
3091
3092 // Force alignment if all the predecessors are jumps. We already checked
3093 // that the block isn't cold above.
3094 if (!LayoutPred->isSuccessor(ChainBB)) {
3095 ChainBB->setAlignment(LoopAlign);
3096 DetermineMaxAlignmentPadding();
3097 continue;
3098 }
3099
3100 // Align this block if the layout predecessor's edge into this block is
3101 // cold relative to the block. When this is true, other predecessors make up
3102 // all of the hot entries into the block and thus alignment is likely to be
3103 // important.
3104 BranchProbability LayoutProb =
3105 MBPI->getEdgeProbability(LayoutPred, ChainBB);
3106 BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb;
3107 if (LayoutEdgeFreq <= (Freq * ColdProb)) {
3108 ChainBB->setAlignment(LoopAlign);
3109 DetermineMaxAlignmentPadding();
3110 }
3111 }
3112
3113 const bool HasMaxBytesOverride =
3114 MaxBytesForAlignmentOverride.getNumOccurrences() > 0;
3115
3116 if (AlignAllBlock)
3117 // Align all of the blocks in the function to a specific alignment.
3118 for (MachineBasicBlock &MBB : *F) {
3119 if (HasMaxBytesOverride)
3122 else
3124 }
3125 else if (AlignAllNonFallThruBlocks) {
3126 // Align all of the blocks that have no fall-through predecessors to a
3127 // specific alignment.
3128 for (auto MBI = std::next(F->begin()), MBE = F->end(); MBI != MBE; ++MBI) {
3129 auto LayoutPred = std::prev(MBI);
3130 if (!LayoutPred->isSuccessor(&*MBI)) {
3131 if (HasMaxBytesOverride)
3132 MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks),
3134 else
3135 MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks));
3136 }
3137 }
3138 }
3139}
3140
3141/// Tail duplicate \p BB into (some) predecessors if profitable, repeating if
3142/// it was duplicated into its chain predecessor and removed.
3143/// \p BB - Basic block that may be duplicated.
3144///
3145/// \p LPred - Chosen layout predecessor of \p BB.
3146/// Updated to be the chain end if LPred is removed.
3147/// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
3148/// \p BlockFilter - Set of blocks that belong to the loop being laid out.
3149/// Used to identify which blocks to update predecessor
3150/// counts.
3151/// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
3152/// chosen in the given order due to unnatural CFG
3153/// only needed if \p BB is removed and
3154/// \p PrevUnplacedBlockIt pointed to \p BB.
3155/// @return true if \p BB was removed.
3156bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
3157 MachineBasicBlock *BB, MachineBasicBlock *&LPred,
3158 const MachineBasicBlock *LoopHeaderBB, BlockChain &Chain,
3159 BlockFilterSet *BlockFilter, MachineFunction::iterator &PrevUnplacedBlockIt,
3160 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt) {
3161 bool Removed, DuplicatedToLPred;
3162 bool DuplicatedToOriginalLPred;
3163 Removed = maybeTailDuplicateBlock(
3164 BB, LPred, Chain, BlockFilter, PrevUnplacedBlockIt,
3165 PrevUnplacedBlockInFilterIt, DuplicatedToLPred);
3166 if (!Removed)
3167 return false;
3168 DuplicatedToOriginalLPred = DuplicatedToLPred;
3169 // Iteratively try to duplicate again. It can happen that a block that is
3170 // duplicated into is still small enough to be duplicated again.
3171 // No need to call markBlockSuccessors in this case, as the blocks being
3172 // duplicated from here on are already scheduled.
3173 while (DuplicatedToLPred && Removed) {
3174 MachineBasicBlock *DupBB, *DupPred;
3175 // The removal callback causes Chain.end() to be updated when a block is
3176 // removed. On the first pass through the loop, the chain end should be the
3177 // same as it was on function entry. On subsequent passes, because we are
3178 // duplicating the block at the end of the chain, if it is removed the
3179 // chain will have shrunk by one block.
3180 BlockChain::iterator ChainEnd = Chain.end();
3181 DupBB = *(--ChainEnd);
3182 // Now try to duplicate again.
3183 if (ChainEnd == Chain.begin())
3184 break;
3185 DupPred = *std::prev(ChainEnd);
3186 Removed = maybeTailDuplicateBlock(
3187 DupBB, DupPred, Chain, BlockFilter, PrevUnplacedBlockIt,
3188 PrevUnplacedBlockInFilterIt, DuplicatedToLPred);
3189 }
3190 // If BB was duplicated into LPred, it is now scheduled. But because it was
3191 // removed, markChainSuccessors won't be called for its chain. Instead we
3192 // call markBlockSuccessors for LPred to achieve the same effect. This must go
3193 // at the end because repeating the tail duplication can increase the number
3194 // of unscheduled predecessors.
3195 LPred = *std::prev(Chain.end());
3196 if (DuplicatedToOriginalLPred)
3197 markBlockSuccessors(Chain, LPred, LoopHeaderBB, BlockFilter);
3198 return true;
3199}
3200
3201/// Tail duplicate \p BB into (some) predecessors if profitable.
3202/// \p BB - Basic block that may be duplicated
3203/// \p LPred - Chosen layout predecessor of \p BB
3204/// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
3205/// \p BlockFilter - Set of blocks that belong to the loop being laid out.
3206/// Used to identify which blocks to update predecessor
3207/// counts.
3208/// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
3209/// chosen in the given order due to unnatural CFG
3210/// only needed if \p BB is removed and
3211/// \p PrevUnplacedBlockIt pointed to \p BB.
3212/// \p DuplicatedToLPred - True if the block was duplicated into LPred.
3213/// \return - True if the block was duplicated into all preds and removed.
3214bool MachineBlockPlacement::maybeTailDuplicateBlock(
3215 MachineBasicBlock *BB, MachineBasicBlock *LPred, BlockChain &Chain,
3216 BlockFilterSet *BlockFilter, MachineFunction::iterator &PrevUnplacedBlockIt,
3217 BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
3218 bool &DuplicatedToLPred) {
3219 DuplicatedToLPred = false;
3220 if (!shouldTailDuplicate(BB))
3221 return false;
3222
3223 LLVM_DEBUG(dbgs() << "Redoing tail duplication for Succ#" << BB->getNumber()
3224 << "\n");
3225
3226 // This has to be a callback because none of it can be done after
3227 // BB is deleted.
3228 bool Removed = false;
3229 auto RemovalCallback = [&](MachineBasicBlock *RemBB) {
3230 // Signal to outer function
3231 Removed = true;
3232
3233 // Remove from the Chain and Chain Map
3234 if (auto It = BlockToChain.find(RemBB); It != BlockToChain.end()) {
3235 It->second->remove(RemBB);
3236 BlockToChain.erase(It);
3237 }
3238
3239 // Handle the unplaced block iterator
3240 if (&(*PrevUnplacedBlockIt) == RemBB) {
3241 PrevUnplacedBlockIt++;
3242 }
3243
3244 // Handle the Work Lists
3245 if (RemBB->isEHPad()) {
3246 llvm::erase(EHPadWorkList, RemBB);
3247 } else {
3248 llvm::erase(BlockWorkList, RemBB);
3249 }
3250
3251 // Handle the filter set
3252 if (BlockFilter) {
3253 auto It = llvm::find(*BlockFilter, RemBB);
3254 // Erase RemBB from BlockFilter, and keep PrevUnplacedBlockInFilterIt
3255 // pointing to the same element as before.
3256 if (It != BlockFilter->end()) {
3257 if (It < PrevUnplacedBlockInFilterIt) {
3258 const MachineBasicBlock *PrevBB = *PrevUnplacedBlockInFilterIt;
3259 // BlockFilter is a SmallVector so all elements after RemBB are
3260 // shifted to the front by 1 after its deletion.
3261 auto Distance = PrevUnplacedBlockInFilterIt - It - 1;
3262 PrevUnplacedBlockInFilterIt = BlockFilter->erase(It) + Distance;
3263 assert(*PrevUnplacedBlockInFilterIt == PrevBB);
3264 (void)PrevBB;
3265 } else if (It == PrevUnplacedBlockInFilterIt)
3266 // The block pointed by PrevUnplacedBlockInFilterIt is erased, we
3267 // have to set it to the next element.
3268 PrevUnplacedBlockInFilterIt = BlockFilter->erase(It);
3269 else
3270 BlockFilter->erase(It);
3271 }
3272 }
3273
3274 // Remove the block from loop info.
3275 MLI->removeBlock(RemBB);
3276 if (RemBB == PreferredLoopExit)
3277 PreferredLoopExit = nullptr;
3278
3279 LLVM_DEBUG(dbgs() << "TailDuplicator deleted block: " << getBlockName(RemBB)
3280 << "\n");
3281 };
3282 auto RemovalCallbackRef =
3283 function_ref<void(MachineBasicBlock *)>(RemovalCallback);
3284
3286 bool IsSimple = TailDup.isSimpleBB(BB);
3288 SmallVectorImpl<MachineBasicBlock *> *CandidatePtr = nullptr;
3289 if (F->getFunction().hasProfileData()) {
3290 // We can do partial duplication with precise profile information.
3291 findDuplicateCandidates(CandidatePreds, BB, BlockFilter);
3292 if (CandidatePreds.size() == 0)
3293 return false;
3294 if (CandidatePreds.size() < BB->pred_size())
3295 CandidatePtr = &CandidatePreds;
3296 }
3297 TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred, &DuplicatedPreds,
3298 &RemovalCallbackRef, CandidatePtr);
3299
3300 // Update UnscheduledPredecessors to reflect tail-duplication.
3301 DuplicatedToLPred = false;
3302 for (MachineBasicBlock *Pred : DuplicatedPreds) {
3303 // We're only looking for unscheduled predecessors that match the filter.
3304 BlockChain *PredChain = BlockToChain[Pred];
3305 if (Pred == LPred)
3306 DuplicatedToLPred = true;
3307 if (Pred == LPred || (BlockFilter && !BlockFilter->count(Pred)) ||
3308 PredChain == &Chain)
3309 continue;
3310 for (MachineBasicBlock *NewSucc : Pred->successors()) {
3311 if (BlockFilter && !BlockFilter->count(NewSucc))
3312 continue;
3313 BlockChain *NewChain = BlockToChain[NewSucc];
3314 if (NewChain != &Chain && NewChain != PredChain)
3315 NewChain->UnscheduledPredecessors++;
3316 }
3317 }
3318 return Removed;
3319}
3320
3321// Count the number of actual machine instructions.
3323 uint64_t InstrCount = 0;
3324 for (MachineInstr &MI : *MBB) {
3325 if (!MI.isPHI() && !MI.isMetaInstruction())
3326 InstrCount += 1;
3327 }
3328 return InstrCount;
3329}
3330
3331// The size cost of duplication is the instruction size of the duplicated block.
3332// So we should scale the threshold accordingly. But the instruction size is not
3333// available on all targets, so we use the number of instructions instead.
3334BlockFrequency MachineBlockPlacement::scaleThreshold(MachineBasicBlock *BB) {
3335 return BlockFrequency(DupThreshold.getFrequency() * countMBBInstruction(BB));
3336}
3337
3338// Returns true if BB is Pred's best successor.
3339bool MachineBlockPlacement::isBestSuccessor(MachineBasicBlock *BB,
3340 MachineBasicBlock *Pred,
3341 BlockFilterSet *BlockFilter) {
3342 if (BB == Pred)
3343 return false;
3344 if (BlockFilter && !BlockFilter->count(Pred))
3345 return false;
3346 BlockChain *PredChain = BlockToChain[Pred];
3347 if (PredChain && (Pred != *std::prev(PredChain->end())))
3348 return false;
3349
3350 // Find the successor with largest probability excluding BB.
3351 BranchProbability BestProb = BranchProbability::getZero();
3352 for (MachineBasicBlock *Succ : Pred->successors())
3353 if (Succ != BB) {
3354 if (BlockFilter && !BlockFilter->count(Succ))
3355 continue;
3356 BlockChain *SuccChain = BlockToChain[Succ];
3357 if (SuccChain && (Succ != *SuccChain->begin()))
3358 continue;
3359 BranchProbability SuccProb = MBPI->getEdgeProbability(Pred, Succ);
3360 if (SuccProb > BestProb)
3361 BestProb = SuccProb;
3362 }
3363
3364 BranchProbability BBProb = MBPI->getEdgeProbability(Pred, BB);
3365 if (BBProb <= BestProb)
3366 return false;
3367
3368 // Compute the number of reduced taken branches if Pred falls through to BB
3369 // instead of another successor. Then compare it with threshold.
3370 BlockFrequency PredFreq = getBlockCountOrFrequency(Pred);
3371 BlockFrequency Gain = PredFreq * (BBProb - BestProb);
3372 return Gain > scaleThreshold(BB);
3373}
3374
3375// Find out the predecessors of BB and BB can be beneficially duplicated into
3376// them.
3377void MachineBlockPlacement::findDuplicateCandidates(
3378 SmallVectorImpl<MachineBasicBlock *> &Candidates, MachineBasicBlock *BB,
3379 BlockFilterSet *BlockFilter) {
3380 MachineBasicBlock *Fallthrough = nullptr;
3381 BranchProbability DefaultBranchProb = BranchProbability::getZero();
3382 BlockFrequency BBDupThreshold(scaleThreshold(BB));
3385
3386 // Sort for highest frequency.
3387 auto CmpSucc = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
3388 return MBPI->getEdgeProbability(BB, A) > MBPI->getEdgeProbability(BB, B);
3389 };
3390 auto CmpPred = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
3391 return MBFI->getBlockFreq(A) > MBFI->getBlockFreq(B);
3392 };
3393 llvm::stable_sort(Succs, CmpSucc);
3394 llvm::stable_sort(Preds, CmpPred);
3395
3396 auto SuccIt = Succs.begin();
3397 if (SuccIt != Succs.end()) {
3398 DefaultBranchProb = MBPI->getEdgeProbability(BB, *SuccIt).getCompl();
3399 }
3400
3401 // For each predecessors of BB, compute the benefit of duplicating BB,
3402 // if it is larger than the threshold, add it into Candidates.
3403 //
3404 // If we have following control flow.
3405 //
3406 // PB1 PB2 PB3 PB4
3407 // \ | / /\
3408 // \ | / / \
3409 // \ |/ / \
3410 // BB----/ OB
3411 // /\
3412 // / \
3413 // SB1 SB2
3414 //
3415 // And it can be partially duplicated as
3416 //
3417 // PB2+BB
3418 // | PB1 PB3 PB4
3419 // | | / /\
3420 // | | / / \
3421 // | |/ / \
3422 // | BB----/ OB
3423 // |\ /|
3424 // | X |
3425 // |/ \|
3426 // SB2 SB1
3427 //
3428 // The benefit of duplicating into a predecessor is defined as
3429 // Orig_taken_branch - Duplicated_taken_branch
3430 //
3431 // The Orig_taken_branch is computed with the assumption that predecessor
3432 // jumps to BB and the most possible successor is laid out after BB.
3433 //
3434 // The Duplicated_taken_branch is computed with the assumption that BB is
3435 // duplicated into PB, and one successor is layout after it (SB1 for PB1 and
3436 // SB2 for PB2 in our case). If there is no available successor, the combined
3437 // block jumps to all BB's successor, like PB3 in this example.
3438 //
3439 // If a predecessor has multiple successors, so BB can't be duplicated into
3440 // it. But it can beneficially fall through to BB, and duplicate BB into other
3441 // predecessors.
3442 for (MachineBasicBlock *Pred : Preds) {
3443 BlockFrequency PredFreq = getBlockCountOrFrequency(Pred);
3444
3445 if (!TailDup.canTailDuplicate(BB, Pred)) {
3446 // BB can't be duplicated into Pred, but it is possible to be layout
3447 // below Pred.
3448 if (!Fallthrough && isBestSuccessor(BB, Pred, BlockFilter)) {
3449 Fallthrough = Pred;
3450 if (SuccIt != Succs.end())
3451 SuccIt++;
3452 }
3453 continue;
3454 }
3455
3456 BlockFrequency OrigCost = PredFreq + PredFreq * DefaultBranchProb;
3457 BlockFrequency DupCost;
3458 if (SuccIt == Succs.end()) {
3459 // Jump to all successors;
3460 if (Succs.size() > 0)
3461 DupCost += PredFreq;
3462 } else {
3463 // Fallthrough to *SuccIt, jump to all other successors;
3464 DupCost += PredFreq;
3465 DupCost -= PredFreq * MBPI->getEdgeProbability(BB, *SuccIt);
3466 }
3467
3468 assert(OrigCost >= DupCost);
3469 OrigCost -= DupCost;
3470 if (OrigCost > BBDupThreshold) {
3471 Candidates.push_back(Pred);
3472 if (SuccIt != Succs.end())
3473 SuccIt++;
3474 }
3475 }
3476
3477 // No predecessors can optimally fallthrough to BB.
3478 // So we can change one duplication into fallthrough.
3479 if (!Fallthrough) {
3480 if ((Candidates.size() < Preds.size()) && (Candidates.size() > 0)) {
3481 Candidates[0] = Candidates.back();
3482 Candidates.pop_back();
3483 }
3484 }
3485}
3486
3487void MachineBlockPlacement::initTailDupThreshold() {
3488 DupThreshold = BlockFrequency(0);
3489 if (F->getFunction().hasProfileData()) {
3490 // We prefer to use prifile count.
3491 uint64_t HotThreshold = PSI->getOrCompHotCountThreshold();
3492 if (HotThreshold != UINT64_MAX) {
3493 UseProfileCount = true;
3494 DupThreshold =
3495 BlockFrequency(HotThreshold * TailDupProfilePercentThreshold / 100);
3496 } else {
3497 // Profile count is not available, we can use block frequency instead.
3498 BlockFrequency MaxFreq = BlockFrequency(0);
3499 for (MachineBasicBlock &MBB : *F) {
3500 BlockFrequency Freq = MBFI->getBlockFreq(&MBB);
3501 if (Freq > MaxFreq)
3502 MaxFreq = Freq;
3503 }
3504
3505 BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
3506 DupThreshold = BlockFrequency(MaxFreq * ThresholdProb);
3507 UseProfileCount = false;
3508 }
3509 }
3510
3511 TailDupSize = TailDupPlacementThreshold;
3512 // If only the aggressive threshold is explicitly set, use it.
3513 if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
3514 TailDupPlacementThreshold.getNumOccurrences() == 0)
3516
3517 // For aggressive optimization, we can adjust some thresholds to be less
3518 // conservative.
3519 if (OptLevel >= CodeGenOptLevel::Aggressive) {
3520 // At O3 we should be more willing to copy blocks for tail duplication. This
3521 // increases size pressure, so we only do it at O3
3522 // Do this unless only the regular threshold is explicitly set.
3523 if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
3524 TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
3526 }
3527
3528 // If there's no threshold provided through options, query the target
3529 // information for a threshold instead.
3530 if (TailDupPlacementThreshold.getNumOccurrences() == 0 &&
3531 (OptLevel < CodeGenOptLevel::Aggressive ||
3532 TailDupPlacementAggressiveThreshold.getNumOccurrences() == 0))
3533 TailDupSize = TII->getTailDuplicateSize(OptLevel);
3534}
3535
3536PreservedAnalyses
3539 auto *MBPI = &MFAM.getResult<MachineBranchProbabilityAnalysis>(MF);
3540 auto MBFI = std::make_unique<MBFIWrapper>(
3542 auto *MLI = &MFAM.getResult<MachineLoopAnalysis>(MF);
3543 auto *MPDT = MachineBlockPlacement::allowTailDupPlacement(MF)
3545 : nullptr;
3547 .getCachedResult<ProfileSummaryAnalysis>(
3548 *MF.getFunction().getParent());
3549 if (!PSI)
3550 report_fatal_error("MachineBlockPlacement requires ProfileSummaryAnalysis",
3551 false);
3552 MachineBlockPlacement MBP(MBPI, MLI, PSI, std::move(MBFI), MPDT,
3553 AllowTailMerge);
3554
3555 if (MBP.run(MF))
3557
3558 return PreservedAnalyses::all();
3559}
3560
3562 raw_ostream &OS,
3563 function_ref<StringRef(StringRef)> MapClassName2PassName) const {
3564 OS << MapClassName2PassName(name());
3565 if (!AllowTailMerge)
3566 OS << "<no-tail-merge>";
3567}
3568
3569bool MachineBlockPlacement::run(MachineFunction &MF) {
3570
3571 // Check for single-block functions and skip them.
3572 if (std::next(MF.begin()) == MF.end())
3573 return false;
3574
3575 F = &MF;
3576 OptLevel = F->getTarget().getOptLevel();
3577
3578 TII = MF.getSubtarget().getInstrInfo();
3579 TLI = MF.getSubtarget().getTargetLowering();
3580
3581 // Initialize PreferredLoopExit to nullptr here since it may never be set if
3582 // there are no MachineLoops.
3583 PreferredLoopExit = nullptr;
3584
3585 assert(BlockToChain.empty() &&
3586 "BlockToChain map should be empty before starting placement.");
3587 assert(ComputedEdges.empty() &&
3588 "Computed Edge map should be empty before starting placement.");
3589
3590 // Initialize tail duplication thresholds.
3591 initTailDupThreshold();
3592
3593 const bool OptForSize =
3594 llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI());
3595 // Determine whether to use ext-tsp for perf/size optimization. The method
3596 // is beneficial only for instances with at least 3 basic blocks and it can be
3597 // disabled for huge functions (exceeding a certain size).
3598 bool UseExtTspForPerf = false;
3599 bool UseExtTspForSize = false;
3600 if (3 <= MF.size() && MF.size() <= ExtTspBlockPlacementMaxBlocks) {
3601 UseExtTspForSize = OptForSize && ApplyExtTspForSize;
3602 UseExtTspForPerf =
3603 !UseExtTspForSize && EnableExtTspBlockPlacement &&
3605 }
3606
3607 // Apply tail duplication.
3608 if (allowTailDupPlacement(*F)) {
3609 if (OptForSize)
3610 TailDupSize = 1;
3611 const bool PreRegAlloc = false;
3612 TailDup.initMF(MF, PreRegAlloc, MBPI, MBFI.get(), PSI,
3613 /* LayoutMode */ true, TailDupSize);
3614 if (!UseExtTspForSize)
3615 precomputeTriangleChains();
3616 }
3617
3618 // Run the main block placement.
3619 if (!UseExtTspForSize)
3620 buildCFGChains();
3621
3622 // Changing the layout can create new tail merging opportunities.
3623 // TailMerge can create jump into if branches that make CFG irreducible for
3624 // HW that requires structured CFG.
3625 const bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
3626 AllowTailMerge && BranchFoldPlacement &&
3627 MF.size() > 3;
3628 // No tail merging opportunities if the block number is less than four.
3629 if (EnableTailMerge) {
3630 const unsigned TailMergeSize = TailDupSize + 1;
3631 BranchFolder BF(/*DefaultEnableTailMerge=*/true, /*CommonHoist=*/false,
3632 *MBFI, *MBPI, PSI, TailMergeSize);
3633
3634 if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(), MLI,
3635 /*AfterPlacement=*/true)) {
3636 // Must redo the post-dominator tree if blocks were changed.
3637 if (MPDT)
3638 MPDT->recalculate(MF);
3639 if (!UseExtTspForSize) {
3640 // Redo the layout if tail merging creates/removes/moves blocks.
3641 BlockToChain.clear();
3642 ComputedEdges.clear();
3643 ChainAllocator.DestroyAll();
3644 buildCFGChains();
3645 }
3646 }
3647 }
3648
3649 // Apply a post-processing optimizing block placement:
3650 // - find a new placement and modify the layout of the blocks in the function;
3651 // - re-create CFG chains so that we can optimizeBranches and alignBlocks.
3652 if (UseExtTspForPerf || UseExtTspForSize) {
3653 assert(
3654 !(UseExtTspForPerf && UseExtTspForSize) &&
3655 "UseExtTspForPerf and UseExtTspForSize can not be set simultaneously");
3656 applyExtTsp(/*OptForSize=*/UseExtTspForSize);
3657 createCFGChainExtTsp();
3658 }
3659
3660 optimizeBranches();
3661 alignBlocks();
3662
3663 BlockToChain.clear();
3664 ComputedEdges.clear();
3665 ChainAllocator.DestroyAll();
3666
3667 // View the function.
3669 (ViewBlockFreqFuncName.empty() ||
3670 F->getFunction().getName() == ViewBlockFreqFuncName)) {
3672 MF.RenumberBlocks();
3673 MBFI->view("MBP." + MF.getName(), false);
3674 }
3675
3676 // We always return true as we have no way to track whether the final order
3677 // differs from the original order.
3678 return true;
3679}
3680
3681void MachineBlockPlacement::applyExtTsp(bool OptForSize) {
3682 // Prepare data; blocks are indexed by their index in the current ordering.
3683 DenseMap<const MachineBasicBlock *, uint64_t> BlockIndex;
3684 BlockIndex.reserve(F->size());
3685 std::vector<const MachineBasicBlock *> CurrentBlockOrder;
3686 CurrentBlockOrder.reserve(F->size());
3687 size_t NumBlocks = 0;
3688 for (const MachineBasicBlock &MBB : *F) {
3689 BlockIndex[&MBB] = NumBlocks++;
3690 CurrentBlockOrder.push_back(&MBB);
3691 }
3692
3693 SmallVector<uint64_t, 0> BlockCounts(F->size());
3694 SmallVector<uint64_t, 0> BlockSizes(F->size());
3696 SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
3698 for (MachineBasicBlock &MBB : *F) {
3699 // Getting the block frequency.
3700 BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
3701 BlockCounts[BlockIndex[&MBB]] = OptForSize ? 1 : BlockFreq.getFrequency();
3702 // Getting the block size:
3703 // - approximate the size of an instruction by 4 bytes, and
3704 // - ignore debug instructions.
3705 // Note: getting the exact size of each block is target-dependent and can be
3706 // done by extending the interface of MCCodeEmitter. Experimentally we do
3707 // not see a perf improvement with the exact block sizes.
3708 auto NonDbgInsts =
3710 size_t NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end());
3711 BlockSizes[BlockIndex[&MBB]] = 4 * NumInsts;
3712
3713 // Getting jump frequencies.
3714 if (OptForSize) {
3715 Cond.clear();
3716 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
3717 if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
3718 continue;
3719
3720 const MachineBasicBlock *FTB = MBB.getFallThrough();
3721 // Succs is a collection of distinct destinations of the block reachable
3722 // from MBB via a jump instruction; initialize the list using the three
3723 // (non-necessarily distinct) blocks, FTB, TBB, and FBB.
3724 Succs.clear();
3725 if (TBB && TBB != FTB)
3726 Succs.push_back(TBB);
3727 if (FBB && FBB != FTB)
3728 Succs.push_back(FBB);
3729 if (FTB)
3730 Succs.push_back(FTB);
3731 // Absolute magnitude of non-zero counts does not matter for the
3732 // optimization; prioritize slightly jumps with a single successor, since
3733 // the corresponding jump instruction will be removed from the binary.
3734 const uint64_t Freq = Succs.size() == 1 ? 110 : 100;
3735 for (const MachineBasicBlock *Succ : Succs)
3736 JumpCounts.push_back({BlockIndex[&MBB], BlockIndex[Succ], Freq});
3737 } else {
3738 for (MachineBasicBlock *Succ : MBB.successors()) {
3739 auto EP = MBPI->getEdgeProbability(&MBB, Succ);
3740 BlockFrequency JumpFreq = BlockFreq * EP;
3741 JumpCounts.push_back(
3742 {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency()});
3743 }
3744 }
3745 }
3746
3747 LLVM_DEBUG(dbgs() << "Applying ext-tsp layout for |V| = " << F->size()
3748 << " with profile = " << F->getFunction().hasProfileData()
3749 << " (" << F->getName() << ")" << "\n");
3750
3751 const double OrgScore = calcExtTspScore(BlockSizes, JumpCounts);
3752 LLVM_DEBUG(dbgs() << format(" original layout score: %0.2f\n", OrgScore));
3753
3754 // Run the layout algorithm.
3755 auto NewOrder = computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
3756 std::vector<const MachineBasicBlock *> NewBlockOrder;
3757 NewBlockOrder.reserve(F->size());
3758 for (uint64_t Node : NewOrder) {
3759 NewBlockOrder.push_back(CurrentBlockOrder[Node]);
3760 }
3761 const double OptScore = calcExtTspScore(NewOrder, BlockSizes, JumpCounts);
3762 LLVM_DEBUG(dbgs() << format(" optimized layout score: %0.2f\n", OptScore));
3763
3764 // If the optimization is unsuccessful, fall back to the original block order.
3765 if (OptForSize && OrgScore > OptScore)
3766 assignBlockOrder(CurrentBlockOrder);
3767 else
3768 assignBlockOrder(NewBlockOrder);
3769}
3770
3771void MachineBlockPlacement::assignBlockOrder(
3772 const std::vector<const MachineBasicBlock *> &NewBlockOrder) {
3773 assert(F->size() == NewBlockOrder.size() && "Incorrect size of block order");
3774 F->RenumberBlocks();
3775
3776 bool HasChanges = false;
3777 for (size_t I = 0; I < NewBlockOrder.size(); I++) {
3778 if (NewBlockOrder[I] != F->getBlockNumbered(I)) {
3779 HasChanges = true;
3780 break;
3781 }
3782 }
3783 // Stop early if the new block order is identical to the existing one.
3784 if (!HasChanges)
3785 return;
3786
3787 SmallVector<MachineBasicBlock *, 4> PrevFallThroughs(F->getNumBlockIDs());
3788 for (auto &MBB : *F) {
3789 PrevFallThroughs[MBB.getNumber()] = MBB.getFallThrough();
3790 }
3791
3792 // Sort basic blocks in the function according to the computed order.
3793 DenseMap<const MachineBasicBlock *, size_t> NewIndex;
3794 for (const MachineBasicBlock *MBB : NewBlockOrder) {
3795 NewIndex[MBB] = NewIndex.size();
3796 }
3797 F->sort([&](MachineBasicBlock &L, MachineBasicBlock &R) {
3798 return NewIndex[&L] < NewIndex[&R];
3799 });
3800
3801 // Update basic block branches by inserting explicit fallthrough branches
3802 // when required and re-optimize branches when possible.
3803 const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo();
3805 for (auto &MBB : *F) {
3806 MachineFunction::iterator NextMBB = std::next(MBB.getIterator());
3808 auto *FTMBB = PrevFallThroughs[MBB.getNumber()];
3809 // If this block had a fallthrough before we need an explicit unconditional
3810 // branch to that block if the fallthrough block is not adjacent to the
3811 // block in the new order.
3812 if (FTMBB && (NextMBB == EndIt || &*NextMBB != FTMBB)) {
3813 TII->insertUnconditionalBranch(MBB, FTMBB, MBB.findBranchDebugLoc());
3814 }
3815
3816 // It might be possible to optimize branches by flipping the condition.
3817 Cond.clear();
3818 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
3819 if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
3820 continue;
3821 MBB.updateTerminator(FTMBB);
3822 }
3823}
3824
3825void MachineBlockPlacement::createCFGChainExtTsp() {
3826 BlockToChain.clear();
3827 ComputedEdges.clear();
3828 ChainAllocator.DestroyAll();
3829
3830 MachineBasicBlock *HeadBB = &F->front();
3831 BlockChain *FunctionChain =
3832 new (ChainAllocator.Allocate()) BlockChain(BlockToChain, HeadBB);
3833
3834 for (MachineBasicBlock &MBB : *F) {
3835 if (HeadBB == &MBB)
3836 continue; // Ignore head of the chain
3837 FunctionChain->merge(&MBB, nullptr);
3838 }
3839}
3840
3841namespace {
3842
3843/// A pass to compute block placement statistics.
3844///
3845/// A separate pass to compute interesting statistics for evaluating block
3846/// placement. This is separate from the actual placement pass so that they can
3847/// be computed in the absence of any placement transformations or when using
3848/// alternative placement strategies.
3849class MachineBlockPlacementStats {
3850 /// A handle to the branch probability pass.
3851 const MachineBranchProbabilityInfo *MBPI;
3852
3853 /// A handle to the function-wide block frequency pass.
3854 const MachineBlockFrequencyInfo *MBFI;
3855
3856public:
3857 MachineBlockPlacementStats(const MachineBranchProbabilityInfo *MBPI,
3858 const MachineBlockFrequencyInfo *MBFI)
3859 : MBPI(MBPI), MBFI(MBFI) {}
3860 bool run(MachineFunction &MF);
3861};
3862
3863class MachineBlockPlacementStatsLegacy : public MachineFunctionPass {
3864public:
3865 static char ID; // Pass identification, replacement for typeid
3866
3867 MachineBlockPlacementStatsLegacy() : MachineFunctionPass(ID) {}
3868
3869 bool runOnMachineFunction(MachineFunction &F) override {
3870 auto *MBPI =
3871 &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
3872 auto *MBFI = &getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI();
3873 return MachineBlockPlacementStats(MBPI, MBFI).run(F);
3874 }
3875
3876 void getAnalysisUsage(AnalysisUsage &AU) const override {
3877 AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
3878 AU.addRequired<MachineBlockFrequencyInfoWrapperPass>();
3879 AU.setPreservesAll();
3881 }
3882};
3883
3884} // end anonymous namespace
3885
3886char MachineBlockPlacementStatsLegacy::ID = 0;
3887
3888char &llvm::MachineBlockPlacementStatsID = MachineBlockPlacementStatsLegacy::ID;
3889
3890INITIALIZE_PASS_BEGIN(MachineBlockPlacementStatsLegacy, "block-placement-stats",
3891 "Basic Block Placement Stats", false, false)
3894INITIALIZE_PASS_END(MachineBlockPlacementStatsLegacy, "block-placement-stats",
3895 "Basic Block Placement Stats", false, false)
3896
3900 auto &MBPI = MFAM.getResult<MachineBranchProbabilityAnalysis>(MF);
3901 auto &MBFI = MFAM.getResult<MachineBlockFrequencyAnalysis>(MF);
3902
3903 MachineBlockPlacementStats(&MBPI, &MBFI).run(MF);
3904 return PreservedAnalyses::all();
3905}
3906
3907bool MachineBlockPlacementStats::run(MachineFunction &F) {
3908 // Check for single-block functions and skip them.
3909 if (std::next(F.begin()) == F.end())
3910 return false;
3911
3912 if (!isFunctionInPrintList(F.getName()))
3913 return false;
3914
3915 for (MachineBasicBlock &MBB : F) {
3916 BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
3917 Statistic &NumBranches =
3918 (MBB.succ_size() > 1) ? NumCondBranches : NumUncondBranches;
3919 Statistic &BranchTakenFreq =
3920 (MBB.succ_size() > 1) ? CondBranchTakenFreq : UncondBranchTakenFreq;
3921 for (MachineBasicBlock *Succ : MBB.successors()) {
3922 // Skip if this successor is a fallthrough.
3923 if (MBB.isLayoutSuccessor(Succ))
3924 continue;
3925
3926 BlockFrequency EdgeFreq =
3927 BlockFreq * MBPI->getEdgeProbability(&MBB, Succ);
3928 ++NumBranches;
3929 BranchTakenFreq += EdgeFreq.getFrequency();
3930 }
3931 }
3932
3933 return false;
3934}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
This file defines the BumpPtrAllocator interface.
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static cl::opt< unsigned > TailMergeSize("tail-merge-size", cl::desc("Min number of instructions to consider tail merging"), cl::init(3), cl::Hidden)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Declares methods and data structures for code layout algorithms.
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:661
static unsigned InstrCount
This file defines the DenseMap class.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static BranchProbability getAdjustedProbability(BranchProbability OrigProb, BranchProbability AdjustedSumProb)
The helper function returns the branch probability that is adjusted or normalized over the new total ...
static cl::opt< bool > PreciseRotationCost("precise-rotation-cost", cl::desc("Model the cost of loop rotation more " "precisely by using profile data."), cl::init(false), cl::Hidden)
static cl::opt< unsigned > ExtTspBlockPlacementMaxBlocks("ext-tsp-block-placement-max-blocks", cl::desc("Maximum number of basic blocks in a function to run ext-TSP " "block placement."), cl::init(UINT_MAX), cl::Hidden)
static cl::opt< unsigned > AlignAllBlock("align-all-blocks", cl::desc("Force the alignment of all blocks in the function in log2 format " "(e.g 4 means align on 16B boundaries)."), cl::init(0), cl::Hidden)
static cl::opt< unsigned > PredecessorLimit("block-placement-predecessor-limit", cl::desc("For blocks with more predecessors, certain layout optimizations" "will be disabled to prevent quadratic compile time."), cl::init(1000), cl::Hidden)
static BranchProbability getLayoutSuccessorProbThreshold(const MachineBasicBlock *BB)
static cl::opt< bool > ForceLoopColdBlock("force-loop-cold-block", cl::desc("Force outlining cold blocks from loops."), cl::init(false), cl::Hidden)
static cl::opt< unsigned > ExitBlockBias("block-placement-exit-block-bias", cl::desc("Block frequency percentage a loop exit block needs " "over the original exit to be considered the new exit."), cl::init(0), cl::Hidden)
static cl::opt< unsigned > AlignAllNonFallThruBlocks("align-all-nofallthru-blocks", cl::desc("Force the alignment of all blocks that have no fall-through " "predecessors (i.e. don't add nops that are executed). In log2 " "format (e.g 4 means align on 16B boundaries)."), cl::init(0), cl::Hidden)
static cl::opt< unsigned > TailDupPlacementThreshold("tail-dup-placement-threshold", cl::desc("Instruction cutoff for tail duplication during layout. " "Tail merging during layout is forced to have a threshold " "that won't conflict."), cl::init(2), cl::Hidden)
static cl::opt< unsigned > JumpInstCost("jump-inst-cost", cl::desc("Cost of jump instructions."), cl::init(1), cl::Hidden)
static cl::opt< unsigned > TailDupPlacementPenalty("tail-dup-placement-penalty", cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. " "Copying can increase fallthrough, but it also increases icache " "pressure. This parameter controls the penalty to account for that. " "Percent as integer."), cl::init(2), cl::Hidden)
static bool greaterWithBias(BlockFrequency A, BlockFrequency B, BlockFrequency EntryFreq)
Compare 2 BlockFrequency's with a small penalty for A.
static cl::opt< unsigned > MisfetchCost("misfetch-cost", cl::desc("Cost that models the probabilistic risk of an instruction " "misfetch due to a jump comparing to falling through, whose cost " "is zero."), cl::init(1), cl::Hidden)
static cl::opt< unsigned > MaxBytesForAlignmentOverride("max-bytes-for-alignment", cl::desc("Forces the maximum bytes allowed to be emitted when padding for " "alignment"), cl::init(0), cl::Hidden)
static cl::opt< bool > BranchFoldPlacement("branch-fold-placement", cl::desc("Perform branch folding during placement. " "Reduces code size."), cl::init(true), cl::Hidden)
static cl::opt< unsigned > TailDupProfilePercentThreshold("tail-dup-profile-percent-threshold", cl::desc("If profile count information is used in tail duplication cost " "model, the gained fall through number from tail duplication " "should be at least this percent of hot count."), cl::init(50), cl::Hidden)
static cl::opt< unsigned > TriangleChainCount("triangle-chain-count", cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the " "triangle tail duplication heuristic to kick in. 0 to disable."), cl::init(2), cl::Hidden)
Branch Probability Basic Block static false std::string getBlockName(const MachineBasicBlock *BB)
Helper to print the name of a MBB.
static cl::opt< bool > ApplyExtTspForSize("apply-ext-tsp-for-size", cl::init(false), cl::Hidden, cl::desc("Use ext-tsp for size-aware block placement."))
static bool hasSameSuccessors(MachineBasicBlock &BB, SmallPtrSetImpl< const MachineBasicBlock * > &Successors)
Check if BB has exactly the successors in Successors.
static cl::opt< bool > TailDupPlacement("tail-dup-placement", cl::desc("Perform tail duplication during placement. " "Creates more fallthrough opportunities in " "outline branches."), cl::init(true), cl::Hidden)
static uint64_t countMBBInstruction(MachineBasicBlock *MBB)
static cl::opt< unsigned > LoopToColdBlockRatio("loop-to-cold-block-ratio", cl::desc("Outline loop blocks from loop chain if (frequency of loop) / " "(frequency of block) is greater than this ratio"), cl::init(5), cl::Hidden)
static cl::opt< bool > RenumberBlocksBeforeView("renumber-blocks-before-view", cl::desc("If true, basic blocks are re-numbered before MBP layout is printed " "into a dot graph. Only used when a function is being printed."), cl::init(false), cl::Hidden)
static cl::opt< unsigned > TailDupPlacementAggressiveThreshold("tail-dup-placement-aggressive-threshold", cl::desc("Instruction cutoff for aggressive tail duplication during " "layout. Used at -O3. Tail merging during layout is forced to " "have a threshold that won't conflict."), cl::init(4), cl::Hidden)
static cl::opt< bool > ForcePreciseRotationCost("force-precise-rotation-cost", cl::desc("Force the use of precise cost " "loop rotation strategy."), cl::init(false), cl::Hidden)
#define P(N)
static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
static const char * name
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:119
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
This file describes how to lower LLVM code to machine code.
Target-Independent Code Generator Pass Configuration Options pass.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
static BlockFrequency max()
Returns the maximum possible frequency, the saturation value.
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
static BranchProbability getOne()
uint32_t getNumerator() const
BranchProbability getCompl() const
static BranchProbability getZero()
ValueT lookup(const_arg_type_t< KeyT > Val) const
Return the entry for the specified key, or a default constructed value if no such entry exists.
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:254
bool erase(const KeyT &Val)
Definition DenseMap.h:328
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
iterator end()
Definition DenseMap.h:81
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:239
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition DenseMap.h:114
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
void recalculate(ParentType &Func)
recalculate - compute a dominator tree for the given function
bool hasProfileData(bool IncludeSynthetic=false) const
Return true if the function is annotated with profile data.
Definition Function.h:336
Module * getParent()
Get the module that this global value is contained inside of...
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
Remove the branching code at the end of the specific MBB.
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const override
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
Reverses the branch condition of the specified condition list, returning false on success and true if...
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
Insert branch code into the end of the specified MachineBasicBlock.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1444
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1442
unsigned getNumOperands() const
Return number of MDNode operands.
Definition Metadata.h:1450
LLVM_ABI StringRef getString() const
Definition Metadata.cpp:632
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_ABI MachineBasicBlock * getFallThrough(bool JumpToFallThrough=true)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
LLVM_ABI void updateTerminator(MachineBasicBlock *PreviousLayoutSuccessor)
Update the terminator instructions in block to account for changes to block layout which may have bee...
LLVM_ABI bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
void setAlignment(Align A)
Set alignment of the basic block.
LLVM_ABI bool isEntryBlock() const
Returns true if this is the entry block of the function.
succ_reverse_iterator succ_rbegin()
LLVM_ABI bool isLayoutSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB will be emitted immediately after this block, such that if this bloc...
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
LLVM_ABI DebugLoc findBranchDebugLoc()
Find and return the merged DebugLoc of the branch instructions of the block.
iterator_range< succ_iterator > successors()
LLVM_ABI bool isSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB is a successor of this block.
iterator_range< pred_iterator > predecessors()
LLVM_ABI StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
LLVM_ABI Result run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName) const
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
BranchProbability getEdgeProbability(const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
void RenumberBlocks(MachineBasicBlock *MBBFrom=nullptr)
RenumberBlocks - This discards all of the MachineBasicBlock numbers and recomputes them.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Representation of each machine instruction.
Analysis pass that exposes the MachineLoopInfo for a machine function.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
LLVM_ABI uint64_t getOrCompHotCountThreshold() const
Returns HotCountThreshold if set.
typename vector_type::const_iterator iterator
Definition SetVector.h:72
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
reference emplace_back(ArgTypes &&... Args)
iterator erase(const_iterator CI)
typename SuperClass::const_iterator const_iterator
typename SuperClass::iterator iterator
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
T * Allocate(size_t num=1)
Allocate space for an array of objects without constructing them.
Definition Allocator.h:440
void DestroyAll()
Call the destructor of each allocated object and deallocate all but the current slab and reset the cu...
Definition Allocator.h:411
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
void initMF(MachineFunction &MF, bool PreRegAlloc, const MachineBranchProbabilityInfo *MBPI, MBFIWrapper *MBFI, ProfileSummaryInfo *PSI, bool LayoutMode, unsigned TailDupSize=0)
Prepare to run on a specific machine function.
bool tailDuplicateAndUpdate(bool IsSimple, MachineBasicBlock *MBB, MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl< MachineBasicBlock * > *DuplicatedPreds=nullptr, function_ref< void(MachineBasicBlock *)> *RemovalCallback=nullptr, SmallVectorImpl< MachineBasicBlock * > *CandidatePtr=nullptr)
Tail duplicate a single basic block into its predecessors, and then clean up.
static bool isSimpleBB(MachineBasicBlock *TailBB)
True if this BB has only one unconditional jump.
bool canTailDuplicate(MachineBasicBlock *TailBB, MachineBasicBlock *PredBB)
Returns true if TailBB can successfully be duplicated into PredBB.
bool shouldTailDuplicate(bool IsSimple, MachineBasicBlock &TailBB)
Determine if it is profitable to duplicate this block.
virtual unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
virtual bool alignLoopsWithOptSize() const
Should loops be aligned even when the function is marked OptSize (but not MinSize).
bool requiresStructuredCFG() const
virtual const TargetInstrInfo * getInstrInfo() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
virtual const TargetLowering * getTargetLowering() const
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition ilist_node.h:123
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
A raw_ostream that writes to an std::string.
#define UINT64_MAX
Definition DataTypes.h:77
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
initializer< Ty > init(const Ty &Val)
LLVM_ABI double calcExtTspScore(ArrayRef< uint64_t > Order, ArrayRef< uint64_t > NodeSizes, ArrayRef< EdgeCount > EdgeCounts)
Estimate the "quality" of a given node order in CFG.
LLVM_ABI std::vector< uint64_t > computeExtTspLayout(ArrayRef< uint64_t > NodeSizes, ArrayRef< uint64_t > NodeCounts, ArrayRef< EdgeCount > EdgeCounts)
Find a layout of nodes (basic blocks) of a given CFG optimizing jump locality and thus processor I-ca...
DXILDebugInfoMap run(Module &M)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
Definition Metadata.h:668
LLVM_ABI void append(SmallVectorImpl< char > &path, const Twine &a, const Twine &b="", const Twine &c="", const Twine &d="")
Append to path.
Definition Path.cpp:457
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
void stable_sort(R &&Range)
Definition STLExtras.h:2115
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1764
OuterAnalysisManagerProxy< ModuleAnalysisManager, MachineFunction > ModuleAnalysisManagerMachineFunctionProxy
Provide the ModuleAnalysisManager to Function proxy.
InstructionCost Cost
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
cl::opt< bool > ApplyExtTspWithoutProfile
constexpr from_range_t from_range
LLVM_ABI bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
cl::opt< unsigned > ProfileLikelyProb
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2199
cl::opt< std::string > ViewBlockFreqFuncName("view-bfi-func-name", cl::Hidden, cl::desc("The option to specify " "the name of the function " "whose CFG will be displayed."))
NoopStatistic Statistic
Definition Statistic.h:162
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
cl::opt< GVDAGType > ViewBlockLayoutWithBFI("view-block-layout-with-bfi", cl::Hidden, cl::desc("Pop up a window to show a dag displaying MBP layout and associated " "block frequencies of the CFG."), cl::values(clEnumValN(GVDT_None, "none", "do not display graphs."), clEnumValN(GVDT_Fraction, "fraction", "display a graph using the " "fractional block frequency representation."), clEnumValN(GVDT_Integer, "integer", "display a graph using the raw " "integer fractional block frequency representation."), clEnumValN(GVDT_Count, "count", "display a graph using the real " "profile count if available.")))
bool isFunctionInPrintList(StringRef FunctionName)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
auto instructionsWithoutDebug(IterT It, IterT End, bool SkipPseudoOp=true)
Construct a range iterator which begins at It and moves forwards until End is reached,...
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2011
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1916
cl::opt< unsigned > StaticLikelyProb
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2191
LLVM_ABI Printable printBlockFreq(const BlockFrequencyInfo &BFI, BlockFrequency Freq)
Print the block frequency Freq relative to the current functions entry frequency.
LLVM_ABI char & MachineBlockPlacementID
MachineBlockPlacement - This pass places basic blocks based on branch probabilities.
cl::opt< bool > EnableExtTspBlockPlacement
LLVM_ABI char & MachineBlockPlacementStatsID
MachineBlockPlacementStats - This pass collects statistics about the basic block placement using bran...
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:876