LLVM 23.0.0git
LoadStoreVectorizer.cpp
Go to the documentation of this file.
1//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass merges loads/stores to/from sequential memory addresses into vector
10// loads/stores. Although there's nothing GPU-specific in here, this pass is
11// motivated by the microarchitectural quirks of nVidia and AMD GPUs.
12//
13// (For simplicity below we talk about loads only, but everything also applies
14// to stores.)
15//
16// This pass is intended to be run late in the pipeline, after other
17// vectorization opportunities have been exploited. So the assumption here is
18// that immediately following our new vector load we'll need to extract out the
19// individual elements of the load, so we can operate on them individually.
20//
21// On CPUs this transformation is usually not beneficial, because extracting the
22// elements of a vector register is expensive on most architectures. It's
23// usually better just to load each element individually into its own scalar
24// register.
25//
26// However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
27// "vector load" loads directly into a series of scalar registers. In effect,
28// extracting the elements of the vector is free. It's therefore always
29// beneficial to vectorize a sequence of loads on these architectures.
30//
31// Vectorizing (perhaps a better name might be "coalescing") loads can have
32// large performance impacts on GPU kernels, and opportunities for vectorizing
33// are common in GPU code. This pass tries very hard to find such
34// opportunities; its runtime is quadratic in the number of loads in a BB.
35//
36// Some CPU architectures, such as ARM, have instructions that load into
37// multiple scalar registers, similar to a GPU vectorized load. In theory ARM
38// could use this pass (with some modifications), but currently it implements
39// its own pass to do something similar to what we do here.
40//
41// Overview of the algorithm and terminology in this pass:
42//
43// - Break up each basic block into pseudo-BBs, composed of instructions which
44// are guaranteed to transfer control to their successors.
45// - Within a single pseudo-BB, find all loads, and group them into
46// "equivalence classes" according to getUnderlyingObject() and loaded
47// element size. Do the same for stores.
48// - For each equivalence class, greedily build "chains". Each chain has a
49// leader instruction, and every other member of the chain has a known
50// constant offset from the first instr in the chain.
51// - Break up chains so that they contain only contiguous accesses of legal
52// size with no intervening may-alias instrs.
53// - Convert each chain to vector instructions.
54//
55// The O(n^2) behavior of this pass comes from initially building the chains.
56// In the worst case we have to compare each new instruction to all of those
57// that came before. To limit this, we only calculate the offset to the leaders
58// of the N most recently-used chains.
59
61#include "llvm/ADT/APInt.h"
62#include "llvm/ADT/ArrayRef.h"
63#include "llvm/ADT/DenseMap.h"
64#include "llvm/ADT/MapVector.h"
66#include "llvm/ADT/STLExtras.h"
67#include "llvm/ADT/Sequence.h"
70#include "llvm/ADT/Statistic.h"
79#include "llvm/IR/Attributes.h"
80#include "llvm/IR/BasicBlock.h"
82#include "llvm/IR/Constants.h"
83#include "llvm/IR/DataLayout.h"
85#include "llvm/IR/Dominators.h"
86#include "llvm/IR/Function.h"
88#include "llvm/IR/IRBuilder.h"
89#include "llvm/IR/InstrTypes.h"
90#include "llvm/IR/Instruction.h"
92#include "llvm/IR/LLVMContext.h"
93#include "llvm/IR/Module.h"
94#include "llvm/IR/Type.h"
95#include "llvm/IR/Value.h"
97#include "llvm/Pass.h"
100#include "llvm/Support/Debug.h"
103#include "llvm/Support/ModRef.h"
106#include <algorithm>
107#include <cassert>
108#include <cstdint>
109#include <cstdlib>
110#include <iterator>
111#include <numeric>
112#include <optional>
113#include <tuple>
114#include <type_traits>
115#include <utility>
116#include <vector>
117
118using namespace llvm;
119
120#define DEBUG_TYPE "load-store-vectorizer"
121
122STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
123STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
124
125namespace {
126
127// Equivalence class key, the initial tuple by which we group loads/stores.
128// Loads/stores with different EqClassKeys are never merged.
129//
130// (We could in theory remove element-size from the this tuple. We'd just need
131// to fix up the vector packing/unpacking code.)
132using EqClassKey =
133 std::tuple<const Value * /* result of getUnderlyingObject() */,
134 unsigned /* AddrSpace */,
135 unsigned /* Load/Store element size bits */,
136 char /* IsLoad; char b/c bool can't be a DenseMap key */
137 >;
139 const EqClassKey &K) {
140 const auto &[UnderlyingObject, AddrSpace, ElementSize, IsLoad] = K;
141 return OS << (IsLoad ? "load" : "store") << " of " << *UnderlyingObject
142 << " of element size " << ElementSize << " bits in addrspace "
143 << AddrSpace;
144}
145
146// A Chain is a set of instructions such that:
147// - All instructions have the same equivalence class, so in particular all are
148// loads, or all are stores.
149// - We know the address accessed by the i'th chain elem relative to the
150// chain's leader instruction, which is the first instr of the chain in BB
151// order.
152//
153// Chains have two canonical orderings:
154// - BB order, sorted by Instr->comesBefore.
155// - Offset order, sorted by OffsetFromLeader.
156// This pass switches back and forth between these orders.
157struct ChainElem {
158 Instruction *Inst;
159 APInt OffsetFromLeader;
160 ChainElem(Instruction *Inst, APInt OffsetFromLeader)
161 : Inst(std::move(Inst)), OffsetFromLeader(std::move(OffsetFromLeader)) {}
162};
163using Chain = SmallVector<ChainElem, 1>;
164
165void sortChainInBBOrder(Chain &C) {
166 sort(C, [](auto &A, auto &B) { return A.Inst->comesBefore(B.Inst); });
167}
168
169void sortChainInOffsetOrder(Chain &C) {
170 sort(C, [](const auto &A, const auto &B) {
171 if (A.OffsetFromLeader != B.OffsetFromLeader)
172 return A.OffsetFromLeader.slt(B.OffsetFromLeader);
173 return A.Inst->comesBefore(B.Inst); // stable tiebreaker
174 });
175}
176
177[[maybe_unused]] void dumpChain(ArrayRef<ChainElem> C) {
178 for (const auto &E : C) {
179 dbgs() << " " << *E.Inst << " (offset " << E.OffsetFromLeader << ")\n";
180 }
181}
182
183using EquivalenceClassMap =
185
186// FIXME: Assuming stack alignment of 4 is always good enough
187constexpr unsigned StackAdjustedAlignment = 4;
188
191 for (const ChainElem &E : C)
192 Values.emplace_back(E.Inst);
193 return propagateMetadata(I, Values);
194}
195
196bool isInvariantLoad(const Instruction *I) {
197 const LoadInst *LI = dyn_cast<LoadInst>(I);
198 return LI != nullptr && LI->hasMetadata(LLVMContext::MD_invariant_load);
199}
200
201/// Reorders the instructions that I depends on (the instructions defining its
202/// operands), to ensure they dominate I.
203void reorder(Instruction *I) {
204 SmallPtrSet<Instruction *, 16> InstructionsToMove;
206
207 Worklist.emplace_back(I);
208 while (!Worklist.empty()) {
209 Instruction *IW = Worklist.pop_back_val();
210 int NumOperands = IW->getNumOperands();
211 for (int Idx = 0; Idx < NumOperands; Idx++) {
213 if (!IM || IM->getOpcode() == Instruction::PHI)
214 continue;
215
216 // If IM is in another BB, no need to move it, because this pass only
217 // vectorizes instructions within one BB.
218 if (IM->getParent() != I->getParent())
219 continue;
220
221 assert(IM != I && "Unexpected cycle while re-ordering instructions");
222
223 if (!IM->comesBefore(I)) {
224 InstructionsToMove.insert(IM);
225 Worklist.emplace_back(IM);
226 }
227 }
228 }
229
230 // All instructions to move should follow I. Start from I, not from begin().
231 for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;) {
232 Instruction *IM = &*(BBI++);
233 if (!InstructionsToMove.contains(IM))
234 continue;
235 IM->moveBefore(I->getIterator());
236 }
237}
238
239class Vectorizer {
240 Function &F;
241 AliasAnalysis &AA;
242 AssumptionCache &AC;
243 DominatorTree &DT;
244 ScalarEvolution &SE;
245 TargetTransformInfo &TTI;
246 const DataLayout &DL;
247 IRBuilder<> Builder;
248
249 /// We could erase instrs right after vectorizing them, but that can mess up
250 /// our BB iterators, and also can make the equivalence class keys point to
251 /// freed memory. This is fixable, but it's simpler just to wait until we're
252 /// done with the BB and erase all at once.
254
255 /// We insert load/store instructions and GEPs to fill gaps and extend chains
256 /// to enable vectorization. Keep track and delete them later.
257 DenseSet<Instruction *> ExtraElements;
258
259public:
260 Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC,
261 DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI)
262 : F(F), AA(AA), AC(AC), DT(DT), SE(SE), TTI(TTI),
263 DL(F.getDataLayout()), Builder(SE.getContext()) {}
264
265 bool run();
266
267private:
268 static const unsigned MaxDepth = 3;
269
270 /// Runs the vectorizer on a "pseudo basic block", which is a range of
271 /// instructions [Begin, End) within one BB all of which have
272 /// isGuaranteedToTransferExecutionToSuccessor(I) == true.
273 bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End);
274
275 /// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores
276 /// in the same BB with the same value for getUnderlyingObject() etc.
277 bool runOnEquivalenceClass(const EqClassKey &EqClassKey,
279
280 /// Runs the vectorizer on one chain, i.e. a subset of an equivalence class
281 /// where all instructions access a known, constant offset from the first
282 /// instruction.
283 bool runOnChain(Chain &C);
284
285 /// Splits the chain into subchains of instructions which read/write a
286 /// contiguous block of memory. Discards any length-1 subchains (because
287 /// there's nothing to vectorize in there). Also attempts to fill gaps with
288 /// "extra" elements to artificially make chains contiguous in some cases.
289 std::vector<Chain> splitChainByContiguity(Chain &C);
290
291 /// Splits the chain into subchains where it's safe to hoist loads up to the
292 /// beginning of the sub-chain and it's safe to sink loads up to the end of
293 /// the sub-chain. Discards any length-1 subchains. Also attempts to extend
294 /// non-power-of-two chains by adding "extra" elements in some cases.
295 std::vector<Chain> splitChainByMayAliasInstrs(Chain &C);
296
297 /// Splits the chain into subchains that make legal, aligned accesses.
298 /// Discards any length-1 subchains.
299 std::vector<Chain> splitChainByAlignment(Chain &C);
300
301 /// Converts the instrs in the chain into a single vectorized load or store.
302 /// Adds the old scalar loads/stores to ToErase.
303 bool vectorizeChain(Chain &C);
304
305 /// Tries to compute the offset in bytes PtrB - PtrA.
306 std::optional<APInt> getConstantOffset(Value *PtrA, Value *PtrB,
307 Instruction *ContextInst,
308 unsigned Depth = 0);
309 std::optional<APInt> getConstantOffsetComplexAddrs(Value *PtrA, Value *PtrB,
310 Instruction *ContextInst,
311 unsigned Depth);
312 std::optional<APInt> getConstantOffsetSelects(Value *PtrA, Value *PtrB,
313 Instruction *ContextInst,
314 unsigned Depth);
315
316 /// Gets the element type of the vector that the chain will load or store.
317 /// This is nontrivial because the chain may contain elements of different
318 /// types; e.g. it's legal to have a chain that contains both i32 and float.
319 Type *getChainElemTy(const Chain &C);
320
321 /// Determines whether ChainElem can be moved up (if IsLoad) or down (if
322 /// !IsLoad) to ChainBegin -- i.e. there are no intervening may-alias
323 /// instructions.
324 ///
325 /// The map ChainElemOffsets must contain all of the elements in
326 /// [ChainBegin, ChainElem] and their offsets from some arbitrary base
327 /// address. It's ok if it contains additional entries.
328 template <bool IsLoadChain>
329 bool isSafeToMove(
330 Instruction *ChainElem, Instruction *ChainBegin,
331 const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets,
332 BatchAAResults &BatchAA);
333
334 /// Merges the equivalence classes if they have underlying objects that differ
335 /// by one level of indirection (i.e., one is a getelementptr and the other is
336 /// the base pointer in that getelementptr).
337 void mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const;
338
339 /// Collects loads and stores grouped by "equivalence class", where:
340 /// - all elements in an eq class are a load or all are a store,
341 /// - they all load/store the same element size (it's OK to have e.g. i8 and
342 /// <4 x i8> in the same class, but not i32 and <4 x i8>), and
343 /// - they all have the same value for getUnderlyingObject().
344 EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin,
346
347 /// Partitions Instrs into "chains" where every instruction has a known
348 /// constant offset from the first instr in the chain.
349 ///
350 /// Postcondition: For all i, ret[i][0].second == 0, because the first instr
351 /// in the chain is the leader, and an instr touches distance 0 from itself.
352 std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
353
354 /// Checks if a potential vector load/store with a given alignment is allowed
355 /// and fast. Aligned accesses are always allowed and fast, while misaligned
356 /// accesses depend on TTI checks to determine whether they can and should be
357 /// vectorized or kept as element-wise accesses.
358 bool accessIsAllowedAndFast(unsigned SizeBytes, unsigned AS, Align Alignment,
359 unsigned VecElemBits) const;
360
361 /// Create a new GEP and a new Load/Store instruction such that the GEP
362 /// is pointing at PrevElem + Offset. In the case of stores, store poison.
363 /// Extra elements will either be combined into a masked load/store or
364 /// deleted before the end of the pass.
365 ChainElem createExtraElementAfter(const ChainElem &PrevElem, Type *Ty,
366 APInt Offset, StringRef Prefix,
367 Align Alignment = Align());
368
369 /// Create a mask that masks off the extra elements in the chain, to be used
370 /// for the creation of a masked load/store vector.
371 Value *createMaskForExtraElements(const ArrayRef<ChainElem> C,
372 FixedVectorType *VecTy);
373
374 /// Delete dead GEPs and extra Load/Store instructions created by
375 /// createExtraElementAfter
376 void deleteExtraElements();
377};
378
379class LoadStoreVectorizerLegacyPass : public FunctionPass {
380public:
381 static char ID;
382
383 LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {}
384
385 bool runOnFunction(Function &F) override;
386
387 StringRef getPassName() const override {
388 return "GPU Load and Store Vectorizer";
389 }
390
391 void getAnalysisUsage(AnalysisUsage &AU) const override {
392 AU.addRequired<AAResultsWrapperPass>();
393 AU.addRequired<AssumptionCacheTracker>();
394 AU.addRequired<ScalarEvolutionWrapperPass>();
395 AU.addRequired<DominatorTreeWrapperPass>();
396 AU.addRequired<TargetTransformInfoWrapperPass>();
397 AU.setPreservesCFG();
398 }
399};
400
401} // end anonymous namespace
402
403char LoadStoreVectorizerLegacyPass::ID = 0;
404
405INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
406 "Vectorize load and Store instructions", false, false)
413INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
414 "Vectorize load and store instructions", false, false)
415
417 return new LoadStoreVectorizerLegacyPass();
418}
419
420bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
421 // Don't vectorize when the attribute NoImplicitFloat is used.
422 if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
423 return false;
424
425 AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
426 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
427 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
428 TargetTransformInfo &TTI =
429 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
430
431 AssumptionCache &AC =
432 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
433
434 return Vectorizer(F, AA, AC, DT, SE, TTI).run();
435}
436
439 // Don't vectorize when the attribute NoImplicitFloat is used.
440 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
441 return PreservedAnalyses::all();
442
448
449 bool Changed = Vectorizer(F, AA, AC, DT, SE, TTI).run();
452 return Changed ? PA : PreservedAnalyses::all();
453}
454
455bool Vectorizer::run() {
456 bool Changed = false;
457 // Break up the BB if there are any instrs which aren't guaranteed to transfer
458 // execution to their successor.
459 //
460 // Consider, for example:
461 //
462 // def assert_arr_len(int n) { if (n < 2) exit(); }
463 //
464 // load arr[0]
465 // call assert_array_len(arr.length)
466 // load arr[1]
467 //
468 // Even though assert_arr_len does not read or write any memory, we can't
469 // speculate the second load before the call. More info at
470 // https://github.com/llvm/llvm-project/issues/52950.
471 for (BasicBlock *BB : post_order(&F)) {
472 // BB must at least have a terminator.
473 assert(!BB->empty());
474
476 Barriers.emplace_back(BB->begin());
477 for (Instruction &I : *BB)
479 Barriers.emplace_back(I.getIterator());
480 Barriers.emplace_back(BB->end());
481
482 for (auto It = Barriers.begin(), End = std::prev(Barriers.end()); It != End;
483 ++It)
484 Changed |= runOnPseudoBB(*It, *std::next(It));
485
486 for (Instruction *I : ToErase) {
487 // These will get deleted in deleteExtraElements.
488 // This is because ExtraElements will include both extra elements
489 // that *were* vectorized and extra elements that *were not*
490 // vectorized. ToErase will only include extra elements that *were*
491 // vectorized, so in order to avoid double deletion we skip them here and
492 // handle them in deleteExtraElements.
493 if (ExtraElements.contains(I))
494 continue;
495 auto *PtrOperand = getLoadStorePointerOperand(I);
496 if (I->use_empty())
497 I->eraseFromParent();
499 }
500 ToErase.clear();
501 deleteExtraElements();
502 }
503
504 return Changed;
505}
506
507bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
509 LLVM_DEBUG({
510 dbgs() << "LSV: Running on pseudo-BB [" << *Begin << " ... ";
511 if (End != Begin->getParent()->end())
512 dbgs() << *End;
513 else
514 dbgs() << "<BB end>";
515 dbgs() << ")\n";
516 });
517
518 bool Changed = false;
519 for (const auto &[EqClassKey, EqClass] :
520 collectEquivalenceClasses(Begin, End))
521 Changed |= runOnEquivalenceClass(EqClassKey, EqClass);
522
523 return Changed;
524}
525
526bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey,
527 ArrayRef<Instruction *> EqClass) {
528 bool Changed = false;
529
530 LLVM_DEBUG({
531 dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
532 << " keyed on " << EqClassKey << ":\n";
533 for (Instruction *I : EqClass)
534 dbgs() << " " << *I << "\n";
535 });
536
537 std::vector<Chain> Chains = gatherChains(EqClass);
538 LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size()
539 << " nontrivial chains.\n";);
540 for (Chain &C : Chains)
541 Changed |= runOnChain(C);
542 return Changed;
543}
544
545bool Vectorizer::runOnChain(Chain &C) {
546 LLVM_DEBUG({
547 dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n";
548 dumpChain(C);
549 });
550
551 // Split up the chain into increasingly smaller chains, until we can finally
552 // vectorize the chains.
553 //
554 // (Don't be scared by the depth of the loop nest here. These operations are
555 // all at worst O(n lg n) in the number of instructions, and splitting chains
556 // doesn't change the number of instrs. So the whole loop nest is O(n lg n).)
557 bool Changed = false;
558 for (auto &C : splitChainByMayAliasInstrs(C))
559 for (auto &C : splitChainByContiguity(C))
560 for (auto &C : splitChainByAlignment(C))
561 Changed |= vectorizeChain(C);
562 return Changed;
563}
564
565std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
566 if (C.empty())
567 return {};
568
569 sortChainInBBOrder(C);
570
571 LLVM_DEBUG({
572 dbgs() << "LSV: splitChainByMayAliasInstrs considering chain:\n";
573 dumpChain(C);
574 });
575
576 // We know that elements in the chain with nonverlapping offsets can't
577 // alias, but AA may not be smart enough to figure this out. Use a
578 // hashtable.
579 DenseMap<Instruction *, APInt /*OffsetFromLeader*/> ChainOffsets;
580 for (const auto &E : C)
581 ChainOffsets.insert({&*E.Inst, E.OffsetFromLeader});
582
583 // Across a single invocation of this function the IR is not changing, so
584 // using a batched Alias Analysis is safe and can reduce compile time.
585 BatchAAResults BatchAA(AA);
586
587 // Loads get hoisted up to the first load in the chain. Stores get sunk
588 // down to the last store in the chain. Our algorithm for loads is:
589 //
590 // - Take the first element of the chain. This is the start of a new chain.
591 // - Take the next element of `Chain` and check for may-alias instructions
592 // up to the start of NewChain. If no may-alias instrs, add it to
593 // NewChain. Otherwise, start a new NewChain.
594 //
595 // For stores it's the same except in the reverse direction.
596 //
597 // We expect IsLoad to be an std::bool_constant.
598 auto Impl = [&](auto IsLoad) {
599 // MSVC is unhappy if IsLoad is a capture, so pass it as an arg.
600 auto [ChainBegin, ChainEnd] = [&](auto IsLoad) {
601 if constexpr (IsLoad())
602 return std::make_pair(C.begin(), C.end());
603 else
604 return std::make_pair(C.rbegin(), C.rend());
605 }(IsLoad);
606 assert(ChainBegin != ChainEnd);
607
608 std::vector<Chain> Chains;
610 NewChain.emplace_back(*ChainBegin);
611 for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) {
612 if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst,
613 ChainOffsets, BatchAA)) {
614 LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge "
615 << *ChainIt->Inst << " into " << *ChainBegin->Inst
616 << "\n");
617 NewChain.emplace_back(*ChainIt);
618 } else {
620 dbgs() << "LSV: Found intervening may-alias instrs; cannot merge "
621 << *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n");
622 if (NewChain.size() > 1) {
623 LLVM_DEBUG({
624 dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
625 dumpChain(NewChain);
626 });
627 Chains.emplace_back(std::move(NewChain));
628 }
629
630 // Start a new chain.
631 NewChain = SmallVector<ChainElem, 1>({*ChainIt});
632 }
633 }
634 if (NewChain.size() > 1) {
635 LLVM_DEBUG({
636 dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
637 dumpChain(NewChain);
638 });
639 Chains.emplace_back(std::move(NewChain));
640 }
641 return Chains;
642 };
643
644 if (isa<LoadInst>(C[0].Inst))
645 return Impl(/*IsLoad=*/std::bool_constant<true>());
646
647 assert(isa<StoreInst>(C[0].Inst));
648 return Impl(/*IsLoad=*/std::bool_constant<false>());
649}
650
651std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
652 if (C.empty())
653 return {};
654
655 sortChainInOffsetOrder(C);
656
657 LLVM_DEBUG({
658 dbgs() << "LSV: splitChainByContiguity considering chain:\n";
659 dumpChain(C);
660 });
661
662 // If the chain is not contiguous, we try to fill the gap with "extra"
663 // elements to artificially make it contiguous, to try to enable
664 // vectorization. We only fill gaps if there is potential to end up with a
665 // legal masked load/store given the target, address space, and element type.
666 // At this point, when querying the TTI, optimistically assume max alignment
667 // and max vector size, as splitChainByAlignment will ensure the final vector
668 // shape passes the legalization check.
669 unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
671 unsigned MaxVecRegBits = TTI.getLoadStoreVecRegBitWidth(AS);
672 Align OptimisticAlign = Align(MaxVecRegBits / 8);
673 unsigned int MaxVectorNumElems =
674 MaxVecRegBits / DL.getTypeSizeInBits(ElementType);
675 // Note: This check decides whether to try to fill gaps based on the masked
676 // legality of the target's maximum vector size (getLoadStoreVecRegBitWidth).
677 // If a target *does not* support a masked load/store with this max vector
678 // size, but *does* support a masked load/store with a *smaller* vector size,
679 // that optimization will be missed. This does not occur in any of the targets
680 // that currently support this API.
681 FixedVectorType *OptimisticVectorType =
682 FixedVectorType::get(ElementType, MaxVectorNumElems);
683 bool TryFillGaps =
684 isa<LoadInst>(C[0].Inst)
685 ? TTI.isLegalMaskedLoad(OptimisticVectorType, OptimisticAlign, AS,
687 : TTI.isLegalMaskedStore(OptimisticVectorType, OptimisticAlign, AS,
689
690 // Cache the best aligned element in the chain for use when creating extra
691 // elements.
692 Align BestAlignedElemAlign = getLoadStoreAlignment(C[0].Inst);
693 APInt OffsetOfBestAlignedElemFromLeader = C[0].OffsetFromLeader;
694 for (const auto &E : C) {
695 Align ElementAlignment = getLoadStoreAlignment(E.Inst);
696 if (ElementAlignment > BestAlignedElemAlign) {
697 BestAlignedElemAlign = ElementAlignment;
698 OffsetOfBestAlignedElemFromLeader = E.OffsetFromLeader;
699 }
700 }
701
702 auto DeriveAlignFromBestAlignedElem = [&](APInt NewElemOffsetFromLeader) {
703 return commonAlignment(
704 BestAlignedElemAlign,
705 (NewElemOffsetFromLeader - OffsetOfBestAlignedElemFromLeader)
706 .abs()
707 .getLimitedValue());
708 };
709
710 unsigned ASPtrBits = DL.getIndexSizeInBits(AS);
711
712 std::vector<Chain> Ret;
713 Ret.push_back({C.front()});
714
715 unsigned ChainElemTyBits = DL.getTypeSizeInBits(getChainElemTy(C));
716 ChainElem &Prev = C[0];
717 for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
718 auto &CurChain = Ret.back();
719
720 APInt PrevSzBytes =
721 APInt(ASPtrBits, DL.getTypeStoreSize(getLoadStoreType(Prev.Inst)));
722 APInt PrevReadEnd = Prev.OffsetFromLeader + PrevSzBytes;
723 unsigned SzBytes = DL.getTypeStoreSize(getLoadStoreType(It->Inst));
724
725 // Add this instruction to the end of the current chain, or start a new one.
726 assert(
727 8 * SzBytes % ChainElemTyBits == 0 &&
728 "Every chain-element size must be a multiple of the element size after "
729 "vectorization.");
730 APInt ReadEnd = It->OffsetFromLeader + SzBytes;
731 // Allow redundancy: partial or full overlap counts as contiguous.
732 bool AreContiguous = false;
733 if (It->OffsetFromLeader.sle(PrevReadEnd)) {
734 // Check overlap is a multiple of the element size after vectorization.
735 uint64_t Overlap = (PrevReadEnd - It->OffsetFromLeader).getZExtValue();
736 if (8 * Overlap % ChainElemTyBits == 0)
737 AreContiguous = true;
738 }
739
740 LLVM_DEBUG(dbgs() << "LSV: Instruction is "
741 << (AreContiguous ? "contiguous" : "chain-breaker")
742 << *It->Inst << " (starts at offset "
743 << It->OffsetFromLeader << ")\n");
744
745 // If the chain is not contiguous, try to fill in gaps between Prev and
746 // Curr. For now, we aren't filling gaps between load/stores of different
747 // sizes. Additionally, as a conservative heuristic, we only fill gaps of
748 // 1-2 elements. Generating loads/stores with too many unused bytes has a
749 // side effect of increasing register pressure (on NVIDIA targets at least),
750 // which could cancel out the benefits of reducing number of load/stores.
751 bool GapFilled = false;
752 if (!AreContiguous && TryFillGaps && PrevSzBytes == SzBytes) {
753 APInt GapSzBytes = It->OffsetFromLeader - PrevReadEnd;
754 if (GapSzBytes == PrevSzBytes) {
755 // There is a single gap between Prev and Curr, create one extra element
756 ChainElem NewElem = createExtraElementAfter(
757 Prev, getLoadStoreType(Prev.Inst), PrevSzBytes, "GapFill",
758 DeriveAlignFromBestAlignedElem(PrevReadEnd));
759 CurChain.push_back(NewElem);
760 GapFilled = true;
761 }
762 // There are two gaps between Prev and Curr, only create two extra
763 // elements if Prev is the first element in a sequence of four.
764 // This has the highest chance of resulting in a beneficial vectorization.
765 if ((GapSzBytes == 2 * PrevSzBytes) && (CurChain.size() % 4 == 1)) {
766 ChainElem NewElem1 = createExtraElementAfter(
767 Prev, getLoadStoreType(Prev.Inst), PrevSzBytes, "GapFill",
768 DeriveAlignFromBestAlignedElem(PrevReadEnd));
769 ChainElem NewElem2 = createExtraElementAfter(
770 NewElem1, getLoadStoreType(Prev.Inst), PrevSzBytes, "GapFill",
771 DeriveAlignFromBestAlignedElem(PrevReadEnd + PrevSzBytes));
772 CurChain.push_back(NewElem1);
773 CurChain.push_back(NewElem2);
774 GapFilled = true;
775 }
776 }
777
778 if (AreContiguous || GapFilled)
779 CurChain.push_back(*It);
780 else
781 Ret.push_back({*It});
782 // In certain cases when handling redundant elements with partial overlaps,
783 // the previous element may still extend beyond the current element. Only
784 // update Prev if the current element is the new end of the chain.
785 if (ReadEnd.sge(PrevReadEnd))
786 Prev = *It;
787 }
788
789 // Filter out length-1 chains, these are uninteresting.
790 llvm::erase_if(Ret, [](const auto &Chain) { return Chain.size() <= 1; });
791 return Ret;
792}
793
794Type *Vectorizer::getChainElemTy(const Chain &C) {
795 assert(!C.empty());
796 // The rules are:
797 // - If there are any pointer types in the chain, use an integer type.
798 // - Prefer an integer type if it appears in the chain.
799 // - Otherwise, use the first type in the chain.
800 //
801 // The rule about pointer types is a simplification when we merge e.g. a load
802 // of a ptr and a double. There's no direct conversion from a ptr to a
803 // double; it requires a ptrtoint followed by a bitcast.
804 //
805 // It's unclear to me if the other rules have any practical effect, but we do
806 // it to match this pass's previous behavior.
807 if (any_of(C, [](const ChainElem &E) {
808 return getLoadStoreType(E.Inst)->getScalarType()->isPointerTy();
809 })) {
810 return Type::getIntNTy(
811 F.getContext(),
812 DL.getTypeSizeInBits(getLoadStoreType(C[0].Inst)->getScalarType()));
813 }
814
815 for (const ChainElem &E : C)
816 if (Type *T = getLoadStoreType(E.Inst)->getScalarType(); T->isIntegerTy())
817 return T;
818 return getLoadStoreType(C[0].Inst)->getScalarType();
819}
820
821std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
822 // We use a simple greedy algorithm.
823 // - Given a chain of length N, find all prefixes that
824 // (a) are not longer than the max register length, and
825 // (b) are a power of 2.
826 // - Starting from the longest prefix, try to create a vector of that length.
827 // - If one of them works, great. Repeat the algorithm on any remaining
828 // elements in the chain.
829 // - If none of them work, discard the first element and repeat on a chain
830 // of length N-1.
831 if (C.empty())
832 return {};
833
834 sortChainInOffsetOrder(C);
835
836 LLVM_DEBUG({
837 dbgs() << "LSV: splitChainByAlignment considering chain:\n";
838 dumpChain(C);
839 });
840
841 bool IsLoadChain = isa<LoadInst>(C[0].Inst);
842 auto GetVectorFactor = [&](unsigned VF, unsigned LoadStoreSize,
843 unsigned ChainSizeBytes, VectorType *VecTy) {
844 return IsLoadChain ? TTI.getLoadVectorFactor(VF, LoadStoreSize,
845 ChainSizeBytes, VecTy)
846 : TTI.getStoreVectorFactor(VF, LoadStoreSize,
847 ChainSizeBytes, VecTy);
848 };
849
850#ifndef NDEBUG
851 for (const auto &E : C) {
852 Type *Ty = getLoadStoreType(E.Inst)->getScalarType();
853 assert(isPowerOf2_32(DL.getTypeSizeInBits(Ty)) &&
854 "Should have filtered out non-power-of-two elements in "
855 "collectEquivalenceClasses.");
856 }
857#endif
858
859 unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
860 unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AS) / 8;
861
862 // For compile time reasons, we cache whether or not the superset
863 // of all candidate chains contains any extra loads/stores from earlier gap
864 // filling.
865 bool CandidateChainsMayContainExtraLoadsStores = any_of(
866 C, [this](const ChainElem &E) { return ExtraElements.contains(E.Inst); });
867
868 std::vector<Chain> Ret;
869 for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) {
870 // Find candidate chains of size not greater than the largest vector reg.
871 // These chains are over the closed interval [CBegin, CEnd].
872 SmallVector<std::pair<unsigned /*CEnd*/, unsigned /*SizeBytes*/>, 8>
873 CandidateChains;
874 // Need to compute the size of every candidate chain from its beginning
875 // because of possible overlapping among chain elements.
876 unsigned Sz = DL.getTypeStoreSize(getLoadStoreType(C[CBegin].Inst));
877 APInt PrevReadEnd = C[CBegin].OffsetFromLeader + Sz;
878 for (unsigned CEnd = CBegin + 1, Size = C.size(); CEnd < Size; ++CEnd) {
879 APInt ReadEnd = C[CEnd].OffsetFromLeader +
880 DL.getTypeStoreSize(getLoadStoreType(C[CEnd].Inst));
881 unsigned BytesAdded =
882 PrevReadEnd.sle(ReadEnd) ? (ReadEnd - PrevReadEnd).getSExtValue() : 0;
883 Sz += BytesAdded;
884 if (Sz > VecRegBytes)
885 break;
886 CandidateChains.emplace_back(CEnd, Sz);
887 PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
888 }
889
890 // Consider the longest chain first.
891 for (auto It = CandidateChains.rbegin(), End = CandidateChains.rend();
892 It != End; ++It) {
893 auto [CEnd, SizeBytes] = *It;
895 dbgs() << "LSV: splitChainByAlignment considering candidate chain ["
896 << *C[CBegin].Inst << " ... " << *C[CEnd].Inst << "]\n");
897
898 Type *VecElemTy = getChainElemTy(C);
899 // Note, VecElemTy is a power of 2, but might be less than one byte. For
900 // example, we can vectorize 2 x <2 x i4> to <4 x i4>, and in this case
901 // VecElemTy would be i4.
902 unsigned VecElemBits = DL.getTypeSizeInBits(VecElemTy);
903
904 // SizeBytes and VecElemBits are powers of 2, so they divide evenly.
905 assert((8 * SizeBytes) % VecElemBits == 0);
906 unsigned NumVecElems = 8 * SizeBytes / VecElemBits;
907 FixedVectorType *VecTy = FixedVectorType::get(VecElemTy, NumVecElems);
908 unsigned VF = 8 * VecRegBytes / VecElemBits;
909
910 // Check that TTI is happy with this vectorization factor.
911 unsigned TargetVF = GetVectorFactor(VF, VecElemBits,
912 VecElemBits * NumVecElems / 8, VecTy);
913 if (TargetVF != VF && TargetVF < NumVecElems) {
915 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
916 "because TargetVF="
917 << TargetVF << " != VF=" << VF
918 << " and TargetVF < NumVecElems=" << NumVecElems << "\n");
919 continue;
920 }
921
922 // If we're loading/storing from an alloca, align it if possible.
923 //
924 // FIXME: We eagerly upgrade the alignment, regardless of whether TTI
925 // tells us this is beneficial. This feels a bit odd, but it matches
926 // existing tests. This isn't *so* bad, because at most we align to 4
927 // bytes (current value of StackAdjustedAlignment).
928 //
929 // FIXME: We will upgrade the alignment of the alloca even if it turns out
930 // we can't vectorize for some other reason.
931 Value *PtrOperand = getLoadStorePointerOperand(C[CBegin].Inst);
932 bool IsAllocaAccess = AS == DL.getAllocaAddrSpace() &&
933 isa<AllocaInst>(PtrOperand->stripPointerCasts());
934 Align Alignment = getLoadStoreAlignment(C[CBegin].Inst);
935 Align PrefAlign = Align(StackAdjustedAlignment);
936 if (IsAllocaAccess && Alignment.value() % SizeBytes != 0 &&
937 accessIsAllowedAndFast(SizeBytes, AS, PrefAlign, VecElemBits)) {
939 PtrOperand, PrefAlign, DL, C[CBegin].Inst, nullptr, &DT);
940 if (NewAlign >= Alignment) {
942 << "LSV: splitByChain upgrading alloca alignment from "
943 << Alignment.value() << " to " << NewAlign.value()
944 << "\n");
945 Alignment = NewAlign;
946 }
947 }
948
949 Chain ExtendingLoadsStores;
950 if (!accessIsAllowedAndFast(SizeBytes, AS, Alignment, VecElemBits)) {
951 // If we have a non-power-of-2 element count, attempt to extend the
952 // chain to the next power-of-2 if it makes the access allowed and
953 // fast.
954 bool AllowedAndFast = false;
955 if (NumVecElems < TargetVF && !isPowerOf2_32(NumVecElems) &&
956 VecElemBits >= 8) {
957 // TargetVF may be a lot higher than NumVecElems,
958 // so only extend to the next power of 2.
959 assert(VecElemBits % 8 == 0);
960 unsigned VecElemBytes = VecElemBits / 8;
961 unsigned NewNumVecElems = PowerOf2Ceil(NumVecElems);
962 unsigned NewSizeBytes = VecElemBytes * NewNumVecElems;
963
964 assert(isPowerOf2_32(TargetVF) &&
965 "TargetVF expected to be a power of 2");
966 assert(NewNumVecElems <= TargetVF &&
967 "Should not extend past TargetVF");
968
970 << "LSV: attempting to extend chain of " << NumVecElems
971 << " " << (IsLoadChain ? "loads" : "stores") << " to "
972 << NewNumVecElems << " elements\n");
973 bool IsLegalToExtend =
974 IsLoadChain ? TTI.isLegalMaskedLoad(
975 FixedVectorType::get(VecElemTy, NewNumVecElems),
976 Alignment, AS, TTI::MaskKind::ConstantMask)
978 FixedVectorType::get(VecElemTy, NewNumVecElems),
979 Alignment, AS, TTI::MaskKind::ConstantMask);
980 // Only artificially increase the chain if it would be AllowedAndFast
981 // and if the resulting masked load/store will be legal for the
982 // target.
983 if (IsLegalToExtend &&
984 accessIsAllowedAndFast(NewSizeBytes, AS, Alignment,
985 VecElemBits)) {
987 << "LSV: extending " << (IsLoadChain ? "load" : "store")
988 << " chain of " << NumVecElems << " "
989 << (IsLoadChain ? "loads" : "stores")
990 << " with total byte size of " << SizeBytes << " to "
991 << NewNumVecElems << " "
992 << (IsLoadChain ? "loads" : "stores")
993 << " with total byte size of " << NewSizeBytes
994 << ", TargetVF=" << TargetVF << " \n");
995
996 // Create (NewNumVecElems - NumVecElems) extra elements.
997 // We are basing each extra element on CBegin, which means the
998 // offsets should be based on SizeBytes, which represents the offset
999 // from CBegin to the current end of the chain.
1000 unsigned ASPtrBits = DL.getIndexSizeInBits(AS);
1001 for (unsigned I = 0; I < (NewNumVecElems - NumVecElems); I++) {
1002 ChainElem NewElem = createExtraElementAfter(
1003 C[CBegin], VecElemTy,
1004 APInt(ASPtrBits, SizeBytes + I * VecElemBytes), "Extend");
1005 ExtendingLoadsStores.push_back(NewElem);
1006 }
1007
1008 // Update the size and number of elements for upcoming checks.
1009 SizeBytes = NewSizeBytes;
1010 NumVecElems = NewNumVecElems;
1011 AllowedAndFast = true;
1012 }
1013 }
1014 if (!AllowedAndFast) {
1015 // We were not able to achieve legality by extending the chain.
1017 << "LSV: splitChainByAlignment discarding candidate chain "
1018 "because its alignment is not AllowedAndFast: "
1019 << Alignment.value() << "\n");
1020 continue;
1021 }
1022 }
1023
1024 if ((IsLoadChain &&
1025 !TTI.isLegalToVectorizeLoadChain(SizeBytes, Alignment, AS)) ||
1026 (!IsLoadChain &&
1027 !TTI.isLegalToVectorizeStoreChain(SizeBytes, Alignment, AS))) {
1028 LLVM_DEBUG(
1029 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
1030 "because !isLegalToVectorizeLoad/StoreChain.");
1031 continue;
1032 }
1033
1034 if (CandidateChainsMayContainExtraLoadsStores) {
1035 // If the candidate chain contains extra loads/stores from an earlier
1036 // optimization, confirm legality now. This filter is essential because
1037 // when filling gaps in splitChainByContiguity, we queried the API to
1038 // check that (for a given element type and address space) there *may*
1039 // have been a legal masked load/store we could possibly create. Now, we
1040 // need to check if the actual chain we ended up with is legal to turn
1041 // into a masked load/store. This is relevant for NVPTX, for example,
1042 // where a masked store is only legal if we have ended up with a 256-bit
1043 // vector.
1044 bool CurrCandContainsExtraLoadsStores = llvm::any_of(
1045 ArrayRef<ChainElem>(C).slice(CBegin, CEnd - CBegin + 1),
1046 [this](const ChainElem &E) {
1047 return ExtraElements.contains(E.Inst);
1048 });
1049
1050 if (CurrCandContainsExtraLoadsStores &&
1051 (IsLoadChain ? !TTI.isLegalMaskedLoad(
1052 FixedVectorType::get(VecElemTy, NumVecElems),
1053 Alignment, AS, TTI::MaskKind::ConstantMask)
1055 FixedVectorType::get(VecElemTy, NumVecElems),
1056 Alignment, AS, TTI::MaskKind::ConstantMask))) {
1058 << "LSV: splitChainByAlignment discarding candidate chain "
1059 "because it contains extra loads/stores that we cannot "
1060 "legally vectorize into a masked load/store \n");
1061 continue;
1062 }
1063 }
1064
1065 // Hooray, we can vectorize this chain!
1066 Chain &NewChain = Ret.emplace_back();
1067 for (unsigned I = CBegin; I <= CEnd; ++I)
1068 NewChain.emplace_back(C[I]);
1069 for (ChainElem E : ExtendingLoadsStores)
1070 NewChain.emplace_back(E);
1071 CBegin = CEnd; // Skip over the instructions we've added to the chain.
1072 break;
1073 }
1074 }
1075 return Ret;
1076}
1077
1078bool Vectorizer::vectorizeChain(Chain &C) {
1079 if (C.size() < 2)
1080 return false;
1081
1082 bool ChainContainsExtraLoadsStores = llvm::any_of(
1083 C, [this](const ChainElem &E) { return ExtraElements.contains(E.Inst); });
1084
1085 // If we are left with a two-element chain, and one of the elements is an
1086 // extra element, we don't want to vectorize
1087 if (C.size() == 2 && ChainContainsExtraLoadsStores)
1088 return false;
1089
1090 sortChainInOffsetOrder(C);
1091
1092 LLVM_DEBUG({
1093 dbgs() << "LSV: Vectorizing chain of " << C.size() << " instructions:\n";
1094 dumpChain(C);
1095 });
1096
1097 Type *VecElemTy = getChainElemTy(C);
1098 bool IsLoadChain = isa<LoadInst>(C[0].Inst);
1099 unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
1100 unsigned BytesAdded = DL.getTypeStoreSize(getLoadStoreType(&*C[0].Inst));
1101 APInt PrevReadEnd = C[0].OffsetFromLeader + BytesAdded;
1102 unsigned ChainBytes = BytesAdded;
1103 for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
1104 unsigned SzBytes = DL.getTypeStoreSize(getLoadStoreType(&*It->Inst));
1105 APInt ReadEnd = It->OffsetFromLeader + SzBytes;
1106 // Update ChainBytes considering possible overlap.
1107 BytesAdded =
1108 PrevReadEnd.sle(ReadEnd) ? (ReadEnd - PrevReadEnd).getSExtValue() : 0;
1109 ChainBytes += BytesAdded;
1110 PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
1111 }
1112
1113 assert(8 * ChainBytes % DL.getTypeSizeInBits(VecElemTy) == 0);
1114 // VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
1115 // than 1 byte (e.g. VecTy == <32 x i1>).
1116 unsigned NumElem = 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy);
1117 Type *VecTy = FixedVectorType::get(VecElemTy, NumElem);
1118
1119 Align Alignment = getLoadStoreAlignment(C[0].Inst);
1120 // If this is a load/store of an alloca, we might have upgraded the alloca's
1121 // alignment earlier. Get the new alignment.
1122 if (AS == DL.getAllocaAddrSpace()) {
1123 Alignment = std::max(
1124 Alignment,
1126 MaybeAlign(), DL, C[0].Inst, nullptr, &DT));
1127 }
1128
1129 // All elements of the chain must have the same scalar-type size.
1130#ifndef NDEBUG
1131 for (const ChainElem &E : C)
1132 assert(DL.getTypeStoreSize(getLoadStoreType(E.Inst)->getScalarType()) ==
1133 DL.getTypeStoreSize(VecElemTy));
1134#endif
1135
1136 Instruction *VecInst;
1137 if (IsLoadChain) {
1138 // Loads get hoisted to the location of the first load in the chain. We may
1139 // also need to hoist the (transitive) operands of the loads.
1140 Builder.SetInsertPoint(
1141 llvm::min_element(C, [](const auto &A, const auto &B) {
1142 return A.Inst->comesBefore(B.Inst);
1143 })->Inst);
1144
1145 // If the chain contains extra loads, we need to vectorize into a
1146 // masked load.
1147 if (ChainContainsExtraLoadsStores) {
1148 assert(TTI.isLegalMaskedLoad(VecTy, Alignment, AS,
1150 Value *Mask = createMaskForExtraElements(C, cast<FixedVectorType>(VecTy));
1151 VecInst = Builder.CreateMaskedLoad(
1152 VecTy, getLoadStorePointerOperand(C[0].Inst), Alignment, Mask);
1153 } else {
1154 // This can happen due to a chain of redundant loads.
1155 // In this case, just use the element-type, and avoid ExtractElement.
1156 if (NumElem == 1)
1157 VecTy = VecElemTy;
1158 // Chain is in offset order, so C[0] is the instr with the lowest offset,
1159 // i.e. the root of the vector.
1160 VecInst = Builder.CreateAlignedLoad(
1161 VecTy, getLoadStorePointerOperand(C[0].Inst), Alignment);
1162 }
1163
1164 for (const ChainElem &E : C) {
1165 Instruction *I = E.Inst;
1166 Value *V;
1168 unsigned EOffset =
1169 (E.OffsetFromLeader - C[0].OffsetFromLeader).getZExtValue();
1170 unsigned VecIdx = 8 * EOffset / DL.getTypeSizeInBits(VecElemTy);
1171 if (!VecTy->isVectorTy()) {
1172 V = VecInst;
1173 } else if (auto *VT = dyn_cast<FixedVectorType>(T)) {
1174 auto Mask = llvm::to_vector<8>(
1175 llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));
1176 V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());
1177 } else {
1178 V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),
1179 I->getName());
1180 }
1181 if (V->getType() != I->getType())
1182 V = Builder.CreateBitOrPointerCast(V, I->getType());
1184 }
1185
1186 // Finally, we need to reorder the instrs in the BB so that the (transitive)
1187 // operands of VecInst appear before it. To see why, suppose we have
1188 // vectorized the following code:
1189 //
1190 // ptr1 = gep a, 1
1191 // load1 = load i32 ptr1
1192 // ptr0 = gep a, 0
1193 // load0 = load i32 ptr0
1194 //
1195 // We will put the vectorized load at the location of the earliest load in
1196 // the BB, i.e. load1. We get:
1197 //
1198 // ptr1 = gep a, 1
1199 // loadv = load <2 x i32> ptr0
1200 // load0 = extractelement loadv, 0
1201 // load1 = extractelement loadv, 1
1202 // ptr0 = gep a, 0
1203 //
1204 // Notice that loadv uses ptr0, which is defined *after* it!
1205 reorder(VecInst);
1206 } else {
1207 // Stores get sunk to the location of the last store in the chain.
1208 Builder.SetInsertPoint(llvm::max_element(C, [](auto &A, auto &B) {
1209 return A.Inst->comesBefore(B.Inst);
1210 })->Inst);
1211
1212 // Build the vector to store.
1213 Value *Vec = PoisonValue::get(VecTy);
1214 auto InsertElem = [&](Value *V, unsigned VecIdx) {
1215 if (V->getType() != VecElemTy)
1216 V = Builder.CreateBitOrPointerCast(V, VecElemTy);
1217 Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx));
1218 };
1219 for (const ChainElem &E : C) {
1220 auto *I = cast<StoreInst>(E.Inst);
1221 unsigned EOffset =
1222 (E.OffsetFromLeader - C[0].OffsetFromLeader).getZExtValue();
1223 unsigned VecIdx = 8 * EOffset / DL.getTypeSizeInBits(VecElemTy);
1224 if (FixedVectorType *VT =
1226 for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
1227 InsertElem(Builder.CreateExtractElement(I->getValueOperand(),
1228 Builder.getInt32(J)),
1229 VecIdx++);
1230 }
1231 } else {
1232 InsertElem(I->getValueOperand(), VecIdx);
1233 }
1234 }
1235
1236 // If the chain originates from extra stores, we need to vectorize into a
1237 // masked store.
1238 if (ChainContainsExtraLoadsStores) {
1239 assert(TTI.isLegalMaskedStore(Vec->getType(), Alignment, AS,
1241 Value *Mask =
1242 createMaskForExtraElements(C, cast<FixedVectorType>(Vec->getType()));
1243 VecInst = Builder.CreateMaskedStore(
1244 Vec, getLoadStorePointerOperand(C[0].Inst), Alignment, Mask);
1245 } else {
1246 // Chain is in offset order, so C[0] is the instr with the lowest offset,
1247 // i.e. the root of the vector.
1248 VecInst = Builder.CreateAlignedStore(
1249 Vec, getLoadStorePointerOperand(C[0].Inst), Alignment);
1250 }
1251 }
1252
1253 propagateMetadata(VecInst, C);
1254
1255 for (const ChainElem &E : C)
1256 ToErase.emplace_back(E.Inst);
1257
1258 ++NumVectorInstructions;
1259 NumScalarsVectorized += C.size();
1260 return true;
1261}
1262
1263template <bool IsLoadChain>
1264bool Vectorizer::isSafeToMove(
1265 Instruction *ChainElem, Instruction *ChainBegin,
1266 const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets,
1267 BatchAAResults &BatchAA) {
1268 LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> "
1269 << *ChainBegin << ")\n");
1270
1271 assert(isa<LoadInst>(ChainElem) == IsLoadChain);
1272 if (ChainElem == ChainBegin)
1273 return true;
1274
1275 // Invariant loads can always be reordered; by definition they are not
1276 // clobbered by stores.
1277 if (isInvariantLoad(ChainElem))
1278 return true;
1279
1280 auto BBIt = std::next([&] {
1281 if constexpr (IsLoadChain)
1282 return BasicBlock::reverse_iterator(ChainElem);
1283 else
1284 return BasicBlock::iterator(ChainElem);
1285 }());
1286 auto BBItEnd = std::next([&] {
1287 if constexpr (IsLoadChain)
1288 return BasicBlock::reverse_iterator(ChainBegin);
1289 else
1290 return BasicBlock::iterator(ChainBegin);
1291 }());
1292
1293 const APInt &ChainElemOffset = ChainOffsets.at(ChainElem);
1294 const unsigned ChainElemSize =
1295 DL.getTypeStoreSize(getLoadStoreType(ChainElem));
1296
1297 for (; BBIt != BBItEnd; ++BBIt) {
1298 Instruction *I = &*BBIt;
1299
1300 if (!I->mayReadOrWriteMemory())
1301 continue;
1302
1303 // Loads can be reordered with other loads.
1304 if (IsLoadChain && isa<LoadInst>(I))
1305 continue;
1306
1307 // Stores can be sunk below invariant loads.
1308 if (!IsLoadChain && isInvariantLoad(I))
1309 continue;
1310
1311 // If I is in the chain, we can tell whether it aliases ChainIt by checking
1312 // what offset ChainIt accesses. This may be better than AA is able to do.
1313 //
1314 // We should really only have duplicate offsets for stores (the duplicate
1315 // loads should be CSE'ed), but in case we have a duplicate load, we'll
1316 // split the chain so we don't have to handle this case specially.
1317 if (auto OffsetIt = ChainOffsets.find(I); OffsetIt != ChainOffsets.end()) {
1318 // I and ChainElem overlap if:
1319 // - I and ChainElem have the same offset, OR
1320 // - I's offset is less than ChainElem's, but I touches past the
1321 // beginning of ChainElem, OR
1322 // - ChainElem's offset is less than I's, but ChainElem touches past the
1323 // beginning of I.
1324 const APInt &IOffset = OffsetIt->second;
1325 unsigned IElemSize = DL.getTypeStoreSize(getLoadStoreType(I));
1326 if (IOffset == ChainElemOffset ||
1327 (IOffset.sle(ChainElemOffset) &&
1328 (IOffset + IElemSize).sgt(ChainElemOffset)) ||
1329 (ChainElemOffset.sle(IOffset) &&
1330 (ChainElemOffset + ChainElemSize).sgt(OffsetIt->second))) {
1331 LLVM_DEBUG({
1332 // Double check that AA also sees this alias. If not, we probably
1333 // have a bug.
1334 ModRefInfo MR =
1335 BatchAA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1336 assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR));
1337 dbgs() << "LSV: Found alias in chain: " << *I << "\n";
1338 });
1339 return false; // We found an aliasing instruction; bail.
1340 }
1341
1342 continue; // We're confident there's no alias.
1343 }
1344
1345 LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n");
1346 ModRefInfo MR = BatchAA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1347 if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) {
1348 LLVM_DEBUG(dbgs() << "LSV: Found alias in chain:\n"
1349 << " Aliasing instruction:\n"
1350 << " " << *I << '\n'
1351 << " Aliased instruction and pointer:\n"
1352 << " " << *ChainElem << '\n'
1353 << " " << *getLoadStorePointerOperand(ChainElem)
1354 << '\n');
1355
1356 return false;
1357 }
1358 }
1359 return true;
1360}
1361
1363 // or disjoint is equivalent to add nuw nsw, so it never wraps.
1364 if (auto *PDI = dyn_cast<PossiblyDisjointInst>(I); PDI && PDI->isDisjoint())
1365 return true;
1367 return (Signed && BinOpI->hasNoSignedWrap()) ||
1368 (!Signed && BinOpI->hasNoUnsignedWrap());
1369}
1370
1371/// Check if instruction is an add or an or-disjoint (which is semantically
1372/// equivalent to add nuw nsw).
1373static bool isAddLike(Instruction *I) {
1374 switch (I->getOpcode()) {
1375 default:
1376 break;
1377 case Instruction::Add:
1378 return true;
1379 case Instruction::Or:
1380 if (auto *PDI = dyn_cast<PossiblyDisjointInst>(I))
1381 return PDI->isDisjoint();
1382 break;
1383 }
1384 return false;
1385}
1386
1387static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
1388 unsigned MatchingOpIdxA, Instruction *AddOpB,
1389 unsigned MatchingOpIdxB, bool Signed) {
1390 LLVM_DEBUG(dbgs() << "LSV: checkIfSafeAddSequence IdxDiff=" << IdxDiff
1391 << ", AddOpA=" << *AddOpA << ", MatchingOpIdxA="
1392 << MatchingOpIdxA << ", AddOpB=" << *AddOpB
1393 << ", MatchingOpIdxB=" << MatchingOpIdxB
1394 << ", Signed=" << Signed << "\n");
1395 // If both OpA and OpB are adds (or or-disjoint) with NSW/NUW and with one of
1396 // the operands being the same, we can guarantee that the transformation is
1397 // safe if we can prove that OpA won't overflow when Ret added to the other
1398 // operand of OpA.
1399 // For example:
1400 // %tmp7 = add nsw i32 %tmp2, %v0
1401 // %tmp8 = sext i32 %tmp7 to i64
1402 // ...
1403 // %tmp11 = add nsw i32 %v0, 1
1404 // %tmp12 = add nsw i32 %tmp2, %tmp11
1405 // %tmp13 = sext i32 %tmp12 to i64
1406 //
1407 // Both %tmp7 and %tmp12 have the nsw flag and the first operand is %tmp2.
1408 // It's guaranteed that adding 1 to %tmp7 won't overflow because %tmp11 adds
1409 // 1 to %v0 and both %tmp11 and %tmp12 have the nsw flag.
1410 assert(isAddLike(AddOpA) && isAddLike(AddOpB) &&
1411 checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed));
1412 if (AddOpA->getOperand(MatchingOpIdxA) ==
1413 AddOpB->getOperand(MatchingOpIdxB)) {
1414 Value *OtherOperandA = AddOpA->getOperand(MatchingOpIdxA == 1 ? 0 : 1);
1415 Value *OtherOperandB = AddOpB->getOperand(MatchingOpIdxB == 1 ? 0 : 1);
1416 Instruction *OtherInstrA = dyn_cast<Instruction>(OtherOperandA);
1417 Instruction *OtherInstrB = dyn_cast<Instruction>(OtherOperandB);
1418 // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`.
1419 if (OtherInstrB && isAddLike(OtherInstrB) &&
1420 checkNoWrapFlags(OtherInstrB, Signed) &&
1421 isa<ConstantInt>(OtherInstrB->getOperand(1))) {
1422 int64_t CstVal =
1423 cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
1424 if (OtherInstrB->getOperand(0) == OtherOperandA &&
1425 IdxDiff.getSExtValue() == CstVal)
1426 return true;
1427 }
1428 // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`.
1429 if (OtherInstrA && isAddLike(OtherInstrA) &&
1430 checkNoWrapFlags(OtherInstrA, Signed) &&
1431 isa<ConstantInt>(OtherInstrA->getOperand(1))) {
1432 int64_t CstVal =
1433 cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
1434 if (OtherInstrA->getOperand(0) == OtherOperandB &&
1435 IdxDiff.getSExtValue() == -CstVal)
1436 return true;
1437 }
1438 // Match `x +nsw/nuw (y +nsw/nuw c)` and
1439 // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`.
1440 if (OtherInstrA && OtherInstrB && isAddLike(OtherInstrA) &&
1441 isAddLike(OtherInstrB) && checkNoWrapFlags(OtherInstrA, Signed) &&
1442 checkNoWrapFlags(OtherInstrB, Signed) &&
1443 isa<ConstantInt>(OtherInstrA->getOperand(1)) &&
1444 isa<ConstantInt>(OtherInstrB->getOperand(1))) {
1445 int64_t CstValA =
1446 cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
1447 int64_t CstValB =
1448 cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
1449 if (OtherInstrA->getOperand(0) == OtherInstrB->getOperand(0) &&
1450 IdxDiff.getSExtValue() == (CstValB - CstValA))
1451 return true;
1452 }
1453 }
1454 return false;
1455}
1456
1457std::optional<APInt> Vectorizer::getConstantOffsetComplexAddrs(
1458 Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
1459 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs PtrA=" << *PtrA
1460 << " PtrB=" << *PtrB << " ContextInst=" << *ContextInst
1461 << " Depth=" << Depth << "\n");
1462 auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
1463 auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
1464 if (!GEPA || !GEPB)
1465 return getConstantOffsetSelects(PtrA, PtrB, ContextInst, Depth);
1466
1467 // Look through GEPs after checking they're the same except for the last
1468 // index.
1469 if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
1470 GEPA->getPointerOperand() != GEPB->getPointerOperand() ||
1471 GEPA->getSourceElementType() != GEPB->getSourceElementType())
1472 return std::nullopt;
1473 gep_type_iterator GTIA = gep_type_begin(GEPA);
1474 gep_type_iterator GTIB = gep_type_begin(GEPB);
1475 for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
1476 if (GTIA.getOperand() != GTIB.getOperand())
1477 return std::nullopt;
1478 ++GTIA;
1479 ++GTIB;
1480 }
1481
1484 if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
1485 OpA->getType() != OpB->getType())
1486 return std::nullopt;
1487
1488 uint64_t Stride = GTIA.getSequentialElementStride(DL);
1489
1490 // Only look through a ZExt/SExt.
1491 if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
1492 return std::nullopt;
1493
1494 bool Signed = isa<SExtInst>(OpA);
1495
1496 // At this point A could be a function parameter, i.e. not an instruction
1497 Value *ValA = OpA->getOperand(0);
1498 OpB = dyn_cast<Instruction>(OpB->getOperand(0));
1499 if (!OpB || ValA->getType() != OpB->getType())
1500 return std::nullopt;
1501
1502 const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
1503 const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
1504 const SCEV *IdxDiffSCEV = SE.getMinusSCEV(OffsetSCEVB, OffsetSCEVA);
1505 if (IdxDiffSCEV == SE.getCouldNotCompute())
1506 return std::nullopt;
1507
1508 ConstantRange IdxDiffRange = SE.getSignedRange(IdxDiffSCEV);
1509 if (!IdxDiffRange.isSingleElement())
1510 return std::nullopt;
1511 APInt IdxDiff = *IdxDiffRange.getSingleElement();
1512
1513 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs IdxDiff=" << IdxDiff
1514 << "\n");
1515
1516 // Now we need to prove that adding IdxDiff to ValA won't overflow.
1517 bool Safe = false;
1518
1519 // First attempt: if OpB is an add (or or-disjoint) with NSW/NUW, and OpB is
1520 // IdxDiff added to ValA, we're okay.
1521 if (isAddLike(OpB) && isa<ConstantInt>(OpB->getOperand(1)) &&
1522 IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue()) &&
1524 Safe = true;
1525
1526 // Second attempt: check if we have eligible add NSW/NUW instruction
1527 // sequences.
1528 OpA = dyn_cast<Instruction>(ValA);
1529 if (!Safe && OpA && isAddLike(OpA) && isAddLike(OpB) &&
1531 // In the checks below a matching operand in OpA and OpB is an operand which
1532 // is the same in those two instructions. Below we account for possible
1533 // orders of the operands of these add instructions.
1534 for (unsigned MatchingOpIdxA : {0, 1})
1535 for (unsigned MatchingOpIdxB : {0, 1})
1536 if (!Safe)
1537 Safe = checkIfSafeAddSequence(IdxDiff, OpA, MatchingOpIdxA, OpB,
1538 MatchingOpIdxB, Signed);
1539 }
1540
1541 unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
1542
1543 // Third attempt:
1544 //
1545 // Assuming IdxDiff is positive: If all set bits of IdxDiff or any higher
1546 // order bit other than the sign bit are known to be zero in ValA, we can add
1547 // Diff to it while guaranteeing no overflow of any sort.
1548 //
1549 // If IdxDiff is negative, do the same, but swap ValA and ValB.
1550 if (!Safe) {
1551 // When computing known bits, use the GEPs as context instructions, since
1552 // they likely are in the same BB as the load/store.
1553 KnownBits Known(BitWidth);
1554 computeKnownBits((IdxDiff.sge(0) ? ValA : OpB), Known, DL, &AC, ContextInst,
1555 &DT);
1556 APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
1557 if (Signed)
1558 BitsAllowedToBeSet.clearBit(BitWidth - 1);
1559 Safe = BitsAllowedToBeSet.uge(IdxDiff.abs());
1560 }
1561
1562 // Fourth attempt: use SCEV unsigned range to prove that adding IdxDiff
1563 // to ValA won't cause unsigned overflow (which would make zext produce
1564 // a different difference). This handles cases where KnownBits analysis
1565 // can't determine safety but SCEV has tighter range information.
1566 if (!Safe && !Signed) {
1567 Value *CheckVal = IdxDiff.sge(0) ? ValA : OpB;
1568 ConstantRange CR = SE.getUnsignedRange(SE.getSCEV(CheckVal));
1569 APInt AbsDiff = IdxDiff.abs().zextOrTrunc(BitWidth);
1570 APInt Limit = APInt::getMaxValue(BitWidth) - AbsDiff;
1571 Safe = CR.getUnsignedMax().ule(Limit);
1572 }
1573
1574 if (Safe)
1575 return IdxDiff * Stride;
1576 return std::nullopt;
1577}
1578
1579std::optional<APInt> Vectorizer::getConstantOffsetSelects(
1580 Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
1581 if (Depth++ == MaxDepth)
1582 return std::nullopt;
1583
1584 if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
1585 if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
1586 if (SelectA->getCondition() != SelectB->getCondition())
1587 return std::nullopt;
1588 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetSelects, PtrA=" << *PtrA
1589 << ", PtrB=" << *PtrB << ", ContextInst="
1590 << *ContextInst << ", Depth=" << Depth << "\n");
1591 std::optional<APInt> TrueDiff = getConstantOffset(
1592 SelectA->getTrueValue(), SelectB->getTrueValue(), ContextInst, Depth);
1593 if (!TrueDiff)
1594 return std::nullopt;
1595 std::optional<APInt> FalseDiff =
1596 getConstantOffset(SelectA->getFalseValue(), SelectB->getFalseValue(),
1597 ContextInst, Depth);
1598 if (TrueDiff == FalseDiff)
1599 return TrueDiff;
1600 }
1601 }
1602 return std::nullopt;
1603}
1604
1605void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const {
1606 if (EQClasses.size() < 2) // There is nothing to merge.
1607 return;
1608
1609 // The reduced key has all elements of the ECClassKey except the underlying
1610 // object. Check that EqClassKey has 4 elements and define the reduced key.
1611 static_assert(std::tuple_size_v<EqClassKey> == 4,
1612 "EqClassKey has changed - EqClassReducedKey needs changes too");
1613 using EqClassReducedKey =
1614 std::tuple<std::tuple_element_t<1, EqClassKey> /* AddrSpace */,
1615 std::tuple_element_t<2, EqClassKey> /* Element size */,
1616 std::tuple_element_t<3, EqClassKey> /* IsLoad; */>;
1617 using ECReducedKeyToUnderlyingObjectMap =
1618 MapVector<EqClassReducedKey,
1619 SmallPtrSet<std::tuple_element_t<0, EqClassKey>, 4>>;
1620
1621 // Form a map from the reduced key (without the underlying object) to the
1622 // underlying objects: 1 reduced key to many underlying objects, to form
1623 // groups of potentially merge-able equivalence classes.
1624 ECReducedKeyToUnderlyingObjectMap RedKeyToUOMap;
1625 bool FoundPotentiallyOptimizableEC = false;
1626 for (const auto &EC : EQClasses) {
1627 const auto &Key = EC.first;
1628 EqClassReducedKey RedKey{std::get<1>(Key), std::get<2>(Key),
1629 std::get<3>(Key)};
1630 auto &UOMap = RedKeyToUOMap[RedKey];
1631 UOMap.insert(std::get<0>(Key));
1632 if (UOMap.size() > 1)
1633 FoundPotentiallyOptimizableEC = true;
1634 }
1635 if (!FoundPotentiallyOptimizableEC)
1636 return;
1637
1638 LLVM_DEBUG({
1639 dbgs() << "LSV: mergeEquivalenceClasses: before merging:\n";
1640 for (const auto &EC : EQClasses) {
1641 dbgs() << " Key: {" << EC.first << "}\n";
1642 for (const auto &Inst : EC.second)
1643 dbgs() << " Inst: " << *Inst << '\n';
1644 }
1645 });
1646 LLVM_DEBUG({
1647 dbgs() << "LSV: mergeEquivalenceClasses: RedKeyToUOMap:\n";
1648 for (const auto &RedKeyToUO : RedKeyToUOMap) {
1649 dbgs() << " Reduced key: {" << std::get<0>(RedKeyToUO.first) << ", "
1650 << std::get<1>(RedKeyToUO.first) << ", "
1651 << static_cast<int>(std::get<2>(RedKeyToUO.first)) << "} --> "
1652 << RedKeyToUO.second.size() << " underlying objects:\n";
1653 for (auto UObject : RedKeyToUO.second)
1654 dbgs() << " " << *UObject << '\n';
1655 }
1656 });
1657
1658 using UObjectToUObjectMap = DenseMap<const Value *, const Value *>;
1659
1660 // Compute the ultimate targets for a set of underlying objects.
1661 auto GetUltimateTargets =
1662 [](SmallPtrSetImpl<const Value *> &UObjects) -> UObjectToUObjectMap {
1663 UObjectToUObjectMap IndirectionMap;
1664 for (const auto *UObject : UObjects) {
1665 const unsigned MaxLookupDepth = 1; // look for 1-level indirections only
1666 const auto *UltimateTarget = getUnderlyingObject(UObject, MaxLookupDepth);
1667 if (UltimateTarget != UObject)
1668 IndirectionMap[UObject] = UltimateTarget;
1669 }
1670 UObjectToUObjectMap UltimateTargetsMap;
1671 for (const auto *UObject : UObjects) {
1672 auto Target = UObject;
1673 auto It = IndirectionMap.find(Target);
1674 for (; It != IndirectionMap.end(); It = IndirectionMap.find(Target))
1675 Target = It->second;
1676 UltimateTargetsMap[UObject] = Target;
1677 }
1678 return UltimateTargetsMap;
1679 };
1680
1681 // For each item in RedKeyToUOMap, if it has more than one underlying object,
1682 // try to merge the equivalence classes.
1683 for (auto &[RedKey, UObjects] : RedKeyToUOMap) {
1684 if (UObjects.size() < 2)
1685 continue;
1686 auto UTMap = GetUltimateTargets(UObjects);
1687 for (const auto &[UObject, UltimateTarget] : UTMap) {
1688 if (UObject == UltimateTarget)
1689 continue;
1690
1691 EqClassKey KeyFrom{UObject, std::get<0>(RedKey), std::get<1>(RedKey),
1692 std::get<2>(RedKey)};
1693 EqClassKey KeyTo{UltimateTarget, std::get<0>(RedKey), std::get<1>(RedKey),
1694 std::get<2>(RedKey)};
1695 // The entry for KeyFrom is guarantted to exist, unlike KeyTo. Thus,
1696 // request the reference to the instructions vector for KeyTo first.
1697 const auto &VecTo = EQClasses[KeyTo];
1698 const auto &VecFrom = EQClasses[KeyFrom];
1699 SmallVector<Instruction *, 8> MergedVec;
1700 std::merge(VecFrom.begin(), VecFrom.end(), VecTo.begin(), VecTo.end(),
1701 std::back_inserter(MergedVec),
1702 [](Instruction *A, Instruction *B) {
1703 return A && B && A->comesBefore(B);
1704 });
1705 EQClasses[KeyTo] = std::move(MergedVec);
1706 EQClasses.erase(KeyFrom);
1707 }
1708 }
1709 LLVM_DEBUG({
1710 dbgs() << "LSV: mergeEquivalenceClasses: after merging:\n";
1711 for (const auto &EC : EQClasses) {
1712 dbgs() << " Key: {" << EC.first << "}\n";
1713 for (const auto &Inst : EC.second)
1714 dbgs() << " Inst: " << *Inst << '\n';
1715 }
1716 });
1717}
1718
1719EquivalenceClassMap
1720Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
1722 EquivalenceClassMap Ret;
1723
1724 auto GetUnderlyingObject = [](const Value *Ptr) -> const Value * {
1725 const Value *ObjPtr = llvm::getUnderlyingObject(Ptr);
1726 if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
1727 // The select's themselves are distinct instructions even if they share
1728 // the same condition and evaluate to consecutive pointers for true and
1729 // false values of the condition. Therefore using the select's themselves
1730 // for grouping instructions would put consecutive accesses into different
1731 // lists and they won't be even checked for being consecutive, and won't
1732 // be vectorized.
1733 return Sel->getCondition();
1734 }
1735 return ObjPtr;
1736 };
1737
1738 for (Instruction &I : make_range(Begin, End)) {
1739 auto *LI = dyn_cast<LoadInst>(&I);
1740 auto *SI = dyn_cast<StoreInst>(&I);
1741 if (!LI && !SI)
1742 continue;
1743
1744 if ((LI && !LI->isSimple()) || (SI && !SI->isSimple()))
1745 continue;
1746
1747 if ((LI && !TTI.isLegalToVectorizeLoad(LI)) ||
1748 (SI && !TTI.isLegalToVectorizeStore(SI)))
1749 continue;
1750
1751 Type *Ty = getLoadStoreType(&I);
1752 if (!VectorType::isValidElementType(Ty->getScalarType()))
1753 continue;
1754
1755 // Skip weird non-byte sizes. They probably aren't worth the effort of
1756 // handling correctly.
1757 unsigned TySize = DL.getTypeSizeInBits(Ty);
1758 if ((TySize % 8) != 0)
1759 continue;
1760
1761 // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
1762 // functions are currently using an integer type for the vectorized
1763 // load/store, and does not support casting between the integer type and a
1764 // vector of pointers (e.g. i64 to <2 x i16*>)
1765 if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
1766 continue;
1767
1769 unsigned AS = Ptr->getType()->getPointerAddressSpace();
1770 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
1771
1772 unsigned VF = VecRegSize / TySize;
1773 VectorType *VecTy = dyn_cast<VectorType>(Ty);
1774
1775 // Only handle power-of-two sized elements.
1776 if ((!VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(Ty))) ||
1777 (VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(VecTy->getScalarType()))))
1778 continue;
1779
1780 // No point in looking at these if they're too big to vectorize.
1781 if (TySize > VecRegSize / 2 ||
1782 (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
1783 continue;
1784
1785 Ret[{GetUnderlyingObject(Ptr), AS,
1786 DL.getTypeSizeInBits(getLoadStoreType(&I)->getScalarType()),
1787 /*IsLoad=*/LI != nullptr}]
1788 .emplace_back(&I);
1789 }
1790
1791 mergeEquivalenceClasses(Ret);
1792 return Ret;
1793}
1794
1795std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) {
1796 if (Instrs.empty())
1797 return {};
1798
1799 unsigned AS = getLoadStoreAddressSpace(Instrs[0]);
1800 unsigned ASPtrBits = DL.getIndexSizeInBits(AS);
1801
1802#ifndef NDEBUG
1803 // Check that Instrs is in BB order and all have the same addr space.
1804 for (size_t I = 1; I < Instrs.size(); ++I) {
1805 assert(Instrs[I - 1]->comesBefore(Instrs[I]));
1806 assert(getLoadStoreAddressSpace(Instrs[I]) == AS);
1807 }
1808#endif
1809
1810 // Machinery to build an MRU-hashtable of Chains.
1811 //
1812 // (Ideally this could be done with MapVector, but as currently implemented,
1813 // moving an element to the front of a MapVector is O(n).)
1814 struct InstrListElem : ilist_node<InstrListElem>,
1815 std::pair<Instruction *, Chain> {
1816 explicit InstrListElem(Instruction *I)
1817 : std::pair<Instruction *, Chain>(I, {}) {}
1818 };
1819 struct InstrListElemDenseMapInfo {
1820 using PtrInfo = DenseMapInfo<InstrListElem *>;
1821 using IInfo = DenseMapInfo<Instruction *>;
1822 static InstrListElem *getEmptyKey() { return PtrInfo::getEmptyKey(); }
1823 static InstrListElem *getTombstoneKey() {
1824 return PtrInfo::getTombstoneKey();
1825 }
1826 static unsigned getHashValue(const InstrListElem *E) {
1827 return IInfo::getHashValue(E->first);
1828 }
1829 static bool isEqual(const InstrListElem *A, const InstrListElem *B) {
1830 if (A == getEmptyKey() || B == getEmptyKey())
1831 return A == getEmptyKey() && B == getEmptyKey();
1832 if (A == getTombstoneKey() || B == getTombstoneKey())
1833 return A == getTombstoneKey() && B == getTombstoneKey();
1834 return IInfo::isEqual(A->first, B->first);
1835 }
1836 };
1837 SpecificBumpPtrAllocator<InstrListElem> Allocator;
1838 simple_ilist<InstrListElem> MRU;
1839 DenseSet<InstrListElem *, InstrListElemDenseMapInfo> Chains;
1840
1841 // Compare each instruction in `instrs` to leader of the N most recently-used
1842 // chains. This limits the O(n^2) behavior of this pass while also allowing
1843 // us to build arbitrarily long chains.
1844 for (Instruction *I : Instrs) {
1845 constexpr int MaxChainsToTry = 64;
1846
1847 bool MatchFound = false;
1848 auto ChainIter = MRU.begin();
1849 for (size_t J = 0; J < MaxChainsToTry && ChainIter != MRU.end();
1850 ++J, ++ChainIter) {
1851 if (std::optional<APInt> Offset = getConstantOffset(
1852 getLoadStorePointerOperand(ChainIter->first),
1854 /*ContextInst=*/
1855 (ChainIter->first->comesBefore(I) ? I : ChainIter->first))) {
1856 // `Offset` might not have the expected number of bits, if e.g. AS has a
1857 // different number of bits than opaque pointers.
1858 ChainIter->second.emplace_back(I, Offset.value());
1859 // Move ChainIter to the front of the MRU list.
1860 MRU.remove(*ChainIter);
1861 MRU.push_front(*ChainIter);
1862 MatchFound = true;
1863 break;
1864 }
1865 }
1866
1867 if (!MatchFound) {
1868 APInt ZeroOffset(ASPtrBits, 0);
1869 InstrListElem *E = new (Allocator.Allocate()) InstrListElem(I);
1870 E->second.emplace_back(I, ZeroOffset);
1871 MRU.push_front(*E);
1872 Chains.insert(E);
1873 }
1874 }
1875
1876 std::vector<Chain> Ret;
1877 Ret.reserve(Chains.size());
1878 // Iterate over MRU rather than Chains so the order is deterministic.
1879 for (auto &E : MRU)
1880 if (E.second.size() > 1)
1881 Ret.emplace_back(std::move(E.second));
1882 return Ret;
1883}
1884
1885std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,
1886 Instruction *ContextInst,
1887 unsigned Depth) {
1888 LLVM_DEBUG(dbgs() << "LSV: getConstantOffset, PtrA=" << *PtrA
1889 << ", PtrB=" << *PtrB << ", ContextInst= " << *ContextInst
1890 << ", Depth=" << Depth << "\n");
1891 // We'll ultimately return a value of this bit width, even if computations
1892 // happen in a different width.
1893 unsigned OrigBitWidth = DL.getIndexTypeSizeInBits(PtrA->getType());
1894 APInt OffsetA(OrigBitWidth, 0);
1895 APInt OffsetB(OrigBitWidth, 0);
1896 PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
1897 PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
1898 unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
1899 if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
1900 return std::nullopt;
1901
1902 // If we have to shrink the pointer, stripAndAccumulateInBoundsConstantOffsets
1903 // should properly handle a possible overflow and the value should fit into
1904 // the smallest data type used in the cast/gep chain.
1905 assert(OffsetA.getSignificantBits() <= NewPtrBitWidth &&
1906 OffsetB.getSignificantBits() <= NewPtrBitWidth);
1907
1908 OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
1909 OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
1910 if (PtrA == PtrB)
1911 return (OffsetB - OffsetA).sextOrTrunc(OrigBitWidth);
1912
1913 // Try to compute B - A.
1914 const SCEV *DistScev = SE.getMinusSCEV(SE.getSCEV(PtrB), SE.getSCEV(PtrA));
1915 if (DistScev != SE.getCouldNotCompute()) {
1916 LLVM_DEBUG(dbgs() << "LSV: SCEV PtrB - PtrA =" << *DistScev << "\n");
1917 ConstantRange DistRange = SE.getSignedRange(DistScev);
1918 if (DistRange.isSingleElement()) {
1919 // Handle index width (the width of Dist) != pointer width (the width of
1920 // the Offset*s at this point).
1921 APInt Dist = DistRange.getSingleElement()->sextOrTrunc(NewPtrBitWidth);
1922 return (OffsetB - OffsetA + Dist).sextOrTrunc(OrigBitWidth);
1923 }
1924 }
1925 if (std::optional<APInt> Diff =
1926 getConstantOffsetComplexAddrs(PtrA, PtrB, ContextInst, Depth))
1927 return (OffsetB - OffsetA + Diff->sext(OffsetB.getBitWidth()))
1928 .sextOrTrunc(OrigBitWidth);
1929 return std::nullopt;
1930}
1931
1932bool Vectorizer::accessIsAllowedAndFast(unsigned SizeBytes, unsigned AS,
1933 Align Alignment,
1934 unsigned VecElemBits) const {
1935 // Aligned vector accesses are ALWAYS faster than element-wise accesses.
1936 if (Alignment.value() % SizeBytes == 0)
1937 return true;
1938
1939 // Ask TTI whether misaligned accesses are faster as vector or element-wise.
1940 unsigned VectorizedSpeed = 0;
1941 bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses(
1942 F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed);
1943 if (!AllowsMisaligned) {
1944 LLVM_DEBUG(
1945 dbgs() << "LSV: Access of " << SizeBytes << "B in addrspace " << AS
1946 << " with alignment " << Alignment.value()
1947 << " is misaligned, and therefore can't be vectorized.\n");
1948 return false;
1949 }
1950
1951 unsigned ElementwiseSpeed = 0;
1952 (TTI).allowsMisalignedMemoryAccesses((F).getContext(), VecElemBits, AS,
1953 Alignment, &ElementwiseSpeed);
1954 if (VectorizedSpeed < ElementwiseSpeed) {
1955 LLVM_DEBUG(dbgs() << "LSV: Access of " << SizeBytes << "B in addrspace "
1956 << AS << " with alignment " << Alignment.value()
1957 << " has relative speed " << VectorizedSpeed
1958 << ", which is lower than the elementwise speed of "
1959 << ElementwiseSpeed
1960 << ". Therefore this access won't be vectorized.\n");
1961 return false;
1962 }
1963 return true;
1964}
1965
1966ChainElem Vectorizer::createExtraElementAfter(const ChainElem &Prev, Type *Ty,
1967 APInt Offset, StringRef Prefix,
1968 Align Alignment) {
1969 Instruction *NewElement = nullptr;
1970 Builder.SetInsertPoint(Prev.Inst->getNextNode());
1971 if (LoadInst *PrevLoad = dyn_cast<LoadInst>(Prev.Inst)) {
1972 Value *NewGep = Builder.CreatePtrAdd(
1973 PrevLoad->getPointerOperand(), Builder.getInt(Offset), Prefix + "GEP");
1974 LLVM_DEBUG(dbgs() << "LSV: Extra GEP Created: \n" << *NewGep << "\n");
1975 NewElement = Builder.CreateAlignedLoad(Ty, NewGep, Alignment, Prefix);
1976 } else {
1977 StoreInst *PrevStore = cast<StoreInst>(Prev.Inst);
1978
1979 Value *NewGep = Builder.CreatePtrAdd(
1980 PrevStore->getPointerOperand(), Builder.getInt(Offset), Prefix + "GEP");
1981 LLVM_DEBUG(dbgs() << "LSV: Extra GEP Created: \n" << *NewGep << "\n");
1982 NewElement =
1983 Builder.CreateAlignedStore(PoisonValue::get(Ty), NewGep, Alignment);
1984 }
1985
1986 // Attach all metadata to the new element.
1987 // propagateMetadata will fold it into the final vector when applicable.
1988 NewElement->copyMetadata(*Prev.Inst);
1989
1990 // Cache created elements for tracking and cleanup
1991 ExtraElements.insert(NewElement);
1992
1993 APInt NewOffsetFromLeader = Prev.OffsetFromLeader + Offset;
1994 LLVM_DEBUG(dbgs() << "LSV: Extra Element Created: \n"
1995 << *NewElement
1996 << " OffsetFromLeader: " << NewOffsetFromLeader << "\n");
1997 return ChainElem{NewElement, NewOffsetFromLeader};
1998}
1999
2000Value *Vectorizer::createMaskForExtraElements(const ArrayRef<ChainElem> C,
2001 FixedVectorType *VecTy) {
2002 // Start each mask element as false
2004 Builder.getInt1(false));
2005 // Iterate over the chain and set the corresponding mask element to true for
2006 // each element that is not an extra element.
2007 for (const ChainElem &E : C) {
2008 if (ExtraElements.contains(E.Inst))
2009 continue;
2010 unsigned EOffset =
2011 (E.OffsetFromLeader - C[0].OffsetFromLeader).getZExtValue();
2012 unsigned VecIdx =
2013 8 * EOffset / DL.getTypeSizeInBits(VecTy->getScalarType());
2014 if (FixedVectorType *VT =
2016 for (unsigned J = 0; J < VT->getNumElements(); ++J)
2017 MaskElts[VecIdx + J] = Builder.getInt1(true);
2018 else
2019 MaskElts[VecIdx] = Builder.getInt1(true);
2020 }
2021 return ConstantVector::get(MaskElts);
2022}
2023
2024void Vectorizer::deleteExtraElements() {
2025 for (auto *ExtraElement : ExtraElements) {
2026 if (isa<LoadInst>(ExtraElement)) {
2027 [[maybe_unused]] bool Deleted =
2029 assert(Deleted && "Extra Load should always be trivially dead");
2030 } else {
2031 // Unlike Extra Loads, Extra Stores won't be "dead", but should all be
2032 // deleted regardless. They will have either been combined into a masked
2033 // store, or will be left behind and need to be cleaned up.
2034 auto *PtrOperand = getLoadStorePointerOperand(ExtraElement);
2035 ExtraElement->eraseFromParent();
2037 }
2038 }
2039
2040 ExtraElements.clear();
2041}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file defines the DenseMap class.
static bool runOnFunction(Function &F, bool PostInlining)
#define DEBUG_TYPE
Module.h This file contains the declarations for the Module class.
static bool checkNoWrapFlags(Instruction *I, bool Signed)
static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA, unsigned MatchingOpIdxA, Instruction *AddOpB, unsigned MatchingOpIdxB, bool Signed)
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
#define T
static bool isAddLike(const SDValue V)
static bool isInvariantLoad(const Instruction *I, const Value *Ptr, const bool IsKernelFn)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
Basic Register Allocator
This file contains some templates that are useful if you are working with the STL at all.
Provides some synthesis utilities to produce sequences of values.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:119
This pass exposes codegen information to IR-level passes.
static bool isSafeToMove(const MachineOperand *Def, const MachineOperand *Use, const MachineInstr *Insert, const WebAssemblyFunctionInfo &MFI, const MachineRegisterInfo &MRI, bool Optimize)
A manager for alias analyses.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Class for arbitrary precision integers.
Definition APInt.h:78
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1429
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1076
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition APInt.h:207
APInt abs() const
Get the absolute value.
Definition APInt.h:1818
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition APInt.h:1173
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1084
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition APInt.h:1157
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1244
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
const APInt * getSingleElement() const
If this set contains a single element, return it, otherwise return null.
bool isSingleElement() const
Return true if this set contains exactly one member.
LLVM_ABI APInt getUnsignedMax() const
Return the largest unsigned value contained in the ConstantRange.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
ValueT & at(const_arg_type_t< KeyT > Val)
Return the entry for the specified key, or abort if no such entry exists.
Definition DenseMap.h:223
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
iterator end()
Definition DenseMap.h:81
Analysis pass which computes a DominatorTree.
Definition Dominators.h:278
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:314
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
Legacy wrapper pass to provide the GlobalsAAResult object.
ConstantInt * getInt1(bool V)
Get a constant value representing either true or false.
Definition IRBuilder.h:504
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2627
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2615
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1935
LLVM_ABI CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
Value * CreatePtrAdd(Value *Ptr, Value *Offset, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:2091
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:529
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2324
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2649
LLVM_ABI CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition IRBuilder.h:1954
ConstantInt * getInt(const APInt &AI)
Get a constant integer value.
Definition IRBuilder.h:544
LLVM_ABI bool hasNoUnsignedWrap() const LLVM_READONLY
Determine whether the no unsigned wrap flag is set.
LLVM_ABI bool hasNoSignedWrap() const LLVM_READONLY
Determine whether the no signed wrap flag is set.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
An instruction for reading from memory.
bool isSimple() const
LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:38
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition MapVector.h:126
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
Pass interface - Implemented by all 'passes'.
Definition Pass.h:99
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
Legacy wrapper pass to provide the SCEVAAResult object.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(SCEVUse LHS, SCEVUse RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
ConstantRange getUnsignedRange(const SCEV *S)
Determine the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * getCouldNotCompute()
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Value * getPointerOperand()
Analysis pass providing the TargetTransformInfo.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
LLVM_ABI bool isLegalToVectorizeLoad(LoadInst *LI) const
LLVM_ABI bool isLegalToVectorizeStore(StoreInst *SI) const
LLVM_ABI bool isLegalMaskedStore(Type *DataType, Align Alignment, unsigned AddressSpace, MaskKind MaskKind=VariableOrConstantMask) const
Return true if the target supports masked store.
LLVM_ABI unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
LLVM_ABI bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
LLVM_ABI unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
LLVM_ABI unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
LLVM_ABI bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, unsigned AddressSpace=0, Align Alignment=Align(1), unsigned *Fast=nullptr) const
Determine if the target supports unaligned memory accesses.
LLVM_ABI bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
LLVM_ABI bool isLegalMaskedLoad(Type *DataType, Align Alignment, unsigned AddressSpace, MaskKind MaskKind=VariableOrConstantMask) const
Return true if the target supports masked load.
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:290
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:287
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
Value * getOperand(unsigned i) const
Definition User.h:207
unsigned getNumOperands() const
Definition User.h:229
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition Value.h:737
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:549
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition Value.cpp:709
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type size() const
Definition DenseSet.h:87
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
TypeSize getSequentialElementStride(const DataLayout &DL) const
const ParentTy * getParent() const
Definition ilist_node.h:34
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
void push_front(reference Node)
Insert a node at the front; never copies.
void remove(reference N)
Remove a node by reference; never deletes.
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
Abstract Attribute helper functions.
Definition Attributor.h:165
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
const APInt & smax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be signed.
Definition APInt.h:2282
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
DXILDebugInfoMap run(Module &M)
ElementType
The element type of an SRV or UAV resource.
Definition DXILABI.h:68
Context & getContext() const
Definition BasicBlock.h:99
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:557
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2077
unsigned getLoadStoreAddressSpace(const Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:535
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition APFloat.h:1660
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI Pass * createLoadStoreVectorizerPass()
Create a legacy pass manager instance of the LoadStoreVectorizer pass.
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI Align getOrEnforceKnownAlignment(Value *V, MaybeAlign PrefAlign, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to ensure that the alignment of V is at least PrefAlign bytes.
Definition Local.cpp:1581
bool isModSet(const ModRefInfo MRI)
Definition ModRef.h:49
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
generic_gep_type_iterator<> gep_type_iterator
bool isModOrRefSet(const ModRefInfo MRI)
Definition ModRef.h:43
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
auto post_order(const T &G)
Post-order traversal of a graph.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
Definition ModRef.h:28
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2087
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned BitWidth
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1916
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
gep_type_iterator gep_type_begin(const User *GEP)
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2191
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77