LLVM 23.0.0git
AMDGPUPromoteAlloca.cpp
Go to the documentation of this file.
1//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Eliminates allocas by either converting them into vectors or by migrating
10// them to local address space.
11//
12// Two passes are exposed by this file:
13// - "promote-alloca-to-vector", which runs early in the pipeline and only
14// promotes to vector. Promotion to vector is almost always profitable
15// except when the alloca is too big and the promotion would result in
16// very high register pressure.
17// - "promote-alloca", which does both promotion to vector and LDS and runs
18// much later in the pipeline. This runs after SROA because promoting to
19// LDS is of course less profitable than getting rid of the alloca or
20// vectorizing it, thus we only want to do it when the only alternative is
21// lowering the alloca to stack.
22//
23// Note that both of them exist for the old and new PMs. The new PM passes are
24// declared in AMDGPU.h and the legacy PM ones are declared here.s
25//
26//===----------------------------------------------------------------------===//
27
28#include "AMDGPU.h"
29#include "GCNSubtarget.h"
31#include "llvm/ADT/STLExtras.h"
38#include "llvm/IR/IRBuilder.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
44#include "llvm/Pass.h"
48
49#define DEBUG_TYPE "amdgpu-promote-alloca"
50
51using namespace llvm;
52
53namespace {
54
55static cl::opt<bool>
56 DisablePromoteAllocaToVector("disable-promote-alloca-to-vector",
57 cl::desc("Disable promote alloca to vector"),
58 cl::init(false));
59
60static cl::opt<bool>
61 DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds",
62 cl::desc("Disable promote alloca to LDS"),
63 cl::init(false));
64
65static cl::opt<unsigned> PromoteAllocaToVectorLimit(
66 "amdgpu-promote-alloca-to-vector-limit",
67 cl::desc("Maximum byte size to consider promote alloca to vector"),
68 cl::init(0));
69
70static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs(
71 "amdgpu-promote-alloca-to-vector-max-regs",
73 "Maximum vector size (in 32b registers) to use when promoting alloca"),
74 cl::init(32));
75
76// Use up to 1/4 of available register budget for vectorization.
77// FIXME: Increase the limit for whole function budgets? Perhaps x2?
78static cl::opt<unsigned> PromoteAllocaToVectorVGPRRatio(
79 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
80 cl::desc("Ratio of VGPRs to budget for promoting alloca to vectors"),
81 cl::init(4));
82
84 LoopUserWeight("promote-alloca-vector-loop-user-weight",
85 cl::desc("The bonus weight of users of allocas within loop "
86 "when sorting profitable allocas"),
87 cl::init(4));
88
89// We support vector indices of the form ((A * stride) >> shift) + B
90// VarIndex is A, VarMul is stride, VarShift is shift and ConstIndex is B. All
91// parts are optional.
92struct GEPToVectorIndex {
93 WeakTrackingVH VarIndex = nullptr; // defaults to 0
94 ConstantInt *VarMul = nullptr; // defaults to 1
95 ConstantInt *VarShift = nullptr; // defaults to 0
96 ConstantInt *ConstIndex = nullptr; // defaults to 0
97 Value *Full = nullptr;
98};
99
100struct MemTransferInfo {
101 ConstantInt *SrcIndex = nullptr;
102 ConstantInt *DestIndex = nullptr;
103};
104
105// Analysis for planning the different strategies of alloca promotion.
106struct AllocaAnalysis {
107 AllocaInst *Alloca = nullptr;
108 DenseSet<Value *> Pointers;
110 unsigned Score = 0;
111 bool HaveSelectOrPHI = false;
112 struct {
113 FixedVectorType *Ty = nullptr;
115 SmallVector<Instruction *> UsersToRemove;
118 } Vector;
119 struct {
120 bool Enable = false;
121 SmallVector<User *> Worklist;
122 } LDS;
123
124 explicit AllocaAnalysis(AllocaInst *Alloca) : Alloca(Alloca) {}
125};
126
127// Shared implementation which can do both promotion to vector and to LDS.
128class AMDGPUPromoteAllocaImpl {
129private:
130 const TargetMachine &TM;
131 LoopInfo &LI;
132 Module *Mod = nullptr;
133 const DataLayout *DL = nullptr;
134
135 // FIXME: This should be per-kernel.
136 uint32_t LocalMemLimit = 0;
137 uint32_t CurrentLocalMemUsage = 0;
138 unsigned MaxVGPRs;
139 unsigned VGPRBudgetRatio;
140 unsigned MaxVectorRegs;
141
142 bool IsAMDGCN = false;
143 bool IsAMDHSA = false;
144
145 std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
146 Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
147
148 bool collectAllocaUses(AllocaAnalysis &AA) const;
149
150 /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
151 /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
152 /// Returns true if both operands are derived from the same alloca. Val should
153 /// be the same value as one of the input operands of UseInst.
154 bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
155 Instruction *UseInst, int OpIdx0,
156 int OpIdx1) const;
157
158 /// Check whether we have enough local memory for promotion.
159 bool hasSufficientLocalMem(const Function &F);
160
161 FixedVectorType *getVectorTypeForAlloca(Type *AllocaTy) const;
162 void analyzePromoteToVector(AllocaAnalysis &AA) const;
163 void promoteAllocaToVector(AllocaAnalysis &AA);
164 void analyzePromoteToLDS(AllocaAnalysis &AA) const;
165 bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS,
166 SetVector<IntrinsicInst *> &DeferredIntrs);
167 void
168 finishDeferredAllocaToLDSPromotion(SetVector<IntrinsicInst *> &DeferredIntrs);
169
170 void scoreAlloca(AllocaAnalysis &AA) const;
171
172 void setFunctionLimits(const Function &F);
173
174public:
175 AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
176
177 const Triple &TT = TM.getTargetTriple();
178 IsAMDGCN = TT.isAMDGCN();
179 IsAMDHSA = TT.getOS() == Triple::AMDHSA;
180 }
181
182 bool run(Function &F, bool PromoteToLDS);
183};
184
185// FIXME: This can create globals so should be a module pass.
186class AMDGPUPromoteAlloca : public FunctionPass {
187public:
188 static char ID;
189
190 AMDGPUPromoteAlloca() : FunctionPass(ID) {}
191
192 bool runOnFunction(Function &F) override {
193 if (skipFunction(F))
194 return false;
195 if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
196 return AMDGPUPromoteAllocaImpl(
197 TPC->getTM<TargetMachine>(),
198 getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
199 .run(F, /*PromoteToLDS*/ true);
200 return false;
201 }
202
203 StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
204
205 void getAnalysisUsage(AnalysisUsage &AU) const override {
206 AU.setPreservesCFG();
209 }
210};
211
212static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
213 const Function &F) {
214 if (!TM.getTargetTriple().isAMDGCN())
215 return 128;
216
217 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
218
219 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
220 // Temporarily check both the attribute and the subtarget feature, until the
221 // latter is removed.
222 if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
223 DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
224
225 unsigned MaxVGPRs = ST.getMaxNumVGPRs(
226 ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), LDSBytes, F).first,
227 DynamicVGPRBlockSize);
228
229 // A non-entry function has only 32 caller preserved registers.
230 // Do not promote alloca which will force spilling unless we know the function
231 // will be inlined.
232 if (!F.hasFnAttribute(Attribute::AlwaysInline) &&
233 !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
234 MaxVGPRs = std::min(MaxVGPRs, 32u);
235 return MaxVGPRs;
236}
237
238} // end anonymous namespace
239
240char AMDGPUPromoteAlloca::ID = 0;
241
243 "AMDGPU promote alloca to vector or LDS", false, false)
244// Move LDS uses from functions to kernels before promote alloca for accurate
245// estimation of LDS available
246INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy)
248INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
249 "AMDGPU promote alloca to vector or LDS", false, false)
250
251char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
252
255 auto &LI = AM.getResult<LoopAnalysis>(F);
256 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/true);
257 if (Changed) {
260 return PA;
261 }
262 return PreservedAnalyses::all();
263}
264
267 auto &LI = AM.getResult<LoopAnalysis>(F);
268 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/false);
269 if (Changed) {
272 return PA;
273 }
274 return PreservedAnalyses::all();
275}
276
278 return new AMDGPUPromoteAlloca();
279}
280
281bool AMDGPUPromoteAllocaImpl::collectAllocaUses(AllocaAnalysis &AA) const {
282 const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
283 LLVM_DEBUG(dbgs() << " Cannot promote alloca: " << Msg << "\n"
284 << " " << *Inst << "\n");
285 return false;
286 };
287
288 SmallVector<Instruction *, 4> WorkList({AA.Alloca});
289 while (!WorkList.empty()) {
290 auto *Cur = WorkList.pop_back_val();
291 if (find(AA.Pointers, Cur) != AA.Pointers.end())
292 continue;
293 AA.Pointers.insert(Cur);
294 for (auto &U : Cur->uses()) {
295 auto *Inst = cast<Instruction>(U.getUser());
296 if (isa<StoreInst>(Inst)) {
297 if (U.getOperandNo() != StoreInst::getPointerOperandIndex()) {
298 return RejectUser(Inst, "pointer escapes via store");
299 }
300 }
301 AA.Uses.push_back(&U);
302
303 if (isa<GetElementPtrInst>(U.getUser())) {
304 WorkList.push_back(Inst);
305 } else if (auto *SI = dyn_cast<SelectInst>(Inst)) {
306 // Only promote a select if we know that the other select operand is
307 // from another pointer that will also be promoted.
308 if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Cur, SI, 1, 2))
309 return RejectUser(Inst, "select from mixed objects");
310 WorkList.push_back(Inst);
311 AA.HaveSelectOrPHI = true;
312 } else if (auto *Phi = dyn_cast<PHINode>(Inst)) {
313 // Repeat for phis.
314
315 // TODO: Handle more complex cases. We should be able to replace loops
316 // over arrays.
317 switch (Phi->getNumIncomingValues()) {
318 case 1:
319 break;
320 case 2:
321 if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Cur, Phi, 0, 1))
322 return RejectUser(Inst, "phi from mixed objects");
323 break;
324 default:
325 return RejectUser(Inst, "phi with too many operands");
326 }
327
328 WorkList.push_back(Inst);
329 AA.HaveSelectOrPHI = true;
330 }
331 }
332 }
333 return true;
334}
335
336void AMDGPUPromoteAllocaImpl::scoreAlloca(AllocaAnalysis &AA) const {
337 LLVM_DEBUG(dbgs() << "Scoring: " << *AA.Alloca << "\n");
338 unsigned Score = 0;
339 // Increment score by one for each user + a bonus for users within loops.
340 for (auto *U : AA.Uses) {
341 Instruction *Inst = cast<Instruction>(U->getUser());
342 if (isa<GetElementPtrInst>(Inst) || isa<SelectInst>(Inst) ||
343 isa<PHINode>(Inst))
344 continue;
345 unsigned UserScore =
346 1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent()));
347 LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n");
348 Score += UserScore;
349 }
350 LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n");
351 AA.Score = Score;
352}
353
354void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
355 // Load per function limits, overriding with global options where appropriate.
356 // R600 register tuples/aliasing are fragile with large vector promotions so
357 // apply architecture specific limit here.
358 const int R600MaxVectorRegs = 16;
359 MaxVectorRegs = F.getFnAttributeAsParsedInteger(
360 "amdgpu-promote-alloca-to-vector-max-regs",
361 IsAMDGCN ? PromoteAllocaToVectorMaxRegs : R600MaxVectorRegs);
362 if (PromoteAllocaToVectorMaxRegs.getNumOccurrences())
363 MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
364 VGPRBudgetRatio = F.getFnAttributeAsParsedInteger(
365 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
366 PromoteAllocaToVectorVGPRRatio);
367 if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences())
368 VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
369}
370
371bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
372 if (DisablePromoteAllocaToLDS && DisablePromoteAllocaToVector)
373 return false;
374
375 Mod = F.getParent();
376 DL = &Mod->getDataLayout();
377
378 bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(F);
379 MaxVGPRs = getMaxVGPRs(CurrentLocalMemUsage, TM, F);
380 setFunctionLimits(F);
381
382 unsigned VectorizationBudget =
383 (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
384 : (MaxVGPRs * 32)) /
385 VGPRBudgetRatio;
386
387 std::vector<AllocaAnalysis> Allocas;
388 for (Instruction &I : F.getEntryBlock()) {
389 if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
390 // Array allocations are probably not worth handling, since an allocation
391 // of the array type is the canonical form.
392 if (!AI->isStaticAlloca() || AI->isArrayAllocation())
393 continue;
394
395 LLVM_DEBUG(dbgs() << "Analyzing: " << *AI << '\n');
396
397 AllocaAnalysis AA{AI};
398 if (collectAllocaUses(AA)) {
399 analyzePromoteToVector(AA);
400 if (PromoteToLDS)
401 analyzePromoteToLDS(AA);
402 if (AA.Vector.Ty || AA.LDS.Enable) {
403 scoreAlloca(AA);
404 Allocas.push_back(std::move(AA));
405 }
406 }
407 }
408 }
409
410 stable_sort(Allocas,
411 [](const auto &A, const auto &B) { return A.Score > B.Score; });
412
413 // clang-format off
415 dbgs() << "Sorted Worklist:\n";
416 for (const auto &AA : Allocas)
417 dbgs() << " " << *AA.Alloca << "\n";
418 );
419 // clang-format on
420
421 bool Changed = false;
422 SetVector<IntrinsicInst *> DeferredIntrs;
423 for (AllocaAnalysis &AA : Allocas) {
424 if (AA.Vector.Ty) {
425 std::optional<TypeSize> Size = AA.Alloca->getAllocationSize(*DL);
426 assert(Size); // Expected to succeed on non-array alloca.
427 const unsigned AllocaCost = Size->getFixedValue() * 8;
428 // First, check if we have enough budget to vectorize this alloca.
429 if (AllocaCost <= VectorizationBudget) {
430 promoteAllocaToVector(AA);
431 Changed = true;
432 assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
433 "Underflow!");
434 VectorizationBudget -= AllocaCost;
435 LLVM_DEBUG(dbgs() << " Remaining vectorization budget:"
436 << VectorizationBudget << "\n");
437 continue;
438 } else {
439 LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:"
440 << AllocaCost << ", budget:" << VectorizationBudget
441 << "): " << *AA.Alloca << "\n");
442 }
443 }
444
445 if (AA.LDS.Enable &&
446 tryPromoteAllocaToLDS(AA, SufficientLDS, DeferredIntrs))
447 Changed = true;
448 }
449 finishDeferredAllocaToLDSPromotion(DeferredIntrs);
450
451 // NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
452 // dangling pointers. If we want to reuse it past this point, the loop above
453 // would need to be updated to remove successfully promoted allocas.
454
455 return Changed;
456}
457
458// Checks if the instruction I is a memset user of the alloca AI that we can
459// deal with. Currently, only non-volatile memsets that affect the whole alloca
460// are handled.
462 const DataLayout &DL) {
463 using namespace PatternMatch;
464 // For now we only care about non-volatile memsets that affect the whole type
465 // (start at index 0 and fill the whole alloca).
466 //
467 // TODO: Now that we moved to PromoteAlloca we could handle any memsets
468 // (except maybe volatile ones?) - we just need to use shufflevector if it
469 // only affects a subset of the vector.
470 const unsigned Size = DL.getTypeStoreSize(AI->getAllocatedType());
471 return I->getOperand(0) == AI &&
472 match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile();
473}
474
475static Value *calculateVectorIndex(Value *Ptr, AllocaAnalysis &AA) {
476 IRBuilder<> B(Ptr->getContext());
477
478 Ptr = Ptr->stripPointerCasts();
479 if (Ptr == AA.Alloca)
480 return B.getInt32(0);
481
482 auto *GEP = cast<GetElementPtrInst>(Ptr);
483 auto I = AA.Vector.GEPVectorIdx.find(GEP);
484 assert(I != AA.Vector.GEPVectorIdx.end() && "Must have entry for GEP!");
485
486 if (!I->second.Full) {
487 Value *Result = nullptr;
488 B.SetInsertPoint(GEP);
489
490 if (I->second.VarIndex) {
491 Result = I->second.VarIndex;
492 Result = B.CreateSExtOrTrunc(Result, B.getInt32Ty());
493
494 if (I->second.VarMul)
495 Result = B.CreateMul(Result, I->second.VarMul);
496
497 if (I->second.VarShift)
498 Result = B.CreateAShr(Result, I->second.VarShift, "", /*isExact*/ true);
499 }
500
501 if (I->second.ConstIndex) {
502 if (Result)
503 Result = B.CreateAdd(Result, I->second.ConstIndex);
504 else
505 Result = I->second.ConstIndex;
506 }
507
508 if (!Result)
509 Result = B.getInt32(0);
510
511 I->second.Full = Result;
512 }
513
514 return I->second.Full;
515}
516
517static std::optional<GEPToVectorIndex>
519 Type *VecElemTy, const DataLayout &DL) {
520 // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
521 // helper.
522 LLVMContext &Ctx = GEP->getContext();
523 unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
525 APInt ConstOffset(BW, 0);
526
527 // Walk backwards through nested GEPs to collect both constant and variable
528 // offsets, so that nested vector GEP chains can be lowered in one step.
529 //
530 // Given this IR fragment as input:
531 //
532 // %0 = alloca [10 x <2 x i32>], align 8, addrspace(5)
533 // %1 = getelementptr [10 x <2 x i32>], ptr addrspace(5) %0, i32 0, i32 %j
534 // %2 = getelementptr i8, ptr addrspace(5) %1, i32 4
535 // %3 = load i32, ptr addrspace(5) %2, align 4
536 //
537 // Combine both GEP operations in a single pass, producing:
538 // BasePtr = %0
539 // ConstOffset = 4
540 // VarOffsets = { %j -> element_size(<2 x i32>) }
541 //
542 // That lets us emit a single buffer_load directly into a VGPR, without ever
543 // allocating scratch memory for the intermediate pointer.
544 Value *CurPtr = GEP;
545 while (auto *CurGEP = dyn_cast<GetElementPtrInst>(CurPtr)) {
546 if (!CurGEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
547 return {};
548
549 // Move to the next outer pointer.
550 CurPtr = CurGEP->getPointerOperand();
551 }
552
553 assert(CurPtr == Alloca && "GEP not based on alloca");
554
555 int64_t VecElemSize = DL.getTypeAllocSize(VecElemTy);
556 if (VarOffsets.size() > 1)
557 return {};
558
559 // We support vector indices of the form ((VarIndex * stride) >> shift) + B.
560 // IndexQuot represents B. Check that the constant offset is a multiple
561 // of the vector element size.
562 if (ConstOffset.srem(VecElemSize) != 0)
563 return {};
564 APInt IndexQuot = ConstOffset.sdiv(VecElemSize);
565
566 GEPToVectorIndex Result;
567
568 if (!ConstOffset.isZero())
569 Result.ConstIndex = ConstantInt::get(Ctx, IndexQuot.sextOrTrunc(BW));
570
571 // If there are no variable offsets, only a constant offset, then we're done.
572 if (VarOffsets.empty())
573 return Result;
574
575 // Scale is the stride in the (A * stride) part. Check that there is only one
576 // variable offset and extract the scale factor.
577 const auto &VarOffset = VarOffsets.front();
578 auto ScaleOpt = VarOffset.second.tryZExtValue();
579 if (!ScaleOpt || *ScaleOpt == 0)
580 return {};
581
582 uint64_t Scale = *ScaleOpt;
583 Result.VarIndex = VarOffset.first;
584 auto *OffsetType = dyn_cast<IntegerType>(Result.VarIndex->getType());
585 if (!OffsetType)
586 return {};
587
588 // The vector index for the variable part is: VarIndex * Scale / VecElemSize.
589 if (Scale >= (uint64_t)VecElemSize) {
590 if (Scale % VecElemSize != 0)
591 return {};
592
593 // Scale is a multiple of VecElemSize, so the index is just: VarIndex *
594 // (Scale / VecElemSize).
595 uint64_t VarMul = Scale / VecElemSize;
596 // Only the multiplier is needed.
597 if (VarMul != 1)
598 Result.VarMul = ConstantInt::get(Ctx, APInt(BW, VarMul));
599 } else {
600 if ((uint64_t)VecElemSize % Scale != 0)
601 return {};
602
603 // VecElemSize is a multiple of Scale, so the index is just: VarIndex /
604 // (VecElemSize / Scale).
605 uint64_t Divisor = VecElemSize / Scale;
606 // The divisor must be a power of 2 so we can use a right shift.
607 if (!isPowerOf2_64(Divisor))
608 return {};
609
610 // VarIndex must be known to be divisible by that divisor.
611 KnownBits KB = computeKnownBits(VarOffset.first, DL);
612 if (KB.countMinTrailingZeros() < Log2_64(Divisor))
613 return {};
614
615 Result.VarShift = ConstantInt::get(Ctx, APInt(BW, Log2_64(Divisor)));
616 }
617
618 return Result;
619}
620
621/// Promotes a single user of the alloca to a vector form.
622///
623/// \param Inst Instruction to be promoted.
624/// \param DL Module Data Layout.
625/// \param AA Alloca Analysis.
626/// \param VecStoreSize Size of \p VectorTy in bytes.
627/// \param ElementSize Size of \p VectorTy element type in bytes.
628/// \param CurVal Current value of the vector (e.g. last stored value)
629/// \param[out] DeferredLoads \p Inst is added to this vector if it can't
630/// be promoted now. This happens when promoting requires \p
631/// CurVal, but \p CurVal is nullptr.
632/// \return the stored value if \p Inst would have written to the alloca, or
633/// nullptr otherwise.
635 AllocaAnalysis &AA,
636 unsigned VecStoreSize,
637 unsigned ElementSize,
638 function_ref<Value *()> GetCurVal) {
639 // Note: we use InstSimplifyFolder because it can leverage the DataLayout
640 // to do more folding, especially in the case of vector splats.
643 Builder.SetInsertPoint(Inst);
644
645 Type *VecEltTy = AA.Vector.Ty->getElementType();
646
647 switch (Inst->getOpcode()) {
648 case Instruction::Load: {
649 Value *CurVal = GetCurVal();
650 Value *Index =
652
653 // We're loading the full vector.
654 Type *AccessTy = Inst->getType();
655 TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
656 if (Constant *CI = dyn_cast<Constant>(Index)) {
657 if (CI->isNullValue() && AccessSize == VecStoreSize) {
658 Inst->replaceAllUsesWith(
659 Builder.CreateBitPreservingCastChain(DL, CurVal, AccessTy));
660 return nullptr;
661 }
662 }
663
664 // Loading a subvector.
665 if (isa<FixedVectorType>(AccessTy)) {
666 assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
667 const unsigned NumLoadedElts = AccessSize / DL.getTypeStoreSize(VecEltTy);
668 auto *SubVecTy = FixedVectorType::get(VecEltTy, NumLoadedElts);
669 assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
670
671 // If idx is dynamic, then sandwich load with bitcasts.
672 // ie. VectorTy SubVecTy AccessTy
673 // <64 x i8> -> <16 x i8> <8 x i16>
674 // <64 x i8> -> <4 x i128> -> i128 -> <8 x i16>
675 // Extracting subvector with dynamic index has very large expansion in
676 // the amdgpu backend. Limit to pow2.
677 FixedVectorType *VectorTy = AA.Vector.Ty;
678 TypeSize NumBits = DL.getTypeStoreSize(SubVecTy) * 8u;
679 uint64_t LoadAlign = cast<LoadInst>(Inst)->getAlign().value();
680 bool IsAlignedLoad = NumBits <= (LoadAlign * 8u);
681 unsigned TotalNumElts = VectorTy->getNumElements();
682 bool IsProperlyDivisible = TotalNumElts % NumLoadedElts == 0;
683 if (!isa<ConstantInt>(Index) &&
684 llvm::isPowerOf2_32(SubVecTy->getNumElements()) &&
685 IsProperlyDivisible && IsAlignedLoad) {
686 IntegerType *NewElemTy = Builder.getIntNTy(NumBits);
687 const unsigned NewNumElts =
688 DL.getTypeStoreSize(VectorTy) * 8u / NumBits;
689 const unsigned LShrAmt = llvm::Log2_32(SubVecTy->getNumElements());
690 FixedVectorType *BitCastTy =
691 FixedVectorType::get(NewElemTy, NewNumElts);
692 Value *BCVal = Builder.CreateBitCast(CurVal, BitCastTy);
693 Value *NewIdx = Builder.CreateLShr(
694 Index, ConstantInt::get(Index->getType(), LShrAmt));
695 Value *ExtVal = Builder.CreateExtractElement(BCVal, NewIdx);
696 Value *BCOut = Builder.CreateBitCast(ExtVal, AccessTy);
697 Inst->replaceAllUsesWith(BCOut);
698 return nullptr;
699 }
700
701 Value *SubVec = PoisonValue::get(SubVecTy);
702 for (unsigned K = 0; K < NumLoadedElts; ++K) {
703 Value *CurIdx =
704 Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
705 SubVec = Builder.CreateInsertElement(
706 SubVec, Builder.CreateExtractElement(CurVal, CurIdx), K);
707 }
708
709 Inst->replaceAllUsesWith(
710 Builder.CreateBitPreservingCastChain(DL, SubVec, AccessTy));
711 return nullptr;
712 }
713
714 // We're loading one element.
715 Value *ExtractElement = Builder.CreateExtractElement(CurVal, Index);
716 if (AccessTy != VecEltTy)
717 ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, AccessTy);
718
719 Inst->replaceAllUsesWith(ExtractElement);
720 return nullptr;
721 }
722 case Instruction::Store: {
723 // For stores, it's a bit trickier and it depends on whether we're storing
724 // the full vector or not. If we're storing the full vector, we don't need
725 // to know the current value. If this is a store of a single element, we
726 // need to know the value.
728 Value *Index = calculateVectorIndex(SI->getPointerOperand(), AA);
729 Value *Val = SI->getValueOperand();
730
731 // We're storing the full vector, we can handle this without knowing CurVal.
732 Type *AccessTy = Val->getType();
733 TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
734 if (Constant *CI = dyn_cast<Constant>(Index))
735 if (CI->isNullValue() && AccessSize == VecStoreSize)
736 return Builder.CreateBitPreservingCastChain(DL, Val, AA.Vector.Ty);
737
738 // Storing a subvector.
739 if (isa<FixedVectorType>(AccessTy)) {
740 assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
741 const unsigned NumWrittenElts =
742 AccessSize / DL.getTypeStoreSize(VecEltTy);
743 const unsigned NumVecElts = AA.Vector.Ty->getNumElements();
744 auto *SubVecTy = FixedVectorType::get(VecEltTy, NumWrittenElts);
745 assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
746
747 Val = Builder.CreateBitPreservingCastChain(DL, Val, SubVecTy);
748 Value *CurVec = GetCurVal();
749 for (unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts);
750 K < NumElts; ++K) {
751 Value *CurIdx =
752 Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
753 CurVec = Builder.CreateInsertElement(
754 CurVec, Builder.CreateExtractElement(Val, K), CurIdx);
755 }
756 return CurVec;
757 }
758
759 if (Val->getType() != VecEltTy)
760 Val = Builder.CreateBitOrPointerCast(Val, VecEltTy);
761 return Builder.CreateInsertElement(GetCurVal(), Val, Index);
762 }
763 case Instruction::Call: {
764 if (auto *MTI = dyn_cast<MemTransferInst>(Inst)) {
765 // For memcpy, we need to know curval.
766 ConstantInt *Length = cast<ConstantInt>(MTI->getLength());
767 unsigned NumCopied = Length->getZExtValue() / ElementSize;
768 MemTransferInfo *TI = &AA.Vector.TransferInfo[MTI];
769 unsigned SrcBegin = TI->SrcIndex->getZExtValue();
770 unsigned DestBegin = TI->DestIndex->getZExtValue();
771
772 SmallVector<int> Mask;
773 for (unsigned Idx = 0; Idx < AA.Vector.Ty->getNumElements(); ++Idx) {
774 if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
775 Mask.push_back(SrcBegin < AA.Vector.Ty->getNumElements()
776 ? SrcBegin++
778 } else {
779 Mask.push_back(Idx);
780 }
781 }
782
783 return Builder.CreateShuffleVector(GetCurVal(), Mask);
784 }
785
786 if (auto *MSI = dyn_cast<MemSetInst>(Inst)) {
787 // For memset, we don't need to know the previous value because we
788 // currently only allow memsets that cover the whole alloca.
789 Value *Elt = MSI->getOperand(1);
790 const unsigned BytesPerElt = DL.getTypeStoreSize(VecEltTy);
791 if (BytesPerElt > 1) {
792 Value *EltBytes = Builder.CreateVectorSplat(BytesPerElt, Elt);
793
794 // If the element type of the vector is a pointer, we need to first cast
795 // to an integer, then use a PtrCast.
796 if (VecEltTy->isPointerTy()) {
797 Type *PtrInt = Builder.getIntNTy(BytesPerElt * 8);
798 Elt = Builder.CreateBitCast(EltBytes, PtrInt);
799 Elt = Builder.CreateIntToPtr(Elt, VecEltTy);
800 } else
801 Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
802 }
803
804 return Builder.CreateVectorSplat(AA.Vector.Ty->getElementCount(), Elt);
805 }
806
807 if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) {
808 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
809 Intr->replaceAllUsesWith(
810 Builder.getIntN(Intr->getType()->getIntegerBitWidth(),
811 DL.getTypeAllocSize(AA.Vector.Ty)));
812 return nullptr;
813 }
814 }
815
816 llvm_unreachable("Unsupported call when promoting alloca to vector");
817 }
818
819 default:
820 llvm_unreachable("Inconsistency in instructions promotable to vector");
821 }
822
823 llvm_unreachable("Did not return after promoting instruction!");
824}
825
826static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy,
827 const DataLayout &DL) {
828 // Access as a vector type can work if the size of the access vector is a
829 // multiple of the size of the alloca's vector element type.
830 //
831 // Examples:
832 // - VecTy = <8 x float>, AccessTy = <4 x float> -> OK
833 // - VecTy = <4 x double>, AccessTy = <2 x float> -> OK
834 // - VecTy = <4 x double>, AccessTy = <3 x float> -> NOT OK
835 // - 3*32 is not a multiple of 64
836 //
837 // We could handle more complicated cases, but it'd make things a lot more
838 // complicated.
839 if (isa<FixedVectorType>(AccessTy)) {
840 TypeSize AccTS = DL.getTypeStoreSize(AccessTy);
841 // If the type size and the store size don't match, we would need to do more
842 // than just bitcast to translate between an extracted/insertable subvectors
843 // and the accessed value.
844 if (AccTS * 8 != DL.getTypeSizeInBits(AccessTy))
845 return false;
846 TypeSize VecTS = DL.getTypeStoreSize(VecTy->getElementType());
847 return AccTS.isKnownMultipleOf(VecTS);
848 }
849
851 DL);
852}
853
854/// Iterates over an instruction worklist that may contain multiple instructions
855/// from the same basic block, but in a different order.
856template <typename InstContainer>
857static void forEachWorkListItem(const InstContainer &WorkList,
858 std::function<void(Instruction *)> Fn) {
859 // Bucket up uses of the alloca by the block they occur in.
860 // This is important because we have to handle multiple defs/uses in a block
861 // ourselves: SSAUpdater is purely for cross-block references.
863 for (Instruction *User : WorkList)
864 UsesByBlock[User->getParent()].insert(User);
865
866 for (Instruction *User : WorkList) {
867 BasicBlock *BB = User->getParent();
868 auto &BlockUses = UsesByBlock[BB];
869
870 // Already processed, skip.
871 if (BlockUses.empty())
872 continue;
873
874 // Only user in the block, directly process it.
875 if (BlockUses.size() == 1) {
876 Fn(User);
877 continue;
878 }
879
880 // Multiple users in the block, do a linear scan to see users in order.
881 for (Instruction &Inst : *BB) {
882 if (!BlockUses.contains(&Inst))
883 continue;
884
885 Fn(&Inst);
886 }
887
888 // Clear the block so we know it's been processed.
889 BlockUses.clear();
890 }
891}
892
893/// Find an insert point after an alloca, after all other allocas clustered at
894/// the start of the block.
897 for (BasicBlock::iterator E = BB.end(); I != E && isa<AllocaInst>(*I); ++I)
898 ;
899 return I;
900}
901
903AMDGPUPromoteAllocaImpl::getVectorTypeForAlloca(Type *AllocaTy) const {
904 if (DisablePromoteAllocaToVector) {
905 LLVM_DEBUG(dbgs() << " Promote alloca to vectors is disabled\n");
906 return nullptr;
907 }
908
909 auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
910 if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
911 uint64_t NumElems = 1;
912 Type *ElemTy;
913 do {
914 NumElems *= ArrayTy->getNumElements();
915 ElemTy = ArrayTy->getElementType();
916 } while ((ArrayTy = dyn_cast<ArrayType>(ElemTy)));
917
918 // Check for array of vectors
919 auto *InnerVectorTy = dyn_cast<FixedVectorType>(ElemTy);
920 if (InnerVectorTy) {
921 NumElems *= InnerVectorTy->getNumElements();
922 ElemTy = InnerVectorTy->getElementType();
923 }
924
925 if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
926 unsigned ElementSize = DL->getTypeSizeInBits(ElemTy) / 8;
927 if (ElementSize > 0) {
928 unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
929 // Expand vector if required to match padding of inner type,
930 // i.e. odd size subvectors.
931 // Storage size of new vector must match that of alloca for correct
932 // behaviour of byte offsets and GEP computation.
933 if (NumElems * ElementSize != AllocaSize)
934 NumElems = AllocaSize / ElementSize;
935 if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
936 VectorTy = FixedVectorType::get(ElemTy, NumElems);
937 }
938 }
939 }
940 if (!VectorTy) {
941 LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
942 return nullptr;
943 }
944
945 const unsigned MaxElements =
946 (MaxVectorRegs * 32) / DL->getTypeSizeInBits(VectorTy->getElementType());
947
948 if (VectorTy->getNumElements() > MaxElements ||
949 VectorTy->getNumElements() < 2) {
950 LLVM_DEBUG(dbgs() << " " << *VectorTy
951 << " has an unsupported number of elements\n");
952 return nullptr;
953 }
954
955 Type *VecEltTy = VectorTy->getElementType();
956 unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy);
957 if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) {
958 LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size "
959 "does not match the type's size\n");
960 return nullptr;
961 }
962
963 return VectorTy;
964}
965
966void AMDGPUPromoteAllocaImpl::analyzePromoteToVector(AllocaAnalysis &AA) const {
967 if (AA.HaveSelectOrPHI) {
968 LLVM_DEBUG(dbgs() << " Cannot convert to vector due to select or phi\n");
969 return;
970 }
971
972 Type *AllocaTy = AA.Alloca->getAllocatedType();
973 AA.Vector.Ty = getVectorTypeForAlloca(AllocaTy);
974 if (!AA.Vector.Ty)
975 return;
976
977 const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
978 LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n"
979 << " " << *Inst << "\n");
980 AA.Vector.Ty = nullptr;
981 };
982
983 Type *VecEltTy = AA.Vector.Ty->getElementType();
984 unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
985 assert(ElementSize > 0);
986 for (auto *U : AA.Uses) {
987 Instruction *Inst = cast<Instruction>(U->getUser());
988
989 if (Value *Ptr = getLoadStorePointerOperand(Inst)) {
990 assert(!isa<StoreInst>(Inst) ||
991 U->getOperandNo() == StoreInst::getPointerOperandIndex());
992
993 Type *AccessTy = getLoadStoreType(Inst);
994 if (AccessTy->isAggregateType())
995 return RejectUser(Inst, "unsupported load/store as aggregate");
996 assert(!AccessTy->isAggregateType() || AccessTy->isArrayTy());
997
998 // Check that this is a simple access of a vector element.
999 bool IsSimple = isa<LoadInst>(Inst) ? cast<LoadInst>(Inst)->isSimple()
1000 : cast<StoreInst>(Inst)->isSimple();
1001 if (!IsSimple)
1002 return RejectUser(Inst, "not a simple load or store");
1003
1004 Ptr = Ptr->stripPointerCasts();
1005
1006 // Alloca already accessed as vector.
1007 if (Ptr == AA.Alloca &&
1008 DL->getTypeStoreSize(AA.Alloca->getAllocatedType()) ==
1009 DL->getTypeStoreSize(AccessTy)) {
1010 AA.Vector.Worklist.push_back(Inst);
1011 continue;
1012 }
1013
1014 if (!isSupportedAccessType(AA.Vector.Ty, AccessTy, *DL))
1015 return RejectUser(Inst, "not a supported access type");
1016
1017 AA.Vector.Worklist.push_back(Inst);
1018 continue;
1019 }
1020
1021 if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
1022 // If we can't compute a vector index from this GEP, then we can't
1023 // promote this alloca to vector.
1024 auto Index = computeGEPToVectorIndex(GEP, AA.Alloca, VecEltTy, *DL);
1025 if (!Index)
1026 return RejectUser(Inst, "cannot compute vector index for GEP");
1027
1028 AA.Vector.GEPVectorIdx[GEP] = std::move(Index.value());
1029 AA.Vector.UsersToRemove.push_back(Inst);
1030 continue;
1031 }
1032
1033 if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst);
1034 MSI && isSupportedMemset(MSI, AA.Alloca, *DL)) {
1035 AA.Vector.Worklist.push_back(Inst);
1036 continue;
1037 }
1038
1039 if (MemTransferInst *TransferInst = dyn_cast<MemTransferInst>(Inst)) {
1040 if (TransferInst->isVolatile())
1041 return RejectUser(Inst, "mem transfer inst is volatile");
1042
1043 ConstantInt *Len = dyn_cast<ConstantInt>(TransferInst->getLength());
1044 if (!Len || (Len->getZExtValue() % ElementSize))
1045 return RejectUser(Inst, "mem transfer inst length is non-constant or "
1046 "not a multiple of the vector element size");
1047
1048 auto getConstIndexIntoAlloca = [&](Value *Ptr) -> ConstantInt * {
1049 if (Ptr == AA.Alloca)
1050 return ConstantInt::get(Ptr->getContext(), APInt(32, 0));
1051
1053 const auto &GEPI = AA.Vector.GEPVectorIdx.find(GEP)->second;
1054 if (GEPI.VarIndex)
1055 return nullptr;
1056 if (GEPI.ConstIndex)
1057 return GEPI.ConstIndex;
1058 return ConstantInt::get(Ptr->getContext(), APInt(32, 0));
1059 };
1060
1061 MemTransferInfo *TI =
1062 &AA.Vector.TransferInfo.try_emplace(TransferInst).first->second;
1063 unsigned OpNum = U->getOperandNo();
1064 if (OpNum == 0) {
1065 Value *Dest = TransferInst->getDest();
1066 ConstantInt *Index = getConstIndexIntoAlloca(Dest);
1067 if (!Index)
1068 return RejectUser(Inst, "could not calculate constant dest index");
1069 TI->DestIndex = Index;
1070 } else {
1071 assert(OpNum == 1);
1072 Value *Src = TransferInst->getSource();
1073 ConstantInt *Index = getConstIndexIntoAlloca(Src);
1074 if (!Index)
1075 return RejectUser(Inst, "could not calculate constant src index");
1076 TI->SrcIndex = Index;
1077 }
1078 continue;
1079 }
1080
1081 if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) {
1082 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
1083 AA.Vector.Worklist.push_back(Inst);
1084 continue;
1085 }
1086 }
1087
1088 // Ignore assume-like intrinsics and comparisons used in assumes.
1089 if (isAssumeLikeIntrinsic(Inst)) {
1090 if (!Inst->use_empty())
1091 return RejectUser(Inst, "assume-like intrinsic cannot have any users");
1092 AA.Vector.UsersToRemove.push_back(Inst);
1093 continue;
1094 }
1095
1096 if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) {
1097 return isAssumeLikeIntrinsic(cast<Instruction>(U));
1098 })) {
1099 AA.Vector.UsersToRemove.push_back(Inst);
1100 continue;
1101 }
1102
1103 return RejectUser(Inst, "unhandled alloca user");
1104 }
1105
1106 // Follow-up check to ensure we've seen both sides of all transfer insts.
1107 for (const auto &Entry : AA.Vector.TransferInfo) {
1108 const MemTransferInfo &TI = Entry.second;
1109 if (!TI.SrcIndex || !TI.DestIndex)
1110 return RejectUser(Entry.first,
1111 "mem transfer inst between different objects");
1112 AA.Vector.Worklist.push_back(Entry.first);
1113 }
1114}
1115
1116void AMDGPUPromoteAllocaImpl::promoteAllocaToVector(AllocaAnalysis &AA) {
1117 LLVM_DEBUG(dbgs() << "Promoting to vectors: " << *AA.Alloca << '\n');
1118 LLVM_DEBUG(dbgs() << " type conversion: " << *AA.Alloca->getAllocatedType()
1119 << " -> " << *AA.Vector.Ty << '\n');
1120 const unsigned VecStoreSize = DL->getTypeStoreSize(AA.Vector.Ty);
1121
1122 Type *VecEltTy = AA.Vector.Ty->getElementType();
1123 const unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
1124
1125 // Alloca is uninitialized memory. Imitate that by making the first value
1126 // undef.
1127 SSAUpdater Updater;
1128 Updater.Initialize(AA.Vector.Ty, "promotealloca");
1129
1130 BasicBlock *EntryBB = AA.Alloca->getParent();
1131 BasicBlock::iterator InitInsertPos =
1132 skipToNonAllocaInsertPt(*EntryBB, AA.Alloca->getIterator());
1133 IRBuilder<> Builder(&*InitInsertPos);
1134 Value *AllocaInitValue = Builder.CreateFreeze(PoisonValue::get(AA.Vector.Ty));
1135 AllocaInitValue->takeName(AA.Alloca);
1136
1137 Updater.AddAvailableValue(AA.Alloca->getParent(), AllocaInitValue);
1138
1139 // First handle the initial worklist, in basic block order.
1140 //
1141 // Insert a placeholder whenever we need the vector value at the top of a
1142 // basic block.
1144 forEachWorkListItem(AA.Vector.Worklist, [&](Instruction *I) {
1145 BasicBlock *BB = I->getParent();
1146 auto GetCurVal = [&]() -> Value * {
1147 if (Value *CurVal = Updater.FindValueForBlock(BB))
1148 return CurVal;
1149
1150 if (!Placeholders.empty() && Placeholders.back()->getParent() == BB)
1151 return Placeholders.back();
1152
1153 // If the current value in the basic block is not yet known, insert a
1154 // placeholder that we will replace later.
1155 IRBuilder<> Builder(I);
1156 auto *Placeholder = cast<Instruction>(Builder.CreateFreeze(
1157 PoisonValue::get(AA.Vector.Ty), "promotealloca.placeholder"));
1158 Placeholders.insert(Placeholder);
1159 return Placeholders.back();
1160 };
1161
1162 Value *Result = promoteAllocaUserToVector(I, *DL, AA, VecStoreSize,
1163 ElementSize, GetCurVal);
1164 // If the returned result is a placeholder, it means the instruction does
1165 // not really modify the alloca. So no need to make it being available value
1166 // to SSAUpdater.
1167 // This will stop placeholder being cached in SSAUpdater. The cached
1168 // placeholder may cause stale pointer being referenced when doing
1169 // placeholder replacement.
1170 if (Result && (!isa<Instruction>(Result) ||
1171 !Placeholders.contains(cast<Instruction>(Result))))
1172 Updater.AddAvailableValue(BB, Result);
1173 });
1174
1175 // Now fixup the placeholders.
1176 for (Instruction *Placeholder : Placeholders) {
1177 Placeholder->replaceAllUsesWith(
1178 Updater.GetValueInMiddleOfBlock(Placeholder->getParent()));
1179 Placeholder->eraseFromParent();
1180 }
1181
1182 // Delete all instructions.
1183 for (Instruction *I : AA.Vector.Worklist) {
1184 assert(I->use_empty());
1185 I->eraseFromParent();
1186 }
1187
1188 // Delete all the users that are known to be removeable.
1189 for (Instruction *I : reverse(AA.Vector.UsersToRemove)) {
1190 I->dropDroppableUses();
1191 assert(I->use_empty());
1192 I->eraseFromParent();
1193 }
1194
1195 // Alloca should now be dead too.
1196 assert(AA.Alloca->use_empty());
1197 AA.Alloca->eraseFromParent();
1198}
1199
1200std::pair<Value *, Value *>
1201AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
1202 Function &F = *Builder.GetInsertBlock()->getParent();
1204
1205 if (!IsAMDHSA) {
1206 CallInst *LocalSizeY =
1207 Builder.CreateIntrinsic(Intrinsic::r600_read_local_size_y, {});
1208 CallInst *LocalSizeZ =
1209 Builder.CreateIntrinsic(Intrinsic::r600_read_local_size_z, {});
1210
1211 ST.makeLIDRangeMetadata(LocalSizeY);
1212 ST.makeLIDRangeMetadata(LocalSizeZ);
1213
1214 return std::pair(LocalSizeY, LocalSizeZ);
1215 }
1216
1217 // We must read the size out of the dispatch pointer.
1218 assert(IsAMDGCN);
1219
1220 // We are indexing into this struct, and want to extract the workgroup_size_*
1221 // fields.
1222 //
1223 // typedef struct hsa_kernel_dispatch_packet_s {
1224 // uint16_t header;
1225 // uint16_t setup;
1226 // uint16_t workgroup_size_x ;
1227 // uint16_t workgroup_size_y;
1228 // uint16_t workgroup_size_z;
1229 // uint16_t reserved0;
1230 // uint32_t grid_size_x ;
1231 // uint32_t grid_size_y ;
1232 // uint32_t grid_size_z;
1233 //
1234 // uint32_t private_segment_size;
1235 // uint32_t group_segment_size;
1236 // uint64_t kernel_object;
1237 //
1238 // #ifdef HSA_LARGE_MODEL
1239 // void *kernarg_address;
1240 // #elif defined HSA_LITTLE_ENDIAN
1241 // void *kernarg_address;
1242 // uint32_t reserved1;
1243 // #else
1244 // uint32_t reserved1;
1245 // void *kernarg_address;
1246 // #endif
1247 // uint64_t reserved2;
1248 // hsa_signal_t completion_signal; // uint64_t wrapper
1249 // } hsa_kernel_dispatch_packet_t
1250 //
1251 CallInst *DispatchPtr =
1252 Builder.CreateIntrinsic(Intrinsic::amdgcn_dispatch_ptr, {});
1253 DispatchPtr->addRetAttr(Attribute::NoAlias);
1254 DispatchPtr->addRetAttr(Attribute::NonNull);
1255 F.removeFnAttr("amdgpu-no-dispatch-ptr");
1256
1257 // Size of the dispatch packet struct.
1258 DispatchPtr->addDereferenceableRetAttr(64);
1259
1260 Type *I32Ty = Type::getInt32Ty(Mod->getContext());
1261
1262 // We could do a single 64-bit load here, but it's likely that the basic
1263 // 32-bit and extract sequence is already present, and it is probably easier
1264 // to CSE this. The loads should be mergeable later anyway.
1265 Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 1);
1266 LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4));
1267
1268 Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 2);
1269 LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4));
1270
1271 MDNode *MD = MDNode::get(Mod->getContext(), {});
1272 LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
1273 LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
1274 ST.makeLIDRangeMetadata(LoadZU);
1275
1276 // Extract y component. Upper half of LoadZU should be zero already.
1277 Value *Y = Builder.CreateLShr(LoadXY, 16);
1278
1279 return std::pair(Y, LoadZU);
1280}
1281
1282Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
1283 unsigned N) {
1284 Function *F = Builder.GetInsertBlock()->getParent();
1287 StringRef AttrName;
1288
1289 switch (N) {
1290 case 0:
1291 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
1292 : (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
1293 AttrName = "amdgpu-no-workitem-id-x";
1294 break;
1295 case 1:
1296 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
1297 : (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
1298 AttrName = "amdgpu-no-workitem-id-y";
1299 break;
1300
1301 case 2:
1302 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
1303 : (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
1304 AttrName = "amdgpu-no-workitem-id-z";
1305 break;
1306 default:
1307 llvm_unreachable("invalid dimension");
1308 }
1309
1310 Function *WorkitemIdFn = Intrinsic::getOrInsertDeclaration(Mod, IntrID);
1311 CallInst *CI = Builder.CreateCall(WorkitemIdFn);
1312 ST.makeLIDRangeMetadata(CI);
1313 F->removeFnAttr(AttrName);
1314
1315 return CI;
1316}
1317
1318static bool isCallPromotable(CallInst *CI) {
1320 if (!II)
1321 return false;
1322
1323 switch (II->getIntrinsicID()) {
1324 case Intrinsic::memcpy:
1325 case Intrinsic::memmove:
1326 case Intrinsic::memset:
1327 case Intrinsic::lifetime_start:
1328 case Intrinsic::lifetime_end:
1329 case Intrinsic::invariant_start:
1330 case Intrinsic::invariant_end:
1331 case Intrinsic::launder_invariant_group:
1332 case Intrinsic::strip_invariant_group:
1333 case Intrinsic::objectsize:
1334 return true;
1335 default:
1336 return false;
1337 }
1338}
1339
1340bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
1341 Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0,
1342 int OpIdx1) const {
1343 // Figure out which operand is the one we might not be promoting.
1344 Value *OtherOp = Inst->getOperand(OpIdx0);
1345 if (Val == OtherOp)
1346 OtherOp = Inst->getOperand(OpIdx1);
1347
1349 return true;
1350
1351 // TODO: getUnderlyingObject will not work on a vector getelementptr
1352 Value *OtherObj = getUnderlyingObject(OtherOp);
1353 if (!isa<AllocaInst>(OtherObj))
1354 return false;
1355
1356 // TODO: We should be able to replace undefs with the right pointer type.
1357
1358 // TODO: If we know the other base object is another promotable
1359 // alloca, not necessarily this alloca, we can do this. The
1360 // important part is both must have the same address space at
1361 // the end.
1362 if (OtherObj != BaseAlloca) {
1363 LLVM_DEBUG(
1364 dbgs() << "Found a binary instruction with another alloca object\n");
1365 return false;
1366 }
1367
1368 return true;
1369}
1370
1371void AMDGPUPromoteAllocaImpl::analyzePromoteToLDS(AllocaAnalysis &AA) const {
1372 if (DisablePromoteAllocaToLDS) {
1373 LLVM_DEBUG(dbgs() << " Promote alloca to LDS is disabled\n");
1374 return;
1375 }
1376
1377 // Don't promote the alloca to LDS for shader calling conventions as the work
1378 // item ID intrinsics are not supported for these calling conventions.
1379 // Furthermore not all LDS is available for some of the stages.
1380 const Function &ContainingFunction = *AA.Alloca->getFunction();
1381 CallingConv::ID CC = ContainingFunction.getCallingConv();
1382
1383 switch (CC) {
1386 break;
1387 default:
1388 LLVM_DEBUG(
1389 dbgs()
1390 << " promote alloca to LDS not supported with calling convention.\n");
1391 return;
1392 }
1393
1394 for (Use *Use : AA.Uses) {
1395 auto *User = Use->getUser();
1396
1397 if (CallInst *CI = dyn_cast<CallInst>(User)) {
1398 if (!isCallPromotable(CI))
1399 return;
1400
1401 if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end())
1402 AA.LDS.Worklist.push_back(User);
1403 continue;
1404 }
1405
1407 if (UseInst->getOpcode() == Instruction::PtrToInt)
1408 return;
1409
1410 if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
1411 if (LI->isVolatile())
1412 return;
1413 continue;
1414 }
1415
1416 if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
1417 if (SI->isVolatile())
1418 return;
1419 continue;
1420 }
1421
1422 if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
1423 if (RMW->isVolatile())
1424 return;
1425 continue;
1426 }
1427
1428 if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
1429 if (CAS->isVolatile())
1430 return;
1431 continue;
1432 }
1433
1434 // Only promote a select if we know that the other select operand
1435 // is from another pointer that will also be promoted.
1436 if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
1437 if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Use->get(), ICmp, 0, 1))
1438 return;
1439
1440 // May need to rewrite constant operands.
1441 if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end())
1442 AA.LDS.Worklist.push_back(ICmp);
1443 continue;
1444 }
1445
1447 // Be conservative if an address could be computed outside the bounds of
1448 // the alloca.
1449 if (!GEP->isInBounds())
1450 return;
1452 // Do not promote vector/aggregate type instructions. It is hard to track
1453 // their users.
1454
1455 // Do not promote addrspacecast.
1456 //
1457 // TODO: If we know the address is only observed through flat pointers, we
1458 // could still promote.
1459 return;
1460 }
1461
1462 if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end())
1463 AA.LDS.Worklist.push_back(User);
1464 }
1465
1466 AA.LDS.Enable = true;
1467}
1468
1469bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
1470
1471 FunctionType *FTy = F.getFunctionType();
1473
1474 // If the function has any arguments in the local address space, then it's
1475 // possible these arguments require the entire local memory space, so
1476 // we cannot use local memory in the pass.
1477 for (Type *ParamTy : FTy->params()) {
1478 PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
1479 if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
1480 LocalMemLimit = 0;
1481 LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
1482 "local memory disabled.\n");
1483 return false;
1484 }
1485 }
1486
1487 LocalMemLimit = ST.getAddressableLocalMemorySize();
1488 if (LocalMemLimit == 0)
1489 return false;
1490
1492 SmallPtrSet<const Constant *, 8> VisitedConstants;
1494
1495 auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool {
1496 for (const User *U : Val->users()) {
1497 if (const Instruction *Use = dyn_cast<Instruction>(U)) {
1498 if (Use->getFunction() == &F)
1499 return true;
1500 } else {
1501 const Constant *C = cast<Constant>(U);
1502 if (VisitedConstants.insert(C).second)
1503 Stack.push_back(C);
1504 }
1505 }
1506
1507 return false;
1508 };
1509
1510 for (GlobalVariable &GV : Mod->globals()) {
1512 continue;
1513
1514 if (visitUsers(&GV, &GV)) {
1515 UsedLDS.insert(&GV);
1516 Stack.clear();
1517 continue;
1518 }
1519
1520 // For any ConstantExpr uses, we need to recursively search the users until
1521 // we see a function.
1522 while (!Stack.empty()) {
1523 const Constant *C = Stack.pop_back_val();
1524 if (visitUsers(&GV, C)) {
1525 UsedLDS.insert(&GV);
1526 Stack.clear();
1527 break;
1528 }
1529 }
1530 }
1531
1532 const DataLayout &DL = Mod->getDataLayout();
1533 SmallVector<std::pair<uint64_t, Align>, 16> AllocatedSizes;
1534 AllocatedSizes.reserve(UsedLDS.size());
1535
1536 for (const GlobalVariable *GV : UsedLDS) {
1537 Align Alignment =
1538 DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
1539 uint64_t AllocSize = GV->getGlobalSize(DL);
1540
1541 // HIP uses an extern unsized array in local address space for dynamically
1542 // allocated shared memory. In that case, we have to disable the promotion.
1543 if (GV->hasExternalLinkage() && AllocSize == 0) {
1544 LocalMemLimit = 0;
1545 LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated "
1546 "local memory. Promoting to local memory "
1547 "disabled.\n");
1548 return false;
1549 }
1550
1551 AllocatedSizes.emplace_back(AllocSize, Alignment);
1552 }
1553
1554 // Sort to try to estimate the worst case alignment padding
1555 //
1556 // FIXME: We should really do something to fix the addresses to a more optimal
1557 // value instead
1558 llvm::sort(AllocatedSizes, llvm::less_second());
1559
1560 // Check how much local memory is being used by global objects
1561 CurrentLocalMemUsage = 0;
1562
1563 // FIXME: Try to account for padding here. The real padding and address is
1564 // currently determined from the inverse order of uses in the function when
1565 // legalizing, which could also potentially change. We try to estimate the
1566 // worst case here, but we probably should fix the addresses earlier.
1567 for (auto Alloc : AllocatedSizes) {
1568 CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alloc.second);
1569 CurrentLocalMemUsage += Alloc.first;
1570 }
1571
1572 unsigned MaxOccupancy =
1573 ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), CurrentLocalMemUsage, F)
1574 .second;
1575
1576 // Round up to the next tier of usage.
1577 unsigned MaxSizeWithWaveCount =
1578 ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
1579
1580 // Program may already use more LDS than is usable at maximum occupancy.
1581 if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
1582 return false;
1583
1584 LocalMemLimit = MaxSizeWithWaveCount;
1585
1586 LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
1587 << " bytes of LDS\n"
1588 << " Rounding size to " << MaxSizeWithWaveCount
1589 << " with a maximum occupancy of " << MaxOccupancy << '\n'
1590 << " and " << (LocalMemLimit - CurrentLocalMemUsage)
1591 << " available for promotion\n");
1592
1593 return true;
1594}
1595
1596// FIXME: Should try to pick the most likely to be profitable allocas first.
1597bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(
1598 AllocaAnalysis &AA, bool SufficientLDS,
1599 SetVector<IntrinsicInst *> &DeferredIntrs) {
1600 LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << *AA.Alloca << '\n');
1601
1602 // Not likely to have sufficient local memory for promotion.
1603 if (!SufficientLDS)
1604 return false;
1605
1606 const DataLayout &DL = Mod->getDataLayout();
1607 IRBuilder<> Builder(AA.Alloca);
1608
1609 const Function &ContainingFunction = *AA.Alloca->getParent()->getParent();
1610 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, ContainingFunction);
1611 unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
1612
1613 Align Alignment = AA.Alloca->getAlign();
1614
1615 // FIXME: This computed padding is likely wrong since it depends on inverse
1616 // usage order.
1617 //
1618 // FIXME: It is also possible that if we're allowed to use all of the memory
1619 // could end up using more than the maximum due to alignment padding.
1620
1621 uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment);
1622 std::optional<TypeSize> ElemSize = AA.Alloca->getAllocationSize(DL);
1623 if (!ElemSize || ElemSize->isScalable())
1624 return false;
1625 TypeSize AllocSize = WorkGroupSize * *ElemSize;
1626 NewSize += AllocSize.getFixedValue();
1627
1628 if (NewSize > LocalMemLimit) {
1629 LLVM_DEBUG(dbgs() << " " << AllocSize
1630 << " bytes of local memory not available to promote\n");
1631 return false;
1632 }
1633
1634 CurrentLocalMemUsage = NewSize;
1635
1636 LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
1637
1638 Function *F = AA.Alloca->getFunction();
1639
1640 Type *GVTy = ArrayType::get(AA.Alloca->getAllocatedType(), WorkGroupSize);
1643 Twine(F->getName()) + Twine('.') + AA.Alloca->getName(), nullptr,
1646 GV->setAlignment(AA.Alloca->getAlign());
1647
1648 Value *TCntY, *TCntZ;
1649
1650 std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
1651 Value *TIdX = getWorkitemID(Builder, 0);
1652 Value *TIdY = getWorkitemID(Builder, 1);
1653 Value *TIdZ = getWorkitemID(Builder, 2);
1654
1655 Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
1656 Tmp0 = Builder.CreateMul(Tmp0, TIdX);
1657 Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
1658 Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
1659 TID = Builder.CreateAdd(TID, TIdZ);
1660
1661 LLVMContext &Context = Mod->getContext();
1663
1664 Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
1665 AA.Alloca->mutateType(Offset->getType());
1666 AA.Alloca->replaceAllUsesWith(Offset);
1667 AA.Alloca->eraseFromParent();
1668
1670
1671 for (Value *V : AA.LDS.Worklist) {
1673 if (!Call) {
1674 if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
1675 Value *LHS = CI->getOperand(0);
1676 Value *RHS = CI->getOperand(1);
1677
1678 Type *NewTy = LHS->getType()->getWithNewType(NewPtrTy);
1680 CI->setOperand(0, Constant::getNullValue(NewTy));
1681
1683 CI->setOperand(1, Constant::getNullValue(NewTy));
1684
1685 continue;
1686 }
1687
1688 // The operand's value should be corrected on its own and we don't want to
1689 // touch the users.
1691 continue;
1692
1693 assert(V->getType()->isPtrOrPtrVectorTy());
1694
1695 Type *NewTy = V->getType()->getWithNewType(NewPtrTy);
1696 V->mutateType(NewTy);
1697
1698 // Adjust the types of any constant operands.
1701 SI->setOperand(1, Constant::getNullValue(NewTy));
1702
1704 SI->setOperand(2, Constant::getNullValue(NewTy));
1705 } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
1706 for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
1708 Phi->getIncomingValue(I)))
1709 Phi->setIncomingValue(I, Constant::getNullValue(NewTy));
1710 }
1711 }
1712
1713 continue;
1714 }
1715
1717 Builder.SetInsertPoint(Intr);
1718 switch (Intr->getIntrinsicID()) {
1719 case Intrinsic::lifetime_start:
1720 case Intrinsic::lifetime_end:
1721 // These intrinsics are for address space 0 only
1722 Intr->eraseFromParent();
1723 continue;
1724 case Intrinsic::memcpy:
1725 case Intrinsic::memmove:
1726 // These have 2 pointer operands. In case if second pointer also needs
1727 // to be replaced we defer processing of these intrinsics until all
1728 // other values are processed.
1729 DeferredIntrs.insert(Intr);
1730 continue;
1731 case Intrinsic::memset: {
1732 MemSetInst *MemSet = cast<MemSetInst>(Intr);
1733 Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
1734 MemSet->getLength(), MemSet->getDestAlign(),
1735 MemSet->isVolatile());
1736 Intr->eraseFromParent();
1737 continue;
1738 }
1739 case Intrinsic::invariant_start:
1740 case Intrinsic::invariant_end:
1741 case Intrinsic::launder_invariant_group:
1742 case Intrinsic::strip_invariant_group: {
1744 if (Intr->getIntrinsicID() == Intrinsic::invariant_start) {
1745 Args.emplace_back(Intr->getArgOperand(0));
1746 } else if (Intr->getIntrinsicID() == Intrinsic::invariant_end) {
1747 Args.emplace_back(Intr->getArgOperand(0));
1748 Args.emplace_back(Intr->getArgOperand(1));
1749 }
1750 Args.emplace_back(Offset);
1752 Intr->getModule(), Intr->getIntrinsicID(), Offset->getType());
1753 CallInst *NewIntr =
1754 CallInst::Create(F, Args, Intr->getName(), Intr->getIterator());
1755 Intr->mutateType(NewIntr->getType());
1756 Intr->replaceAllUsesWith(NewIntr);
1757 Intr->eraseFromParent();
1758 continue;
1759 }
1760 case Intrinsic::objectsize: {
1761 Value *Src = Intr->getOperand(0);
1762
1763 CallInst *NewCall = Builder.CreateIntrinsic(
1764 Intrinsic::objectsize,
1766 {Src, Intr->getOperand(1), Intr->getOperand(2), Intr->getOperand(3)});
1767 Intr->replaceAllUsesWith(NewCall);
1768 Intr->eraseFromParent();
1769 continue;
1770 }
1771 default:
1772 Intr->print(errs());
1773 llvm_unreachable("Don't know how to promote alloca intrinsic use.");
1774 }
1775 }
1776
1777 return true;
1778}
1779
1780void AMDGPUPromoteAllocaImpl::finishDeferredAllocaToLDSPromotion(
1781 SetVector<IntrinsicInst *> &DeferredIntrs) {
1782
1783 for (IntrinsicInst *Intr : DeferredIntrs) {
1784 IRBuilder<> Builder(Intr);
1785 Builder.SetInsertPoint(Intr);
1787 assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
1788
1790 auto *B = Builder.CreateMemTransferInst(
1791 ID, MI->getRawDest(), MI->getDestAlign(), MI->getRawSource(),
1792 MI->getSourceAlign(), MI->getLength(), MI->isVolatile());
1793
1794 for (unsigned I = 0; I != 2; ++I) {
1795 if (uint64_t Bytes = Intr->getParamDereferenceableBytes(I)) {
1796 B->addDereferenceableParamAttr(I, Bytes);
1797 }
1798 }
1799
1800 Intr->eraseFromParent();
1801 }
1802}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static Value * promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL, AllocaAnalysis &AA, unsigned VecStoreSize, unsigned ElementSize, function_ref< Value *()> GetCurVal)
Promotes a single user of the alloca to a vector form.
AMDGPU promote alloca to vector or LDS
static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy, const DataLayout &DL)
static void forEachWorkListItem(const InstContainer &WorkList, std::function< void(Instruction *)> Fn)
Iterates over an instruction worklist that may contain multiple instructions from the same basic bloc...
static std::optional< GEPToVectorIndex > computeGEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, Type *VecElemTy, const DataLayout &DL)
static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI, const DataLayout &DL)
static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB, BasicBlock::iterator I)
Find an insert point after an alloca, after all other allocas clustered at the start of the block.
static bool isCallPromotable(CallInst *CI)
static Value * calculateVectorIndex(Value *Ptr, AllocaAnalysis &AA)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
@ Enable
static bool runOnFunction(Function &F, bool PostInlining)
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
Hexagon Common GEP
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
uint64_t IntrinsicInst * II
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
Remove Loads Into Fake Uses
static unsigned getNumElements(Type *Ty)
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
Target-Independent Code Generator Pass Configuration Options pass.
Value * RHS
Value * LHS
static const AMDGPUSubtarget & get(const MachineFunction &MF)
Class for arbitrary precision integers.
Definition APInt.h:78
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
LLVM_ABI APInt sdiv(const APInt &RHS) const
Signed division function for APInt.
Definition APInt.cpp:1675
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1072
LLVM_ABI APInt srem(const APInt &RHS) const
Function for signed remainder operation.
Definition APInt.cpp:1776
an instruction to allocate memory on the stack
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:474
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
uint64_t getParamDereferenceableBytes(unsigned i) const
Extract the number of dereferenceable bytes for a call or parameter (0=unknown).
void addDereferenceableRetAttr(uint64_t Bytes)
adds the dereferenceable attribute to the list of attributes.
void addRetAttr(Attribute::AttrKind Kind)
Adds the attribute to the return value.
Value * getArgOperand(unsigned i) const
This class represents a function call, abstracting a target machine's calling convention.
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
static LLVM_ABI bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
Class to represent function types.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
bool hasExternalLinkage() const
void setUnnamedAddr(UnnamedAddr Val)
unsigned getAddressSpace() const
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
Type * getValueType() const
MaybeAlign getAlign() const
Returns the alignment of the given variable.
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
Definition Globals.cpp:563
void setAlignment(Align Align)
Sets the alignment attribute of the GlobalVariable.
This instruction compares its operands according to the predicate given to the constructor.
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1894
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1539
BasicBlock * GetInsertBlock() const
Definition IRBuilder.h:201
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:1975
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
CallInst * CreateMemSet(Value *Ptr, Value *Val, uint64_t Size, MaybeAlign Align, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Create and insert a memset to the specified pointer and the specified value.
Definition IRBuilder.h:653
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1429
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2510
Value * CreateConstInBoundsGEP1_64(Type *Ty, Value *Ptr, uint64_t Idx0, const Twine &Name="")
Definition IRBuilder.h:2017
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
LLVM_ABI CallInst * CreateMemTransferInst(Intrinsic::ID IntrID, Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, Value *Size, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1463
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2811
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
The legacy pass manager's analysis pass to compute loop information.
Definition LoopInfo.h:596
Metadata node.
Definition Metadata.h:1080
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
bool empty() const
Definition MapVector.h:77
size_type size() const
Definition MapVector.h:56
std::pair< KeyT, ValueT > & front()
Definition MapVector.h:79
Value * getLength() const
Value * getRawDest() const
MaybeAlign getDestAlign() const
bool isVolatile() const
Value * getValue() const
This class wraps the llvm.memset and llvm.memset.inline intrinsics.
This class wraps the llvm.memcpy/memmove intrinsics.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition Pass.cpp:112
Class to represent pointers.
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
Helper class for SSA formation on a set of values defined in multiple blocks.
Definition SSAUpdater.h:39
void Initialize(Type *Ty, StringRef Name)
Reset this object to get ready for a new set of SSA updates with type 'Ty'.
Value * GetValueInMiddleOfBlock(BasicBlock *BB)
Construct SSA form, materializing a value that is live in the middle of the specified block.
void AddAvailableValue(BasicBlock *BB, Value *V)
Indicate that a rewritten value is available in the specified block with the specified value.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:57
bool contains(const_arg_type key) const
Check if the SetVector contains the given key.
Definition SetVector.h:252
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:339
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
static unsigned getPointerOperandIndex()
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
bool isAMDGCN() const
Tests whether the target is AMDGCN.
Definition Triple.h:954
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition Type.h:281
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
bool isAggregateType() const
Return true if the type is an aggregate type.
Definition Type.h:321
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false) const
Implement operator<< on Value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:259
iterator_range< user_iterator > users()
Definition Value.h:427
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition Value.cpp:713
bool use_empty() const
Definition Value.h:347
void mutateType(Type *Ty)
Mutate the type of this Value to be of the specified type.
Definition Value.h:845
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:403
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
Value handle that is nullable, but tries to track the Value.
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition TypeSize.h:180
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
@ LOCAL_ADDRESS
Address space for local memory.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
unsigned getDynamicVGPRBlockSize(const Function &F)
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
initializer< Ty > init(const Ty &Val)
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:532
@ Length
Definition DWP.cpp:532
void stable_sort(R &&Range)
Definition STLExtras.h:2116
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1765
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
LLVM_ABI bool isAssumeLikeIntrinsic(const Instruction *I)
Return true if it is an intrinsic that cannot be speculated but also cannot trap.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
FunctionPass * createAMDGPUPromoteAlloca()
@ Mod
The access may modify the value stored in memory.
Definition ModRef.h:34
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
char & AMDGPUPromoteAllocaID
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
#define N
AMDGPUPromoteAllocaPass(TargetMachine &TM)
Definition AMDGPU.h:259
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:258
A MapVector that performs no allocations if smaller than a certain size.
Definition MapVector.h:276
Function object to check whether the second component of a container supported by std::get (like std:...
Definition STLExtras.h:1448