LLVM 23.0.0git
AMDGPUPromoteAlloca.cpp
Go to the documentation of this file.
1//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Eliminates allocas by either converting them into vectors or by migrating
10// them to local address space.
11//
12// Two passes are exposed by this file:
13// - "promote-alloca-to-vector", which runs early in the pipeline and only
14// promotes to vector. Promotion to vector is almost always profitable
15// except when the alloca is too big and the promotion would result in
16// very high register pressure.
17// - "promote-alloca", which does both promotion to vector and LDS and runs
18// much later in the pipeline. This runs after SROA because promoting to
19// LDS is of course less profitable than getting rid of the alloca or
20// vectorizing it, thus we only want to do it when the only alternative is
21// lowering the alloca to stack.
22//
23// Note that both of them exist for the old and new PMs. The new PM passes are
24// declared in AMDGPU.h and the legacy PM ones are declared here.s
25//
26//===----------------------------------------------------------------------===//
27
28#include "AMDGPU.h"
29#include "GCNSubtarget.h"
31#include "llvm/ADT/STLExtras.h"
38#include "llvm/IR/IRBuilder.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
44#include "llvm/Pass.h"
48
49#define DEBUG_TYPE "amdgpu-promote-alloca"
50
51using namespace llvm;
52
53namespace {
54
55static cl::opt<bool>
56 DisablePromoteAllocaToVector("disable-promote-alloca-to-vector",
57 cl::desc("Disable promote alloca to vector"),
58 cl::init(false));
59
60static cl::opt<bool>
61 DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds",
62 cl::desc("Disable promote alloca to LDS"),
63 cl::init(false));
64
65static cl::opt<unsigned> PromoteAllocaToVectorLimit(
66 "amdgpu-promote-alloca-to-vector-limit",
67 cl::desc("Maximum byte size to consider promote alloca to vector"),
68 cl::init(0));
69
70static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs(
71 "amdgpu-promote-alloca-to-vector-max-regs",
73 "Maximum vector size (in 32b registers) to use when promoting alloca"),
74 cl::init(32));
75
76// Use up to 1/4 of available register budget for vectorization.
77// FIXME: Increase the limit for whole function budgets? Perhaps x2?
78static cl::opt<unsigned> PromoteAllocaToVectorVGPRRatio(
79 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
80 cl::desc("Ratio of VGPRs to budget for promoting alloca to vectors"),
81 cl::init(4));
82
84 LoopUserWeight("promote-alloca-vector-loop-user-weight",
85 cl::desc("The bonus weight of users of allocas within loop "
86 "when sorting profitable allocas"),
87 cl::init(4));
88
89// We support vector indices of the form ((A * stride) >> shift) + B
90// VarIndex is A, VarMul is stride, VarShift is shift and ConstIndex is B. All
91// parts are optional.
92struct GEPToVectorIndex {
93 WeakTrackingVH VarIndex = nullptr; // defaults to 0
94 ConstantInt *VarMul = nullptr; // defaults to 1
95 ConstantInt *VarShift = nullptr; // defaults to 0
96 ConstantInt *ConstIndex = nullptr; // defaults to 0
97 Value *Full = nullptr;
98};
99
100struct MemTransferInfo {
101 ConstantInt *SrcIndex = nullptr;
102 ConstantInt *DestIndex = nullptr;
103};
104
105// Analysis for planning the different strategies of alloca promotion.
106struct AllocaAnalysis {
107 AllocaInst *Alloca = nullptr;
108 DenseSet<Value *> Pointers;
110 unsigned Score = 0;
111 bool HaveSelectOrPHI = false;
112 struct {
113 FixedVectorType *Ty = nullptr;
115 SmallVector<Instruction *> UsersToRemove;
118 } Vector;
119 struct {
120 bool Enable = false;
121 SmallVector<User *> Worklist;
122 } LDS;
123
124 explicit AllocaAnalysis(AllocaInst *Alloca) : Alloca(Alloca) {}
125};
126
127// Shared implementation which can do both promotion to vector and to LDS.
128class AMDGPUPromoteAllocaImpl {
129private:
130 const TargetMachine &TM;
131 LoopInfo &LI;
132 Module &Mod;
133 const DataLayout &DL;
134
135 // FIXME: This should be per-kernel.
136 uint32_t LocalMemLimit = 0;
137 uint32_t CurrentLocalMemUsage = 0;
138 unsigned MaxVGPRs;
139 unsigned VGPRBudgetRatio;
140 unsigned MaxVectorRegs;
141
142 bool IsAMDGCN = false;
143 bool IsAMDHSA = false;
144
145 std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
146 Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
147
148 bool collectAllocaUses(AllocaAnalysis &AA) const;
149
150 /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
151 /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
152 /// Returns true if both operands are derived from the same alloca. Val should
153 /// be the same value as one of the input operands of UseInst.
154 bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
155 Instruction *UseInst, int OpIdx0,
156 int OpIdx1) const;
157
158 /// Check whether we have enough local memory for promotion.
159 bool hasSufficientLocalMem(const Function &F);
160
161 FixedVectorType *getVectorTypeForAlloca(Type *AllocaTy) const;
162 void analyzePromoteToVector(AllocaAnalysis &AA) const;
163 void promoteAllocaToVector(AllocaAnalysis &AA);
164 void analyzePromoteToLDS(AllocaAnalysis &AA) const;
165 bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS,
166 SetVector<IntrinsicInst *> &DeferredIntrs);
167 void
168 finishDeferredAllocaToLDSPromotion(SetVector<IntrinsicInst *> &DeferredIntrs);
169
170 void scoreAlloca(AllocaAnalysis &AA) const;
171
172 void setFunctionLimits(const Function &F);
173
174public:
175 AMDGPUPromoteAllocaImpl(TargetMachine &TM, Module &M, LoopInfo &LI)
176 : TM(TM), LI(LI), Mod(M), DL(M.getDataLayout()) {
177 const Triple &TT = M.getTargetTriple();
178 IsAMDGCN = TT.isAMDGCN();
179 IsAMDHSA = TT.getOS() == Triple::AMDHSA;
180 }
181
182 bool run(Function &F, bool PromoteToLDS);
183};
184
185// FIXME: This can create globals so should be a module pass.
186class AMDGPUPromoteAlloca : public FunctionPass {
187public:
188 static char ID;
189
190 AMDGPUPromoteAlloca() : FunctionPass(ID) {}
191
192 bool runOnFunction(Function &F) override {
193 if (skipFunction(F))
194 return false;
195 if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
196 return AMDGPUPromoteAllocaImpl(
197 TPC->getTM<TargetMachine>(), *F.getParent(),
198 getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
199 .run(F, /*PromoteToLDS*/ true);
200 return false;
201 }
202
203 StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
204
205 void getAnalysisUsage(AnalysisUsage &AU) const override {
206 AU.setPreservesCFG();
209 }
210};
211
212static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
213 const Function &F) {
214 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
215
216 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
217 // Temporarily check both the attribute and the subtarget feature, until the
218 // latter is removed.
219 if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
220 DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
221
222 unsigned MaxVGPRs = ST.getMaxNumVGPRs(
223 ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), LDSBytes, F).first,
224 DynamicVGPRBlockSize);
225
226 // A non-entry function has only 32 caller preserved registers.
227 // Do not promote alloca which will force spilling unless we know the function
228 // will be inlined.
229 if (!F.hasFnAttribute(Attribute::AlwaysInline) &&
230 !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
231 MaxVGPRs = std::min(MaxVGPRs, 32u);
232 return MaxVGPRs;
233}
234
235} // end anonymous namespace
236
237char AMDGPUPromoteAlloca::ID = 0;
238
240 "AMDGPU promote alloca to vector or LDS", false, false)
241// Move LDS uses from functions to kernels before promote alloca for accurate
242// estimation of LDS available
243INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy)
245INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
246 "AMDGPU promote alloca to vector or LDS", false, false)
247
248char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
249
252 auto &LI = AM.getResult<LoopAnalysis>(F);
253 bool Changed = AMDGPUPromoteAllocaImpl(TM, *F.getParent(), LI)
254 .run(F, /*PromoteToLDS=*/true);
255 if (Changed) {
258 return PA;
259 }
260 return PreservedAnalyses::all();
261}
262
265 auto &LI = AM.getResult<LoopAnalysis>(F);
266 bool Changed = AMDGPUPromoteAllocaImpl(TM, *F.getParent(), LI)
267 .run(F, /*PromoteToLDS=*/false);
268 if (Changed) {
271 return PA;
272 }
273 return PreservedAnalyses::all();
274}
275
277 return new AMDGPUPromoteAlloca();
278}
279
280bool AMDGPUPromoteAllocaImpl::collectAllocaUses(AllocaAnalysis &AA) const {
281 const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
282 LLVM_DEBUG(dbgs() << " Cannot promote alloca: " << Msg << "\n"
283 << " " << *Inst << "\n");
284 return false;
285 };
286
287 SmallVector<Instruction *, 4> WorkList({AA.Alloca});
288 while (!WorkList.empty()) {
289 auto *Cur = WorkList.pop_back_val();
290 if (find(AA.Pointers, Cur) != AA.Pointers.end())
291 continue;
292 AA.Pointers.insert(Cur);
293 for (auto &U : Cur->uses()) {
294 auto *Inst = cast<Instruction>(U.getUser());
295 if (isa<StoreInst>(Inst)) {
296 if (U.getOperandNo() != StoreInst::getPointerOperandIndex()) {
297 return RejectUser(Inst, "pointer escapes via store");
298 }
299 }
300 AA.Uses.push_back(&U);
301
302 if (isa<GetElementPtrInst>(U.getUser())) {
303 WorkList.push_back(Inst);
304 } else if (auto *SI = dyn_cast<SelectInst>(Inst)) {
305 // Only promote a select if we know that the other select operand is
306 // from another pointer that will also be promoted.
307 if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Cur, SI, 1, 2))
308 return RejectUser(Inst, "select from mixed objects");
309 WorkList.push_back(Inst);
310 AA.HaveSelectOrPHI = true;
311 } else if (auto *Phi = dyn_cast<PHINode>(Inst)) {
312 // Repeat for phis.
313
314 // TODO: Handle more complex cases. We should be able to replace loops
315 // over arrays.
316 switch (Phi->getNumIncomingValues()) {
317 case 1:
318 break;
319 case 2:
320 if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Cur, Phi, 0, 1))
321 return RejectUser(Inst, "phi from mixed objects");
322 break;
323 default:
324 return RejectUser(Inst, "phi with too many operands");
325 }
326
327 WorkList.push_back(Inst);
328 AA.HaveSelectOrPHI = true;
329 }
330 }
331 }
332 return true;
333}
334
335void AMDGPUPromoteAllocaImpl::scoreAlloca(AllocaAnalysis &AA) const {
336 LLVM_DEBUG(dbgs() << "Scoring: " << *AA.Alloca << "\n");
337 unsigned Score = 0;
338 // Increment score by one for each user + a bonus for users within loops.
339 for (auto *U : AA.Uses) {
340 Instruction *Inst = cast<Instruction>(U->getUser());
341 if (isa<GetElementPtrInst>(Inst) || isa<SelectInst>(Inst) ||
342 isa<PHINode>(Inst))
343 continue;
344 unsigned UserScore =
345 1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent()));
346 LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n");
347 Score += UserScore;
348 }
349 LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n");
350 AA.Score = Score;
351}
352
353void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
354 // Load per function limits, overriding with global options where appropriate.
355 // R600 register tuples/aliasing are fragile with large vector promotions so
356 // apply architecture specific limit here.
357 const int R600MaxVectorRegs = 16;
358 MaxVectorRegs = F.getFnAttributeAsParsedInteger(
359 "amdgpu-promote-alloca-to-vector-max-regs",
360 IsAMDGCN ? PromoteAllocaToVectorMaxRegs : R600MaxVectorRegs);
361 if (PromoteAllocaToVectorMaxRegs.getNumOccurrences())
362 MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
363 VGPRBudgetRatio = F.getFnAttributeAsParsedInteger(
364 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
365 PromoteAllocaToVectorVGPRRatio);
366 if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences())
367 VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
368}
369
370bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
371 if (DisablePromoteAllocaToLDS && DisablePromoteAllocaToVector)
372 return false;
373
374 bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(F);
375 MaxVGPRs = IsAMDGCN ? getMaxVGPRs(CurrentLocalMemUsage, TM, F) : 128;
376 setFunctionLimits(F);
377
378 unsigned VectorizationBudget =
379 (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
380 : (MaxVGPRs * 32)) /
381 VGPRBudgetRatio;
382
383 std::vector<AllocaAnalysis> Allocas;
384 for (Instruction &I : F.getEntryBlock()) {
385 if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
386 // Array allocations are probably not worth handling, since an allocation
387 // of the array type is the canonical form.
388 if (!AI->isStaticAlloca() || AI->isArrayAllocation())
389 continue;
390
391 LLVM_DEBUG(dbgs() << "Analyzing: " << *AI << '\n');
392
393 AllocaAnalysis AA{AI};
394 if (collectAllocaUses(AA)) {
395 analyzePromoteToVector(AA);
396 if (PromoteToLDS)
397 analyzePromoteToLDS(AA);
398 if (AA.Vector.Ty || AA.LDS.Enable) {
399 scoreAlloca(AA);
400 Allocas.push_back(std::move(AA));
401 }
402 }
403 }
404 }
405
406 stable_sort(Allocas,
407 [](const auto &A, const auto &B) { return A.Score > B.Score; });
408
409 // clang-format off
411 dbgs() << "Sorted Worklist:\n";
412 for (const auto &AA : Allocas)
413 dbgs() << " " << *AA.Alloca << "\n";
414 );
415 // clang-format on
416
417 bool Changed = false;
418 SetVector<IntrinsicInst *> DeferredIntrs;
419 for (AllocaAnalysis &AA : Allocas) {
420 if (AA.Vector.Ty) {
421 std::optional<TypeSize> Size = AA.Alloca->getAllocationSize(DL);
422 assert(Size); // Expected to succeed on non-array alloca.
423 const unsigned AllocaCost = Size->getFixedValue() * 8;
424 // First, check if we have enough budget to vectorize this alloca.
425 if (AllocaCost <= VectorizationBudget) {
426 promoteAllocaToVector(AA);
427 Changed = true;
428 assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
429 "Underflow!");
430 VectorizationBudget -= AllocaCost;
431 LLVM_DEBUG(dbgs() << " Remaining vectorization budget:"
432 << VectorizationBudget << "\n");
433 continue;
434 } else {
435 LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:"
436 << AllocaCost << ", budget:" << VectorizationBudget
437 << "): " << *AA.Alloca << "\n");
438 }
439 }
440
441 if (AA.LDS.Enable &&
442 tryPromoteAllocaToLDS(AA, SufficientLDS, DeferredIntrs))
443 Changed = true;
444 }
445 finishDeferredAllocaToLDSPromotion(DeferredIntrs);
446
447 // NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
448 // dangling pointers. If we want to reuse it past this point, the loop above
449 // would need to be updated to remove successfully promoted allocas.
450
451 return Changed;
452}
453
454// Checks if the instruction I is a memset user of the alloca AI that we can
455// deal with. Currently, only non-volatile memsets that affect the whole alloca
456// are handled.
458 const DataLayout &DL) {
459 using namespace PatternMatch;
460 // For now we only care about non-volatile memsets that affect the whole type
461 // (start at index 0 and fill the whole alloca).
462 //
463 // TODO: Now that we moved to PromoteAlloca we could handle any memsets
464 // (except maybe volatile ones?) - we just need to use shufflevector if it
465 // only affects a subset of the vector.
466 const unsigned Size = DL.getTypeStoreSize(AI->getAllocatedType());
467 return I->getOperand(0) == AI &&
468 match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile();
469}
470
471static Value *calculateVectorIndex(Value *Ptr, AllocaAnalysis &AA) {
472 IRBuilder<> B(Ptr->getContext());
473
474 Ptr = Ptr->stripPointerCasts();
475 if (Ptr == AA.Alloca)
476 return B.getInt32(0);
477
478 auto *GEP = cast<GetElementPtrInst>(Ptr);
479 auto I = AA.Vector.GEPVectorIdx.find(GEP);
480 assert(I != AA.Vector.GEPVectorIdx.end() && "Must have entry for GEP!");
481
482 if (!I->second.Full) {
483 Value *Result = nullptr;
484 B.SetInsertPoint(GEP);
485
486 if (I->second.VarIndex) {
487 Result = I->second.VarIndex;
488 Result = B.CreateSExtOrTrunc(Result, B.getInt32Ty());
489
490 if (I->second.VarMul)
491 Result = B.CreateMul(Result, I->second.VarMul);
492
493 if (I->second.VarShift)
494 Result = B.CreateAShr(Result, I->second.VarShift, "", /*isExact*/ true);
495 }
496
497 if (I->second.ConstIndex) {
498 if (Result)
499 Result = B.CreateAdd(Result, I->second.ConstIndex);
500 else
501 Result = I->second.ConstIndex;
502 }
503
504 if (!Result)
505 Result = B.getInt32(0);
506
507 I->second.Full = Result;
508 }
509
510 return I->second.Full;
511}
512
513static std::optional<GEPToVectorIndex>
515 Type *VecElemTy, const DataLayout &DL) {
516 // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
517 // helper.
518 LLVMContext &Ctx = GEP->getContext();
519 unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
521 APInt ConstOffset(BW, 0);
522
523 // Walk backwards through nested GEPs to collect both constant and variable
524 // offsets, so that nested vector GEP chains can be lowered in one step.
525 //
526 // Given this IR fragment as input:
527 //
528 // %0 = alloca [10 x <2 x i32>], align 8, addrspace(5)
529 // %1 = getelementptr [10 x <2 x i32>], ptr addrspace(5) %0, i32 0, i32 %j
530 // %2 = getelementptr i8, ptr addrspace(5) %1, i32 4
531 // %3 = load i32, ptr addrspace(5) %2, align 4
532 //
533 // Combine both GEP operations in a single pass, producing:
534 // BasePtr = %0
535 // ConstOffset = 4
536 // VarOffsets = { %j -> element_size(<2 x i32>) }
537 //
538 // That lets us emit a single buffer_load directly into a VGPR, without ever
539 // allocating scratch memory for the intermediate pointer.
540 Value *CurPtr = GEP;
541 while (auto *CurGEP = dyn_cast<GetElementPtrInst>(CurPtr)) {
542 if (!CurGEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
543 return {};
544
545 // Move to the next outer pointer.
546 CurPtr = CurGEP->getPointerOperand();
547 }
548
549 assert(CurPtr == Alloca && "GEP not based on alloca");
550
551 int64_t VecElemSize = DL.getTypeAllocSize(VecElemTy);
552 if (VarOffsets.size() > 1)
553 return {};
554
555 // We support vector indices of the form ((VarIndex * stride) >> shift) + B.
556 // IndexQuot represents B. Check that the constant offset is a multiple
557 // of the vector element size.
558 if (ConstOffset.srem(VecElemSize) != 0)
559 return {};
560 APInt IndexQuot = ConstOffset.sdiv(VecElemSize);
561
562 GEPToVectorIndex Result;
563
564 if (!ConstOffset.isZero())
565 Result.ConstIndex = ConstantInt::get(Ctx, IndexQuot.sextOrTrunc(BW));
566
567 // If there are no variable offsets, only a constant offset, then we're done.
568 if (VarOffsets.empty())
569 return Result;
570
571 // Scale is the stride in the (A * stride) part. Check that there is only one
572 // variable offset and extract the scale factor.
573 const auto &VarOffset = VarOffsets.front();
574 auto ScaleOpt = VarOffset.second.tryZExtValue();
575 if (!ScaleOpt || *ScaleOpt == 0)
576 return {};
577
578 uint64_t Scale = *ScaleOpt;
579 Result.VarIndex = VarOffset.first;
580 auto *OffsetType = dyn_cast<IntegerType>(Result.VarIndex->getType());
581 if (!OffsetType)
582 return {};
583
584 // The vector index for the variable part is: VarIndex * Scale / VecElemSize.
585 if (Scale >= (uint64_t)VecElemSize) {
586 if (Scale % VecElemSize != 0)
587 return {};
588
589 // Scale is a multiple of VecElemSize, so the index is just: VarIndex *
590 // (Scale / VecElemSize).
591 uint64_t VarMul = Scale / VecElemSize;
592 // Only the multiplier is needed.
593 if (VarMul != 1)
594 Result.VarMul = ConstantInt::get(Ctx, APInt(BW, VarMul));
595 } else {
596 if ((uint64_t)VecElemSize % Scale != 0)
597 return {};
598
599 // VecElemSize is a multiple of Scale, so the index is just: VarIndex /
600 // (VecElemSize / Scale).
601 uint64_t Divisor = VecElemSize / Scale;
602 // The divisor must be a power of 2 so we can use a right shift.
603 if (!isPowerOf2_64(Divisor))
604 return {};
605
606 // VarIndex must be known to be divisible by that divisor.
607 KnownBits KB = computeKnownBits(VarOffset.first, DL);
608 if (KB.countMinTrailingZeros() < Log2_64(Divisor))
609 return {};
610
611 Result.VarShift = ConstantInt::get(Ctx, APInt(BW, Log2_64(Divisor)));
612 }
613
614 return Result;
615}
616
617/// Promotes a single user of the alloca to a vector form.
618///
619/// \param Inst Instruction to be promoted.
620/// \param DL Module Data Layout.
621/// \param AA Alloca Analysis.
622/// \param VecStoreSize Size of \p VectorTy in bytes.
623/// \param ElementSize Size of \p VectorTy element type in bytes.
624/// \param CurVal Current value of the vector (e.g. last stored value)
625/// \param[out] DeferredLoads \p Inst is added to this vector if it can't
626/// be promoted now. This happens when promoting requires \p
627/// CurVal, but \p CurVal is nullptr.
628/// \return the stored value if \p Inst would have written to the alloca, or
629/// nullptr otherwise.
631 AllocaAnalysis &AA,
632 unsigned VecStoreSize,
633 unsigned ElementSize,
634 function_ref<Value *()> GetCurVal) {
635 // Note: we use InstSimplifyFolder because it can leverage the DataLayout
636 // to do more folding, especially in the case of vector splats.
639 Builder.SetInsertPoint(Inst);
640
641 Type *VecEltTy = AA.Vector.Ty->getElementType();
642
643 switch (Inst->getOpcode()) {
644 case Instruction::Load: {
645 Value *CurVal = GetCurVal();
646 Value *Index =
648
649 // We're loading the full vector.
650 Type *AccessTy = Inst->getType();
651 TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
652 if (Constant *CI = dyn_cast<Constant>(Index)) {
653 if (CI->isNullValue() && AccessSize == VecStoreSize) {
654 Inst->replaceAllUsesWith(
655 Builder.CreateBitPreservingCastChain(DL, CurVal, AccessTy));
656 return nullptr;
657 }
658 }
659
660 // Loading a subvector.
661 if (isa<FixedVectorType>(AccessTy)) {
662 assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
663 const unsigned NumLoadedElts = AccessSize / DL.getTypeStoreSize(VecEltTy);
664 auto *SubVecTy = FixedVectorType::get(VecEltTy, NumLoadedElts);
665 assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
666
667 // If idx is dynamic, then sandwich load with bitcasts.
668 // ie. VectorTy SubVecTy AccessTy
669 // <64 x i8> -> <16 x i8> <8 x i16>
670 // <64 x i8> -> <4 x i128> -> i128 -> <8 x i16>
671 // Extracting subvector with dynamic index has very large expansion in
672 // the amdgpu backend. Limit to pow2.
673 FixedVectorType *VectorTy = AA.Vector.Ty;
674 TypeSize NumBits = DL.getTypeStoreSize(SubVecTy) * 8u;
675 uint64_t LoadAlign = cast<LoadInst>(Inst)->getAlign().value();
676 bool IsAlignedLoad = NumBits <= (LoadAlign * 8u);
677 unsigned TotalNumElts = VectorTy->getNumElements();
678 bool IsProperlyDivisible = TotalNumElts % NumLoadedElts == 0;
679 if (!isa<ConstantInt>(Index) &&
680 llvm::isPowerOf2_32(SubVecTy->getNumElements()) &&
681 IsProperlyDivisible && IsAlignedLoad) {
682 IntegerType *NewElemTy = Builder.getIntNTy(NumBits);
683 const unsigned NewNumElts =
684 DL.getTypeStoreSize(VectorTy) * 8u / NumBits;
685 const unsigned LShrAmt = llvm::Log2_32(SubVecTy->getNumElements());
686 FixedVectorType *BitCastTy =
687 FixedVectorType::get(NewElemTy, NewNumElts);
688 Value *BCVal = Builder.CreateBitCast(CurVal, BitCastTy);
689 Value *NewIdx = Builder.CreateLShr(
690 Index, ConstantInt::get(Index->getType(), LShrAmt));
691 Value *ExtVal = Builder.CreateExtractElement(BCVal, NewIdx);
692 Value *BCOut = Builder.CreateBitCast(ExtVal, AccessTy);
693 Inst->replaceAllUsesWith(BCOut);
694 return nullptr;
695 }
696
697 Value *SubVec = PoisonValue::get(SubVecTy);
698 for (unsigned K = 0; K < NumLoadedElts; ++K) {
699 Value *CurIdx =
700 Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
701 SubVec = Builder.CreateInsertElement(
702 SubVec, Builder.CreateExtractElement(CurVal, CurIdx), K);
703 }
704
705 Inst->replaceAllUsesWith(
706 Builder.CreateBitPreservingCastChain(DL, SubVec, AccessTy));
707 return nullptr;
708 }
709
710 // We're loading one element.
711 Value *ExtractElement = Builder.CreateExtractElement(CurVal, Index);
712 if (AccessTy != VecEltTy)
713 ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, AccessTy);
714
715 Inst->replaceAllUsesWith(ExtractElement);
716 return nullptr;
717 }
718 case Instruction::Store: {
719 // For stores, it's a bit trickier and it depends on whether we're storing
720 // the full vector or not. If we're storing the full vector, we don't need
721 // to know the current value. If this is a store of a single element, we
722 // need to know the value.
724 Value *Index = calculateVectorIndex(SI->getPointerOperand(), AA);
725 Value *Val = SI->getValueOperand();
726
727 // We're storing the full vector, we can handle this without knowing CurVal.
728 Type *AccessTy = Val->getType();
729 TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
730 if (Constant *CI = dyn_cast<Constant>(Index))
731 if (CI->isNullValue() && AccessSize == VecStoreSize)
732 return Builder.CreateBitPreservingCastChain(DL, Val, AA.Vector.Ty);
733
734 // Storing a subvector.
735 if (isa<FixedVectorType>(AccessTy)) {
736 assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
737 const unsigned NumWrittenElts =
738 AccessSize / DL.getTypeStoreSize(VecEltTy);
739 const unsigned NumVecElts = AA.Vector.Ty->getNumElements();
740 auto *SubVecTy = FixedVectorType::get(VecEltTy, NumWrittenElts);
741 assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
742
743 Val = Builder.CreateBitPreservingCastChain(DL, Val, SubVecTy);
744 Value *CurVec = GetCurVal();
745 for (unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts);
746 K < NumElts; ++K) {
747 Value *CurIdx =
748 Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
749 CurVec = Builder.CreateInsertElement(
750 CurVec, Builder.CreateExtractElement(Val, K), CurIdx);
751 }
752 return CurVec;
753 }
754
755 if (Val->getType() != VecEltTy)
756 Val = Builder.CreateBitOrPointerCast(Val, VecEltTy);
757 return Builder.CreateInsertElement(GetCurVal(), Val, Index);
758 }
759 case Instruction::Call: {
760 if (auto *MTI = dyn_cast<MemTransferInst>(Inst)) {
761 // For memcpy, we need to know curval.
762 ConstantInt *Length = cast<ConstantInt>(MTI->getLength());
763 unsigned NumCopied = Length->getZExtValue() / ElementSize;
764 MemTransferInfo *TI = &AA.Vector.TransferInfo[MTI];
765 unsigned SrcBegin = TI->SrcIndex->getZExtValue();
766 unsigned DestBegin = TI->DestIndex->getZExtValue();
767
768 SmallVector<int> Mask;
769 for (unsigned Idx = 0; Idx < AA.Vector.Ty->getNumElements(); ++Idx) {
770 if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
771 Mask.push_back(SrcBegin < AA.Vector.Ty->getNumElements()
772 ? SrcBegin++
774 } else {
775 Mask.push_back(Idx);
776 }
777 }
778
779 return Builder.CreateShuffleVector(GetCurVal(), Mask);
780 }
781
782 if (auto *MSI = dyn_cast<MemSetInst>(Inst)) {
783 // For memset, we don't need to know the previous value because we
784 // currently only allow memsets that cover the whole alloca.
785 Value *Elt = MSI->getOperand(1);
786 const unsigned BytesPerElt = DL.getTypeStoreSize(VecEltTy);
787 if (BytesPerElt > 1) {
788 Value *EltBytes = Builder.CreateVectorSplat(BytesPerElt, Elt);
789
790 // If the element type of the vector is a pointer, we need to first cast
791 // to an integer, then use a PtrCast.
792 if (VecEltTy->isPointerTy()) {
793 Type *PtrInt = Builder.getIntNTy(BytesPerElt * 8);
794 Elt = Builder.CreateBitCast(EltBytes, PtrInt);
795 Elt = Builder.CreateIntToPtr(Elt, VecEltTy);
796 } else
797 Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
798 }
799
800 return Builder.CreateVectorSplat(AA.Vector.Ty->getElementCount(), Elt);
801 }
802
803 if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) {
804 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
805 Intr->replaceAllUsesWith(
806 Builder.getIntN(Intr->getType()->getIntegerBitWidth(),
807 DL.getTypeAllocSize(AA.Vector.Ty)));
808 return nullptr;
809 }
810 }
811
812 llvm_unreachable("Unsupported call when promoting alloca to vector");
813 }
814
815 default:
816 llvm_unreachable("Inconsistency in instructions promotable to vector");
817 }
818
819 llvm_unreachable("Did not return after promoting instruction!");
820}
821
822static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy,
823 const DataLayout &DL) {
824 // Access as a vector type can work if the size of the access vector is a
825 // multiple of the size of the alloca's vector element type.
826 //
827 // Examples:
828 // - VecTy = <8 x float>, AccessTy = <4 x float> -> OK
829 // - VecTy = <4 x double>, AccessTy = <2 x float> -> OK
830 // - VecTy = <4 x double>, AccessTy = <3 x float> -> NOT OK
831 // - 3*32 is not a multiple of 64
832 //
833 // We could handle more complicated cases, but it'd make things a lot more
834 // complicated.
835 if (isa<FixedVectorType>(AccessTy)) {
836 TypeSize AccTS = DL.getTypeStoreSize(AccessTy);
837 // If the type size and the store size don't match, we would need to do more
838 // than just bitcast to translate between an extracted/insertable subvectors
839 // and the accessed value.
840 if (AccTS * 8 != DL.getTypeSizeInBits(AccessTy))
841 return false;
842 TypeSize VecTS = DL.getTypeStoreSize(VecTy->getElementType());
843 return AccTS.isKnownMultipleOf(VecTS);
844 }
845
847 DL);
848}
849
850/// Iterates over an instruction worklist that may contain multiple instructions
851/// from the same basic block, but in a different order.
852template <typename InstContainer>
853static void forEachWorkListItem(const InstContainer &WorkList,
854 std::function<void(Instruction *)> Fn) {
855 // Bucket up uses of the alloca by the block they occur in.
856 // This is important because we have to handle multiple defs/uses in a block
857 // ourselves: SSAUpdater is purely for cross-block references.
859 for (Instruction *User : WorkList)
860 UsesByBlock[User->getParent()].insert(User);
861
862 for (Instruction *User : WorkList) {
863 BasicBlock *BB = User->getParent();
864 auto &BlockUses = UsesByBlock[BB];
865
866 // Already processed, skip.
867 if (BlockUses.empty())
868 continue;
869
870 // Only user in the block, directly process it.
871 if (BlockUses.size() == 1) {
872 Fn(User);
873 continue;
874 }
875
876 // Multiple users in the block, do a linear scan to see users in order.
877 for (Instruction &Inst : *BB) {
878 if (!BlockUses.contains(&Inst))
879 continue;
880
881 Fn(&Inst);
882 }
883
884 // Clear the block so we know it's been processed.
885 BlockUses.clear();
886 }
887}
888
889/// Find an insert point after an alloca, after all other allocas clustered at
890/// the start of the block.
893 for (BasicBlock::iterator E = BB.end(); I != E && isa<AllocaInst>(*I); ++I)
894 ;
895 return I;
896}
897
899AMDGPUPromoteAllocaImpl::getVectorTypeForAlloca(Type *AllocaTy) const {
900 if (DisablePromoteAllocaToVector) {
901 LLVM_DEBUG(dbgs() << " Promote alloca to vectors is disabled\n");
902 return nullptr;
903 }
904
905 auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
906 if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
907 uint64_t NumElems = 1;
908 Type *ElemTy;
909 do {
910 NumElems *= ArrayTy->getNumElements();
911 ElemTy = ArrayTy->getElementType();
912 } while ((ArrayTy = dyn_cast<ArrayType>(ElemTy)));
913
914 // Check for array of vectors
915 auto *InnerVectorTy = dyn_cast<FixedVectorType>(ElemTy);
916 if (InnerVectorTy) {
917 NumElems *= InnerVectorTy->getNumElements();
918 ElemTy = InnerVectorTy->getElementType();
919 }
920
921 if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
922 unsigned ElementSize = DL.getTypeSizeInBits(ElemTy) / 8;
923 if (ElementSize > 0) {
924 unsigned AllocaSize = DL.getTypeStoreSize(AllocaTy);
925 // Expand vector if required to match padding of inner type,
926 // i.e. odd size subvectors.
927 // Storage size of new vector must match that of alloca for correct
928 // behaviour of byte offsets and GEP computation.
929 if (NumElems * ElementSize != AllocaSize)
930 NumElems = AllocaSize / ElementSize;
931 if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
932 VectorTy = FixedVectorType::get(ElemTy, NumElems);
933 }
934 }
935 }
936 if (!VectorTy) {
937 LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
938 return nullptr;
939 }
940
941 const unsigned MaxElements =
942 (MaxVectorRegs * 32) / DL.getTypeSizeInBits(VectorTy->getElementType());
943
944 if (VectorTy->getNumElements() > MaxElements ||
945 VectorTy->getNumElements() < 2) {
946 LLVM_DEBUG(dbgs() << " " << *VectorTy
947 << " has an unsupported number of elements\n");
948 return nullptr;
949 }
950
951 Type *VecEltTy = VectorTy->getElementType();
952 unsigned ElementSizeInBits = DL.getTypeSizeInBits(VecEltTy);
953 if (ElementSizeInBits != DL.getTypeAllocSizeInBits(VecEltTy)) {
954 LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size "
955 "does not match the type's size\n");
956 return nullptr;
957 }
958
959 return VectorTy;
960}
961
962void AMDGPUPromoteAllocaImpl::analyzePromoteToVector(AllocaAnalysis &AA) const {
963 if (AA.HaveSelectOrPHI) {
964 LLVM_DEBUG(dbgs() << " Cannot convert to vector due to select or phi\n");
965 return;
966 }
967
968 Type *AllocaTy = AA.Alloca->getAllocatedType();
969 AA.Vector.Ty = getVectorTypeForAlloca(AllocaTy);
970 if (!AA.Vector.Ty)
971 return;
972
973 const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
974 LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n"
975 << " " << *Inst << "\n");
976 AA.Vector.Ty = nullptr;
977 };
978
979 Type *VecEltTy = AA.Vector.Ty->getElementType();
980 unsigned ElementSize = DL.getTypeSizeInBits(VecEltTy) / 8;
981 assert(ElementSize > 0);
982 for (auto *U : AA.Uses) {
983 Instruction *Inst = cast<Instruction>(U->getUser());
984
985 if (Value *Ptr = getLoadStorePointerOperand(Inst)) {
986 assert(!isa<StoreInst>(Inst) ||
987 U->getOperandNo() == StoreInst::getPointerOperandIndex());
988
989 Type *AccessTy = getLoadStoreType(Inst);
990 if (AccessTy->isAggregateType())
991 return RejectUser(Inst, "unsupported load/store as aggregate");
992 assert(!AccessTy->isAggregateType() || AccessTy->isArrayTy());
993
994 // Check that this is a simple access of a vector element.
995 bool IsSimple = isa<LoadInst>(Inst) ? cast<LoadInst>(Inst)->isSimple()
996 : cast<StoreInst>(Inst)->isSimple();
997 if (!IsSimple)
998 return RejectUser(Inst, "not a simple load or store");
999
1000 Ptr = Ptr->stripPointerCasts();
1001
1002 // Alloca already accessed as vector.
1003 if (Ptr == AA.Alloca &&
1004 DL.getTypeStoreSize(AA.Alloca->getAllocatedType()) ==
1005 DL.getTypeStoreSize(AccessTy)) {
1006 AA.Vector.Worklist.push_back(Inst);
1007 continue;
1008 }
1009
1010 if (!isSupportedAccessType(AA.Vector.Ty, AccessTy, DL))
1011 return RejectUser(Inst, "not a supported access type");
1012
1013 AA.Vector.Worklist.push_back(Inst);
1014 continue;
1015 }
1016
1017 if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
1018 // If we can't compute a vector index from this GEP, then we can't
1019 // promote this alloca to vector.
1020 auto Index = computeGEPToVectorIndex(GEP, AA.Alloca, VecEltTy, DL);
1021 if (!Index)
1022 return RejectUser(Inst, "cannot compute vector index for GEP");
1023
1024 AA.Vector.GEPVectorIdx[GEP] = std::move(Index.value());
1025 AA.Vector.UsersToRemove.push_back(Inst);
1026 continue;
1027 }
1028
1029 if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst);
1030 MSI && isSupportedMemset(MSI, AA.Alloca, DL)) {
1031 AA.Vector.Worklist.push_back(Inst);
1032 continue;
1033 }
1034
1035 if (MemTransferInst *TransferInst = dyn_cast<MemTransferInst>(Inst)) {
1036 if (TransferInst->isVolatile())
1037 return RejectUser(Inst, "mem transfer inst is volatile");
1038
1039 ConstantInt *Len = dyn_cast<ConstantInt>(TransferInst->getLength());
1040 if (!Len || (Len->getZExtValue() % ElementSize))
1041 return RejectUser(Inst, "mem transfer inst length is non-constant or "
1042 "not a multiple of the vector element size");
1043
1044 auto getConstIndexIntoAlloca = [&](Value *Ptr) -> ConstantInt * {
1045 if (Ptr == AA.Alloca)
1046 return ConstantInt::get(Ptr->getContext(), APInt(32, 0));
1047
1049 const auto &GEPI = AA.Vector.GEPVectorIdx.find(GEP)->second;
1050 if (GEPI.VarIndex)
1051 return nullptr;
1052 if (GEPI.ConstIndex)
1053 return GEPI.ConstIndex;
1054 return ConstantInt::get(Ptr->getContext(), APInt(32, 0));
1055 };
1056
1057 MemTransferInfo *TI =
1058 &AA.Vector.TransferInfo.try_emplace(TransferInst).first->second;
1059 unsigned OpNum = U->getOperandNo();
1060 if (OpNum == 0) {
1061 Value *Dest = TransferInst->getDest();
1062 ConstantInt *Index = getConstIndexIntoAlloca(Dest);
1063 if (!Index)
1064 return RejectUser(Inst, "could not calculate constant dest index");
1065 TI->DestIndex = Index;
1066 } else {
1067 assert(OpNum == 1);
1068 Value *Src = TransferInst->getSource();
1069 ConstantInt *Index = getConstIndexIntoAlloca(Src);
1070 if (!Index)
1071 return RejectUser(Inst, "could not calculate constant src index");
1072 TI->SrcIndex = Index;
1073 }
1074 continue;
1075 }
1076
1077 if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) {
1078 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
1079 AA.Vector.Worklist.push_back(Inst);
1080 continue;
1081 }
1082 }
1083
1084 // Ignore assume-like intrinsics and comparisons used in assumes.
1085 if (isAssumeLikeIntrinsic(Inst)) {
1086 if (!Inst->use_empty())
1087 return RejectUser(Inst, "assume-like intrinsic cannot have any users");
1088 AA.Vector.UsersToRemove.push_back(Inst);
1089 continue;
1090 }
1091
1092 if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) {
1093 return isAssumeLikeIntrinsic(cast<Instruction>(U));
1094 })) {
1095 AA.Vector.UsersToRemove.push_back(Inst);
1096 continue;
1097 }
1098
1099 return RejectUser(Inst, "unhandled alloca user");
1100 }
1101
1102 // Follow-up check to ensure we've seen both sides of all transfer insts.
1103 for (const auto &Entry : AA.Vector.TransferInfo) {
1104 const MemTransferInfo &TI = Entry.second;
1105 if (!TI.SrcIndex || !TI.DestIndex)
1106 return RejectUser(Entry.first,
1107 "mem transfer inst between different objects");
1108 AA.Vector.Worklist.push_back(Entry.first);
1109 }
1110}
1111
1112void AMDGPUPromoteAllocaImpl::promoteAllocaToVector(AllocaAnalysis &AA) {
1113 LLVM_DEBUG(dbgs() << "Promoting to vectors: " << *AA.Alloca << '\n');
1114 LLVM_DEBUG(dbgs() << " type conversion: " << *AA.Alloca->getAllocatedType()
1115 << " -> " << *AA.Vector.Ty << '\n');
1116 const unsigned VecStoreSize = DL.getTypeStoreSize(AA.Vector.Ty);
1117
1118 Type *VecEltTy = AA.Vector.Ty->getElementType();
1119 const unsigned ElementSize = DL.getTypeSizeInBits(VecEltTy) / 8;
1120
1121 // Alloca is uninitialized memory. Imitate that by making the first value
1122 // undef.
1123 SSAUpdater Updater;
1124 Updater.Initialize(AA.Vector.Ty, "promotealloca");
1125
1126 BasicBlock *EntryBB = AA.Alloca->getParent();
1127 BasicBlock::iterator InitInsertPos =
1128 skipToNonAllocaInsertPt(*EntryBB, AA.Alloca->getIterator());
1129 IRBuilder<> Builder(&*InitInsertPos);
1130 Value *AllocaInitValue = Builder.CreateFreeze(PoisonValue::get(AA.Vector.Ty));
1131 AllocaInitValue->takeName(AA.Alloca);
1132
1133 Updater.AddAvailableValue(AA.Alloca->getParent(), AllocaInitValue);
1134
1135 // First handle the initial worklist, in basic block order.
1136 //
1137 // Insert a placeholder whenever we need the vector value at the top of a
1138 // basic block.
1140 forEachWorkListItem(AA.Vector.Worklist, [&](Instruction *I) {
1141 BasicBlock *BB = I->getParent();
1142 auto GetCurVal = [&]() -> Value * {
1143 if (Value *CurVal = Updater.FindValueForBlock(BB))
1144 return CurVal;
1145
1146 if (!Placeholders.empty() && Placeholders.back()->getParent() == BB)
1147 return Placeholders.back();
1148
1149 // If the current value in the basic block is not yet known, insert a
1150 // placeholder that we will replace later.
1151 IRBuilder<> Builder(I);
1152 auto *Placeholder = cast<Instruction>(Builder.CreateFreeze(
1153 PoisonValue::get(AA.Vector.Ty), "promotealloca.placeholder"));
1154 Placeholders.insert(Placeholder);
1155 return Placeholders.back();
1156 };
1157
1158 Value *Result = promoteAllocaUserToVector(I, DL, AA, VecStoreSize,
1159 ElementSize, GetCurVal);
1160 // If the returned result is a placeholder, it means the instruction does
1161 // not really modify the alloca. So no need to make it being available value
1162 // to SSAUpdater.
1163 // This will stop placeholder being cached in SSAUpdater. The cached
1164 // placeholder may cause stale pointer being referenced when doing
1165 // placeholder replacement.
1166 if (Result && (!isa<Instruction>(Result) ||
1167 !Placeholders.contains(cast<Instruction>(Result))))
1168 Updater.AddAvailableValue(BB, Result);
1169 });
1170
1171 // Now fixup the placeholders.
1172 for (Instruction *Placeholder : Placeholders) {
1173 Placeholder->replaceAllUsesWith(
1174 Updater.GetValueInMiddleOfBlock(Placeholder->getParent()));
1175 Placeholder->eraseFromParent();
1176 }
1177
1178 // Delete all instructions.
1179 for (Instruction *I : AA.Vector.Worklist) {
1180 assert(I->use_empty());
1181 I->eraseFromParent();
1182 }
1183
1184 // Delete all the users that are known to be removeable.
1185 for (Instruction *I : reverse(AA.Vector.UsersToRemove)) {
1186 I->dropDroppableUses();
1187 assert(I->use_empty());
1188 I->eraseFromParent();
1189 }
1190
1191 // Alloca should now be dead too.
1192 assert(AA.Alloca->use_empty());
1193 AA.Alloca->eraseFromParent();
1194}
1195
1196std::pair<Value *, Value *>
1197AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
1198 Function &F = *Builder.GetInsertBlock()->getParent();
1200
1201 if (!IsAMDHSA) {
1202 CallInst *LocalSizeY = Builder.CreateIntrinsicWithoutFolding(
1203 Intrinsic::r600_read_local_size_y, {});
1204 CallInst *LocalSizeZ = Builder.CreateIntrinsicWithoutFolding(
1205 Intrinsic::r600_read_local_size_z, {});
1206
1207 ST.makeLIDRangeMetadata(LocalSizeY);
1208 ST.makeLIDRangeMetadata(LocalSizeZ);
1209
1210 return std::pair(LocalSizeY, LocalSizeZ);
1211 }
1212
1213 // We must read the size out of the dispatch pointer.
1214 assert(IsAMDGCN);
1215
1216 // We are indexing into this struct, and want to extract the workgroup_size_*
1217 // fields.
1218 //
1219 // typedef struct hsa_kernel_dispatch_packet_s {
1220 // uint16_t header;
1221 // uint16_t setup;
1222 // uint16_t workgroup_size_x ;
1223 // uint16_t workgroup_size_y;
1224 // uint16_t workgroup_size_z;
1225 // uint16_t reserved0;
1226 // uint32_t grid_size_x ;
1227 // uint32_t grid_size_y ;
1228 // uint32_t grid_size_z;
1229 //
1230 // uint32_t private_segment_size;
1231 // uint32_t group_segment_size;
1232 // uint64_t kernel_object;
1233 //
1234 // #ifdef HSA_LARGE_MODEL
1235 // void *kernarg_address;
1236 // #elif defined HSA_LITTLE_ENDIAN
1237 // void *kernarg_address;
1238 // uint32_t reserved1;
1239 // #else
1240 // uint32_t reserved1;
1241 // void *kernarg_address;
1242 // #endif
1243 // uint64_t reserved2;
1244 // hsa_signal_t completion_signal; // uint64_t wrapper
1245 // } hsa_kernel_dispatch_packet_t
1246 //
1247 CallInst *DispatchPtr =
1248 Builder.CreateIntrinsicWithoutFolding(Intrinsic::amdgcn_dispatch_ptr, {});
1249 DispatchPtr->addRetAttr(Attribute::NoAlias);
1250 DispatchPtr->addRetAttr(Attribute::NonNull);
1251 F.removeFnAttr("amdgpu-no-dispatch-ptr");
1252
1253 // Size of the dispatch packet struct.
1254 DispatchPtr->addDereferenceableRetAttr(64);
1255
1256 Type *I32Ty = Type::getInt32Ty(Mod.getContext());
1257
1258 // We could do a single 64-bit load here, but it's likely that the basic
1259 // 32-bit and extract sequence is already present, and it is probably easier
1260 // to CSE this. The loads should be mergeable later anyway.
1261 Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 1);
1262 LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4));
1263
1264 Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 2);
1265 LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4));
1266
1267 MDNode *MD = MDNode::get(Mod.getContext(), {});
1268 LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
1269 LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
1270 ST.makeLIDRangeMetadata(LoadZU);
1271
1272 // Extract y component. Upper half of LoadZU should be zero already.
1273 Value *Y = Builder.CreateLShr(LoadXY, 16);
1274
1275 return std::pair(Y, LoadZU);
1276}
1277
1278Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
1279 unsigned N) {
1280 Function *F = Builder.GetInsertBlock()->getParent();
1283 StringRef AttrName;
1284
1285 switch (N) {
1286 case 0:
1287 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
1288 : (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
1289 AttrName = "amdgpu-no-workitem-id-x";
1290 break;
1291 case 1:
1292 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
1293 : (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
1294 AttrName = "amdgpu-no-workitem-id-y";
1295 break;
1296
1297 case 2:
1298 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
1299 : (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
1300 AttrName = "amdgpu-no-workitem-id-z";
1301 break;
1302 default:
1303 llvm_unreachable("invalid dimension");
1304 }
1305
1306 Function *WorkitemIdFn = Intrinsic::getOrInsertDeclaration(&Mod, IntrID);
1307 CallInst *CI = Builder.CreateCall(WorkitemIdFn);
1308 ST.makeLIDRangeMetadata(CI);
1309 F->removeFnAttr(AttrName);
1310
1311 return CI;
1312}
1313
1314static bool isCallPromotable(CallInst *CI) {
1316 if (!II)
1317 return false;
1318
1319 switch (II->getIntrinsicID()) {
1320 case Intrinsic::memcpy:
1321 case Intrinsic::memmove:
1322 case Intrinsic::memset:
1323 case Intrinsic::lifetime_start:
1324 case Intrinsic::lifetime_end:
1325 case Intrinsic::invariant_start:
1326 case Intrinsic::invariant_end:
1327 case Intrinsic::launder_invariant_group:
1328 case Intrinsic::strip_invariant_group:
1329 case Intrinsic::objectsize:
1330 return true;
1331 default:
1332 return false;
1333 }
1334}
1335
1336bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
1337 Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0,
1338 int OpIdx1) const {
1339 // Figure out which operand is the one we might not be promoting.
1340 Value *OtherOp = Inst->getOperand(OpIdx0);
1341 if (Val == OtherOp)
1342 OtherOp = Inst->getOperand(OpIdx1);
1343
1345 return true;
1346
1347 // TODO: getUnderlyingObject will not work on a vector getelementptr
1348 Value *OtherObj = getUnderlyingObject(OtherOp);
1349 if (!isa<AllocaInst>(OtherObj))
1350 return false;
1351
1352 // TODO: We should be able to replace undefs with the right pointer type.
1353
1354 // TODO: If we know the other base object is another promotable
1355 // alloca, not necessarily this alloca, we can do this. The
1356 // important part is both must have the same address space at
1357 // the end.
1358 if (OtherObj != BaseAlloca) {
1359 LLVM_DEBUG(
1360 dbgs() << "Found a binary instruction with another alloca object\n");
1361 return false;
1362 }
1363
1364 return true;
1365}
1366
1367void AMDGPUPromoteAllocaImpl::analyzePromoteToLDS(AllocaAnalysis &AA) const {
1368 if (DisablePromoteAllocaToLDS) {
1369 LLVM_DEBUG(dbgs() << " Promote alloca to LDS is disabled\n");
1370 return;
1371 }
1372
1373 // Don't promote the alloca to LDS for shader calling conventions as the work
1374 // item ID intrinsics are not supported for these calling conventions.
1375 // Furthermore not all LDS is available for some of the stages.
1376 const Function &ContainingFunction = *AA.Alloca->getFunction();
1377 CallingConv::ID CC = ContainingFunction.getCallingConv();
1378
1379 switch (CC) {
1382 break;
1383 default:
1384 LLVM_DEBUG(
1385 dbgs()
1386 << " promote alloca to LDS not supported with calling convention.\n");
1387 return;
1388 }
1389
1390 for (Use *Use : AA.Uses) {
1391 auto *User = Use->getUser();
1392
1393 if (CallInst *CI = dyn_cast<CallInst>(User)) {
1394 if (!isCallPromotable(CI))
1395 return;
1396
1397 if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end())
1398 AA.LDS.Worklist.push_back(User);
1399 continue;
1400 }
1401
1403 if (UseInst->getOpcode() == Instruction::PtrToInt)
1404 return;
1405
1406 if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
1407 if (LI->isVolatile())
1408 return;
1409 continue;
1410 }
1411
1412 if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
1413 if (SI->isVolatile())
1414 return;
1415 continue;
1416 }
1417
1418 if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
1419 if (RMW->isVolatile())
1420 return;
1421 continue;
1422 }
1423
1424 if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
1425 if (CAS->isVolatile())
1426 return;
1427 continue;
1428 }
1429
1430 // Only promote a select if we know that the other select operand
1431 // is from another pointer that will also be promoted.
1432 if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
1433 if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Use->get(), ICmp, 0, 1))
1434 return;
1435
1436 // May need to rewrite constant operands.
1437 if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end())
1438 AA.LDS.Worklist.push_back(ICmp);
1439 continue;
1440 }
1441
1443 // Be conservative if an address could be computed outside the bounds of
1444 // the alloca.
1445 if (!GEP->isInBounds())
1446 return;
1448 // Do not promote vector/aggregate type instructions. It is hard to track
1449 // their users.
1450
1451 // Do not promote addrspacecast.
1452 //
1453 // TODO: If we know the address is only observed through flat pointers, we
1454 // could still promote.
1455 return;
1456 }
1457
1458 if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end())
1459 AA.LDS.Worklist.push_back(User);
1460 }
1461
1462 AA.LDS.Enable = true;
1463}
1464
1465bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
1466
1467 FunctionType *FTy = F.getFunctionType();
1469
1470 // If the function has any arguments in the local address space, then it's
1471 // possible these arguments require the entire local memory space, so
1472 // we cannot use local memory in the pass.
1473 for (Type *ParamTy : FTy->params()) {
1474 PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
1475 if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
1476 LocalMemLimit = 0;
1477 LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
1478 "local memory disabled.\n");
1479 return false;
1480 }
1481 }
1482
1483 LocalMemLimit = ST.getAddressableLocalMemorySize();
1484 if (LocalMemLimit == 0)
1485 return false;
1486
1488 SmallPtrSet<const Constant *, 8> VisitedConstants;
1490
1491 auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool {
1492 for (const User *U : Val->users()) {
1493 if (const Instruction *Use = dyn_cast<Instruction>(U)) {
1494 if (Use->getFunction() == &F)
1495 return true;
1496 } else {
1497 const Constant *C = cast<Constant>(U);
1498 if (VisitedConstants.insert(C).second)
1499 Stack.push_back(C);
1500 }
1501 }
1502
1503 return false;
1504 };
1505
1506 for (GlobalVariable &GV : Mod.globals()) {
1508 continue;
1509
1510 if (visitUsers(&GV, &GV)) {
1511 UsedLDS.insert(&GV);
1512 Stack.clear();
1513 continue;
1514 }
1515
1516 // For any ConstantExpr uses, we need to recursively search the users until
1517 // we see a function.
1518 while (!Stack.empty()) {
1519 const Constant *C = Stack.pop_back_val();
1520 if (visitUsers(&GV, C)) {
1521 UsedLDS.insert(&GV);
1522 Stack.clear();
1523 break;
1524 }
1525 }
1526 }
1527
1528 SmallVector<std::pair<uint64_t, Align>, 16> AllocatedSizes;
1529 AllocatedSizes.reserve(UsedLDS.size());
1530
1531 for (const GlobalVariable *GV : UsedLDS) {
1532 Align Alignment =
1533 DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
1534 uint64_t AllocSize = GV->getGlobalSize(DL);
1535
1536 // HIP uses an extern unsized array in local address space for dynamically
1537 // allocated shared memory. In that case, we have to disable the promotion.
1538 if (GV->hasExternalLinkage() && AllocSize == 0) {
1539 LocalMemLimit = 0;
1540 LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated "
1541 "local memory. Promoting to local memory "
1542 "disabled.\n");
1543 return false;
1544 }
1545
1546 AllocatedSizes.emplace_back(AllocSize, Alignment);
1547 }
1548
1549 // Sort to try to estimate the worst case alignment padding
1550 //
1551 // FIXME: We should really do something to fix the addresses to a more optimal
1552 // value instead
1553 llvm::sort(AllocatedSizes, llvm::less_second());
1554
1555 // Check how much local memory is being used by global objects
1556 CurrentLocalMemUsage = 0;
1557
1558 // FIXME: Try to account for padding here. The real padding and address is
1559 // currently determined from the inverse order of uses in the function when
1560 // legalizing, which could also potentially change. We try to estimate the
1561 // worst case here, but we probably should fix the addresses earlier.
1562 for (auto Alloc : AllocatedSizes) {
1563 CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alloc.second);
1564 CurrentLocalMemUsage += Alloc.first;
1565 }
1566
1567 unsigned MaxOccupancy =
1568 ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), CurrentLocalMemUsage, F)
1569 .second;
1570
1571 // Round up to the next tier of usage.
1572 unsigned MaxSizeWithWaveCount =
1573 ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
1574
1575 // Program may already use more LDS than is usable at maximum occupancy.
1576 if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
1577 return false;
1578
1579 LocalMemLimit = MaxSizeWithWaveCount;
1580
1581 LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
1582 << " bytes of LDS\n"
1583 << " Rounding size to " << MaxSizeWithWaveCount
1584 << " with a maximum occupancy of " << MaxOccupancy << '\n'
1585 << " and " << (LocalMemLimit - CurrentLocalMemUsage)
1586 << " available for promotion\n");
1587
1588 return true;
1589}
1590
1591// FIXME: Should try to pick the most likely to be profitable allocas first.
1592bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(
1593 AllocaAnalysis &AA, bool SufficientLDS,
1594 SetVector<IntrinsicInst *> &DeferredIntrs) {
1595 LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << *AA.Alloca << '\n');
1596
1597 // Not likely to have sufficient local memory for promotion.
1598 if (!SufficientLDS)
1599 return false;
1600
1601 IRBuilder<> Builder(AA.Alloca);
1602
1603 const Function &ContainingFunction = *AA.Alloca->getParent()->getParent();
1604 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, ContainingFunction);
1605 unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
1606
1607 Align Alignment = AA.Alloca->getAlign();
1608
1609 // FIXME: This computed padding is likely wrong since it depends on inverse
1610 // usage order.
1611 //
1612 // FIXME: It is also possible that if we're allowed to use all of the memory
1613 // could end up using more than the maximum due to alignment padding.
1614
1615 uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment);
1616 std::optional<TypeSize> ElemSize = AA.Alloca->getAllocationSize(DL);
1617 if (!ElemSize || ElemSize->isScalable())
1618 return false;
1619 TypeSize AllocSize = WorkGroupSize * *ElemSize;
1620 NewSize += AllocSize.getFixedValue();
1621
1622 if (NewSize > LocalMemLimit) {
1623 LLVM_DEBUG(dbgs() << " " << AllocSize
1624 << " bytes of local memory not available to promote\n");
1625 return false;
1626 }
1627
1628 CurrentLocalMemUsage = NewSize;
1629
1630 LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
1631
1632 Function *F = AA.Alloca->getFunction();
1633
1634 Type *GVTy = ArrayType::get(AA.Alloca->getAllocatedType(), WorkGroupSize);
1637 Twine(F->getName()) + Twine('.') + AA.Alloca->getName(), nullptr,
1640 GV->setAlignment(AA.Alloca->getAlign());
1641
1642 Value *TCntY, *TCntZ;
1643
1644 std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
1645 Value *TIdX = getWorkitemID(Builder, 0);
1646 Value *TIdY = getWorkitemID(Builder, 1);
1647 Value *TIdZ = getWorkitemID(Builder, 2);
1648
1649 Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
1650 Tmp0 = Builder.CreateMul(Tmp0, TIdX);
1651 Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
1652 Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
1653 TID = Builder.CreateAdd(TID, TIdZ);
1654
1655 LLVMContext &Context = Mod.getContext();
1657
1658 Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
1659 AA.Alloca->mutateType(Offset->getType());
1660 AA.Alloca->replaceAllUsesWith(Offset);
1661 AA.Alloca->eraseFromParent();
1662
1664
1665 for (Value *V : AA.LDS.Worklist) {
1667 if (!Call) {
1668 if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
1669 Value *LHS = CI->getOperand(0);
1670 Value *RHS = CI->getOperand(1);
1671
1672 Type *NewTy = LHS->getType()->getWithNewType(NewPtrTy);
1674 CI->setOperand(0, Constant::getNullValue(NewTy));
1675
1677 CI->setOperand(1, Constant::getNullValue(NewTy));
1678
1679 continue;
1680 }
1681
1682 // The operand's value should be corrected on its own and we don't want to
1683 // touch the users.
1685 continue;
1686
1687 assert(V->getType()->isPtrOrPtrVectorTy());
1688
1689 Type *NewTy = V->getType()->getWithNewType(NewPtrTy);
1690 V->mutateType(NewTy);
1691
1692 // Adjust the types of any constant operands.
1695 SI->setOperand(1, Constant::getNullValue(NewTy));
1696
1698 SI->setOperand(2, Constant::getNullValue(NewTy));
1699 } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
1700 for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
1702 Phi->getIncomingValue(I)))
1703 Phi->setIncomingValue(I, Constant::getNullValue(NewTy));
1704 }
1705 }
1706
1707 continue;
1708 }
1709
1711 Builder.SetInsertPoint(Intr);
1712 switch (Intr->getIntrinsicID()) {
1713 case Intrinsic::lifetime_start:
1714 case Intrinsic::lifetime_end:
1715 // These intrinsics are for address space 0 only
1716 Intr->eraseFromParent();
1717 continue;
1718 case Intrinsic::memcpy:
1719 case Intrinsic::memmove:
1720 // These have 2 pointer operands. In case if second pointer also needs
1721 // to be replaced we defer processing of these intrinsics until all
1722 // other values are processed.
1723 DeferredIntrs.insert(Intr);
1724 continue;
1725 case Intrinsic::memset: {
1726 MemSetInst *MemSet = cast<MemSetInst>(Intr);
1727 Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
1728 MemSet->getLength(), MemSet->getDestAlign(),
1729 MemSet->isVolatile());
1730 Intr->eraseFromParent();
1731 continue;
1732 }
1733 case Intrinsic::invariant_start:
1734 case Intrinsic::invariant_end:
1735 case Intrinsic::launder_invariant_group:
1736 case Intrinsic::strip_invariant_group: {
1738 if (Intr->getIntrinsicID() == Intrinsic::invariant_start) {
1739 Args.emplace_back(Intr->getArgOperand(0));
1740 } else if (Intr->getIntrinsicID() == Intrinsic::invariant_end) {
1741 Args.emplace_back(Intr->getArgOperand(0));
1742 Args.emplace_back(Intr->getArgOperand(1));
1743 }
1744 Args.emplace_back(Offset);
1746 Intr->getModule(), Intr->getIntrinsicID(), Offset->getType());
1747 CallInst *NewIntr =
1748 CallInst::Create(F, Args, Intr->getName(), Intr->getIterator());
1749 Intr->mutateType(NewIntr->getType());
1750 Intr->replaceAllUsesWith(NewIntr);
1751 Intr->eraseFromParent();
1752 continue;
1753 }
1754 case Intrinsic::objectsize: {
1755 Value *Src = Intr->getOperand(0);
1756
1757 Value *NewCall = Builder.CreateIntrinsic(
1758 Intrinsic::objectsize,
1760 {Src, Intr->getOperand(1), Intr->getOperand(2), Intr->getOperand(3)});
1761 Intr->replaceAllUsesWith(NewCall);
1762 Intr->eraseFromParent();
1763 continue;
1764 }
1765 default:
1766 Intr->print(errs());
1767 llvm_unreachable("Don't know how to promote alloca intrinsic use.");
1768 }
1769 }
1770
1771 return true;
1772}
1773
1774void AMDGPUPromoteAllocaImpl::finishDeferredAllocaToLDSPromotion(
1775 SetVector<IntrinsicInst *> &DeferredIntrs) {
1776
1777 for (IntrinsicInst *Intr : DeferredIntrs) {
1778 IRBuilder<> Builder(Intr);
1779 Builder.SetInsertPoint(Intr);
1781 assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
1782
1784 auto *B = Builder.CreateMemTransferInst(
1785 ID, MI->getRawDest(), MI->getDestAlign(), MI->getRawSource(),
1786 MI->getSourceAlign(), MI->getLength(), MI->isVolatile());
1787
1788 for (unsigned I = 0; I != 2; ++I) {
1789 if (uint64_t Bytes = Intr->getParamDereferenceableBytes(I)) {
1790 B->addDereferenceableParamAttr(I, Bytes);
1791 }
1792 }
1793
1794 Intr->eraseFromParent();
1795 }
1796}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static Value * promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL, AllocaAnalysis &AA, unsigned VecStoreSize, unsigned ElementSize, function_ref< Value *()> GetCurVal)
Promotes a single user of the alloca to a vector form.
AMDGPU promote alloca to vector or LDS
static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy, const DataLayout &DL)
static void forEachWorkListItem(const InstContainer &WorkList, std::function< void(Instruction *)> Fn)
Iterates over an instruction worklist that may contain multiple instructions from the same basic bloc...
static std::optional< GEPToVectorIndex > computeGEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, Type *VecElemTy, const DataLayout &DL)
static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI, const DataLayout &DL)
static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB, BasicBlock::iterator I)
Find an insert point after an alloca, after all other allocas clustered at the start of the block.
static bool isCallPromotable(CallInst *CI)
static Value * calculateVectorIndex(Value *Ptr, AllocaAnalysis &AA)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
@ Enable
static bool runOnFunction(Function &F, bool PostInlining)
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
Hexagon Common GEP
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
uint64_t IntrinsicInst * II
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
Remove Loads Into Fake Uses
static unsigned getNumElements(Type *Ty)
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:119
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
Target-Independent Code Generator Pass Configuration Options pass.
Value * RHS
Value * LHS
static const AMDGPUSubtarget & get(const MachineFunction &MF)
Class for arbitrary precision integers.
Definition APInt.h:78
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
LLVM_ABI APInt sdiv(const APInt &RHS) const
Signed division function for APInt.
Definition APInt.cpp:1670
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1084
LLVM_ABI APInt srem(const APInt &RHS) const
Function for signed remainder operation.
Definition APInt.cpp:1771
an instruction to allocate memory on the stack
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:275
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:474
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
uint64_t getParamDereferenceableBytes(unsigned i) const
Extract the number of dereferenceable bytes for a call or parameter (0=unknown).
void addDereferenceableRetAttr(uint64_t Bytes)
adds the dereferenceable attribute to the list of attributes.
void addRetAttr(Attribute::AttrKind Kind)
Adds the attribute to the return value.
Value * getArgOperand(unsigned i) const
This class represents a function call, abstracting a target machine's calling convention.
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
static LLVM_ABI bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:286
Implements a dense probed hash-table based set.
Definition DenseSet.h:289
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:867
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
Class to represent function types.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
bool hasExternalLinkage() const
void setUnnamedAddr(UnnamedAddr Val)
unsigned getAddressSpace() const
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
Type * getValueType() const
MaybeAlign getAlign() const
Returns the alignment of the given variable.
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
Definition Globals.cpp:578
void setAlignment(Align Align)
Sets the alignment attribute of the GlobalVariable.
This instruction compares its operands according to the predicate given to the constructor.
LLVM_ABI CallInst * CreateIntrinsicWithoutFolding(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={})
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1923
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1532
BasicBlock * GetInsertBlock() const
Definition IRBuilder.h:175
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:2008
CallInst * CreateMemSet(Value *Ptr, Value *Val, uint64_t Size, MaybeAlign Align, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Create and insert a memset to the specified pointer and the specified value.
Definition IRBuilder.h:608
LLVM_ABI Value * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={}, function_ref< void(CallInst *)> SetFn=[](CallInst *) {})
Variant to create a possibly constant-folded intrinsic.
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1422
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2543
Value * CreateConstInBoundsGEP1_64(Type *Ty, Value *Ptr, uint64_t Idx0, const Twine &Name="")
Definition IRBuilder.h:2050
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:181
LLVM_ABI CallInst * CreateMemTransferInst(Intrinsic::ID IntrID, Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, Value *Size, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1456
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2848
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:587
The legacy pass manager's analysis pass to compute loop information.
Definition LoopInfo.h:612
Metadata node.
Definition Metadata.h:1069
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1554
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:38
bool empty() const
Definition MapVector.h:79
size_type size() const
Definition MapVector.h:58
std::pair< KeyT, ValueT > & front()
Definition MapVector.h:81
Value * getLength() const
Value * getRawDest() const
MaybeAlign getDestAlign() const
bool isVolatile() const
Value * getValue() const
This class wraps the llvm.memset and llvm.memset.inline intrinsics.
This class wraps the llvm.memcpy/memmove intrinsics.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition Pass.cpp:112
Class to represent pointers.
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
Helper class for SSA formation on a set of values defined in multiple blocks.
Definition SSAUpdater.h:39
LLVM_ABI void Initialize(Type *Ty, StringRef Name)
Reset this object to get ready for a new set of SSA updates with type 'Ty'.
LLVM_ABI Value * GetValueInMiddleOfBlock(BasicBlock *BB)
Construct SSA form, materializing a value that is live in the middle of the specified block.
LLVM_ABI void AddAvailableValue(BasicBlock *BB, Value *V)
Indicate that a rewritten value is available in the specified block with the specified value.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:57
bool contains(const_arg_type key) const
Check if the SetVector contains the given key.
Definition SetVector.h:252
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:339
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
static unsigned getPointerOperandIndex()
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
Primary interface to the complete machine description for the target machine.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition Type.h:279
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
bool isAggregateType() const
Return true if the type is an aggregate type.
Definition Type.h:319
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:313
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false) const
Implement operator<< on Value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition Value.cpp:713
bool use_empty() const
Definition Value.h:346
void mutateType(Type *Ty)
Mutate the type of this Value to be of the specified type.
Definition Value.h:807
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:319
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:400
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
Value handle that is nullable, but tries to track the Value.
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition TypeSize.h:180
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
@ LOCAL_ADDRESS
Address space for local memory.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
unsigned getDynamicVGPRBlockSize(const Function &F)
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
initializer< Ty > init(const Ty &Val)
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:573
@ Length
Definition DWP.cpp:573
void stable_sort(R &&Range)
Definition STLExtras.h:2116
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1765
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
LLVM_ABI bool isAssumeLikeIntrinsic(const Instruction *I)
Return true if it is an intrinsic that cannot be speculated but also cannot trap.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
FunctionPass * createAMDGPUPromoteAlloca()
@ Mod
The access may modify the value stored in memory.
Definition ModRef.h:34
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
char & AMDGPUPromoteAllocaID
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
#define N
AMDGPUPromoteAllocaPass(TargetMachine &TM)
Definition AMDGPU.h:272
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:256
A MapVector that performs no allocations if smaller than a certain size.
Definition MapVector.h:342
Function object to check whether the second component of a container supported by std::get (like std:...
Definition STLExtras.h:1448