40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
49#define DEBUG_TYPE "amdgpu-promote-alloca"
56 DisablePromoteAllocaToVector(
"disable-promote-alloca-to-vector",
57 cl::desc(
"Disable promote alloca to vector"),
61 DisablePromoteAllocaToLDS(
"disable-promote-alloca-to-lds",
62 cl::desc(
"Disable promote alloca to LDS"),
66 "amdgpu-promote-alloca-to-vector-limit",
67 cl::desc(
"Maximum byte size to consider promote alloca to vector"),
71 "amdgpu-promote-alloca-to-vector-max-regs",
73 "Maximum vector size (in 32b registers) to use when promoting alloca"),
79 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
80 cl::desc(
"Ratio of VGPRs to budget for promoting alloca to vectors"),
84 LoopUserWeight(
"promote-alloca-vector-loop-user-weight",
85 cl::desc(
"The bonus weight of users of allocas within loop "
86 "when sorting profitable allocas"),
92struct GEPToVectorIndex {
100struct MemTransferInfo {
106struct AllocaAnalysis {
111 bool HaveSelectOrPHI =
false;
124 explicit AllocaAnalysis(
AllocaInst *Alloca) : Alloca(Alloca) {}
128class AMDGPUPromoteAllocaImpl {
139 unsigned VGPRBudgetRatio;
140 unsigned MaxVectorRegs;
142 bool IsAMDGCN =
false;
143 bool IsAMDHSA =
false;
145 std::pair<Value *, Value *> getLocalSizeYZ(
IRBuilder<> &Builder);
148 bool collectAllocaUses(AllocaAnalysis &
AA)
const;
154 bool binaryOpIsDerivedFromSameAlloca(
Value *Alloca,
Value *Val,
159 bool hasSufficientLocalMem(
const Function &
F);
162 void analyzePromoteToVector(AllocaAnalysis &
AA)
const;
163 void promoteAllocaToVector(AllocaAnalysis &
AA);
164 void analyzePromoteToLDS(AllocaAnalysis &
AA)
const;
165 bool tryPromoteAllocaToLDS(AllocaAnalysis &
AA,
bool SufficientLDS,
170 void scoreAlloca(AllocaAnalysis &
AA)
const;
172 void setFunctionLimits(
const Function &
F);
176 : TM(TM), LI(LI),
Mod(M),
DL(M.getDataLayout()) {
177 const Triple &TT = M.getTargetTriple();
178 IsAMDGCN = TT.isAMDGCN();
182 bool run(
Function &
F,
bool PromoteToLDS);
195 if (
auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
196 return AMDGPUPromoteAllocaImpl(
198 getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
203 StringRef getPassName()
const override {
return "AMDGPU Promote Alloca"; }
212static unsigned getMaxVGPRs(
unsigned LDSBytes,
const TargetMachine &TM,
219 if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
220 DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
222 unsigned MaxVGPRs = ST.getMaxNumVGPRs(
223 ST.getWavesPerEU(ST.getFlatWorkGroupSizes(
F), LDSBytes,
F).first,
224 DynamicVGPRBlockSize);
229 if (!
F.hasFnAttribute(Attribute::AlwaysInline) &&
231 MaxVGPRs = std::min(MaxVGPRs, 32u);
237char AMDGPUPromoteAlloca::ID = 0;
240 "AMDGPU promote alloca to vector or LDS",
false,
false)
253 bool Changed = AMDGPUPromoteAllocaImpl(TM, *
F.getParent(), LI)
266 bool Changed = AMDGPUPromoteAllocaImpl(TM, *
F.getParent(), LI)
277 return new AMDGPUPromoteAlloca();
280bool AMDGPUPromoteAllocaImpl::collectAllocaUses(AllocaAnalysis &
AA)
const {
283 <<
" " << *Inst <<
"\n");
288 while (!WorkList.empty()) {
289 auto *Cur = WorkList.pop_back_val();
290 if (
find(
AA.Pointers, Cur) !=
AA.Pointers.end())
292 AA.Pointers.insert(Cur);
293 for (
auto &U : Cur->uses()) {
297 return RejectUser(Inst,
"pointer escapes via store");
300 AA.Uses.push_back(&U);
303 WorkList.push_back(Inst);
307 if (!binaryOpIsDerivedFromSameAlloca(
AA.Alloca, Cur,
SI, 1, 2))
308 return RejectUser(Inst,
"select from mixed objects");
309 WorkList.push_back(Inst);
310 AA.HaveSelectOrPHI =
true;
316 switch (
Phi->getNumIncomingValues()) {
320 if (!binaryOpIsDerivedFromSameAlloca(
AA.Alloca, Cur, Phi, 0, 1))
321 return RejectUser(Inst,
"phi from mixed objects");
324 return RejectUser(Inst,
"phi with too many operands");
327 WorkList.push_back(Inst);
328 AA.HaveSelectOrPHI =
true;
335void AMDGPUPromoteAllocaImpl::scoreAlloca(AllocaAnalysis &
AA)
const {
339 for (
auto *U :
AA.Uses) {
345 1 + (LoopUserWeight * LI.getLoopDepth(Inst->
getParent()));
346 LLVM_DEBUG(
dbgs() <<
" [+" << UserScore <<
"]:\t" << *Inst <<
"\n");
353void AMDGPUPromoteAllocaImpl::setFunctionLimits(
const Function &
F) {
357 const int R600MaxVectorRegs = 16;
358 MaxVectorRegs =
F.getFnAttributeAsParsedInteger(
359 "amdgpu-promote-alloca-to-vector-max-regs",
360 IsAMDGCN ? PromoteAllocaToVectorMaxRegs : R600MaxVectorRegs);
361 if (PromoteAllocaToVectorMaxRegs.getNumOccurrences())
362 MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
363 VGPRBudgetRatio =
F.getFnAttributeAsParsedInteger(
364 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
365 PromoteAllocaToVectorVGPRRatio);
366 if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences())
367 VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
370bool AMDGPUPromoteAllocaImpl::run(
Function &
F,
bool PromoteToLDS) {
371 if (DisablePromoteAllocaToLDS && DisablePromoteAllocaToVector)
374 bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(
F);
375 MaxVGPRs = IsAMDGCN ? getMaxVGPRs(CurrentLocalMemUsage, TM,
F) : 128;
376 setFunctionLimits(
F);
378 unsigned VectorizationBudget =
379 (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
383 std::vector<AllocaAnalysis> Allocas;
388 if (!AI->isStaticAlloca() || AI->isArrayAllocation())
393 AllocaAnalysis
AA{AI};
394 if (collectAllocaUses(
AA)) {
395 analyzePromoteToVector(
AA);
397 analyzePromoteToLDS(
AA);
398 if (
AA.Vector.Ty ||
AA.LDS.Enable) {
400 Allocas.push_back(std::move(
AA));
407 [](
const auto &
A,
const auto &
B) {
return A.Score >
B.Score; });
411 dbgs() <<
"Sorted Worklist:\n";
412 for (
const auto &
AA : Allocas)
413 dbgs() <<
" " << *
AA.Alloca <<
"\n";
419 for (AllocaAnalysis &
AA : Allocas) {
421 std::optional<TypeSize>
Size =
AA.Alloca->getAllocationSize(
DL);
423 const unsigned AllocaCost =
Size->getFixedValue() * 8;
425 if (AllocaCost <= VectorizationBudget) {
426 promoteAllocaToVector(
AA);
428 assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
430 VectorizationBudget -= AllocaCost;
432 << VectorizationBudget <<
"\n");
436 << AllocaCost <<
", budget:" << VectorizationBudget
437 <<
"): " << *
AA.Alloca <<
"\n");
442 tryPromoteAllocaToLDS(
AA, SufficientLDS, DeferredIntrs))
445 finishDeferredAllocaToLDSPromotion(DeferredIntrs);
467 return I->getOperand(0) == AI &&
475 if (Ptr ==
AA.Alloca)
476 return B.getInt32(0);
479 auto I =
AA.Vector.GEPVectorIdx.find(
GEP);
480 assert(
I !=
AA.Vector.GEPVectorIdx.end() &&
"Must have entry for GEP!");
482 if (!
I->second.Full) {
483 Value *Result =
nullptr;
484 B.SetInsertPoint(
GEP);
486 if (
I->second.VarIndex) {
487 Result =
I->second.VarIndex;
488 Result =
B.CreateSExtOrTrunc(Result,
B.getInt32Ty());
490 if (
I->second.VarMul)
491 Result =
B.CreateMul(Result,
I->second.VarMul);
493 if (
I->second.VarShift)
494 Result =
B.CreateAShr(Result,
I->second.VarShift,
"",
true);
497 if (
I->second.ConstIndex) {
499 Result =
B.CreateAdd(Result,
I->second.ConstIndex);
501 Result =
I->second.ConstIndex;
505 Result =
B.getInt32(0);
507 I->second.Full = Result;
510 return I->second.Full;
513static std::optional<GEPToVectorIndex>
519 unsigned BW =
DL.getIndexTypeSizeInBits(
GEP->getType());
521 APInt ConstOffset(BW, 0);
542 if (!CurGEP->collectOffset(
DL, BW, VarOffsets, ConstOffset))
546 CurPtr = CurGEP->getPointerOperand();
549 assert(CurPtr == Alloca &&
"GEP not based on alloca");
551 int64_t VecElemSize =
DL.getTypeAllocSize(VecElemTy);
552 if (VarOffsets.
size() > 1)
558 if (ConstOffset.
srem(VecElemSize) != 0)
560 APInt IndexQuot = ConstOffset.
sdiv(VecElemSize);
562 GEPToVectorIndex Result;
564 if (!ConstOffset.
isZero())
565 Result.ConstIndex = ConstantInt::get(Ctx, IndexQuot.
sextOrTrunc(BW));
568 if (VarOffsets.
empty())
573 const auto &VarOffset = VarOffsets.
front();
574 auto ScaleOpt = VarOffset.second.tryZExtValue();
575 if (!ScaleOpt || *ScaleOpt == 0)
579 Result.VarIndex = VarOffset.first;
585 if (Scale >= (
uint64_t)VecElemSize) {
586 if (Scale % VecElemSize != 0)
591 uint64_t VarMul = Scale / VecElemSize;
594 Result.VarMul = ConstantInt::get(Ctx,
APInt(BW, VarMul));
596 if ((
uint64_t)VecElemSize % Scale != 0)
601 uint64_t Divisor = VecElemSize / Scale;
611 Result.VarShift = ConstantInt::get(Ctx,
APInt(BW,
Log2_64(Divisor)));
632 unsigned VecStoreSize,
633 unsigned ElementSize,
639 Builder.SetInsertPoint(Inst);
641 Type *VecEltTy =
AA.Vector.Ty->getElementType();
644 case Instruction::Load: {
645 Value *CurVal = GetCurVal();
651 TypeSize AccessSize =
DL.getTypeStoreSize(AccessTy);
653 if (CI->isNullValue() && AccessSize == VecStoreSize) {
655 Builder.CreateBitPreservingCastChain(
DL, CurVal, AccessTy));
663 const unsigned NumLoadedElts = AccessSize /
DL.getTypeStoreSize(VecEltTy);
665 assert(
DL.getTypeStoreSize(SubVecTy) ==
DL.getTypeStoreSize(AccessTy));
674 TypeSize NumBits =
DL.getTypeStoreSize(SubVecTy) * 8u;
676 bool IsAlignedLoad = NumBits <= (LoadAlign * 8u);
678 bool IsProperlyDivisible = TotalNumElts % NumLoadedElts == 0;
681 IsProperlyDivisible && IsAlignedLoad) {
683 const unsigned NewNumElts =
684 DL.getTypeStoreSize(VectorTy) * 8u / NumBits;
685 const unsigned LShrAmt =
llvm::Log2_32(SubVecTy->getNumElements());
688 Value *BCVal = Builder.CreateBitCast(CurVal, BitCastTy);
689 Value *NewIdx = Builder.CreateLShr(
690 Index, ConstantInt::get(Index->getType(), LShrAmt));
691 Value *ExtVal = Builder.CreateExtractElement(BCVal, NewIdx);
692 Value *BCOut = Builder.CreateBitCast(ExtVal, AccessTy);
698 for (
unsigned K = 0; K < NumLoadedElts; ++K) {
700 Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
701 SubVec = Builder.CreateInsertElement(
702 SubVec, Builder.CreateExtractElement(CurVal, CurIdx), K);
706 Builder.CreateBitPreservingCastChain(
DL, SubVec, AccessTy));
711 Value *ExtractElement = Builder.CreateExtractElement(CurVal, Index);
712 if (AccessTy != VecEltTy)
713 ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, AccessTy);
718 case Instruction::Store: {
725 Value *Val =
SI->getValueOperand();
729 TypeSize AccessSize =
DL.getTypeStoreSize(AccessTy);
731 if (CI->isNullValue() && AccessSize == VecStoreSize)
732 return Builder.CreateBitPreservingCastChain(
DL, Val,
AA.Vector.Ty);
737 const unsigned NumWrittenElts =
738 AccessSize /
DL.getTypeStoreSize(VecEltTy);
739 const unsigned NumVecElts =
AA.Vector.Ty->getNumElements();
741 assert(
DL.getTypeStoreSize(SubVecTy) ==
DL.getTypeStoreSize(AccessTy));
743 Val = Builder.CreateBitPreservingCastChain(
DL, Val, SubVecTy);
744 Value *CurVec = GetCurVal();
745 for (
unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts);
748 Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
749 CurVec = Builder.CreateInsertElement(
750 CurVec, Builder.CreateExtractElement(Val, K), CurIdx);
755 if (Val->
getType() != VecEltTy)
756 Val = Builder.CreateBitOrPointerCast(Val, VecEltTy);
757 return Builder.CreateInsertElement(GetCurVal(), Val, Index);
759 case Instruction::Call: {
763 unsigned NumCopied =
Length->getZExtValue() / ElementSize;
764 MemTransferInfo *TI = &
AA.Vector.TransferInfo[MTI];
769 for (
unsigned Idx = 0; Idx <
AA.Vector.Ty->getNumElements(); ++Idx) {
770 if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
779 return Builder.CreateShuffleVector(GetCurVal(), Mask);
785 Value *Elt = MSI->getOperand(1);
786 const unsigned BytesPerElt =
DL.getTypeStoreSize(VecEltTy);
787 if (BytesPerElt > 1) {
788 Value *EltBytes = Builder.CreateVectorSplat(BytesPerElt, Elt);
794 Elt = Builder.CreateBitCast(EltBytes, PtrInt);
795 Elt = Builder.CreateIntToPtr(Elt, VecEltTy);
797 Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
800 return Builder.CreateVectorSplat(
AA.Vector.Ty->getElementCount(), Elt);
804 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
805 Intr->replaceAllUsesWith(
806 Builder.getIntN(Intr->getType()->getIntegerBitWidth(),
807 DL.getTypeAllocSize(
AA.Vector.Ty)));
836 TypeSize AccTS =
DL.getTypeStoreSize(AccessTy);
840 if (AccTS * 8 !=
DL.getTypeSizeInBits(AccessTy))
852template <
typename InstContainer>
864 auto &BlockUses = UsesByBlock[BB];
867 if (BlockUses.empty())
871 if (BlockUses.size() == 1) {
878 if (!BlockUses.contains(&Inst))
899AMDGPUPromoteAllocaImpl::getVectorTypeForAlloca(
Type *AllocaTy)
const {
900 if (DisablePromoteAllocaToVector) {
907 uint64_t NumElems = 1;
910 NumElems *= ArrayTy->getNumElements();
911 ElemTy = ArrayTy->getElementType();
917 NumElems *= InnerVectorTy->getNumElements();
918 ElemTy = InnerVectorTy->getElementType();
922 unsigned ElementSize =
DL.getTypeSizeInBits(ElemTy) / 8;
923 if (ElementSize > 0) {
924 unsigned AllocaSize =
DL.getTypeStoreSize(AllocaTy);
929 if (NumElems * ElementSize != AllocaSize)
930 NumElems = AllocaSize / ElementSize;
931 if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
941 const unsigned MaxElements =
942 (MaxVectorRegs * 32) /
DL.getTypeSizeInBits(VectorTy->getElementType());
944 if (VectorTy->getNumElements() > MaxElements ||
945 VectorTy->getNumElements() < 2) {
947 <<
" has an unsupported number of elements\n");
951 Type *VecEltTy = VectorTy->getElementType();
952 unsigned ElementSizeInBits =
DL.getTypeSizeInBits(VecEltTy);
953 if (ElementSizeInBits !=
DL.getTypeAllocSizeInBits(VecEltTy)) {
954 LLVM_DEBUG(
dbgs() <<
" Cannot convert to vector if the allocation size "
955 "does not match the type's size\n");
962void AMDGPUPromoteAllocaImpl::analyzePromoteToVector(AllocaAnalysis &
AA)
const {
963 if (
AA.HaveSelectOrPHI) {
964 LLVM_DEBUG(
dbgs() <<
" Cannot convert to vector due to select or phi\n");
968 Type *AllocaTy =
AA.Alloca->getAllocatedType();
969 AA.Vector.Ty = getVectorTypeForAlloca(AllocaTy);
974 LLVM_DEBUG(
dbgs() <<
" Cannot promote alloca to vector: " << Msg <<
"\n"
975 <<
" " << *Inst <<
"\n");
976 AA.Vector.Ty =
nullptr;
979 Type *VecEltTy =
AA.Vector.Ty->getElementType();
980 unsigned ElementSize =
DL.getTypeSizeInBits(VecEltTy) / 8;
982 for (
auto *U :
AA.Uses) {
991 return RejectUser(Inst,
"unsupported load/store as aggregate");
998 return RejectUser(Inst,
"not a simple load or store");
1000 Ptr = Ptr->stripPointerCasts();
1003 if (Ptr ==
AA.Alloca &&
1004 DL.getTypeStoreSize(
AA.Alloca->getAllocatedType()) ==
1005 DL.getTypeStoreSize(AccessTy)) {
1006 AA.Vector.Worklist.push_back(Inst);
1011 return RejectUser(Inst,
"not a supported access type");
1013 AA.Vector.Worklist.push_back(Inst);
1022 return RejectUser(Inst,
"cannot compute vector index for GEP");
1024 AA.Vector.GEPVectorIdx[
GEP] = std::move(
Index.value());
1025 AA.Vector.UsersToRemove.push_back(Inst);
1031 AA.Vector.Worklist.push_back(Inst);
1036 if (TransferInst->isVolatile())
1037 return RejectUser(Inst,
"mem transfer inst is volatile");
1040 if (!Len || (
Len->getZExtValue() % ElementSize))
1041 return RejectUser(Inst,
"mem transfer inst length is non-constant or "
1042 "not a multiple of the vector element size");
1045 if (Ptr ==
AA.Alloca)
1046 return ConstantInt::get(Ptr->getContext(),
APInt(32, 0));
1049 const auto &GEPI =
AA.Vector.GEPVectorIdx.find(
GEP)->second;
1052 if (GEPI.ConstIndex)
1053 return GEPI.ConstIndex;
1054 return ConstantInt::get(Ptr->getContext(),
APInt(32, 0));
1057 MemTransferInfo *TI =
1058 &
AA.Vector.TransferInfo.try_emplace(TransferInst).first->second;
1059 unsigned OpNum =
U->getOperandNo();
1061 Value *Dest = TransferInst->getDest();
1064 return RejectUser(Inst,
"could not calculate constant dest index");
1065 TI->DestIndex =
Index;
1068 Value *Src = TransferInst->getSource();
1071 return RejectUser(Inst,
"could not calculate constant src index");
1072 TI->SrcIndex =
Index;
1078 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
1079 AA.Vector.Worklist.push_back(Inst);
1087 return RejectUser(Inst,
"assume-like intrinsic cannot have any users");
1088 AA.Vector.UsersToRemove.push_back(Inst);
1093 return isAssumeLikeIntrinsic(cast<Instruction>(U));
1095 AA.Vector.UsersToRemove.push_back(Inst);
1099 return RejectUser(Inst,
"unhandled alloca user");
1103 for (
const auto &Entry :
AA.Vector.TransferInfo) {
1104 const MemTransferInfo &TI =
Entry.second;
1105 if (!TI.SrcIndex || !TI.DestIndex)
1106 return RejectUser(
Entry.first,
1107 "mem transfer inst between different objects");
1108 AA.Vector.Worklist.push_back(
Entry.first);
1112void AMDGPUPromoteAllocaImpl::promoteAllocaToVector(AllocaAnalysis &
AA) {
1114 LLVM_DEBUG(
dbgs() <<
" type conversion: " << *
AA.Alloca->getAllocatedType()
1115 <<
" -> " << *
AA.Vector.Ty <<
'\n');
1116 const unsigned VecStoreSize =
DL.getTypeStoreSize(
AA.Vector.Ty);
1118 Type *VecEltTy =
AA.Vector.Ty->getElementType();
1119 const unsigned ElementSize =
DL.getTypeSizeInBits(VecEltTy) / 8;
1141 BasicBlock *BB = I->getParent();
1142 auto GetCurVal = [&]() -> Value * {
1143 if (Value *CurVal = Updater.FindValueForBlock(BB))
1146 if (!Placeholders.empty() && Placeholders.back()->getParent() == BB)
1147 return Placeholders.back();
1151 IRBuilder<> Builder(I);
1152 auto *Placeholder = cast<Instruction>(Builder.CreateFreeze(
1153 PoisonValue::get(AA.Vector.Ty),
"promotealloca.placeholder"));
1154 Placeholders.insert(Placeholder);
1155 return Placeholders.back();
1159 ElementSize, GetCurVal);
1173 Placeholder->replaceAllUsesWith(
1175 Placeholder->eraseFromParent();
1181 I->eraseFromParent();
1186 I->dropDroppableUses();
1188 I->eraseFromParent();
1193 AA.Alloca->eraseFromParent();
1196std::pair<Value *, Value *>
1197AMDGPUPromoteAllocaImpl::getLocalSizeYZ(
IRBuilder<> &Builder) {
1203 Intrinsic::r600_read_local_size_y, {});
1205 Intrinsic::r600_read_local_size_z, {});
1207 ST.makeLIDRangeMetadata(LocalSizeY);
1208 ST.makeLIDRangeMetadata(LocalSizeZ);
1210 return std::pair(LocalSizeY, LocalSizeZ);
1251 F.removeFnAttr(
"amdgpu-no-dispatch-ptr");
1268 LoadXY->
setMetadata(LLVMContext::MD_invariant_load, MD);
1269 LoadZU->
setMetadata(LLVMContext::MD_invariant_load, MD);
1270 ST.makeLIDRangeMetadata(LoadZU);
1275 return std::pair(
Y, LoadZU);
1287 IntrID = IsAMDGCN ? (
Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
1289 AttrName =
"amdgpu-no-workitem-id-x";
1292 IntrID = IsAMDGCN ? (
Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
1294 AttrName =
"amdgpu-no-workitem-id-y";
1298 IntrID = IsAMDGCN ? (
Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
1300 AttrName =
"amdgpu-no-workitem-id-z";
1308 ST.makeLIDRangeMetadata(CI);
1309 F->removeFnAttr(AttrName);
1319 switch (
II->getIntrinsicID()) {
1320 case Intrinsic::memcpy:
1321 case Intrinsic::memmove:
1322 case Intrinsic::memset:
1323 case Intrinsic::lifetime_start:
1324 case Intrinsic::lifetime_end:
1325 case Intrinsic::invariant_start:
1326 case Intrinsic::invariant_end:
1327 case Intrinsic::launder_invariant_group:
1328 case Intrinsic::strip_invariant_group:
1329 case Intrinsic::objectsize:
1336bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
1358 if (OtherObj != BaseAlloca) {
1360 dbgs() <<
"Found a binary instruction with another alloca object\n");
1367void AMDGPUPromoteAllocaImpl::analyzePromoteToLDS(AllocaAnalysis &
AA)
const {
1368 if (DisablePromoteAllocaToLDS) {
1376 const Function &ContainingFunction = *
AA.Alloca->getFunction();
1386 <<
" promote alloca to LDS not supported with calling convention.\n");
1397 if (
find(
AA.LDS.Worklist,
User) ==
AA.LDS.Worklist.end())
1398 AA.LDS.Worklist.push_back(
User);
1403 if (UseInst->
getOpcode() == Instruction::PtrToInt)
1407 if (LI->isVolatile())
1413 if (
SI->isVolatile())
1419 if (RMW->isVolatile())
1425 if (CAS->isVolatile())
1433 if (!binaryOpIsDerivedFromSameAlloca(
AA.Alloca,
Use->get(), ICmp, 0, 1))
1437 if (
find(
AA.LDS.Worklist,
User) ==
AA.LDS.Worklist.end())
1438 AA.LDS.Worklist.push_back(ICmp);
1445 if (!
GEP->isInBounds())
1458 if (
find(
AA.LDS.Worklist,
User) ==
AA.LDS.Worklist.end())
1459 AA.LDS.Worklist.push_back(
User);
1462 AA.LDS.Enable =
true;
1465bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(
const Function &
F) {
1473 for (
Type *ParamTy : FTy->params()) {
1477 LLVM_DEBUG(
dbgs() <<
"Function has local memory argument. Promoting to "
1478 "local memory disabled.\n");
1483 LocalMemLimit =
ST.getAddressableLocalMemorySize();
1484 if (LocalMemLimit == 0)
1494 if (
Use->getFunction() == &
F)
1498 if (VisitedConstants.
insert(
C).second)
1510 if (visitUsers(&GV, &GV)) {
1518 while (!
Stack.empty()) {
1520 if (visitUsers(&GV,
C)) {
1540 LLVM_DEBUG(
dbgs() <<
"Function has a reference to externally allocated "
1541 "local memory. Promoting to local memory "
1556 CurrentLocalMemUsage = 0;
1562 for (
auto Alloc : AllocatedSizes) {
1563 CurrentLocalMemUsage =
alignTo(CurrentLocalMemUsage,
Alloc.second);
1564 CurrentLocalMemUsage +=
Alloc.first;
1567 unsigned MaxOccupancy =
1568 ST.getWavesPerEU(
ST.getFlatWorkGroupSizes(
F), CurrentLocalMemUsage,
F)
1572 unsigned MaxSizeWithWaveCount =
1573 ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy,
F);
1576 if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
1579 LocalMemLimit = MaxSizeWithWaveCount;
1582 <<
" bytes of LDS\n"
1583 <<
" Rounding size to " << MaxSizeWithWaveCount
1584 <<
" with a maximum occupancy of " << MaxOccupancy <<
'\n'
1585 <<
" and " << (LocalMemLimit - CurrentLocalMemUsage)
1586 <<
" available for promotion\n");
1592bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(
1593 AllocaAnalysis &
AA,
bool SufficientLDS,
1603 const Function &ContainingFunction = *
AA.Alloca->getParent()->getParent();
1605 unsigned WorkGroupSize =
ST.getFlatWorkGroupSizes(ContainingFunction).second;
1607 Align Alignment =
AA.Alloca->getAlign();
1615 uint32_t NewSize =
alignTo(CurrentLocalMemUsage, Alignment);
1616 std::optional<TypeSize> ElemSize =
AA.Alloca->getAllocationSize(
DL);
1617 if (!ElemSize || ElemSize->isScalable())
1619 TypeSize AllocSize = WorkGroupSize * *ElemSize;
1622 if (NewSize > LocalMemLimit) {
1624 <<
" bytes of local memory not available to promote\n");
1628 CurrentLocalMemUsage = NewSize;
1637 Twine(
F->getName()) +
Twine(
'.') +
AA.Alloca->getName(),
nullptr,
1642 Value *TCntY, *TCntZ;
1644 std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
1645 Value *TIdX = getWorkitemID(Builder, 0);
1646 Value *TIdY = getWorkitemID(Builder, 1);
1647 Value *TIdZ = getWorkitemID(Builder, 2);
1659 AA.Alloca->mutateType(
Offset->getType());
1660 AA.Alloca->replaceAllUsesWith(
Offset);
1661 AA.Alloca->eraseFromParent();
1665 for (
Value *V :
AA.LDS.Worklist) {
1687 assert(
V->getType()->isPtrOrPtrVectorTy());
1689 Type *NewTy =
V->getType()->getWithNewType(NewPtrTy);
1690 V->mutateType(NewTy);
1700 for (
unsigned I = 0,
E =
Phi->getNumIncomingValues();
I !=
E; ++
I) {
1702 Phi->getIncomingValue(
I)))
1713 case Intrinsic::lifetime_start:
1714 case Intrinsic::lifetime_end:
1718 case Intrinsic::memcpy:
1719 case Intrinsic::memmove:
1723 DeferredIntrs.
insert(Intr);
1725 case Intrinsic::memset: {
1733 case Intrinsic::invariant_start:
1734 case Intrinsic::invariant_end:
1735 case Intrinsic::launder_invariant_group:
1736 case Intrinsic::strip_invariant_group: {
1754 case Intrinsic::objectsize: {
1758 Intrinsic::objectsize,
1774void AMDGPUPromoteAllocaImpl::finishDeferredAllocaToLDSPromotion(
1781 assert(
ID == Intrinsic::memcpy ||
ID == Intrinsic::memmove);
1785 ID,
MI->getRawDest(),
MI->getDestAlign(),
MI->getRawSource(),
1786 MI->getSourceAlign(),
MI->getLength(),
MI->isVolatile());
1788 for (
unsigned I = 0;
I != 2; ++
I) {
1790 B->addDereferenceableParamAttr(
I, Bytes);
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool runOnFunction(Function &F, bool PostInlining)
AMD GCN specific subclass of TargetSubtarget.
uint64_t IntrinsicInst * II
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Remove Loads Into Fake Uses
static unsigned getNumElements(Type *Ty)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
Target-Independent Code Generator Pass Configuration Options pass.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
Class for arbitrary precision integers.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
LLVM_ABI APInt sdiv(const APInt &RHS) const
Signed division function for APInt.
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
LLVM_ABI APInt srem(const APInt &RHS) const
Function for signed remainder operation.
an instruction to allocate memory on the stack
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
LLVM Basic Block Representation.
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
Represents analyses that only rely on functions' control flow.
uint64_t getParamDereferenceableBytes(unsigned i) const
Extract the number of dereferenceable bytes for a call or parameter (0=unknown).
void addDereferenceableRetAttr(uint64_t Bytes)
adds the dereferenceable attribute to the list of attributes.
void addRetAttr(Attribute::AttrKind Kind)
Adds the attribute to the return value.
Value * getArgOperand(unsigned i) const
This class represents a function call, abstracting a target machine's calling convention.
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
static LLVM_ABI bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
This is the shared class of boolean and integer constants.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
This is an important base class in LLVM.
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
Class to represent function types.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
bool hasExternalLinkage() const
void setUnnamedAddr(UnnamedAddr Val)
unsigned getAddressSpace() const
@ InternalLinkage
Rename collisions when linking (static functions).
Type * getValueType() const
MaybeAlign getAlign() const
Returns the alignment of the given variable.
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
void setAlignment(Align Align)
Sets the alignment attribute of the GlobalVariable.
This instruction compares its operands according to the predicate given to the constructor.
LLVM_ABI CallInst * CreateIntrinsicWithoutFolding(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={})
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
BasicBlock * GetInsertBlock() const
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
CallInst * CreateMemSet(Value *Ptr, Value *Val, uint64_t Size, MaybeAlign Align, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Create and insert a memset to the specified pointer and the specified value.
LLVM_ABI Value * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={}, function_ref< void(CallInst *)> SetFn=[](CallInst *) {})
Variant to create a possibly constant-folded intrinsic.
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateConstInBoundsGEP1_64(Type *Ty, Value *Ptr, uint64_t Idx0, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
LLVM_ABI CallInst * CreateMemTransferInst(Intrinsic::ID IntrID, Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, Value *Size, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
Analysis pass that exposes the LoopInfo for a function.
The legacy pass manager's analysis pass to compute loop information.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
This class implements a map that also provides access to all stored values in a deterministic order.
std::pair< KeyT, ValueT > & front()
Value * getLength() const
Value * getRawDest() const
MaybeAlign getDestAlign() const
This class wraps the llvm.memset and llvm.memset.inline intrinsics.
This class wraps the llvm.memcpy/memmove intrinsics.
A Module instance is used to store all the information related to an LLVM module.
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Class to represent pointers.
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Helper class for SSA formation on a set of values defined in multiple blocks.
LLVM_ABI void Initialize(Type *Ty, StringRef Name)
Reset this object to get ready for a new set of SSA updates with type 'Ty'.
LLVM_ABI Value * GetValueInMiddleOfBlock(BasicBlock *BB)
Construct SSA form, materializing a value that is live in the middle of the specified block.
LLVM_ABI void AddAvailableValue(BasicBlock *BB, Value *V)
Indicate that a rewritten value is available in the specified block with the specified value.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
bool contains(const_arg_type key) const
Check if the SetVector contains the given key.
bool insert(const value_type &X)
Insert a new element into the SetVector.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
static unsigned getPointerOperandIndex()
Represent a constant reference to a string, i.e.
Primary interface to the complete machine description for the target machine.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
Triple - Helper class for working with autoconf configuration names.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
The instances of the Type class are immutable: once they are created, they are never changed.
bool isArrayTy() const
True if this is an instance of ArrayType.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isPointerTy() const
True if this is an instance of PointerType.
bool isAggregateType() const
Return true if the type is an aggregate type.
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
A Use represents the edge between a Value definition and its users.
void setOperand(unsigned i, Value *Val)
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false) const
Implement operator<< on Value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< user_iterator > users()
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
void mutateType(Type *Ty)
Mutate the type of this Value to be of the specified type.
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
Value handle that is nullable, but tries to track the Value.
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
constexpr ScalarTy getFixedValue() const
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
@ LOCAL_ADDRESS
Address space for local memory.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
unsigned getDynamicVGPRBlockSize(const Function &F)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ C
The default llvm calling convention, compatible with C.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
initializer< Ty > init(const Ty &Val)
NodeAddr< PhiNode * > Phi
This is an optimization pass for GlobalISel generic memory operations.
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool isAssumeLikeIntrinsic(const Instruction *I)
Return true if it is an intrinsic that cannot be speculated but also cannot trap.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
auto reverse(ContainerTy &&C)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
FunctionPass * createAMDGPUPromoteAlloca()
@ Mod
The access may modify the value stored in memory.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
char & AMDGPUPromoteAllocaID
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
This struct is a compact representation of a valid (non-zero power of two) alignment.
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
A MapVector that performs no allocations if smaller than a certain size.
Function object to check whether the second component of a container supported by std::get (like std:...