40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
49#define DEBUG_TYPE "amdgpu-promote-alloca"
56 DisablePromoteAllocaToVector(
"disable-promote-alloca-to-vector",
57 cl::desc(
"Disable promote alloca to vector"),
61 DisablePromoteAllocaToLDS(
"disable-promote-alloca-to-lds",
62 cl::desc(
"Disable promote alloca to LDS"),
66 "amdgpu-promote-alloca-to-vector-limit",
67 cl::desc(
"Maximum byte size to consider promote alloca to vector"),
71 "amdgpu-promote-alloca-to-vector-max-regs",
73 "Maximum vector size (in 32b registers) to use when promoting alloca"),
79 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
80 cl::desc(
"Ratio of VGPRs to budget for promoting alloca to vectors"),
84 LoopUserWeight(
"promote-alloca-vector-loop-user-weight",
85 cl::desc(
"The bonus weight of users of allocas within loop "
86 "when sorting profitable allocas"),
91struct GEPToVectorIndex {
92 Value *VarIndex =
nullptr;
98struct MemTransferInfo {
104struct AllocaAnalysis {
109 bool HaveSelectOrPHI =
false;
122 explicit AllocaAnalysis(
AllocaInst *Alloca) : Alloca(Alloca) {}
126class AMDGPUPromoteAllocaImpl {
137 unsigned VGPRBudgetRatio;
138 unsigned MaxVectorRegs;
140 bool IsAMDGCN =
false;
141 bool IsAMDHSA =
false;
143 std::pair<Value *, Value *> getLocalSizeYZ(
IRBuilder<> &Builder);
146 bool collectAllocaUses(AllocaAnalysis &
AA)
const;
152 bool binaryOpIsDerivedFromSameAlloca(
Value *Alloca,
Value *Val,
157 bool hasSufficientLocalMem(
const Function &
F);
160 void analyzePromoteToVector(AllocaAnalysis &
AA)
const;
161 void promoteAllocaToVector(AllocaAnalysis &
AA);
162 void analyzePromoteToLDS(AllocaAnalysis &
AA)
const;
163 bool tryPromoteAllocaToLDS(AllocaAnalysis &
AA,
bool SufficientLDS,
168 void scoreAlloca(AllocaAnalysis &
AA)
const;
170 void setFunctionLimits(
const Function &
F);
176 IsAMDGCN = TT.isAMDGCN();
180 bool run(
Function &
F,
bool PromoteToLDS);
193 if (
auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
194 return AMDGPUPromoteAllocaImpl(
196 getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
201 StringRef getPassName()
const override {
return "AMDGPU Promote Alloca"; }
210static unsigned getMaxVGPRs(
unsigned LDSBytes,
const TargetMachine &TM,
220 if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
221 DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
223 unsigned MaxVGPRs = ST.getMaxNumVGPRs(
224 ST.getWavesPerEU(ST.getFlatWorkGroupSizes(
F), LDSBytes,
F).first,
225 DynamicVGPRBlockSize);
230 if (!
F.hasFnAttribute(Attribute::AlwaysInline) &&
232 MaxVGPRs = std::min(MaxVGPRs, 32u);
238char AMDGPUPromoteAlloca::ID = 0;
241 "AMDGPU promote alloca to vector or LDS",
false,
false)
254 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(
F,
true);
266 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(
F,
false);
276 return new AMDGPUPromoteAlloca();
279bool AMDGPUPromoteAllocaImpl::collectAllocaUses(AllocaAnalysis &
AA)
const {
282 <<
" " << *Inst <<
"\n");
287 while (!WorkList.empty()) {
288 auto *Cur = WorkList.pop_back_val();
289 if (
find(
AA.Pointers, Cur) !=
AA.Pointers.end())
291 AA.Pointers.insert(Cur);
292 for (
auto &U : Cur->uses()) {
296 return RejectUser(Inst,
"pointer escapes via store");
299 AA.Uses.push_back(&U);
302 WorkList.push_back(Inst);
306 if (!binaryOpIsDerivedFromSameAlloca(
AA.Alloca, Cur,
SI, 1, 2))
307 return RejectUser(Inst,
"select from mixed objects");
308 WorkList.push_back(Inst);
309 AA.HaveSelectOrPHI =
true;
315 switch (
Phi->getNumIncomingValues()) {
319 if (!binaryOpIsDerivedFromSameAlloca(
AA.Alloca, Cur, Phi, 0, 1))
320 return RejectUser(Inst,
"phi from mixed objects");
323 return RejectUser(Inst,
"phi with too many operands");
326 WorkList.push_back(Inst);
327 AA.HaveSelectOrPHI =
true;
334void AMDGPUPromoteAllocaImpl::scoreAlloca(AllocaAnalysis &
AA)
const {
338 for (
auto *U :
AA.Uses) {
344 1 + (LoopUserWeight * LI.getLoopDepth(Inst->
getParent()));
345 LLVM_DEBUG(
dbgs() <<
" [+" << UserScore <<
"]:\t" << *Inst <<
"\n");
352void AMDGPUPromoteAllocaImpl::setFunctionLimits(
const Function &
F) {
356 const int R600MaxVectorRegs = 16;
357 MaxVectorRegs =
F.getFnAttributeAsParsedInteger(
358 "amdgpu-promote-alloca-to-vector-max-regs",
359 IsAMDGCN ? PromoteAllocaToVectorMaxRegs : R600MaxVectorRegs);
360 if (PromoteAllocaToVectorMaxRegs.getNumOccurrences())
361 MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
362 VGPRBudgetRatio =
F.getFnAttributeAsParsedInteger(
363 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
364 PromoteAllocaToVectorVGPRRatio);
365 if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences())
366 VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
369bool AMDGPUPromoteAllocaImpl::run(
Function &
F,
bool PromoteToLDS) {
371 DL = &
Mod->getDataLayout();
374 if (!
ST.isPromoteAllocaEnabled())
377 bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(
F);
378 MaxVGPRs = getMaxVGPRs(CurrentLocalMemUsage, TM,
F);
379 setFunctionLimits(
F);
381 unsigned VectorizationBudget =
382 (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
386 std::vector<AllocaAnalysis> Allocas;
391 if (!AI->isStaticAlloca() || AI->isArrayAllocation())
396 AllocaAnalysis
AA{AI};
397 if (collectAllocaUses(
AA)) {
398 analyzePromoteToVector(
AA);
400 analyzePromoteToLDS(
AA);
401 if (
AA.Vector.Ty ||
AA.LDS.Enable) {
403 Allocas.push_back(std::move(
AA));
410 [](
const auto &
A,
const auto &
B) {
return A.Score >
B.Score; });
414 dbgs() <<
"Sorted Worklist:\n";
415 for (
const auto &
AA : Allocas)
416 dbgs() <<
" " << *
AA.Alloca <<
"\n";
422 for (AllocaAnalysis &
AA : Allocas) {
424 const unsigned AllocaCost =
425 DL->getTypeSizeInBits(
AA.Alloca->getAllocatedType());
427 if (AllocaCost <= VectorizationBudget) {
428 promoteAllocaToVector(
AA);
430 assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
432 VectorizationBudget -= AllocaCost;
434 << VectorizationBudget <<
"\n");
438 << AllocaCost <<
", budget:" << VectorizationBudget
439 <<
"): " << *
AA.Alloca <<
"\n");
444 tryPromoteAllocaToLDS(
AA, SufficientLDS, DeferredIntrs))
447 finishDeferredAllocaToLDSPromotion(DeferredIntrs);
469 return I->getOperand(0) == AI &&
477 if (Ptr ==
AA.Alloca)
478 return B.getInt32(0);
481 auto I =
AA.Vector.GEPVectorIdx.find(
GEP);
482 assert(
I !=
AA.Vector.GEPVectorIdx.end() &&
"Must have entry for GEP!");
484 if (!
I->second.Full) {
485 Value *Result =
nullptr;
486 B.SetInsertPoint(
GEP);
488 if (
I->second.VarIndex) {
489 Result =
I->second.VarIndex;
490 Result =
B.CreateSExtOrTrunc(Result,
B.getInt32Ty());
492 if (
I->second.VarMul)
493 Result =
B.CreateMul(Result,
I->second.VarMul);
496 if (
I->second.ConstIndex) {
498 Result =
B.CreateAdd(Result,
I->second.ConstIndex);
500 Result =
I->second.ConstIndex;
504 Result =
B.getInt32(0);
506 I->second.Full = Result;
509 return I->second.Full;
512static std::optional<GEPToVectorIndex>
518 unsigned BW =
DL.getIndexTypeSizeInBits(
GEP->getType());
520 APInt ConstOffset(BW, 0);
541 if (!CurGEP->collectOffset(
DL, BW, VarOffsets, ConstOffset))
545 CurPtr = CurGEP->getPointerOperand();
548 assert(CurPtr == Alloca &&
"GEP not based on alloca");
550 int64_t VecElemSize =
DL.getTypeAllocSize(VecElemTy);
551 if (VarOffsets.
size() > 1)
560 GEPToVectorIndex Result;
562 if (!ConstOffset.
isZero())
563 Result.ConstIndex = ConstantInt::get(Ctx, IndexQuot.
sextOrTrunc(BW));
565 if (VarOffsets.
empty())
568 const auto &VarOffset = VarOffsets.
front();
571 if (Rem != 0 || OffsetQuot.
isZero())
574 Result.VarIndex = VarOffset.first;
579 if (!OffsetQuot.
isOne())
580 Result.VarMul = ConstantInt::get(Ctx, OffsetQuot.
sextOrTrunc(BW));
600 unsigned VecStoreSize,
601 unsigned ElementSize,
607 Builder.SetInsertPoint(Inst);
609 const auto CreateTempPtrIntCast = [&Builder,
DL](
Value *Val,
611 assert(
DL.getTypeStoreSize(Val->getType()) ==
DL.getTypeStoreSize(PtrTy));
612 const unsigned Size =
DL.getTypeStoreSizeInBits(PtrTy);
613 if (!PtrTy->isVectorTy())
614 return Builder.CreateBitOrPointerCast(Val, Builder.getIntNTy(
Size));
618 assert((
Size % NumPtrElts == 0) &&
"Vector size not divisble");
620 return Builder.CreateBitOrPointerCast(
624 Type *VecEltTy =
AA.Vector.Ty->getElementType();
627 case Instruction::Load: {
628 Value *CurVal = GetCurVal();
634 TypeSize AccessSize =
DL.getTypeStoreSize(AccessTy);
636 if (CI->isZeroValue() && AccessSize == VecStoreSize) {
638 CurVal = CreateTempPtrIntCast(CurVal, AccessTy);
640 CurVal = CreateTempPtrIntCast(CurVal, CurVal->
getType());
641 Value *NewVal = Builder.CreateBitOrPointerCast(CurVal, AccessTy);
650 const unsigned NumLoadedElts = AccessSize /
DL.getTypeStoreSize(VecEltTy);
652 assert(
DL.getTypeStoreSize(SubVecTy) ==
DL.getTypeStoreSize(AccessTy));
661 TypeSize NumBits =
DL.getTypeStoreSize(SubVecTy) * 8u;
663 bool IsAlignedLoad = NumBits <= (LoadAlign * 8u);
665 bool IsProperlyDivisible = TotalNumElts % NumLoadedElts == 0;
668 IsProperlyDivisible && IsAlignedLoad) {
670 const unsigned NewNumElts =
671 DL.getTypeStoreSize(VectorTy) * 8u / NumBits;
672 const unsigned LShrAmt =
llvm::Log2_32(SubVecTy->getNumElements());
675 Value *BCVal = Builder.CreateBitCast(CurVal, BitCastTy);
676 Value *NewIdx = Builder.CreateLShr(
677 Index, ConstantInt::get(Index->getType(), LShrAmt));
678 Value *ExtVal = Builder.CreateExtractElement(BCVal, NewIdx);
679 Value *BCOut = Builder.CreateBitCast(ExtVal, AccessTy);
685 for (
unsigned K = 0; K < NumLoadedElts; ++K) {
687 Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
688 SubVec = Builder.CreateInsertElement(
689 SubVec, Builder.CreateExtractElement(CurVal, CurIdx), K);
693 SubVec = CreateTempPtrIntCast(SubVec, AccessTy);
694 else if (SubVecTy->isPtrOrPtrVectorTy())
695 SubVec = CreateTempPtrIntCast(SubVec, SubVecTy);
697 SubVec = Builder.CreateBitOrPointerCast(SubVec, AccessTy);
703 Value *ExtractElement = Builder.CreateExtractElement(CurVal, Index);
704 if (AccessTy != VecEltTy)
705 ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, AccessTy);
710 case Instruction::Store: {
717 Value *Val =
SI->getValueOperand();
721 TypeSize AccessSize =
DL.getTypeStoreSize(AccessTy);
723 if (CI->isZeroValue() && AccessSize == VecStoreSize) {
725 Val = CreateTempPtrIntCast(Val, AccessTy);
726 else if (
AA.Vector.Ty->isPtrOrPtrVectorTy())
727 Val = CreateTempPtrIntCast(Val,
AA.Vector.Ty);
728 return Builder.CreateBitOrPointerCast(Val,
AA.Vector.Ty);
735 const unsigned NumWrittenElts =
736 AccessSize /
DL.getTypeStoreSize(VecEltTy);
737 const unsigned NumVecElts =
AA.Vector.Ty->getNumElements();
739 assert(
DL.getTypeStoreSize(SubVecTy) ==
DL.getTypeStoreSize(AccessTy));
741 if (SubVecTy->isPtrOrPtrVectorTy())
742 Val = CreateTempPtrIntCast(Val, SubVecTy);
744 Val = CreateTempPtrIntCast(Val, AccessTy);
746 Val = Builder.CreateBitOrPointerCast(Val, SubVecTy);
748 Value *CurVec = GetCurVal();
749 for (
unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts);
752 Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
753 CurVec = Builder.CreateInsertElement(
754 CurVec, Builder.CreateExtractElement(Val, K), CurIdx);
759 if (Val->
getType() != VecEltTy)
760 Val = Builder.CreateBitOrPointerCast(Val, VecEltTy);
761 return Builder.CreateInsertElement(GetCurVal(), Val, Index);
763 case Instruction::Call: {
767 unsigned NumCopied =
Length->getZExtValue() / ElementSize;
768 MemTransferInfo *TI = &
AA.Vector.TransferInfo[MTI];
773 for (
unsigned Idx = 0; Idx <
AA.Vector.Ty->getNumElements(); ++Idx) {
774 if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
783 return Builder.CreateShuffleVector(GetCurVal(), Mask);
789 Value *Elt = MSI->getOperand(1);
790 const unsigned BytesPerElt =
DL.getTypeStoreSize(VecEltTy);
791 if (BytesPerElt > 1) {
792 Value *EltBytes = Builder.CreateVectorSplat(BytesPerElt, Elt);
798 Elt = Builder.CreateBitCast(EltBytes, PtrInt);
799 Elt = Builder.CreateIntToPtr(Elt, VecEltTy);
801 Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
804 return Builder.CreateVectorSplat(
AA.Vector.Ty->getElementCount(), Elt);
808 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
809 Intr->replaceAllUsesWith(
810 Builder.getIntN(Intr->getType()->getIntegerBitWidth(),
811 DL.getTypeAllocSize(
AA.Vector.Ty)));
840 TypeSize AccTS =
DL.getTypeStoreSize(AccessTy);
844 if (AccTS * 8 !=
DL.getTypeSizeInBits(AccessTy))
856template <
typename InstContainer>
868 auto &BlockUses = UsesByBlock[BB];
871 if (BlockUses.empty())
875 if (BlockUses.size() == 1) {
882 if (!BlockUses.contains(&Inst))
903AMDGPUPromoteAllocaImpl::getVectorTypeForAlloca(
Type *AllocaTy)
const {
904 if (DisablePromoteAllocaToVector) {
911 uint64_t NumElems = 1;
914 NumElems *= ArrayTy->getNumElements();
915 ElemTy = ArrayTy->getElementType();
921 NumElems *= InnerVectorTy->getNumElements();
922 ElemTy = InnerVectorTy->getElementType();
926 unsigned ElementSize =
DL->getTypeSizeInBits(ElemTy) / 8;
927 if (ElementSize > 0) {
928 unsigned AllocaSize =
DL->getTypeStoreSize(AllocaTy);
933 if (NumElems * ElementSize != AllocaSize)
934 NumElems = AllocaSize / ElementSize;
935 if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
945 const unsigned MaxElements =
946 (MaxVectorRegs * 32) /
DL->getTypeSizeInBits(VectorTy->getElementType());
948 if (VectorTy->getNumElements() > MaxElements ||
949 VectorTy->getNumElements() < 2) {
951 <<
" has an unsupported number of elements\n");
955 Type *VecEltTy = VectorTy->getElementType();
956 unsigned ElementSizeInBits =
DL->getTypeSizeInBits(VecEltTy);
957 if (ElementSizeInBits !=
DL->getTypeAllocSizeInBits(VecEltTy)) {
958 LLVM_DEBUG(
dbgs() <<
" Cannot convert to vector if the allocation size "
959 "does not match the type's size\n");
966void AMDGPUPromoteAllocaImpl::analyzePromoteToVector(AllocaAnalysis &
AA)
const {
967 if (
AA.HaveSelectOrPHI) {
968 LLVM_DEBUG(
dbgs() <<
" Cannot convert to vector due to select or phi\n");
972 Type *AllocaTy =
AA.Alloca->getAllocatedType();
973 AA.Vector.Ty = getVectorTypeForAlloca(AllocaTy);
978 LLVM_DEBUG(
dbgs() <<
" Cannot promote alloca to vector: " << Msg <<
"\n"
979 <<
" " << *Inst <<
"\n");
980 AA.Vector.Ty =
nullptr;
983 Type *VecEltTy =
AA.Vector.Ty->getElementType();
984 unsigned ElementSize =
DL->getTypeSizeInBits(VecEltTy) / 8;
986 for (
auto *U :
AA.Uses) {
995 return RejectUser(Inst,
"unsupported load/store as aggregate");
1002 return RejectUser(Inst,
"not a simple load or store");
1004 Ptr = Ptr->stripPointerCasts();
1007 if (Ptr ==
AA.Alloca &&
1008 DL->getTypeStoreSize(
AA.Alloca->getAllocatedType()) ==
1009 DL->getTypeStoreSize(AccessTy)) {
1010 AA.Vector.Worklist.push_back(Inst);
1015 return RejectUser(Inst,
"not a supported access type");
1017 AA.Vector.Worklist.push_back(Inst);
1026 return RejectUser(Inst,
"cannot compute vector index for GEP");
1028 AA.Vector.GEPVectorIdx[
GEP] = std::move(
Index.value());
1029 AA.Vector.UsersToRemove.push_back(Inst);
1035 AA.Vector.Worklist.push_back(Inst);
1040 if (TransferInst->isVolatile())
1041 return RejectUser(Inst,
"mem transfer inst is volatile");
1044 if (!Len || (
Len->getZExtValue() % ElementSize))
1045 return RejectUser(Inst,
"mem transfer inst length is non-constant or "
1046 "not a multiple of the vector element size");
1049 if (Ptr ==
AA.Alloca)
1050 return ConstantInt::get(Ptr->getContext(),
APInt(32, 0));
1053 const auto &GEPI =
AA.Vector.GEPVectorIdx.find(
GEP)->second;
1056 if (GEPI.ConstIndex)
1057 return GEPI.ConstIndex;
1058 return ConstantInt::get(Ptr->getContext(),
APInt(32, 0));
1061 MemTransferInfo *TI =
1062 &
AA.Vector.TransferInfo.try_emplace(TransferInst).first->second;
1063 unsigned OpNum =
U->getOperandNo();
1065 Value *Dest = TransferInst->getDest();
1068 return RejectUser(Inst,
"could not calculate constant dest index");
1069 TI->DestIndex =
Index;
1072 Value *Src = TransferInst->getSource();
1075 return RejectUser(Inst,
"could not calculate constant src index");
1076 TI->SrcIndex =
Index;
1082 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
1083 AA.Vector.Worklist.push_back(Inst);
1091 return RejectUser(Inst,
"assume-like intrinsic cannot have any users");
1092 AA.Vector.UsersToRemove.push_back(Inst);
1097 return isAssumeLikeIntrinsic(cast<Instruction>(U));
1099 AA.Vector.UsersToRemove.push_back(Inst);
1103 return RejectUser(Inst,
"unhandled alloca user");
1107 for (
const auto &Entry :
AA.Vector.TransferInfo) {
1108 const MemTransferInfo &TI =
Entry.second;
1109 if (!TI.SrcIndex || !TI.DestIndex)
1110 return RejectUser(
Entry.first,
1111 "mem transfer inst between different objects");
1112 AA.Vector.Worklist.push_back(
Entry.first);
1116void AMDGPUPromoteAllocaImpl::promoteAllocaToVector(AllocaAnalysis &
AA) {
1118 LLVM_DEBUG(
dbgs() <<
" type conversion: " << *
AA.Alloca->getAllocatedType()
1119 <<
" -> " << *
AA.Vector.Ty <<
'\n');
1120 const unsigned VecStoreSize =
DL->getTypeStoreSize(
AA.Vector.Ty);
1122 Type *VecEltTy =
AA.Vector.Ty->getElementType();
1123 const unsigned ElementSize =
DL->getTypeSizeInBits(VecEltTy) / 8;
1145 BasicBlock *BB = I->getParent();
1146 auto GetCurVal = [&]() -> Value * {
1147 if (Value *CurVal = Updater.FindValueForBlock(BB))
1150 if (!Placeholders.empty() && Placeholders.back()->getParent() == BB)
1151 return Placeholders.back();
1155 IRBuilder<> Builder(I);
1156 auto *Placeholder = cast<Instruction>(Builder.CreateFreeze(
1157 PoisonValue::get(AA.Vector.Ty),
"promotealloca.placeholder"));
1158 Placeholders.push_back(Placeholder);
1159 return Placeholders.back();
1163 ElementSize, GetCurVal);
1170 Placeholder->replaceAllUsesWith(
1172 Placeholder->eraseFromParent();
1178 I->eraseFromParent();
1183 I->dropDroppableUses();
1185 I->eraseFromParent();
1190 AA.Alloca->eraseFromParent();
1193std::pair<Value *, Value *>
1194AMDGPUPromoteAllocaImpl::getLocalSizeYZ(
IRBuilder<> &Builder) {
1204 ST.makeLIDRangeMetadata(LocalSizeY);
1205 ST.makeLIDRangeMetadata(LocalSizeZ);
1207 return std::pair(LocalSizeY, LocalSizeZ);
1248 F.removeFnAttr(
"amdgpu-no-dispatch-ptr");
1265 LoadXY->
setMetadata(LLVMContext::MD_invariant_load, MD);
1266 LoadZU->
setMetadata(LLVMContext::MD_invariant_load, MD);
1267 ST.makeLIDRangeMetadata(LoadZU);
1272 return std::pair(
Y, LoadZU);
1284 IntrID = IsAMDGCN ? (
Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
1286 AttrName =
"amdgpu-no-workitem-id-x";
1289 IntrID = IsAMDGCN ? (
Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
1291 AttrName =
"amdgpu-no-workitem-id-y";
1295 IntrID = IsAMDGCN ? (
Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
1297 AttrName =
"amdgpu-no-workitem-id-z";
1305 ST.makeLIDRangeMetadata(CI);
1306 F->removeFnAttr(AttrName);
1316 switch (
II->getIntrinsicID()) {
1317 case Intrinsic::memcpy:
1318 case Intrinsic::memmove:
1319 case Intrinsic::memset:
1320 case Intrinsic::lifetime_start:
1321 case Intrinsic::lifetime_end:
1322 case Intrinsic::invariant_start:
1323 case Intrinsic::invariant_end:
1324 case Intrinsic::launder_invariant_group:
1325 case Intrinsic::strip_invariant_group:
1326 case Intrinsic::objectsize:
1333bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
1355 if (OtherObj != BaseAlloca) {
1357 dbgs() <<
"Found a binary instruction with another alloca object\n");
1364void AMDGPUPromoteAllocaImpl::analyzePromoteToLDS(AllocaAnalysis &
AA)
const {
1365 if (DisablePromoteAllocaToLDS) {
1373 const Function &ContainingFunction = *
AA.Alloca->getFunction();
1383 <<
" promote alloca to LDS not supported with calling convention.\n");
1394 if (
find(
AA.LDS.Worklist,
User) ==
AA.LDS.Worklist.end())
1395 AA.LDS.Worklist.push_back(
User);
1400 if (UseInst->
getOpcode() == Instruction::PtrToInt)
1404 if (LI->isVolatile())
1410 if (
SI->isVolatile())
1416 if (RMW->isVolatile())
1422 if (CAS->isVolatile())
1430 if (!binaryOpIsDerivedFromSameAlloca(
AA.Alloca,
Use->get(), ICmp, 0, 1))
1434 if (
find(
AA.LDS.Worklist,
User) ==
AA.LDS.Worklist.end())
1435 AA.LDS.Worklist.push_back(ICmp);
1442 if (!
GEP->isInBounds())
1455 if (
find(
AA.LDS.Worklist,
User) ==
AA.LDS.Worklist.end())
1456 AA.LDS.Worklist.push_back(
User);
1459 AA.LDS.Enable =
true;
1462bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(
const Function &
F) {
1470 for (
Type *ParamTy : FTy->params()) {
1474 LLVM_DEBUG(
dbgs() <<
"Function has local memory argument. Promoting to "
1475 "local memory disabled.\n");
1480 LocalMemLimit =
ST.getAddressableLocalMemorySize();
1481 if (LocalMemLimit == 0)
1491 if (
Use->getFunction() == &
F)
1495 if (VisitedConstants.
insert(
C).second)
1507 if (visitUsers(&GV, &GV)) {
1515 while (!
Stack.empty()) {
1517 if (visitUsers(&GV,
C)) {
1538 LLVM_DEBUG(
dbgs() <<
"Function has a reference to externally allocated "
1539 "local memory. Promoting to local memory "
1554 CurrentLocalMemUsage = 0;
1560 for (
auto Alloc : AllocatedSizes) {
1561 CurrentLocalMemUsage =
alignTo(CurrentLocalMemUsage,
Alloc.second);
1562 CurrentLocalMemUsage +=
Alloc.first;
1565 unsigned MaxOccupancy =
1566 ST.getWavesPerEU(
ST.getFlatWorkGroupSizes(
F), CurrentLocalMemUsage,
F)
1570 unsigned MaxSizeWithWaveCount =
1571 ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy,
F);
1574 if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
1577 LocalMemLimit = MaxSizeWithWaveCount;
1580 <<
" bytes of LDS\n"
1581 <<
" Rounding size to " << MaxSizeWithWaveCount
1582 <<
" with a maximum occupancy of " << MaxOccupancy <<
'\n'
1583 <<
" and " << (LocalMemLimit - CurrentLocalMemUsage)
1584 <<
" available for promotion\n");
1590bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(
1591 AllocaAnalysis &
AA,
bool SufficientLDS,
1602 const Function &ContainingFunction = *
AA.Alloca->getParent()->getParent();
1604 unsigned WorkGroupSize =
ST.getFlatWorkGroupSizes(ContainingFunction).second;
1606 Align Alignment =
DL.getValueOrABITypeAlignment(
1607 AA.Alloca->getAlign(),
AA.Alloca->getAllocatedType());
1615 uint32_t NewSize =
alignTo(CurrentLocalMemUsage, Alignment);
1616 uint32_t AllocSize =
1617 WorkGroupSize *
DL.getTypeAllocSize(
AA.Alloca->getAllocatedType());
1618 NewSize += AllocSize;
1620 if (NewSize > LocalMemLimit) {
1622 <<
" bytes of local memory not available to promote\n");
1626 CurrentLocalMemUsage = NewSize;
1635 Twine(
F->getName()) +
Twine(
'.') +
AA.Alloca->getName(),
nullptr,
1640 Value *TCntY, *TCntZ;
1642 std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
1643 Value *TIdX = getWorkitemID(Builder, 0);
1644 Value *TIdY = getWorkitemID(Builder, 1);
1645 Value *TIdZ = getWorkitemID(Builder, 2);
1657 AA.Alloca->mutateType(
Offset->getType());
1658 AA.Alloca->replaceAllUsesWith(
Offset);
1659 AA.Alloca->eraseFromParent();
1663 for (
Value *V :
AA.LDS.Worklist) {
1685 assert(
V->getType()->isPtrOrPtrVectorTy());
1687 Type *NewTy =
V->getType()->getWithNewType(NewPtrTy);
1688 V->mutateType(NewTy);
1698 for (
unsigned I = 0,
E =
Phi->getNumIncomingValues();
I !=
E; ++
I) {
1700 Phi->getIncomingValue(
I)))
1711 case Intrinsic::lifetime_start:
1712 case Intrinsic::lifetime_end:
1716 case Intrinsic::memcpy:
1717 case Intrinsic::memmove:
1721 DeferredIntrs.
insert(Intr);
1723 case Intrinsic::memset: {
1731 case Intrinsic::invariant_start:
1732 case Intrinsic::invariant_end:
1733 case Intrinsic::launder_invariant_group:
1734 case Intrinsic::strip_invariant_group: {
1752 case Intrinsic::objectsize: {
1756 Intrinsic::objectsize,
1772void AMDGPUPromoteAllocaImpl::finishDeferredAllocaToLDSPromotion(
1779 assert(
ID == Intrinsic::memcpy ||
ID == Intrinsic::memmove);
1783 ID,
MI->getRawDest(),
MI->getDestAlign(),
MI->getRawSource(),
1784 MI->getSourceAlign(),
MI->getLength(),
MI->isVolatile());
1786 for (
unsigned I = 0;
I != 2; ++
I) {
1788 B->addDereferenceableParamAttr(
I, Bytes);
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool runOnFunction(Function &F, bool PostInlining)
AMD GCN specific subclass of TargetSubtarget.
uint64_t IntrinsicInst * II
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Remove Loads Into Fake Uses
static unsigned getNumElements(Type *Ty)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
Target-Independent Code Generator Pass Configuration Options pass.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
Class for arbitrary precision integers.
static LLVM_ABI void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
bool isOne() const
Determine if this is a value of 1.
an instruction to allocate memory on the stack
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
LLVM Basic Block Representation.
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
Represents analyses that only rely on functions' control flow.
uint64_t getParamDereferenceableBytes(unsigned i) const
Extract the number of dereferenceable bytes for a call or parameter (0=unknown).
void addDereferenceableRetAttr(uint64_t Bytes)
adds the dereferenceable attribute to the list of attributes.
void addRetAttr(Attribute::AttrKind Kind)
Adds the attribute to the return value.
Value * getArgOperand(unsigned i) const
This class represents a function call, abstracting a target machine's calling convention.
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
static LLVM_ABI bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
This is the shared class of boolean and integer constants.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
This is an important base class in LLVM.
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
Class to represent function types.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
bool hasExternalLinkage() const
void setUnnamedAddr(UnnamedAddr Val)
unsigned getAddressSpace() const
@ InternalLinkage
Rename collisions when linking (static functions).
Type * getValueType() const
MaybeAlign getAlign() const
Returns the alignment of the given variable.
void setAlignment(Align Align)
Sets the alignment attribute of the GlobalVariable.
This instruction compares its operands according to the predicate given to the constructor.
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
BasicBlock * GetInsertBlock() const
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
CallInst * CreateMemSet(Value *Ptr, Value *Val, uint64_t Size, MaybeAlign Align, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Create and insert a memset to the specified pointer and the specified value.
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateConstInBoundsGEP1_64(Type *Ty, Value *Ptr, uint64_t Idx0, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
LLVM_ABI CallInst * CreateMemTransferInst(Intrinsic::ID IntrID, Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, Value *Size, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
Analysis pass that exposes the LoopInfo for a function.
The legacy pass manager's analysis pass to compute loop information.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
This class implements a map that also provides access to all stored values in a deterministic order.
std::pair< KeyT, ValueT > & front()
Value * getLength() const
Value * getRawDest() const
MaybeAlign getDestAlign() const
This class wraps the llvm.memset and llvm.memset.inline intrinsics.
This class wraps the llvm.memcpy/memmove intrinsics.
A Module instance is used to store all the information related to an LLVM module.
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Class to represent pointers.
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Helper class for SSA formation on a set of values defined in multiple blocks.
void Initialize(Type *Ty, StringRef Name)
Reset this object to get ready for a new set of SSA updates with type 'Ty'.
Value * GetValueInMiddleOfBlock(BasicBlock *BB)
Construct SSA form, materializing a value that is live in the middle of the specified block.
void AddAvailableValue(BasicBlock *BB, Value *V)
Indicate that a rewritten value is available in the specified block with the specified value.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
bool insert(const value_type &X)
Insert a new element into the SetVector.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
static unsigned getPointerOperandIndex()
StringRef - Represent a constant reference to a string, i.e.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
Triple - Helper class for working with autoconf configuration names.
bool isAMDGCN() const
Tests whether the target is AMDGCN.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
The instances of the Type class are immutable: once they are created, they are never changed.
bool isArrayTy() const
True if this is an instance of ArrayType.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isPointerTy() const
True if this is an instance of PointerType.
bool isAggregateType() const
Return true if the type is an aggregate type.
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
A Use represents the edge between a Value definition and its users.
void setOperand(unsigned i, Value *Val)
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false) const
Implement operator<< on Value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
void mutateType(Type *Ty)
Mutate the type of this Value to be of the specified type.
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
@ LOCAL_ADDRESS
Address space for local memory.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
unsigned getDynamicVGPRBlockSize(const Function &F)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ C
The default llvm calling convention, compatible with C.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
initializer< Ty > init(const Ty &Val)
NodeAddr< PhiNode * > Phi
This is an optimization pass for GlobalISel generic memory operations.
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool isAssumeLikeIntrinsic(const Instruction *I)
Return true if it is an intrinsic that cannot be speculated but also cannot trap.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
auto reverse(ContainerTy &&C)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
FunctionPass * createAMDGPUPromoteAlloca()
@ Mod
The access may modify the value stored in memory.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
char & AMDGPUPromoteAllocaID
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
This struct is a compact representation of a valid (non-zero power of two) alignment.
A MapVector that performs no allocations if smaller than a certain size.
Function object to check whether the second component of a container supported by std::get (like std:...