17#include "llvm/IR/IntrinsicsAMDGPU.h"
18#include "llvm/IR/IntrinsicsR600.h"
23#define DEBUG_TYPE "amdgpu-attributor"
28 "amdgpu-indirect-call-specialization-threshold",
30 "A threshold controls whether an indirect call will be specialized"),
33#define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
36#include "AMDGPUAttributes.def"
40#define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
44#include "AMDGPUAttributes.def"
49#define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
50static constexpr std::pair<ImplicitArgumentMask, StringLiteral>
52#include "AMDGPUAttributes.def"
62 bool HasApertureRegs,
bool SupportsGetDoorBellID,
63 unsigned CodeObjectVersion) {
65 case Intrinsic::amdgcn_workitem_id_x:
68 case Intrinsic::amdgcn_workgroup_id_x:
70 return WORKGROUP_ID_X;
71 case Intrinsic::amdgcn_workitem_id_y:
72 case Intrinsic::r600_read_tidig_y:
74 case Intrinsic::amdgcn_workitem_id_z:
75 case Intrinsic::r600_read_tidig_z:
77 case Intrinsic::amdgcn_workgroup_id_y:
78 case Intrinsic::r600_read_tgid_y:
79 return WORKGROUP_ID_Y;
80 case Intrinsic::amdgcn_workgroup_id_z:
81 case Intrinsic::r600_read_tgid_z:
82 return WORKGROUP_ID_Z;
83 case Intrinsic::amdgcn_cluster_id_x:
86 case Intrinsic::amdgcn_cluster_id_y:
88 case Intrinsic::amdgcn_cluster_id_z:
90 case Intrinsic::amdgcn_lds_kernel_id:
92 case Intrinsic::amdgcn_dispatch_ptr:
94 case Intrinsic::amdgcn_dispatch_id:
96 case Intrinsic::amdgcn_implicitarg_ptr:
97 return IMPLICIT_ARG_PTR;
100 case Intrinsic::amdgcn_queue_ptr:
103 case Intrinsic::amdgcn_is_shared:
104 case Intrinsic::amdgcn_is_private:
112 case Intrinsic::amdgcn_wwm:
113 case Intrinsic::amdgcn_strict_wwm:
114 return WHOLE_WAVE_MODE;
115 case Intrinsic::trap:
116 case Intrinsic::debugtrap:
117 case Intrinsic::ubsantrap:
118 if (SupportsGetDoorBellID)
142 return F.hasFnAttribute(Attribute::SanitizeAddress) ||
143 F.hasFnAttribute(Attribute::SanitizeThread) ||
144 F.hasFnAttribute(Attribute::SanitizeMemory) ||
145 F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
146 F.hasFnAttribute(Attribute::SanitizeMemTag);
152 AMDGPUInformationCache(
const Module &M, AnalysisGetter &AG,
154 SetVector<Function *> *
CGSCC, TargetMachine &TM)
160 enum ConstantStatus : uint8_t {
163 ADDR_SPACE_CAST_PRIVATE_TO_FLAT = 1 << 1,
164 ADDR_SPACE_CAST_LOCAL_TO_FLAT = 1 << 2,
165 ADDR_SPACE_CAST_BOTH_TO_FLAT =
166 ADDR_SPACE_CAST_PRIVATE_TO_FLAT | ADDR_SPACE_CAST_LOCAL_TO_FLAT,
167 CS_WORST = DS_GLOBAL | ADDR_SPACE_CAST_BOTH_TO_FLAT,
171 bool hasApertureRegs(Function &
F) {
172 const GCNSubtarget &
ST = TM.getSubtarget<GCNSubtarget>(
F);
173 return ST.hasApertureRegs();
177 bool supportsGetDoorbellID(Function &
F) {
178 const GCNSubtarget &
ST = TM.getSubtarget<GCNSubtarget>(
F);
179 return ST.supportsGetDoorbellID();
182 std::optional<std::pair<unsigned, unsigned>>
183 getFlatWorkGroupSizeAttr(
const Function &
F)
const {
187 return std::make_pair(
R->first, *(
R->second));
190 std::pair<unsigned, unsigned>
191 getDefaultFlatWorkGroupSize(
const Function &
F)
const {
192 const GCNSubtarget &
ST = TM.getSubtarget<GCNSubtarget>(
F);
193 return ST.getDefaultFlatWorkGroupSize(
F.getCallingConv());
196 std::pair<unsigned, unsigned>
197 getMaximumFlatWorkGroupRange(
const Function &
F) {
198 const GCNSubtarget &
ST = TM.getSubtarget<GCNSubtarget>(
F);
199 return {
ST.getMinFlatWorkGroupSize(),
ST.getMaxFlatWorkGroupSize()};
203 unsigned getCodeObjectVersion()
const {
return CodeObjectVersion; }
205 std::optional<std::pair<unsigned, unsigned>>
206 getWavesPerEUAttr(
const Function &
F) {
212 const GCNSubtarget &
ST = TM.getSubtarget<GCNSubtarget>(
F);
213 Val->second =
ST.getMaxWavesPerEU();
215 return std::make_pair(Val->first, *(Val->second));
219 const GCNSubtarget &
ST = TM.getSubtarget<GCNSubtarget>(
F);
220 return ST.getMaxWavesPerEU();
223 unsigned getMaxAddrSpace()
const override {
230 static uint8_t visitConstExpr(
const ConstantExpr *CE) {
231 uint8_t Status = NONE;
233 if (
CE->getOpcode() == Instruction::AddrSpaceCast) {
234 unsigned SrcAS =
CE->getOperand(0)->getType()->getPointerAddressSpace();
236 Status |= ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
238 Status |= ADDR_SPACE_CAST_LOCAL_TO_FLAT;
245 uint8_t getConstantAccess(
const Constant *
C) {
246 const auto &It = ConstantStatus.find(
C);
247 if (It != ConstantStatus.end())
248 return It->second.value();
250 SmallPtrSet<const Constant *, 8> Visited;
256 while (Result != CS_WORST && !Worklist.
empty()) {
259 std::optional<uint8_t> &CurCResultOrNone = ConstantStatus[CurC];
260 if (CurCResultOrNone) {
261 Result |= CurCResultOrNone.value();
264 uint8_t CurCResult = 0;
267 CurCResult |= DS_GLOBAL;
270 CurCResult |= visitConstExpr(CE);
272 for (
const Use &U : CurC->
operands()) {
274 if (Visited.
insert(OpC).second)
279 CurCResultOrNone = CurCResult;
289 bool needsQueuePtr(
const Constant *
C, Function &Fn) {
291 bool HasAperture = hasApertureRegs(Fn);
294 if (!IsNonEntryFunc && HasAperture)
297 uint8_t
Access = getConstantAccess(
C);
300 if (IsNonEntryFunc && (
Access & DS_GLOBAL))
303 return !HasAperture && (
Access & ADDR_SPACE_CAST_BOTH_TO_FLAT);
306 bool checkConstForAddrSpaceCastFromPrivate(
const Constant *
C) {
307 uint8_t
Access = getConstantAccess(
C);
308 return Access & ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
313 DenseMap<const Constant *, std::optional<uint8_t>> ConstantStatus;
314 const unsigned CodeObjectVersion;
317struct AAAMDAttributes
318 :
public StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
320 using Base = StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
323 AAAMDAttributes(
const IRPosition &IRP, Attributor &
A) : Base(IRP) {}
326 static AAAMDAttributes &createForPosition(
const IRPosition &IRP,
330 StringRef
getName()
const override {
return "AAAMDAttributes"; }
333 const char *getIdAddr()
const override {
return &ID; }
337 static bool classof(
const AbstractAttribute *AA) {
342 static const char ID;
344const char AAAMDAttributes::ID = 0;
346struct AAUniformWorkGroupSize
347 :
public StateWrapper<BooleanState, AbstractAttribute> {
348 using Base = StateWrapper<BooleanState, AbstractAttribute>;
349 AAUniformWorkGroupSize(
const IRPosition &IRP, Attributor &
A) : Base(IRP) {}
352 static AAUniformWorkGroupSize &createForPosition(
const IRPosition &IRP,
356 StringRef
getName()
const override {
return "AAUniformWorkGroupSize"; }
359 const char *getIdAddr()
const override {
return &ID; }
363 static bool classof(
const AbstractAttribute *AA) {
368 static const char ID;
370const char AAUniformWorkGroupSize::ID = 0;
372struct AAUniformWorkGroupSizeFunction :
public AAUniformWorkGroupSize {
373 AAUniformWorkGroupSizeFunction(
const IRPosition &IRP, Attributor &
A)
374 : AAUniformWorkGroupSize(IRP,
A) {}
378 CallingConv::ID CC =
F->getCallingConv();
380 if (CC != CallingConv::AMDGPU_KERNEL)
383 bool InitialValue =
F->hasFnAttribute(
"uniform-work-group-size");
386 indicateOptimisticFixpoint();
388 indicatePessimisticFixpoint();
394 auto CheckCallSite = [&](AbstractCallSite CS) {
397 <<
"->" << getAssociatedFunction()->
getName() <<
"\n");
399 const auto *CallerInfo =
A.getAAFor<AAUniformWorkGroupSize>(
401 if (!CallerInfo || !CallerInfo->isValidState())
405 CallerInfo->getState());
410 bool AllCallSitesKnown =
true;
411 if (!
A.checkForAllCallSites(CheckCallSite, *
this,
true, AllCallSitesKnown))
412 return indicatePessimisticFixpoint();
419 return ChangeStatus::UNCHANGED;
421 LLVMContext &Ctx = getAssociatedFunction()->getContext();
422 return A.manifestAttrs(getIRPosition(),
423 {Attribute::get(Ctx,
"uniform-work-group-size")},
427 bool isValidState()
const override {
432 const std::string getAsStr(Attributor *)
const override {
433 return "AMDWorkGroupSize[" + std::to_string(getAssumed()) +
"]";
437 void trackStatistics()
const override {}
440AAUniformWorkGroupSize &
441AAUniformWorkGroupSize::createForPosition(
const IRPosition &IRP,
444 return *
new (
A.Allocator) AAUniformWorkGroupSizeFunction(IRP,
A);
446 "AAUniformWorkGroupSize is only valid for function position");
449struct AAAMDAttributesFunction :
public AAAMDAttributes {
450 AAAMDAttributesFunction(
const IRPosition &IRP, Attributor &
A)
451 : AAAMDAttributes(IRP,
A) {}
463 if (HasSanitizerAttrs) {
464 removeAssumedBits(IMPLICIT_ARG_PTR);
465 removeAssumedBits(HOSTCALL_PTR);
466 removeAssumedBits(FLAT_SCRATCH_INIT);
470 if (HasSanitizerAttrs &&
471 (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR ||
472 Attr.first == FLAT_SCRATCH_INIT))
475 if (
F->hasFnAttribute(Attr.second))
476 addKnownBits(Attr.first);
479 if (
F->isDeclaration())
485 indicatePessimisticFixpoint();
493 auto OrigAssumed = getAssumed();
496 const AACallEdges *AAEdges =
A.getAAFor<AACallEdges>(
497 *
this, this->getIRPosition(), DepClassTy::REQUIRED);
500 return indicatePessimisticFixpoint();
504 bool NeedsImplicit =
false;
505 auto &InfoCache =
static_cast<AMDGPUInformationCache &
>(
A.getInfoCache());
506 bool HasApertureRegs = InfoCache.hasApertureRegs(*
F);
507 bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*
F);
508 unsigned COV = InfoCache.getCodeObjectVersion();
513 const AAAMDAttributes *AAAMD =
A.getAAFor<AAAMDAttributes>(
515 if (!AAAMD || !AAAMD->isValidState())
516 return indicatePessimisticFixpoint();
521 bool NonKernelOnly =
false;
524 HasApertureRegs, SupportsGetDoorbellID, COV);
535 if (!
Callee->hasFnAttribute(Attribute::NoCallback))
536 return indicatePessimisticFixpoint();
541 if ((IsNonEntryFunc || !NonKernelOnly))
542 removeAssumedBits(AttrMask);
548 removeAssumedBits(IMPLICIT_ARG_PTR);
550 if (isAssumed(QUEUE_PTR) && checkForQueuePtr(
A)) {
554 removeAssumedBits(IMPLICIT_ARG_PTR);
556 removeAssumedBits(QUEUE_PTR);
559 if (funcRetrievesMultigridSyncArg(
A, COV)) {
560 assert(!isAssumed(IMPLICIT_ARG_PTR) &&
561 "multigrid_sync_arg needs implicitarg_ptr");
562 removeAssumedBits(MULTIGRID_SYNC_ARG);
565 if (funcRetrievesHostcallPtr(
A, COV)) {
566 assert(!isAssumed(IMPLICIT_ARG_PTR) &&
"hostcall needs implicitarg_ptr");
567 removeAssumedBits(HOSTCALL_PTR);
570 if (funcRetrievesHeapPtr(
A, COV)) {
571 assert(!isAssumed(IMPLICIT_ARG_PTR) &&
"heap_ptr needs implicitarg_ptr");
572 removeAssumedBits(HEAP_PTR);
575 if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(
A, COV)) {
576 assert(!isAssumed(IMPLICIT_ARG_PTR) &&
"queue_ptr needs implicitarg_ptr");
577 removeAssumedBits(QUEUE_PTR);
580 if (isAssumed(LDS_KERNEL_ID) && funcRetrievesLDSKernelId(
A)) {
581 removeAssumedBits(LDS_KERNEL_ID);
584 if (isAssumed(DEFAULT_QUEUE) && funcRetrievesDefaultQueue(
A, COV))
585 removeAssumedBits(DEFAULT_QUEUE);
587 if (isAssumed(COMPLETION_ACTION) && funcRetrievesCompletionAction(
A, COV))
588 removeAssumedBits(COMPLETION_ACTION);
590 if (isAssumed(FLAT_SCRATCH_INIT) && needFlatScratchInit(
A))
591 removeAssumedBits(FLAT_SCRATCH_INIT);
593 return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
594 : ChangeStatus::UNCHANGED;
599 LLVMContext &Ctx = getAssociatedFunction()->getContext();
602 if (isKnown(Attr.first))
603 AttrList.
push_back(Attribute::get(Ctx, Attr.second));
606 return A.manifestAttrs(getIRPosition(), AttrList,
610 const std::string getAsStr(Attributor *)
const override {
612 raw_string_ostream OS(Str);
615 if (isAssumed(Attr.first))
616 OS <<
' ' << Attr.second;
622 void trackStatistics()
const override {}
625 bool checkForQueuePtr(Attributor &
A) {
629 auto &InfoCache =
static_cast<AMDGPUInformationCache &
>(
A.getInfoCache());
631 bool NeedsQueuePtr =
false;
634 unsigned SrcAS =
static_cast<AddrSpaceCastInst &
>(
I).getSrcAddressSpace();
636 NeedsQueuePtr =
true;
642 bool HasApertureRegs = InfoCache.hasApertureRegs(*
F);
648 if (!HasApertureRegs) {
649 bool UsedAssumedInformation =
false;
650 A.checkForAllInstructions(CheckAddrSpaceCasts, *
this,
651 {Instruction::AddrSpaceCast},
652 UsedAssumedInformation);
659 if (!IsNonEntryFunc && HasApertureRegs)
662 for (BasicBlock &BB : *
F) {
663 for (Instruction &
I : BB) {
664 for (
const Use &U :
I.operands()) {
666 if (InfoCache.needsQueuePtr(
C, *
F))
676 bool funcRetrievesMultigridSyncArg(Attributor &
A,
unsigned COV) {
678 AA::RangeTy
Range(Pos, 8);
679 return funcRetrievesImplicitKernelArg(
A,
Range);
682 bool funcRetrievesHostcallPtr(Attributor &
A,
unsigned COV) {
684 AA::RangeTy
Range(Pos, 8);
685 return funcRetrievesImplicitKernelArg(
A,
Range);
688 bool funcRetrievesDefaultQueue(Attributor &
A,
unsigned COV) {
690 AA::RangeTy
Range(Pos, 8);
691 return funcRetrievesImplicitKernelArg(
A,
Range);
694 bool funcRetrievesCompletionAction(Attributor &
A,
unsigned COV) {
696 AA::RangeTy
Range(Pos, 8);
697 return funcRetrievesImplicitKernelArg(
A,
Range);
700 bool funcRetrievesHeapPtr(Attributor &
A,
unsigned COV) {
704 return funcRetrievesImplicitKernelArg(
A,
Range);
707 bool funcRetrievesQueuePtr(Attributor &
A,
unsigned COV) {
711 return funcRetrievesImplicitKernelArg(
A,
Range);
714 bool funcRetrievesImplicitKernelArg(Attributor &
A, AA::RangeTy
Range) {
726 const auto *PointerInfoAA =
A.getAAFor<AAPointerInfo>(
728 if (!PointerInfoAA || !PointerInfoAA->getState().isValidState())
731 return PointerInfoAA->forallInterferingAccesses(
732 Range, [](
const AAPointerInfo::Access &Acc,
bool IsExact) {
737 bool UsedAssumedInformation =
false;
738 return !
A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *
this,
739 UsedAssumedInformation);
742 bool funcRetrievesLDSKernelId(Attributor &
A) {
747 bool UsedAssumedInformation =
false;
748 return !
A.checkForAllCallLikeInstructions(DoesNotRetrieve, *
this,
749 UsedAssumedInformation);
754 bool needFlatScratchInit(Attributor &
A) {
755 assert(isAssumed(FLAT_SCRATCH_INIT));
764 bool UsedAssumedInformation =
false;
765 if (!
A.checkForAllInstructions(AddrSpaceCastNotFromPrivate, *
this,
766 {Instruction::AddrSpaceCast},
767 UsedAssumedInformation))
771 auto &InfoCache =
static_cast<AMDGPUInformationCache &
>(
A.getInfoCache());
775 for (
const Use &U :
I.operands()) {
777 if (InfoCache.checkConstForAddrSpaceCastFromPrivate(
C))
799 return Callee->getIntrinsicID() !=
800 Intrinsic::amdgcn_addrspacecast_nonnull;
803 UsedAssumedInformation =
false;
807 return !
A.checkForAllCallLikeInstructions(CheckForNoFlatScratchInit, *
this,
808 UsedAssumedInformation);
812AAAMDAttributes &AAAMDAttributes::createForPosition(
const IRPosition &IRP,
815 return *
new (
A.Allocator) AAAMDAttributesFunction(IRP,
A);
820struct AAAMDSizeRangeAttribute
821 :
public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
822 using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>;
826 AAAMDSizeRangeAttribute(
const IRPosition &IRP, Attributor &
A,
828 :
Base(IRP, 32), AttrName(AttrName) {}
831 void trackStatistics()
const override {}
833 template <
class AttributeImpl>
ChangeStatus updateImplImpl(Attributor &
A) {
836 auto CheckCallSite = [&](AbstractCallSite CS) {
839 <<
"->" << getAssociatedFunction()->
getName() <<
'\n');
841 const auto *CallerInfo =
A.getAAFor<AttributeImpl>(
843 if (!CallerInfo || !CallerInfo->isValidState())
852 bool AllCallSitesKnown =
true;
853 if (!
A.checkForAllCallSites(CheckCallSite, *
this,
856 return indicatePessimisticFixpoint();
864 emitAttributeIfNotDefaultAfterClamp(Attributor &
A,
865 std::pair<unsigned, unsigned>
Default) {
867 unsigned Lower = getAssumed().getLower().getZExtValue();
868 unsigned Upper = getAssumed().getUpper().getZExtValue();
878 return ChangeStatus::UNCHANGED;
881 LLVMContext &Ctx =
F->getContext();
882 SmallString<10> Buffer;
883 raw_svector_ostream OS(Buffer);
885 return A.manifestAttrs(getIRPosition(),
886 {Attribute::get(Ctx, AttrName, OS.str())},
890 const std::string getAsStr(Attributor *)
const override {
892 raw_string_ostream OS(Str);
894 OS << getAssumed().getLower() <<
',' << getAssumed().getUpper() - 1;
901struct AAAMDFlatWorkGroupSize :
public AAAMDSizeRangeAttribute {
902 AAAMDFlatWorkGroupSize(
const IRPosition &IRP, Attributor &
A)
903 : AAAMDSizeRangeAttribute(IRP,
A,
"amdgpu-flat-work-group-size") {}
907 auto &InfoCache =
static_cast<AMDGPUInformationCache &
>(
A.getInfoCache());
909 bool HasAttr =
false;
910 auto Range = InfoCache.getDefaultFlatWorkGroupSize(*
F);
911 auto MaxRange = InfoCache.getMaximumFlatWorkGroupRange(*
F);
913 if (
auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*
F)) {
917 if (*Attr != MaxRange) {
925 if (
Range == MaxRange)
929 ConstantRange CR(APInt(32, Min), APInt(32, Max + 1));
930 IntegerRangeState IRS(CR);
934 indicateOptimisticFixpoint();
938 return updateImplImpl<AAAMDFlatWorkGroupSize>(
A);
942 static AAAMDFlatWorkGroupSize &createForPosition(
const IRPosition &IRP,
947 auto &InfoCache =
static_cast<AMDGPUInformationCache &
>(
A.getInfoCache());
948 return emitAttributeIfNotDefaultAfterClamp(
949 A, InfoCache.getMaximumFlatWorkGroupRange(*
F));
953 StringRef
getName()
const override {
return "AAAMDFlatWorkGroupSize"; }
956 const char *getIdAddr()
const override {
return &
ID; }
960 static bool classof(
const AbstractAttribute *AA) {
965 static const char ID;
968const char AAAMDFlatWorkGroupSize::ID = 0;
970AAAMDFlatWorkGroupSize &
971AAAMDFlatWorkGroupSize::createForPosition(
const IRPosition &IRP,
974 return *
new (
A.Allocator) AAAMDFlatWorkGroupSize(IRP,
A);
976 "AAAMDFlatWorkGroupSize is only valid for function position");
979struct TupleDecIntegerRangeState :
public AbstractState {
980 DecIntegerState<uint32_t>
X,
Y, Z;
982 bool isValidState()
const override {
983 return X.isValidState() &&
Y.isValidState() &&
Z.isValidState();
986 bool isAtFixpoint()
const override {
987 return X.isAtFixpoint() &&
Y.isAtFixpoint() &&
Z.isAtFixpoint();
991 return X.indicateOptimisticFixpoint() |
Y.indicateOptimisticFixpoint() |
992 Z.indicateOptimisticFixpoint();
996 return X.indicatePessimisticFixpoint() |
Y.indicatePessimisticFixpoint() |
997 Z.indicatePessimisticFixpoint();
1000 TupleDecIntegerRangeState
operator^=(
const TupleDecIntegerRangeState &
Other) {
1011 TupleDecIntegerRangeState &getAssumed() {
return *
this; }
1012 const TupleDecIntegerRangeState &getAssumed()
const {
return *
this; }
1015using AAAMDMaxNumWorkgroupsState =
1016 StateWrapper<TupleDecIntegerRangeState, AbstractAttribute, uint32_t>;
1019struct AAAMDMaxNumWorkgroups
1020 :
public StateWrapper<TupleDecIntegerRangeState, AbstractAttribute> {
1021 using Base = StateWrapper<TupleDecIntegerRangeState, AbstractAttribute>;
1023 AAAMDMaxNumWorkgroups(
const IRPosition &IRP, Attributor &
A) :
Base(IRP) {}
1030 X.takeKnownMinimum(MaxNumWorkgroups[0]);
1031 Y.takeKnownMinimum(MaxNumWorkgroups[1]);
1032 Z.takeKnownMinimum(MaxNumWorkgroups[2]);
1035 indicatePessimisticFixpoint();
1041 auto CheckCallSite = [&](AbstractCallSite CS) {
1044 <<
"->" << getAssociatedFunction()->
getName() <<
'\n');
1046 const auto *CallerInfo =
A.getAAFor<AAAMDMaxNumWorkgroups>(
1048 if (!CallerInfo || !CallerInfo->isValidState())
1056 bool AllCallSitesKnown =
true;
1057 if (!
A.checkForAllCallSites(CheckCallSite, *
this,
1060 return indicatePessimisticFixpoint();
1066 static AAAMDMaxNumWorkgroups &createForPosition(
const IRPosition &IRP,
1071 LLVMContext &Ctx =
F->getContext();
1072 SmallString<32> Buffer;
1073 raw_svector_ostream OS(Buffer);
1074 OS <<
X.getAssumed() <<
',' <<
Y.getAssumed() <<
',' <<
Z.getAssumed();
1078 return A.manifestAttrs(
1080 {Attribute::get(Ctx,
"amdgpu-max-num-workgroups", OS.str())},
1084 StringRef
getName()
const override {
return "AAAMDMaxNumWorkgroups"; }
1086 const std::string getAsStr(Attributor *)
const override {
1087 std::string Buffer =
"AAAMDMaxNumWorkgroupsState[";
1088 raw_string_ostream OS(Buffer);
1089 OS <<
X.getAssumed() <<
',' <<
Y.getAssumed() <<
',' <<
Z.getAssumed()
1094 const char *getIdAddr()
const override {
return &
ID; }
1098 static bool classof(
const AbstractAttribute *AA) {
1102 void trackStatistics()
const override {}
1105 static const char ID;
1108const char AAAMDMaxNumWorkgroups::ID = 0;
1110AAAMDMaxNumWorkgroups &
1111AAAMDMaxNumWorkgroups::createForPosition(
const IRPosition &IRP, Attributor &
A) {
1113 return *
new (
A.Allocator) AAAMDMaxNumWorkgroups(IRP,
A);
1114 llvm_unreachable(
"AAAMDMaxNumWorkgroups is only valid for function position");
1118struct AAAMDWavesPerEU :
public AAAMDSizeRangeAttribute {
1119 AAAMDWavesPerEU(
const IRPosition &IRP, Attributor &
A)
1120 : AAAMDSizeRangeAttribute(IRP,
A,
"amdgpu-waves-per-eu") {}
1124 auto &InfoCache =
static_cast<AMDGPUInformationCache &
>(
A.getInfoCache());
1127 if (
auto Attr = InfoCache.getWavesPerEUAttr(*
F)) {
1128 std::pair<unsigned, unsigned> MaxWavesPerEURange{
1129 1U, InfoCache.getMaxWavesPerEU(*
F)};
1130 if (*Attr != MaxWavesPerEURange) {
1131 auto [Min,
Max] = *Attr;
1132 ConstantRange
Range(APInt(32, Min), APInt(32, Max + 1));
1133 IntegerRangeState RangeState(
Range);
1134 this->getState() = RangeState;
1135 indicateOptimisticFixpoint();
1141 indicatePessimisticFixpoint();
1147 auto CheckCallSite = [&](AbstractCallSite CS) {
1151 <<
"->" <<
Func->getName() <<
'\n');
1154 const auto *CallerAA =
A.getAAFor<AAAMDWavesPerEU>(
1156 if (!CallerAA || !CallerAA->isValidState())
1159 ConstantRange Assumed = getAssumed();
1161 CallerAA->getAssumed().getLower().getZExtValue());
1163 CallerAA->getAssumed().getUpper().getZExtValue());
1164 ConstantRange
Range(APInt(32, Min), APInt(32, Max));
1165 IntegerRangeState RangeState(
Range);
1166 getState() = RangeState;
1167 Change |= getState() == Assumed ? ChangeStatus::UNCHANGED
1168 : ChangeStatus::CHANGED;
1173 bool AllCallSitesKnown =
true;
1174 if (!
A.checkForAllCallSites(CheckCallSite, *
this,
true, AllCallSitesKnown))
1175 return indicatePessimisticFixpoint();
1181 static AAAMDWavesPerEU &createForPosition(
const IRPosition &IRP,
1186 auto &InfoCache =
static_cast<AMDGPUInformationCache &
>(
A.getInfoCache());
1187 return emitAttributeIfNotDefaultAfterClamp(
1188 A, {1U, InfoCache.getMaxWavesPerEU(*
F)});
1192 StringRef
getName()
const override {
return "AAAMDWavesPerEU"; }
1195 const char *getIdAddr()
const override {
return &
ID; }
1199 static bool classof(
const AbstractAttribute *AA) {
1204 static const char ID;
1207const char AAAMDWavesPerEU::ID = 0;
1209AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(
const IRPosition &IRP,
1212 return *
new (
A.Allocator) AAAMDWavesPerEU(IRP,
A);
1217static unsigned inlineAsmGetNumRequiredAGPRs(
const InlineAsm *IA,
1218 const CallBase &
Call) {
1221 unsigned AGPRDefCount = 0;
1222 unsigned AGPRUseCount = 0;
1223 unsigned MaxPhysReg = 0;
1227 for (
const InlineAsm::ConstraintInfo &CI :
IA->ParseConstraints()) {
1233 Ty = STy->getElementType(ResNo);
1248 for (StringRef Code : CI.Codes) {
1249 unsigned RegCount = 0;
1250 if (
Code.starts_with(
"a")) {
1261 MaxPhysReg = std::max(MaxPhysReg, std::min(RegIdx + NumRegs, 256u));
1271 AGPRDefCount =
alignTo(AGPRDefCount, RegCount);
1273 AGPRDefCount += RegCount;
1274 if (CI.isEarlyClobber) {
1275 AGPRUseCount =
alignTo(AGPRUseCount, RegCount);
1276 AGPRUseCount += RegCount;
1279 AGPRUseCount =
alignTo(AGPRUseCount, RegCount);
1280 AGPRUseCount += RegCount;
1285 unsigned MaxVirtReg = std::max(AGPRUseCount, AGPRDefCount);
1290 return std::min(MaxVirtReg + MaxPhysReg, 256u);
1293struct AAAMDGPUMinAGPRAlloc
1294 :
public StateWrapper<DecIntegerState<>, AbstractAttribute> {
1295 using Base = StateWrapper<DecIntegerState<>, AbstractAttribute>;
1296 AAAMDGPUMinAGPRAlloc(
const IRPosition &IRP, Attributor &
A) :
Base(IRP) {}
1298 static AAAMDGPUMinAGPRAlloc &createForPosition(
const IRPosition &IRP,
1301 return *
new (
A.Allocator) AAAMDGPUMinAGPRAlloc(IRP,
A);
1303 "AAAMDGPUMinAGPRAlloc is only valid for function position");
1308 auto [MinNumAGPR, MaxNumAGPR] =
1311 if (MinNumAGPR == 0) {
1312 indicateOptimisticFixpoint();
1317 indicatePessimisticFixpoint();
1320 const std::string getAsStr(Attributor *
A)
const override {
1321 std::string Str =
"amdgpu-agpr-alloc=";
1322 raw_string_ostream OS(Str);
1327 void trackStatistics()
const override {}
1330 DecIntegerState<> Maximum;
1337 const Value *CalleeOp = CB.getCalledOperand();
1342 unsigned NumRegs = inlineAsmGetNumRequiredAGPRs(IA, CB);
1346 switch (CB.getIntrinsicID()) {
1349 case Intrinsic::write_register:
1350 case Intrinsic::read_register:
1351 case Intrinsic::read_volatile_register: {
1356 auto [
Kind, RegIdx, NumRegs] =
1370 case Intrinsic::trap:
1371 case Intrinsic::debugtrap:
1372 case Intrinsic::ubsantrap:
1373 return CB.hasFnAttr(Attribute::NoCallback) ||
1374 !CB.hasFnAttr(
"trap-func-name");
1380 return CB.hasFnAttr(Attribute::NoCallback);
1384 auto *CBEdges =
A.getAAFor<AACallEdges>(
1386 if (!CBEdges || CBEdges->hasUnknownCallee()) {
1391 for (
const Function *PossibleCallee : CBEdges->getOptimisticEdges()) {
1392 const auto *CalleeInfo =
A.getAAFor<AAAMDGPUMinAGPRAlloc>(
1394 if (!CalleeInfo || !CalleeInfo->isValidState()) {
1405 bool UsedAssumedInformation =
false;
1406 if (!
A.checkForAllCallLikeInstructions(CheckForMinAGPRAllocs, *
this,
1407 UsedAssumedInformation))
1408 return indicatePessimisticFixpoint();
1414 LLVMContext &Ctx = getAssociatedFunction()->getContext();
1415 SmallString<4> Buffer;
1416 raw_svector_ostream OS(Buffer);
1419 return A.manifestAttrs(
1420 getIRPosition(), {Attribute::get(Ctx,
"amdgpu-agpr-alloc", OS.str())});
1423 StringRef
getName()
const override {
return "AAAMDGPUMinAGPRAlloc"; }
1424 const char *getIdAddr()
const override {
return &
ID; }
1428 static bool classof(
const AbstractAttribute *AA) {
1432 static const char ID;
1435const char AAAMDGPUMinAGPRAlloc::ID = 0;
1439struct AAAMDGPUClusterDims
1440 :
public StateWrapper<BooleanState, AbstractAttribute> {
1441 using Base = StateWrapper<BooleanState, AbstractAttribute>;
1442 AAAMDGPUClusterDims(
const IRPosition &IRP, Attributor &
A) :
Base(IRP) {}
1445 static AAAMDGPUClusterDims &createForPosition(
const IRPosition &IRP,
1449 StringRef
getName()
const override {
return "AAAMDGPUClusterDims"; }
1452 const char *getIdAddr()
const override {
return &
ID; }
1456 static bool classof(
const AbstractAttribute *AA) {
1460 virtual const AMDGPU::ClusterDimsAttr &getClusterDims()
const = 0;
1463 static const char ID;
1466const char AAAMDGPUClusterDims::ID = 0;
1468struct AAAMDGPUClusterDimsFunction :
public AAAMDGPUClusterDims {
1469 AAAMDGPUClusterDimsFunction(
const IRPosition &IRP, Attributor &
A)
1470 : AAAMDGPUClusterDims(IRP,
A) {}
1474 assert(
F &&
"empty associated function");
1481 indicatePessimisticFixpoint();
1483 indicateOptimisticFixpoint();
1487 const std::string getAsStr(Attributor *
A)
const override {
1497 void trackStatistics()
const override {}
1500 auto OldState = Attr;
1502 auto CheckCallSite = [&](AbstractCallSite CS) {
1503 const auto *CallerAA =
A.getAAFor<AAAMDGPUClusterDims>(
1505 DepClassTy::REQUIRED);
1506 if (!CallerAA || !CallerAA->isValidState())
1509 return merge(CallerAA->getClusterDims());
1512 bool UsedAssumedInformation =
false;
1513 if (!
A.checkForAllCallSites(CheckCallSite, *
this,
1515 UsedAssumedInformation))
1516 return indicatePessimisticFixpoint();
1518 return OldState == Attr ? ChangeStatus::UNCHANGED : ChangeStatus::CHANGED;
1523 return ChangeStatus::UNCHANGED;
1524 return A.manifestAttrs(
1526 {Attribute::get(getAssociatedFunction()->
getContext(), AttrName,
1531 const AMDGPU::ClusterDimsAttr &getClusterDims()
const override {
1536 bool merge(
const AMDGPU::ClusterDimsAttr &
Other) {
1551 if (
Other.isUnknown())
1576 AMDGPU::ClusterDimsAttr Attr;
1578 static constexpr char AttrName[] =
"amdgpu-cluster-dims";
1581AAAMDGPUClusterDims &
1582AAAMDGPUClusterDims::createForPosition(
const IRPosition &IRP, Attributor &
A) {
1584 return *
new (
A.Allocator) AAAMDGPUClusterDimsFunction(IRP,
A);
1585 llvm_unreachable(
"AAAMDGPUClusterDims is only valid for function position");
1588static bool runImpl(SetVector<Function *> &Functions,
bool IsModulePass,
1589 bool DeleteFns,
Module &M, AnalysisGetter &AG,
1590 TargetMachine &TM, AMDGPUAttributorOptions
Options,
1593 CallGraphUpdater CGUpdater;
1595 AMDGPUInformationCache InfoCache(M, AG,
Allocator,
nullptr, TM);
1596 DenseSet<const char *>
Allowed(
1597 {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
1599 &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID,
1605 AttributorConfig AC(CGUpdater);
1606 AC.IsClosedWorldModule =
Options.IsClosedWorld;
1608 AC.IsModulePass = IsModulePass;
1609 AC.DeleteFns = DeleteFns;
1610 AC.DefaultInitializeLiveInternals =
false;
1611 AC.IndirectCalleeSpecializationCallback =
1612 [](Attributor &
A,
const AbstractAttribute &AA, CallBase &CB,
1617 AC.IPOAmendableCB = [](
const Function &
F) {
1618 return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
1621 Attributor
A(Functions, InfoCache, AC);
1624 StringRef LTOPhaseStr =
to_string(LTOPhase);
1625 dbgs() <<
"[AMDGPUAttributor] Running at phase " << LTOPhaseStr <<
'\n'
1626 <<
"[AMDGPUAttributor] Module " <<
M.getName() <<
" is "
1627 << (AC.IsClosedWorldModule ?
"" :
"not ")
1628 <<
"assumed to be a closed world.\n";
1631 for (
auto *
F : Functions) {
1635 CallingConv::ID CC =
F->getCallingConv();
1642 if (!
F->isDeclaration() &&
ST.hasClusters())
1645 if (
ST.hasGFX90AInsts())
1649 Value *Ptr =
nullptr;
1651 Ptr = LI->getPointerOperand();
1653 Ptr =
SI->getPointerOperand();
1655 Ptr = RMW->getPointerOperand();
1657 Ptr = CmpX->getPointerOperand();
1663 if (
II->getIntrinsicID() == Intrinsic::amdgcn_make_buffer_rsrc)
1670 return A.run() == ChangeStatus::CHANGED;
1683 if (!
F.isDeclaration())
1684 Functions.insert(&
F);
1688 return runImpl(Functions,
true,
true, M, AG,
1689 TM, Options, LTOPhase)
1706 if (!
F->isIntrinsic())
1707 Functions.insert(
F);
1711 Module *M =
C.begin()->getFunction().getParent();
1714 return runImpl(Functions,
false,
false, *M, AG,
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isDSAddress(const Constant *C)
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static cl::opt< unsigned > IndirectCallSpecializationThreshold("amdgpu-indirect-call-specialization-threshold", cl::desc("A threshold controls whether an indirect call will be specialized"), cl::init(3))
static ImplicitArgumentMask intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit, bool HasApertureRegs, bool SupportsGetDoorBellID, unsigned CodeObjectVersion)
static bool hasSanitizerAttributes(const Function &F)
Returns true if sanitizer attributes are present on a function.
ImplicitArgumentPositions
static bool castRequiresQueuePtr(unsigned SrcAS)
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Expand Atomic instructions
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static bool runImpl(MachineFunction &MF)
AMD GCN specific subclass of TargetSubtarget.
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Machine Check Debug Module
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
FunctionAnalysisManager FAM
static StringRef getName(Value *V)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, const llvm::StringTable &StandardNames, VectorLibrary VecLib)
Initialize the set of available library functions based on the specified target triple.
PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM, LazyCallGraph &CG, CGSCCUpdateResult &UR)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static ClusterDimsAttr get(const Function &F)
std::string to_string() const
bool isVariableDims() const
uint64_t getZExtValue() const
Get zero extended value.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Value * getArgOperand(unsigned i) const
LLVM_ABI Intrinsic::ID getIntrinsicID() const
Returns the intrinsic ID of the intrinsic called or Intrinsic::not_intrinsic if the called function i...
const APInt & getLower() const
Return the lower value for this range.
const APInt & getUpper() const
Return the upper value for this range.
This is an important base class in LLVM.
A proxy from a FunctionAnalysisManager to an SCC.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
A node in the call graph.
An SCC of the call graph.
A lazily constructed view of the call graph of a module.
A Module instance is used to store all the information related to an LLVM module.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
A vector that has set insertion semantics.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
void push_back(const T &Elt)
std::string str() const
Get the contents as an std::string.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
LLVM_ABI bool isDroppable() const
A droppable user is a user for which uses can be dropped without affecting correctness and should be ...
Type * getType() const
All values are typed, get the type of this value.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned getMaxWavesPerEU(const MCSubtargetInfo &STI)
unsigned getAMDHSACodeObjectVersion(const Module &M)
unsigned getDefaultQueueImplicitArgPosition(unsigned CodeObjectVersion)
std::tuple< char, unsigned, unsigned > parseAsmPhysRegName(StringRef RegName)
Returns a valid charcode or 0 in the first entry if this is a valid physical register name.
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
unsigned getHostcallImplicitArgPosition(unsigned CodeObjectVersion)
SmallVector< unsigned > getMaxNumWorkGroups(const Function &F)
unsigned getCompletionActionImplicitArgPosition(unsigned CodeObjectVersion)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
unsigned getMultigridSyncArgImplicitArgPosition(unsigned CodeObjectVersion)
E & operator^=(E &LHS, E RHS)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ CE
Windows NT (Windows on ARM)
initializer< Ty > init(const Ty &Val)
NodeAddr< CodeNode * > Code
NodeAddr< FuncNode * > Func
Context & getContext() const
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
InnerAnalysisManagerProxy< FunctionAnalysisManager, Module > FunctionAnalysisManagerModuleProxy
Provide the FunctionAnalysisManager to Module proxy.
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Value
AnalysisManager< LazyCallGraph::SCC, LazyCallGraph & > CGSCCAnalysisManager
The CGSCC analysis manager.
ThinOrFullLTOPhase
This enumerates the LLVM full LTO or ThinLTO optimization phases.
@ None
No LTO/ThinLTO behavior needed.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
const char * to_string(ThinOrFullLTOPhase Phase)
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
ChangeStatus clampStateAndIndicateChange(StateType &S, const StateType &R)
Helper function to clamp a state S of type StateType with the information in R and indicate/return if...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
BumpPtrAllocatorImpl<> BumpPtrAllocator
The standard BumpPtrAllocator which just uses the default template parameters.
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
static LLVM_ABI const char ID
Unique ID (due to the unique address)
static LLVM_ABI const char ID
Unique ID (due to the unique address)
virtual const SetVector< Function * > & getOptimisticEdges() const =0
Get the optimistic edges.
static LLVM_ABI const char ID
Unique ID (due to the unique address)
virtual bool hasNonAsmUnknownCallee() const =0
Is there any call with a unknown callee, excluding any inline asm.
static LLVM_ABI const char ID
Unique ID (due to the unique address)
static LLVM_ABI const char ID
Unique ID (due to the unique address)
Instruction * getRemoteInst() const
Return the actual instruction that causes the access.
static LLVM_ABI const char ID
Unique ID (due to the unique address)
static LLVM_ABI const char ID
Unique ID (due to the unique address)
static LLVM_ABI const char ID
Unique ID (due to the unique address)
static LLVM_ABI const char ID
Unique ID (due to the unique address)
virtual const char * getIdAddr() const =0
This function should return the address of the ID of the AbstractAttribute.
Wrapper for FunctionAnalysisManager.
The fixpoint analysis framework that orchestrates the attribute deduction.
Support structure for SCC passes to communicate updates the call graph back to the CGSCC pass manager...
DecIntegerState & takeAssumedMaximum(base_t Value)
Take maximum of assumed and Value.
Helper to describe and deal with positions in the LLVM-IR.
static const IRPosition callsite_returned(const CallBase &CB)
Create a position describing the returned value of CB.
static const IRPosition value(const Value &V, const CallBaseContext *CBContext=nullptr)
Create a position describing the value of V.
@ IRP_FUNCTION
An attribute for a function (scope).
static const IRPosition function(const Function &F, const CallBaseContext *CBContext=nullptr)
Create a position describing the function scope of F.
Kind getPositionKind() const
Return the associated position kind.
static const IRPosition callsite_function(const CallBase &CB)
Create a position describing the function scope of CB.
bool isValidState() const override
See AbstractState::isValidState() NOTE: For now we simply pretend that the worst possible state is in...
ChangeStatus indicatePessimisticFixpoint() override
See AbstractState::indicatePessimisticFixpoint(...)
Helper to tie a abstract state implementation to an abstract attribute.