27#define DEBUG_TYPE "gcn-hazard-recognizer"
30 "Number of WMMA hazard V_NOPs hoisted from loops");
32 "Number of WMMA hazards where V_NOP hoisting was not possible");
36struct MFMAPaddingRatioParser :
public cl::parser<unsigned> {
39 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg,
unsigned &
Value) {
41 return O.error(
"'" + Arg +
"' value invalid for uint argument!");
44 return O.error(
"'" + Arg +
"' value must be in the range [0, 100]!");
54 cl::desc(
"Fill a percentage of the latency between "
55 "neighboring MFMA with s_nops."));
60 cl::desc(
"Insert a s_nop x before every instruction"));
64 cl::desc(
"Hoist WMMA hazard V_NOPs from loops to preheaders"));
75 : IsHazardRecognizerMode(
false), CurrCycleInstr(nullptr), MF(MF),
76 ST(MF.getSubtarget<
GCNSubtarget>()), TII(*ST.getInstrInfo()),
77 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()), MLI(MLI),
78 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
79 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
84 EmittedInstrs.clear();
96 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
100 return Opcode == AMDGPU::S_GETREG_B32 || Opcode == AMDGPU::S_GETREG_B32_const;
105 case AMDGPU::S_SETREG_B32:
106 case AMDGPU::S_SETREG_B32_mode:
107 case AMDGPU::S_SETREG_IMM32_B32:
108 case AMDGPU::S_SETREG_IMM32_B32_mode:
115 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
119 return Opcode == AMDGPU::S_RFE_B64;
124 case AMDGPU::S_MOVRELS_B32:
125 case AMDGPU::S_MOVRELS_B64:
126 case AMDGPU::S_MOVRELD_B32:
127 case AMDGPU::S_MOVRELD_B64:
136 if (
TII.isAlwaysGDS(
MI.getOpcode()))
139 switch (
MI.getOpcode()) {
140 case AMDGPU::S_SENDMSG:
141 case AMDGPU::S_SENDMSGHALT:
142 case AMDGPU::S_TTRACEDATA:
146 case AMDGPU::DS_PERMUTE_B32:
147 case AMDGPU::DS_BPERMUTE_B32:
150 if (
TII.isDS(
MI.getOpcode())) {
151 int GDS = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
152 AMDGPU::OpName::gds);
153 if (
MI.getOperand(GDS).getImm())
161 unsigned Opcode =
MI.getOpcode();
162 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
163 Opcode == AMDGPU::V_PERMLANE64_B32 ||
164 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
165 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
166 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
167 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
168 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
169 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
170 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 ||
171 Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
172 Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 ||
173 Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
174 Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
175 Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;
185 AMDGPU::OpName::simm16);
202 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(
MI) > 0)
205 if (checkFPAtomicToDenormModeHazard(
MI) > 0)
209 if (!IsHazardRecognizerMode) {
210 if (checkWMMACoexecutionHazards(
MI) > 0)
214 if (ST.hasNoDataDepHazard())
226 if (
isDivFMas(
MI->getOpcode()) && checkDivFMasHazards(
MI) > 0)
229 if (
isRWLane(
MI->getOpcode()) && checkRWLaneHazards(
MI) > 0)
234 checkMAIVALUHazards(
MI) > 0)
237 if (
isSGetReg(
MI->getOpcode()) && checkGetRegHazards(
MI) > 0)
240 if (
isSSetReg(
MI->getOpcode()) && checkSetRegHazards(
MI) > 0)
243 if (
isRFE(
MI->getOpcode()) && checkRFEHazards(
MI) > 0)
246 if (((ST.hasReadM0MovRelInterpHazard() &&
248 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
249 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
251 (ST.hasReadM0LdsDmaHazard() &&
isLdsDma(*
MI)) ||
252 (ST.hasReadM0LdsDirectHazard() &&
253 MI->readsRegister(AMDGPU::LDS_DIRECT,
nullptr))) &&
254 checkReadM0Hazards(
MI) > 0)
261 checkMAILdStHazards(
MI) > 0)
264 if (
MI->isInlineAsm() && checkInlineAsmHazards(
MI) > 0)
272 while (Quantity > 0) {
273 unsigned Arg = std::min(Quantity, 8u);
281GCNHazardRecognizer::getMFMAPipelineWaitStates(
const MachineInstr &
MI)
const {
282 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&
MI);
283 assert(TSchedModel.getWriteProcResBegin(SC) !=
284 TSchedModel.getWriteProcResEnd(SC));
285 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
288void GCNHazardRecognizer::processBundle() {
292 for (;
MI !=
E &&
MI->isInsideBundle(); ++
MI) {
293 CurrCycleInstr = &*
MI;
296 if (IsHazardRecognizerMode) {
297 fixHazards(CurrCycleInstr);
305 for (
unsigned i = 0, e = std::min(WaitStates,
MaxLookAhead - 1); i <
e; ++i)
306 EmittedInstrs.push_front(
nullptr);
308 EmittedInstrs.push_front(CurrCycleInstr);
311 CurrCycleInstr =
nullptr;
315 assert(IsHazardRecognizerMode);
319 if (
MI->isInsideBundle())
329 IsHazardRecognizerMode =
true;
333 CurrCycleInstr =
nullptr;
348 return std::max(WaitStates, checkSMRDHazards(
MI));
350 if (ST.hasNSAtoVMEMBug())
351 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(
MI));
353 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(
MI));
355 if (ST.hasNoDataDepHazard())
359 WaitStates = std::max(WaitStates, checkVMEMHazards(
MI));
362 WaitStates = std::max(WaitStates, checkVALUHazards(
MI));
365 WaitStates = std::max(WaitStates, checkDPPHazards(
MI));
368 WaitStates = std::max(WaitStates, checkDivFMasHazards(
MI));
371 WaitStates = std::max(WaitStates, checkRWLaneHazards(
MI));
375 checkMAIVALUHazards(
MI) > 0)
376 WaitStates = std::max(WaitStates, checkMAIVALUHazards(
MI));
378 if (
MI->isInlineAsm())
379 return std::max(WaitStates, checkInlineAsmHazards(
MI));
382 return std::max(WaitStates, checkGetRegHazards(
MI));
385 return std::max(WaitStates, checkSetRegHazards(
MI));
388 return std::max(WaitStates, checkRFEHazards(
MI));
390 if ((ST.hasReadM0MovRelInterpHazard() &&
392 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
393 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
395 (ST.hasReadM0LdsDmaHazard() &&
isLdsDma(*
MI)) ||
396 (ST.hasReadM0LdsDirectHazard() &&
397 MI->readsRegister(AMDGPU::LDS_DIRECT,
nullptr)))
398 return std::max(WaitStates, checkReadM0Hazards(
MI));
401 return std::max(WaitStates, checkMAIHazards(
MI));
404 return std::max(WaitStates, checkMAILdStHazards(
MI));
407 return std::max(WaitStates, checkPermlaneHazards(
MI));
413 EmittedInstrs.push_front(
nullptr);
419 if (!CurrCycleInstr) {
420 EmittedInstrs.push_front(
nullptr);
424 if (CurrCycleInstr->isBundle()) {
429 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
430 if (!NumWaitStates) {
431 CurrCycleInstr =
nullptr;
436 EmittedInstrs.push_front(CurrCycleInstr);
443 EmittedInstrs.push_front(
nullptr);
451 CurrCycleInstr =
nullptr;
455 assert(!IsHazardRecognizerMode &&
456 "Bottom-up scheduling shouldn't run in hazard recognizer mode");
466template <
typename StateT>
476 static bool isEqual(
const StateMapKey &
LHS,
const StateMapKey &
RHS) {
481 static inline StateMapKey getEmptyKey() {
486 static inline StateMapKey getTombstoneKey() {
491 static unsigned getHashValue(
const StateMapKey &
Key) {
492 return StateT::getHashValue((*
Key.States)[
Key.Idx]);
494 static unsigned getHashValue(
const StateT &State) {
495 return StateT::getHashValue(State);
497 static bool isEqual(
const StateMapKey &
LHS,
const StateMapKey &
RHS) {
498 const auto EKey = getEmptyKey();
499 const auto TKey = getTombstoneKey();
500 if (StateMapKey::isEqual(
LHS, EKey) || StateMapKey::isEqual(
RHS, EKey) ||
501 StateMapKey::isEqual(
LHS, TKey) || StateMapKey::isEqual(
RHS, TKey))
502 return StateMapKey::isEqual(
LHS,
RHS);
503 return StateT::isEqual((*
LHS.States)[
LHS.Idx], (*
RHS.States)[
RHS.Idx]);
505 static bool isEqual(
const StateT &
LHS,
const StateMapKey &
RHS) {
506 if (StateMapKey::isEqual(
RHS, getEmptyKey()) ||
507 StateMapKey::isEqual(
RHS, getTombstoneKey()))
509 return StateT::isEqual(
LHS, (*
RHS.States)[
RHS.Idx]);
518 StateT State = InitialState;
521 unsigned WorkIdx = 0;
523 bool Expired =
false;
524 for (
auto E =
MBB->instr_rend();
I !=
E; ++
I) {
529 auto Result = IsHazard(State, *
I);
537 if (
I->isInlineAsm() ||
I->isMetaInstruction())
540 UpdateState(State, *
I);
544 unsigned StateIdx = States.
size();
545 StateMapKey
Key = {&States, StateIdx};
546 auto Insertion = StateMap.
insert_as(std::pair(
Key, StateIdx), State);
547 if (Insertion.second) {
550 StateIdx = Insertion.first->second;
553 Worklist.
insert(std::pair(Pred, StateIdx));
556 if (WorkIdx == Worklist.
size())
560 std::tie(
MBB, StateIdx) = Worklist[WorkIdx++];
561 State = States[StateIdx];
562 I =
MBB->instr_rbegin();
579 for (
auto E =
MBB->instr_rend();
I !=
E; ++
I) {
587 if (
I->isInlineAsm())
590 WaitStates += GetNumWaitStates(*
I);
592 if (IsExpired(*
I, WaitStates))
593 return std::numeric_limits<int>::max();
596 int MinWaitStates = std::numeric_limits<int>::max();
598 if (!Visited.
insert(Pred).second)
602 IsExpired, Visited, GetNumWaitStates);
604 MinWaitStates = std::min(MinWaitStates, W);
607 return MinWaitStates;
618 std::next(
MI->getReverseIterator()), 0, IsExpired,
619 Visited, GetNumWaitStates);
622int GCNHazardRecognizer::getWaitStatesSince(
623 IsHazardFn IsHazard,
int Limit, GetNumWaitStatesFn GetNumWaitStates)
const {
624 if (IsHazardRecognizerMode) {
625 auto IsExpiredFn = [Limit](
const MachineInstr &,
int WaitStates) {
626 return WaitStates >= Limit;
628 return ::getWaitStatesSince(IsHazard, CurrCycleInstr,
IsExpiredFn,
633 for (MachineInstr *
MI : EmittedInstrs) {
638 if (
MI->isInlineAsm())
641 WaitStates +=
MI ? GetNumWaitStates(*
MI) : 1;
643 if (WaitStates >= Limit)
646 return std::numeric_limits<int>::max();
649int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard,
654int GCNHazardRecognizer::getWaitStatesSinceDef(
unsigned Reg,
655 IsHazardFn IsHazardDef,
657 const SIRegisterInfo *TRI = ST.getRegisterInfo();
660 return IsHazardDef(
MI) &&
MI.modifiesRegister(
Reg, TRI);
666int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
681 for (MCRegUnit Unit :
TRI.regunits(
Reg))
682 BV.
set(
static_cast<unsigned>(Unit));
694void GCNHazardRecognizer::addClauseInst(
const MachineInstr &
MI)
const {
706int GCNHazardRecognizer::checkSoftClauseHazards(
MachineInstr *MEM)
const {
709 if (!ST.isXNACKEnabled())
712 bool IsSMRD = TII.isSMRD(*MEM);
726 for (MachineInstr *
MI : EmittedInstrs) {
738 if (ClauseDefs.none())
751 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
754int GCNHazardRecognizer::checkSMRDHazards(
MachineInstr *SMRD)
const {
755 int WaitStatesNeeded = 0;
757 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
760 if (!ST.hasSMRDReadVALUDefHazard())
761 return WaitStatesNeeded;
765 int SmrdSgprWaitStates = 4;
766 auto IsHazardDefFn = [
this](
const MachineInstr &
MI) {
767 return TII.isVALU(
MI);
769 auto IsBufferHazardDefFn = [
this](
const MachineInstr &
MI) {
770 return TII.isSALU(
MI);
773 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
775 for (
const MachineOperand &Use :
SMRD->uses()) {
778 int WaitStatesNeededForUse =
779 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
781 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
791 int WaitStatesNeededForUse =
792 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(),
795 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
799 return WaitStatesNeeded;
802int GCNHazardRecognizer::checkVMEMHazards(
MachineInstr *VMEM)
const {
803 if (!ST.hasVMEMReadSGPRVALUDefHazard())
806 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
810 const int VmemSgprWaitStates = 5;
811 auto IsHazardDefFn = [
this](
const MachineInstr &
MI) {
812 return TII.isVALU(
MI);
814 for (
const MachineOperand &Use :
VMEM->uses()) {
815 if (!
Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(),
Use.getReg()))
818 int WaitStatesNeededForUse =
819 VmemSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
821 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
823 return WaitStatesNeeded;
827 const SIRegisterInfo *TRI = ST.getRegisterInfo();
828 const SIInstrInfo *TII = ST.getInstrInfo();
831 int DppVgprWaitStates = 2;
832 int DppExecWaitStates = 5;
833 int WaitStatesNeeded = 0;
834 auto IsHazardDefFn = [TII](
const MachineInstr &
MI) {
835 return TII->isVALU(
MI);
838 for (
const MachineOperand &Use :
DPP->uses()) {
839 if (!
Use.isReg() || !TRI->isVGPR(MF.getRegInfo(),
Use.getReg()))
841 int WaitStatesNeededForUse =
842 DppVgprWaitStates - getWaitStatesSinceDef(
844 [](
const MachineInstr &) { return true; },
846 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
849 WaitStatesNeeded = std::max(
851 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
854 return WaitStatesNeeded;
857int GCNHazardRecognizer::checkDivFMasHazards(
MachineInstr *DivFMas)
const {
858 const SIInstrInfo *TII = ST.getInstrInfo();
862 const int DivFMasWaitStates = 4;
863 auto IsHazardDefFn = [TII](
const MachineInstr &
MI) {
864 return TII->isVALU(
MI);
866 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
869 return DivFMasWaitStates - WaitStatesNeeded;
872int GCNHazardRecognizer::checkGetRegHazards(
MachineInstr *GetRegInstr)
const {
873 const SIInstrInfo *TII = ST.getInstrInfo();
874 unsigned GetRegHWReg =
getHWReg(TII, *GetRegInstr);
876 const int GetRegWaitStates = 2;
877 auto IsHazardFn = [TII, GetRegHWReg](
const MachineInstr &
MI) {
880 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, GetRegWaitStates);
882 return GetRegWaitStates - WaitStatesNeeded;
885int GCNHazardRecognizer::checkSetRegHazards(
MachineInstr *SetRegInstr)
const {
886 const SIInstrInfo *TII = ST.getInstrInfo();
887 unsigned HWReg =
getHWReg(TII, *SetRegInstr);
889 const int SetRegWaitStates = ST.getSetRegWaitStates();
890 auto IsHazardFn = [TII, HWReg](
const MachineInstr &
MI) {
893 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, SetRegWaitStates);
894 return SetRegWaitStates - WaitStatesNeeded;
897int GCNHazardRecognizer::createsVALUHazard(
const MachineInstr &
MI)
const {
901 const SIInstrInfo *TII = ST.getInstrInfo();
902 unsigned Opcode =
MI.getOpcode();
903 const MCInstrDesc &
Desc =
MI.getDesc();
905 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
908 VDataRCID = TII->getOpRegClassID(
Desc.operands()[VDataIdx]);
910 if (TII->isMUBUF(
MI) || TII->isMTBUF(
MI)) {
917 const MachineOperand *SOffset =
918 TII->getNamedOperand(
MI, AMDGPU::OpName::soffset);
922 (!SOffset || !SOffset->
isReg()))
930 if (TII->isMIMG(
MI)) {
931 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
933 Desc.operands()[SRsrcIdx])) == 256);
937 if (TII->isFLAT(
MI)) {
949int GCNHazardRecognizer::checkVALUHazardsHelper(
953 const SIRegisterInfo *TRI = ST.getRegisterInfo();
955 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
956 int WaitStatesNeeded = 0;
958 if (!TRI->isVectorRegister(MRI,
Def.getReg()))
959 return WaitStatesNeeded;
962 int DataIdx = createsVALUHazard(
MI);
963 return DataIdx >= 0 &&
964 TRI->regsOverlap(
MI.getOperand(DataIdx).getReg(),
Reg);
967 int WaitStatesNeededForDef =
968 VALUWaitStates - getWaitStatesSince(
IsHazardFn, VALUWaitStates);
969 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
971 return WaitStatesNeeded;
987 unsigned Opcode =
MI.getOpcode();
997 if (
auto *DstSel =
TII->getNamedOperand(
MI, AMDGPU::OpName::dst_sel))
999 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1005 if (
TII->getNamedImmOperand(
MI, AMDGPU::OpName::src0_modifiers) &
1007 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1011 (
TII->getNamedImmOperand(
MI, AMDGPU::OpName::src2_modifiers) &
1013 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1019 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1040 for (
auto &Operand : VALU->operands()) {
1041 if (Operand.isReg() &&
TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
1048int GCNHazardRecognizer::checkVALUHazards(
MachineInstr *VALU)
const {
1049 int WaitStatesNeeded = 0;
1052 const int TransDefWaitstates = 1;
1054 auto IsTransDefFn = [
this,
VALU](
const MachineInstr &
MI) {
1057 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1058 const SIInstrInfo *TII = ST.getInstrInfo();
1059 Register Def = TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)->getReg();
1061 for (
const MachineOperand &Use :
VALU->explicit_uses()) {
1062 if (
Use.isReg() && TRI->regsOverlap(Def,
Use.getReg()))
1069 int WaitStatesNeededForDef =
1070 TransDefWaitstates -
1071 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
1072 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1075 if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
1076 const int Shift16DefWaitstates = 1;
1078 auto IsShift16BitDefFn = [
this,
VALU](
const MachineInstr &ProducerMI) {
1079 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1080 const MachineOperand *ForwardedDst =
1086 if (ProducerMI.isInlineAsm()) {
1088 for (
auto &Def : ProducerMI.all_defs()) {
1097 int WaitStatesNeededForDef =
1098 Shift16DefWaitstates -
1099 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1100 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1103 if (ST.hasVDecCoExecHazard()) {
1104 const int VALUWriteSGPRVALUReadWaitstates = 2;
1105 const int VALUWriteEXECRWLane = 4;
1106 const int VALUWriteVGPRReadlaneRead = 1;
1108 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1109 const MachineRegisterInfo &MRI = MF.getRegInfo();
1111 auto IsVALUDefSGPRFn = [&
UseReg, TRI](
const MachineInstr &
MI) {
1114 return MI.modifiesRegister(
UseReg, TRI);
1117 for (
const MachineOperand &Use :
VALU->explicit_uses()) {
1122 if (TRI->isSGPRReg(MRI,
UseReg)) {
1123 int WaitStatesNeededForDef =
1124 VALUWriteSGPRVALUReadWaitstates -
1125 getWaitStatesSince(IsVALUDefSGPRFn,
1126 VALUWriteSGPRVALUReadWaitstates);
1127 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1131 if (
VALU->readsRegister(AMDGPU::VCC, TRI)) {
1133 int WaitStatesNeededForDef =
1134 VALUWriteSGPRVALUReadWaitstates -
1135 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1136 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1139 switch (
VALU->getOpcode()) {
1140 case AMDGPU::V_READLANE_B32:
1141 case AMDGPU::V_READFIRSTLANE_B32: {
1142 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1144 int WaitStatesNeededForDef =
1145 VALUWriteVGPRReadlaneRead -
1146 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1147 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1150 case AMDGPU::V_WRITELANE_B32: {
1152 int WaitStatesNeededForDef =
1153 VALUWriteEXECRWLane -
1154 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1155 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1165 if (!ST.has12DWordStoreHazard())
1166 return WaitStatesNeeded;
1168 const MachineRegisterInfo &MRI = MF.getRegInfo();
1170 for (
const MachineOperand &Def :
VALU->defs()) {
1171 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1174 return WaitStatesNeeded;
1177int GCNHazardRecognizer::checkInlineAsmHazards(
MachineInstr *IA)
const {
1186 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1187 !ST.hasCvtScaleForwardingHazard())
1190 const MachineRegisterInfo &MRI = MF.getRegInfo();
1191 int WaitStatesNeeded = 0;
1193 for (
const MachineOperand &
Op :
1195 if (
Op.isReg() &&
Op.isDef()) {
1196 if (!TRI.isVectorRegister(MRI,
Op.getReg()))
1199 if (ST.has12DWordStoreHazard()) {
1201 std::max(WaitStatesNeeded, checkVALUHazardsHelper(
Op, MRI));
1206 if (ST.hasDstSelForwardingHazard()) {
1207 const int Shift16DefWaitstates = 1;
1209 auto IsShift16BitDefFn = [
this, &
IA](
const MachineInstr &ProducerMI) {
1213 return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1214 IA->readsRegister(Dst->getReg(), &TRI);
1216 if (ProducerMI.isInlineAsm()) {
1218 for (
auto &Def : ProducerMI.all_defs()) {
1219 if (
IA->modifiesRegister(
Def.getReg(), &TRI) ||
1220 IA->readsRegister(
Def.getReg(), &TRI)) {
1229 int WaitStatesNeededForDef =
1230 Shift16DefWaitstates -
1231 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1232 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1235 return WaitStatesNeeded;
1238int GCNHazardRecognizer::checkRWLaneHazards(
MachineInstr *RWLane)
const {
1239 const SIInstrInfo *TII = ST.getInstrInfo();
1240 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1241 const MachineRegisterInfo &MRI = MF.getRegInfo();
1243 const MachineOperand *LaneSelectOp =
1244 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1246 if (!LaneSelectOp->
isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->
getReg()))
1250 auto IsHazardFn = [TII](
const MachineInstr &
MI) {
return TII->isVALU(
MI); };
1252 const int RWLaneWaitStates = 4;
1253 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg,
IsHazardFn,
1255 return RWLaneWaitStates - WaitStatesSince;
1258int GCNHazardRecognizer::checkRFEHazards(
MachineInstr *RFE)
const {
1259 if (!ST.hasRFEHazards())
1262 const SIInstrInfo *TII = ST.getInstrInfo();
1264 const int RFEWaitStates = 1;
1269 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, RFEWaitStates);
1270 return RFEWaitStates - WaitStatesNeeded;
1273int GCNHazardRecognizer::checkReadM0Hazards(
MachineInstr *
MI)
const {
1274 const SIInstrInfo *TII = ST.getInstrInfo();
1275 const int ReadM0WaitStates = 1;
1276 auto IsHazardFn = [TII](
const MachineInstr &
MI) {
return TII->isSALU(
MI); };
1277 return ReadM0WaitStates -
1278 getWaitStatesSinceDef(AMDGPU::M0,
IsHazardFn, ReadM0WaitStates);
1283 int WaitStatesNeeded,
bool IsHoisting) {
1285 for (
int I = 0;
I < WaitStatesNeeded; ++
I)
1286 BuildMI(
MBB, InsertPt,
DL, TII.get(AMDGPU::V_NOP_e32));
1290 fixVMEMtoScalarWriteHazards(
MI);
1291 fixVcmpxPermlaneHazards(
MI);
1292 fixSMEMtoVectorWriteHazards(
MI);
1293 fixVcmpxExecWARHazard(
MI);
1294 fixLdsBranchVmemWARHazard(
MI);
1295 if (ST.hasLdsDirect()) {
1296 fixLdsDirectVALUHazard(
MI);
1297 fixLdsDirectVMEMHazard(
MI);
1299 fixVALUPartialForwardingHazard(
MI);
1300 fixVALUTransUseHazard(
MI);
1301 fixVALUTransCoexecutionHazards(
MI);
1303 fixWMMACoexecutionHazards(
MI);
1304 fixShift64HighRegBug(
MI);
1305 fixVALUMaskWriteHazard(
MI);
1306 fixRequiredExportPriority(
MI);
1307 if (ST.requiresWaitIdleBeforeGetReg())
1308 fixGetRegWaitIdle(
MI);
1309 if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug())
1310 fixDsAtomicAsyncBarrierArriveB64(
MI);
1311 if (ST.hasScratchBaseForwardingHazard())
1312 fixScratchBaseForwardingHazard(
MI);
1313 if (ST.setRegModeNeedsVNOPs())
1319 return (
TII.isVOPC(
MI) ||
1320 (
MI.isCompare() && (
TII.isVOP3(
MI) ||
TII.isSDWA(
MI)))) &&
1321 MI.modifiesRegister(AMDGPU::EXEC, &
TRI);
1324bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(
MachineInstr *
MI) {
1328 const SIInstrInfo *TII = ST.getInstrInfo();
1329 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1335 unsigned Opc =
MI.getOpcode();
1337 Opc != AMDGPU::V_NOP_e64 &&
Opc != AMDGPU::V_NOP_sdwa;
1341 std::numeric_limits<int>::max())
1347 auto *Src0 = TII->getNamedOperand(*
MI, AMDGPU::OpName::src0);
1349 bool IsUndef = Src0->isUndef();
1351 TII->get(AMDGPU::V_MOV_B32_e32))
1358bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(
MachineInstr *
MI) {
1359 if (!ST.hasVMEMtoScalarWriteHazard())
1361 assert(!ST.hasExtendedWaitCounts());
1366 if (
MI->getNumDefs() == 0)
1369 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1375 for (
const MachineOperand &Def :
MI->defs()) {
1376 const MachineOperand *
Op =
1377 I.findRegisterUseOperand(
Def.getReg(), TRI,
false);
1387 (
MI.getOpcode() == AMDGPU::S_WAITCNT &&
1388 !
MI.getOperand(0).getImm()) ||
1389 (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1394 std::numeric_limits<int>::max())
1397 const SIInstrInfo *TII = ST.getInstrInfo();
1399 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1404bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(
MachineInstr *
MI) {
1405 if (!ST.hasSMEMtoVectorWriteHazard())
1407 assert(!ST.hasExtendedWaitCounts());
1412 AMDGPU::OpName SDSTName;
1413 switch (
MI->getOpcode()) {
1414 case AMDGPU::V_READLANE_B32:
1415 case AMDGPU::V_READFIRSTLANE_B32:
1416 SDSTName = AMDGPU::OpName::vdst;
1419 SDSTName = AMDGPU::OpName::sdst;
1423 const SIInstrInfo *TII = ST.getInstrInfo();
1424 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1426 const MachineOperand *SDST = TII->getNamedOperand(*
MI, SDSTName);
1428 for (
const auto &MO :
MI->implicit_operands()) {
1429 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1440 auto IsHazardFn = [SDSTReg, TRI](
const MachineInstr &
I) {
1445 if (TII->isSALU(
MI)) {
1446 switch (
MI.getOpcode()) {
1447 case AMDGPU::S_SETVSKIP:
1448 case AMDGPU::S_VERSION:
1449 case AMDGPU::S_WAITCNT_VSCNT:
1450 case AMDGPU::S_WAITCNT_VMCNT:
1451 case AMDGPU::S_WAITCNT_EXPCNT:
1454 case AMDGPU::S_WAITCNT_LGKMCNT:
1456 return (
MI.getOperand(1).getImm() == 0) &&
1457 (
MI.getOperand(0).
getReg() == AMDGPU::SGPR_NULL);
1458 case AMDGPU::S_WAITCNT: {
1459 const int64_t
Imm =
MI.getOperand(0).getImm();
1466 MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&
1467 "unexpected wait count instruction");
1469 if (TII->isSOPP(
MI))
1485 std::numeric_limits<int>::max())
1489 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1494bool GCNHazardRecognizer::fixVcmpxExecWARHazard(
MachineInstr *
MI) {
1495 if (!ST.hasVcmpxExecWARHazard())
1497 assert(!ST.hasExtendedWaitCounts());
1502 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1503 if (!
MI->modifiesRegister(AMDGPU::EXEC, TRI))
1509 return I.readsRegister(AMDGPU::EXEC, TRI);
1512 const SIInstrInfo *TII = ST.getInstrInfo();
1513 auto IsExpiredFn = [TII, TRI](
const MachineInstr &
MI, int) {
1515 if (TII->getNamedOperand(
MI, AMDGPU::OpName::sdst))
1517 for (
auto MO :
MI.implicit_operands())
1518 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1521 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1528 std::numeric_limits<int>::max())
1532 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1539 if (!ST.hasLdsBranchVmemWARHazard())
1544 bool HasLds =
false;
1545 bool HasVmem =
false;
1546 for (
auto &
MBB : MF) {
1547 for (
auto &
MI :
MBB) {
1550 if (HasLds && HasVmem)
1558 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1559 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1560 !
I.getOperand(1).getImm();
1563bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(
MachineInstr *
MI) {
1564 if (!RunLdsBranchVmemWARHazardFixup)
1567 assert(ST.hasLdsBranchVmemWARHazard());
1568 assert(!ST.hasExtendedWaitCounts());
1570 auto IsHazardInst = [](
const MachineInstr &
MI) {
1578 auto InstType = IsHazardInst(*
MI);
1582 auto IsExpiredFn = [&IsHazardInst](
const MachineInstr &
I, int) {
1586 auto IsHazardFn = [InstType, &IsHazardInst](
const MachineInstr &
I) {
1590 auto IsHazardFn = [InstType, IsHazardInst](
const MachineInstr &
I) {
1591 auto InstType2 = IsHazardInst(
I);
1592 return InstType2 && InstType != InstType2;
1595 auto IsExpiredFn = [InstType, &IsHazardInst](
const MachineInstr &
I, int) {
1596 auto InstType2 = IsHazardInst(
I);
1597 if (InstType == InstType2)
1604 std::numeric_limits<int>::max();
1608 std::numeric_limits<int>::max())
1611 const SIInstrInfo *TII = ST.getInstrInfo();
1613 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1620bool GCNHazardRecognizer::fixLdsDirectVALUHazard(
MachineInstr *
MI) {
1624 const int NoHazardWaitStates = 15;
1625 const MachineOperand *VDST = TII.getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1628 bool VisitedTrans =
false;
1629 auto IsHazardFn = [
this, VDSTReg, &VisitedTrans](
const MachineInstr &
I) {
1634 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1636 auto IsExpiredFn = [&](
const MachineInstr &
I,
int WaitStates) {
1637 if (WaitStates >= NoHazardWaitStates)
1643 auto GetWaitStatesFn = [](
const MachineInstr &
MI) {
1647 DenseSet<const MachineBasicBlock *> Visited;
1649 std::next(
MI->getReverseIterator()), 0,
1657 MachineOperand *WaitVdstOp =
1658 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvdst);
1659 WaitVdstOp->
setImm(std::min(
Count, NoHazardWaitStates));
1664bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(
MachineInstr *
MI) {
1668 const MachineOperand *VDST = TII.getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1671 auto IsHazardFn = [
this, VDSTReg](
const MachineInstr &
I) {
1674 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1676 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1679 auto IsExpiredFn = [
this, LdsdirCanWait](
const MachineInstr &
I, int) {
1681 (
I.getOpcode() == AMDGPU::S_WAITCNT && !
I.getOperand(0).getImm()) ||
1682 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1685 !TII.getNamedOperand(
I, AMDGPU::OpName::waitvsrc)->getImm());
1689 std::numeric_limits<int>::max())
1692 if (LdsdirCanWait) {
1693 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1696 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1703bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(
MachineInstr *
MI) {
1704 if (!ST.hasVALUPartialForwardingHazard())
1706 assert(!ST.hasExtendedWaitCounts());
1711 SmallSetVector<Register, 4> SrcVGPRs;
1713 for (
const MachineOperand &Use :
MI->explicit_uses()) {
1714 if (
Use.isReg() && TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
1719 if (SrcVGPRs.
size() <= 1)
1737 const int Intv1plus2MaxVALUs = 2;
1738 const int Intv3MaxVALUs = 4;
1739 const int IntvMaxVALUs = 6;
1740 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1743 SmallDenseMap<Register, int, 4> DefPos;
1744 int ExecPos = std::numeric_limits<int>::max();
1747 static unsigned getHashValue(
const StateType &State) {
1751 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
1752 return LHS.DefPos ==
RHS.DefPos &&
LHS.ExecPos ==
RHS.ExecPos &&
1760 auto IsHazardFn = [&,
this](StateType &State,
const MachineInstr &
I) {
1762 if (State.VALUs > NoHazardVALUWaitStates)
1768 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1776 if (!State.DefPos.count(Src) &&
I.modifiesRegister(Src, &TRI)) {
1777 State.DefPos[Src] = State.VALUs;
1782 if (State.ExecPos == std::numeric_limits<int>::max()) {
1783 if (!State.DefPos.empty() &&
I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1784 State.ExecPos = State.VALUs;
1791 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1799 if (State.ExecPos == std::numeric_limits<int>::max())
1802 int PreExecPos = std::numeric_limits<int>::max();
1803 int PostExecPos = std::numeric_limits<int>::max();
1805 for (
auto Entry : State.DefPos) {
1806 int DefVALUs =
Entry.second;
1807 if (DefVALUs != std::numeric_limits<int>::max()) {
1808 if (DefVALUs >= State.ExecPos)
1809 PreExecPos = std::min(PreExecPos, DefVALUs);
1811 PostExecPos = std::min(PostExecPos, DefVALUs);
1816 if (PostExecPos == std::numeric_limits<int>::max())
1820 int Intv3VALUs = PostExecPos;
1821 if (Intv3VALUs > Intv3MaxVALUs)
1825 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1826 if (Intv2VALUs > Intv1plus2MaxVALUs)
1830 if (PreExecPos == std::numeric_limits<int>::max())
1834 int Intv1VALUs = PreExecPos - State.ExecPos;
1835 if (Intv1VALUs > Intv1plus2MaxVALUs)
1839 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1844 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1850 std::next(
MI->getReverseIterator())))
1854 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1860bool GCNHazardRecognizer::fixVALUTransUseHazard(
MachineInstr *
MI) {
1861 if (!ST.hasVALUTransUseHazard())
1863 assert(!ST.hasExtendedWaitCounts());
1868 SmallSet<Register, 4> SrcVGPRs;
1870 for (
const MachineOperand &Use :
MI->explicit_uses()) {
1871 if (
Use.isReg() && TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
1885 const int IntvMaxVALUs = 5;
1886 const int IntvMaxTRANS = 1;
1892 static unsigned getHashValue(
const StateType &State) {
1895 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
1896 return LHS.VALUs ==
RHS.VALUs &&
LHS.TRANS ==
RHS.TRANS;
1903 auto IsHazardFn = [&,
this](StateType &State,
const MachineInstr &
I) {
1905 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1911 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1918 if (
I.modifiesRegister(Src, &TRI)) {
1926 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1934 std::next(
MI->getReverseIterator())))
1940 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1946bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(
MachineInstr *
MI) {
1947 if (!ST.hasGFX1250Insts() ||
1951 const SIInstrInfo *TII = ST.getInstrInfo();
1952 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1954 auto IsTransHazardFn = [
MI, TII, TRI](
const MachineInstr &
I) {
1959 Register TransDef = TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
1960 for (
const MachineOperand &ValuUse :
MI->explicit_uses()) {
1961 if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg()))
1965 auto *ValuDst = TII->getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1966 if (!ValuDst || !ValuDst->isReg())
1970 Register ValuDef = ValuDst->getReg();
1971 for (
const MachineOperand &TransUse :
I.explicit_uses()) {
1972 if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg()))
1983 const int HasVALU = std::numeric_limits<int>::max();
1984 if (::getWaitStatesSince(IsTransHazardFn,
MI,
IsExpiredFn) == HasVALU)
1987 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1995 const SIInstrInfo *TII = ST.getInstrInfo();
1996 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1998 auto IsHazardFn = [
MI, TII, TRI,
this](
const MachineInstr &
I) {
2005 TII->getNamedOperand(*
MI, AMDGPU::OpName::src0)->getReg();
2007 TII->getNamedOperand(*
MI, AMDGPU::OpName::src1)->getReg();
2010 TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
2012 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
2013 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
2022 TII->getNamedOperand(*
MI, AMDGPU::OpName::src2)->getReg();
2023 if (TRI->regsOverlap(PrevDstReg, CurIndex))
2037 std::numeric_limits<int>::max())
2040 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
2052 unsigned Category) {
2054 "Handle me if the xdl wmma instruction latency changes");
2091int GCNHazardRecognizer::checkWMMACoexecutionHazards(
MachineInstr *
MI)
const {
2092 if (!ST.hasGFX1250Insts())
2095 const SIInstrInfo *TII = ST.getInstrInfo();
2104 const int WMMAWaitStates[] = {5, 9, 3, 5};
2105 const int VALUWaitStates[] = {4, 8, 2, 4};
2106 unsigned Category = 0;
2108 auto IsWMMAHazardFn = [
MI, TII, &Category,
this](
const MachineInstr &
I) {
2109 if (!TII->isXDLWMMA(
I))
2112 unsigned Latency = TSchedModel.computeInstrLatency(&
I);
2116 return hasWMMAToWMMARegOverlap(
I, *
MI);
2119 auto IsVALUHazardFn = [
MI, TII, &Category,
this](
const MachineInstr &
I) {
2120 if (!TII->isXDLWMMA(
I))
2123 unsigned Latency = TSchedModel.computeInstrLatency(&
I);
2127 return hasWMMAToVALURegOverlap(
I, *
MI);
2132 auto GetWaitStatesFn = [](
const MachineInstr &
I) {
2136 int WaitStatesNeeded = -1;
2137 if (TII->isXDLWMMA(*
MI)) {
2138 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2139 Limit = WMMAWaitStates[Category];
2145 Limit - getWaitStatesSince(IsWMMAHazardFn, Limit, GetWaitStatesFn);
2148 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2149 Limit = VALUWaitStates[Category];
2155 Limit - getWaitStatesSince(IsVALUHazardFn, Limit, GetWaitStatesFn);
2159 return WaitStatesNeeded;
2162bool GCNHazardRecognizer::hasWMMAToWMMARegOverlap(
2164 Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
2165 Register A1 = TII.getNamedOperand(
MI, AMDGPU::OpName::src0)->getReg();
2166 Register B1 = TII.getNamedOperand(
MI, AMDGPU::OpName::src1)->getReg();
2169 if (TRI.regsOverlap(D0, A1) || TRI.regsOverlap(D0, B1))
2173 Register Idx1 = TII.getNamedOperand(
MI, AMDGPU::OpName::src2)->getReg();
2174 if (TRI.regsOverlap(D0, Idx1))
2180bool GCNHazardRecognizer::hasWMMAToVALURegOverlap(
2183 Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
2184 for (
const MachineOperand &ValuUse :
MI.explicit_uses()) {
2185 if (ValuUse.isReg() && TRI.regsOverlap(D0, ValuUse.getReg()))
2190 Register A0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src0)->getReg();
2191 Register B0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src1)->getReg();
2195 Register Idx0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src2)->getReg();
2196 WMMARegs.push_back(Idx0);
2199 for (
const MachineOperand &ValuDef :
MI.defs()) {
2200 Register VDstReg = ValuDef.getReg();
2201 for (
Register WMMAReg : WMMARegs) {
2202 if (TRI.regsOverlap(VDstReg, WMMAReg))
2209bool GCNHazardRecognizer::isCoexecutionHazardFor(
const MachineInstr &
I,
2213 if (!TII.isXDLWMMA(
I))
2217 if (TII.isXDLWMMA(
MI))
2218 return hasWMMAToWMMARegOverlap(
I,
MI);
2220 return hasWMMAToVALURegOverlap(
I,
MI);
2226 bool IncludeSubloops) {
2229 for (MachineBasicBlock *
MBB :
L->getBlocks()) {
2230 if (!IncludeSubloops && MLI->getLoopFor(
MBB) != L)
2232 for (MachineInstr &
I : *
MBB) {
2235 if (isCoexecutionHazardFor(
I, *
MI))
2242bool GCNHazardRecognizer::tryHoistWMMAVnopsFromLoop(
MachineInstr *
MI,
2243 int WaitStatesNeeded) {
2247 MachineLoop *
L = MLI->getLoopFor(
MI->getParent());
2249 ++NumWMMAHoistingBailed;
2254 if (hasWMMAHazardInLoop(L,
MI)) {
2255 ++NumWMMAHoistingBailed;
2260 MachineLoop *TargetLoop =
L;
2262 if (hasWMMAHazardInLoop(Parent,
MI,
false))
2264 TargetLoop = Parent;
2270 ++NumWMMAHoistingBailed;
2274 LLVM_DEBUG(
dbgs() <<
"WMMA V_NOP Hoisting: Moving " << WaitStatesNeeded
2280 NumWMMANopsHoisted += WaitStatesNeeded;
2284bool GCNHazardRecognizer::fixWMMACoexecutionHazards(
MachineInstr *
MI) {
2285 int WaitStatesNeeded = checkWMMACoexecutionHazards(
MI);
2286 if (WaitStatesNeeded <= 0)
2292 emitVNops(*
MI->getParent(),
MI->getIterator(), WaitStatesNeeded);
2296bool GCNHazardRecognizer::fixShift64HighRegBug(
MachineInstr *
MI) {
2297 if (!ST.hasShift64HighRegBug())
2299 assert(!ST.hasExtendedWaitCounts());
2301 switch (
MI->getOpcode()) {
2304 case AMDGPU::V_LSHLREV_B64_e64:
2305 case AMDGPU::V_LSHRREV_B64_e64:
2306 case AMDGPU::V_ASHRREV_I64_e64:
2310 MachineOperand *Amt = TII.getNamedOperand(*
MI, AMDGPU::OpName::src0);
2315 const MachineRegisterInfo &MRI = MF.getRegInfo();
2317 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
2320 if (AmtReg != AMDGPU::VGPR255 && MRI.
isPhysRegUsed(AmtReg + 1))
2323 assert(ST.needsAlignedVGPRs());
2324 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
2327 MachineBasicBlock *
MBB =
MI->getParent();
2328 MachineOperand *Src1 = TII.getNamedOperand(*
MI, AMDGPU::OpName::src1);
2339 Register DstReg =
MI->getOperand(0).getReg();
2341 Register DstLo = TRI.getSubReg(DstReg, AMDGPU::sub0);
2349 bool Overlapped =
MI->modifiesRegister(AmtReg, &TRI);
2351 for (MCRegister
Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
2352 : AMDGPU::VGPR_32RegClass) {
2353 if (!
MI->modifiesRegister(
Reg, &TRI) && !
MI->readsRegister(
Reg, &TRI)) {
2359 Register NewAmt = Overlapped ? (
Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
2364 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
2377 runOnInstruction(
BuildMI(*
MBB,
MI,
DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
2384 BuildMI(*
MBB, std::next(
MI->getIterator()),
DL, TII.get(AMDGPU::V_SWAP_B32),
2390 BuildMI(*
MBB, std::next(
MI->getIterator()),
DL, TII.get(AMDGPU::V_SWAP_B32),
2404 MI->getOperand(0).setReg(NewReg);
2413int GCNHazardRecognizer::checkNSAtoVMEMHazard(
MachineInstr *
MI)
const {
2414 int NSAtoVMEMWaitStates = 1;
2416 if (!ST.hasNSAtoVMEMBug())
2422 const SIInstrInfo *TII = ST.getInstrInfo();
2423 const auto *
Offset = TII->getNamedOperand(*
MI, AMDGPU::OpName::offset);
2431 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2432 TII->getInstSizeInBytes(
I) >= 16;
2435 return NSAtoVMEMWaitStates - getWaitStatesSince(
IsHazardFn, 1);
2438int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(
2440 int FPAtomicToDenormModeWaitStates = 3;
2442 if (!ST.hasFPAtomicToDenormModeHazard())
2444 assert(!ST.hasExtendedWaitCounts());
2446 if (
MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2455 auto IsExpiredFn = [](
const MachineInstr &
MI,
int WaitStates) {
2462 return FPAtomicToDenormModeWaitStates -
2466int GCNHazardRecognizer::checkMAIHazards(
MachineInstr *
MI)
const {
2469 return ST.hasGFX90AInsts() ? checkMAIHazards90A(
MI) : checkMAIHazards908(
MI);
2472int GCNHazardRecognizer::checkMFMAPadding(
MachineInstr *
MI)
const {
2477 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2481 int NeighborMFMALatency = 0;
2482 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2483 this](
const MachineInstr &
MI) {
2487 NeighborMFMALatency = this->getMFMAPipelineWaitStates(
MI);
2491 const int MaxMFMAPipelineWaitStates = 16;
2492 int WaitStatesSinceNeighborMFMA =
2493 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2495 int NeighborMFMAPaddingNeeded =
2497 WaitStatesSinceNeighborMFMA;
2499 return std::max(0, NeighborMFMAPaddingNeeded);
2502int GCNHazardRecognizer::checkMAIHazards908(
MachineInstr *
MI)
const {
2503 int WaitStatesNeeded = 0;
2504 unsigned Opc =
MI->getOpcode();
2506 auto IsVALUFn = [](
const MachineInstr &
MI) {
2510 if (
Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) {
2511 const int LegacyVALUWritesVGPRWaitStates = 2;
2512 const int VALUWritesExecWaitStates = 4;
2513 const int MaxWaitStates = 4;
2515 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2516 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2517 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2519 if (WaitStatesNeeded < MaxWaitStates) {
2520 for (
const MachineOperand &Use :
MI->explicit_uses()) {
2521 const int MaxWaitStates = 2;
2523 if (!
Use.isReg() || !TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
2526 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2527 getWaitStatesSinceDef(
Use.getReg(), IsVALUFn, MaxWaitStates);
2528 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2530 if (WaitStatesNeeded == MaxWaitStates)
2536 for (
const MachineOperand &
Op :
MI->explicit_operands()) {
2537 if (!
Op.isReg() || !TRI.isAGPR(MF.getRegInfo(),
Op.getReg()))
2540 if (
Op.isDef() &&
Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2543 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2544 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2545 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2546 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2547 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2548 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2549 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2550 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2551 const int MaxWaitStates = 18;
2553 unsigned HazardDefLatency = 0;
2555 auto IsOverlappedMFMAFn = [
Reg, &HazardDefLatency,
2556 this](
const MachineInstr &
MI) {
2563 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2564 return TRI.regsOverlap(DstReg,
Reg);
2567 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsOverlappedMFMAFn,
2569 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2570 int SrcCIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
2571 int OpNo =
Op.getOperandNo();
2572 if (OpNo == SrcCIdx) {
2573 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2574 }
else if (
Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2575 switch (HazardDefLatency) {
2576 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2578 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2580 case 16: [[fallthrough]];
2581 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2584 }
else if (
Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2585 switch (HazardDefLatency) {
2586 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2588 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2590 case 16: [[fallthrough]];
2591 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2596 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2597 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2599 if (WaitStatesNeeded == MaxWaitStates)
2600 return WaitStatesNeeded;
2602 auto IsAccVgprWriteFn = [
Reg,
this](
const MachineInstr &
MI) {
2603 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2606 return TRI.regsOverlap(
Reg, DstReg);
2609 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2610 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2611 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2612 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2613 if (OpNo == SrcCIdx)
2614 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2615 else if (
Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2616 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2618 WaitStatesNeededForUse = NeedWaitStates -
2619 getWaitStatesSinceDef(
Reg, IsAccVgprWriteFn, MaxWaitStates);
2620 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2622 if (WaitStatesNeeded == MaxWaitStates)
2623 return WaitStatesNeeded;
2626 if (
Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2627 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2628 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2629 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2630 const int MaxWaitStates = 13;
2631 Register DstReg =
MI->getOperand(0).getReg();
2632 unsigned HazardDefLatency = 0;
2634 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2635 this](
const MachineInstr &
MI) {
2638 Register Reg = TII.getNamedOperand(
MI, AMDGPU::OpName::src2)->getReg();
2640 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2641 return TRI.regsOverlap(
Reg, DstReg);
2644 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2646 switch (HazardDefLatency) {
2647 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2649 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2651 case 16: [[fallthrough]];
2652 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2656 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2657 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2661 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2663 return WaitStatesNeeded;
2674 return NumPasses + 1 + IsGFX950;
2685 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2703 return NumPasses + 2;
2713 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2716int GCNHazardRecognizer::checkMAIHazards90A(
MachineInstr *
MI)
const {
2717 int WaitStatesNeeded = 0;
2718 unsigned Opc =
MI->getOpcode();
2720 auto IsLegacyVALUFn = [](
const MachineInstr &
MI) {
2724 auto IsLegacyVALUNotDotFn = [](
const MachineInstr &
MI) {
2730 return WaitStatesNeeded;
2732 const int VALUWritesExecWaitStates = 4;
2733 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2734 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2735 VALUWritesExecWaitStates);
2736 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2738 int SrcCIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
2741 for (
const MachineOperand &Use :
MI->explicit_uses()) {
2742 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2743 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2744 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2745 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2746 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2747 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2748 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2749 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2750 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2751 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2752 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2753 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2754 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2755 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2756 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2757 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2758 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2759 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2760 const int MaxWaitStates = 19;
2766 const MachineInstr *MI1;
2768 auto IsOverlappedMFMAFn = [
Reg, &FullReg, &MI1,
2769 this](
const MachineInstr &
MI) {
2773 FullReg = (DstReg ==
Reg);
2775 return TRI.regsOverlap(DstReg,
Reg);
2778 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2779 getWaitStatesSinceDef(
Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2780 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2783 getWaitStatesSinceDef(
Reg, IsOverlappedMFMAFn, MaxWaitStates);
2784 if (NumWaitStates == std::numeric_limits<int>::max())
2787 int OpNo =
Use.getOperandNo();
2789 int NeedWaitStates = 0;
2790 if (OpNo == SrcCIdx) {
2794 }
else if (FullReg) {
2795 if ((
Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2796 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2797 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2798 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2799 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2800 else if (ST.hasGFX940Insts() &&
2801 TSchedModel.computeInstrLatency(MI1) == 2)
2802 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2805 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2806 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2807 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2808 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2809 if (!TII.isXDL(*
MI))
2812 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2813 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2815 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2816 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2817 if (!TII.isXDL(*
MI))
2818 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2821 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2822 if (ST.hasGFX940Insts()) {
2823 if (TII.isXDL(*
MI) && !TII.isXDL(*MI1))
2830 NumPasses, ST.hasGFX950Insts())
2832 NumPasses, ST.hasGFX950Insts()))
2838 switch (NumPasses) {
2842 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2843 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2848 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2849 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2854 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2855 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2864 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2865 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2866 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2867 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2870 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2871 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2873 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2874 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2875 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2878 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2880 if (ST.hasGFX940Insts()) {
2884 NumPasses, ST.hasGFX950Insts())
2890 switch (NumPasses) {
2892 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2897 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2901 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2905 if (WaitStatesNeeded >= NeedWaitStates)
2908 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2909 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2911 if (WaitStatesNeeded == MaxWaitStates)
2916 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2918 return WaitStatesNeeded;
2921int GCNHazardRecognizer::checkMAILdStHazards(
MachineInstr *
MI)
const {
2923 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2926 int WaitStatesNeeded = 0;
2928 auto IsAccVgprReadFn = [](
const MachineInstr &
MI) {
2929 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2932 for (
const MachineOperand &
Op :
MI->explicit_uses()) {
2933 if (!
Op.isReg() || !TRI.isVGPR(MF.getRegInfo(),
Op.getReg()))
2938 const int AccVgprReadLdStWaitStates = 2;
2939 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2940 const int MaxWaitStates = 2;
2942 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2943 getWaitStatesSinceDef(
Reg, IsAccVgprReadFn, MaxWaitStates);
2944 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2946 if (WaitStatesNeeded == MaxWaitStates)
2947 return WaitStatesNeeded;
2949 auto IsVALUAccVgprRdWrCheckFn = [
Reg,
this](
const MachineInstr &
MI) {
2950 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2951 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2953 auto IsVALUFn = [](
const MachineInstr &
MI) {
2956 return getWaitStatesSinceDef(
Reg, IsVALUFn, 2 ) <
2957 std::numeric_limits<int>::max();
2960 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2961 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2962 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2965 return WaitStatesNeeded;
2968int GCNHazardRecognizer::checkPermlaneHazards(
MachineInstr *
MI)
const {
2969 assert(!ST.hasVcmpxPermlaneHazard() &&
2970 "this is a different vcmpx+permlane hazard");
2971 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2972 const SIInstrInfo *TII = ST.getInstrInfo();
2974 auto IsVCmpXWritesExecFn = [TII, TRI](
const MachineInstr &
MI) {
2978 auto IsVALUFn = [](
const MachineInstr &
MI) {
2982 const int VCmpXWritesExecWaitStates = 4;
2983 const int VALUWritesVDstWaitStates = 2;
2984 int WaitStatesNeeded = 0;
2986 for (
const MachineOperand &
Op :
MI->explicit_uses()) {
2987 if (!
Op.isReg() || !TRI->isVGPR(MF.getRegInfo(),
Op.getReg()))
2991 int WaitStatesSinceDef =
2992 VALUWritesVDstWaitStates -
2993 getWaitStatesSinceDef(
Reg, IsVALUFn,
2994 VALUWritesVDstWaitStates);
2995 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
2996 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
3000 int VCmpXHazardWaits =
3001 VCmpXWritesExecWaitStates -
3002 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
3004 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
3005 return WaitStatesNeeded;
3013 return NumPasses + 2;
3023 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
3033 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
3041 return NumPasses + 2;
3044int GCNHazardRecognizer::checkMAIVALUHazards(
MachineInstr *
MI)
const {
3045 if (!ST.hasGFX90AInsts())
3048 auto IsDGEMMFn = [](
const MachineInstr &
MI) ->
bool {
3056 const MachineRegisterInfo &MRI = MF.getRegInfo();
3058 int WaitStatesNeeded = 0;
3064 const MachineInstr *
MFMA =
nullptr;
3066 auto IsMFMAWriteFn = [&
Reg, &
MFMA,
this](
const MachineInstr &
MI) {
3068 !TRI.regsOverlap(
MI.getOperand(0).getReg(),
Reg))
3074 const MachineInstr *
DOT =
nullptr;
3075 auto IsDotWriteFn = [&
Reg, &
DOT,
this](
const MachineInstr &
MI) {
3077 !TRI.regsOverlap(
MI.getOperand(0).getReg(),
Reg))
3083 bool DGEMMAfterVALUWrite =
false;
3084 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite,
this](
const MachineInstr &
MI) {
3087 DGEMMAfterVALUWrite =
true;
3091 if (!TII.isVALU(
MI) || !DGEMMAfterVALUWrite)
3097 int SrcCIdx = AMDGPU::getNamedOperandIdx(
MI->getOpcode(),
3098 AMDGPU::OpName::src2);
3100 if (IsMemOrExport || IsVALU) {
3101 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
3102 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
3103 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
3104 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
3105 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
3106 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
3107 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
3108 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
3109 const int DotWriteSameDotReadSrcAB = 3;
3110 const int DotWriteDifferentVALURead = 3;
3111 const int DMFMABetweenVALUWriteVMEMRead = 2;
3112 const int MaxWaitStates = 19;
3114 for (
const MachineOperand &Use :
MI->explicit_uses()) {
3120 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsDotWriteFn,
3123 int NeedWaitStates = 0;
3124 if (
DOT->getOpcode() ==
MI->getOpcode()) {
3125 if (&Use - &
MI->getOperand(0) != SrcCIdx)
3126 NeedWaitStates = DotWriteSameDotReadSrcAB;
3128 NeedWaitStates = DotWriteDifferentVALURead;
3131 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3132 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3139 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
3140 DGEMMAfterVALUWrite =
false;
3141 if (TRI.isVectorRegister(MRI,
Reg)) {
3142 int WaitStatesNeededForUse =
3143 DMFMABetweenVALUWriteVMEMRead -
3144 getWaitStatesSinceDef(
Reg, IsDGEMMHazard,
3145 DMFMABetweenVALUWriteVMEMRead);
3147 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3152 WaitStatesSinceDef =
3153 getWaitStatesSinceDef(
Reg, IsMFMAWriteFn, MaxWaitStates);
3157 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(
MFMA);
3158 int NumPasses = HazardDefLatency;
3159 int NeedWaitStates = MaxWaitStates;
3162 switch (HazardDefLatency) {
3164 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
3165 : DMFMA4x4WriteVgprVALUReadWaitStates;
3171 ? DMFMA16x16WriteVgprMemExpReadWaitStates
3172 : (ST.hasGFX950Insts()
3173 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
3174 : DMFMA16x16WriteVgprVALUReadWaitStates);
3179 }
else if (ST.hasGFX940Insts()) {
3183 NumPasses, ST.hasGFX950Insts())
3187 switch (HazardDefLatency) {
3189 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
3192 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
3195 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
3202 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3203 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3205 if (WaitStatesNeeded == MaxWaitStates)
3210 unsigned Opc =
MI->getOpcode();
3211 const int DMFMAToFMA64WaitStates = 2;
3212 if ((
Opc == AMDGPU::V_FMA_F64_e64 ||
3213 Opc == AMDGPU::V_FMAC_F64_e32 ||
Opc == AMDGPU::V_FMAC_F64_e64 ||
3214 Opc == AMDGPU::V_FMAC_F64_dpp) &&
3215 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
3216 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
3217 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
3218 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3221 if (!IsVALU && !IsMemOrExport)
3222 return WaitStatesNeeded;
3224 for (
const MachineOperand &Def :
MI->defs()) {
3225 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
3226 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
3227 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
3228 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
3229 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
3230 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
3231 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
3232 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
3233 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
3234 const int DotWriteDifferentVALUWrite = 3;
3235 const int MaxWaitStates = 19;
3236 const int MaxWarWaitStates = 15;
3241 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsDotWriteFn,
3243 if (DOT &&
DOT->getOpcode() !=
MI->getOpcode())
3244 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
3245 WaitStatesSinceDef);
3248 WaitStatesSinceDef =
3249 getWaitStatesSinceDef(
Reg, IsMFMAWriteFn, MaxWaitStates);
3251 int NeedWaitStates = MaxWaitStates;
3252 int NumPasses = TSchedModel.computeInstrLatency(
MFMA);
3255 switch (NumPasses) {
3257 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
3261 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
3266 }
else if (ST.hasGFX940Insts()) {
3270 NumPasses, ST.hasGFX950Insts())
3273 switch (NumPasses) {
3275 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
3278 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
3281 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
3288 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3289 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3291 if (WaitStatesNeeded == MaxWaitStates)
3295 auto IsSMFMAReadAsCFn = [&
Reg, &
MFMA,
this](
const MachineInstr &
MI) {
3297 !
MI.readsRegister(
Reg, &TRI))
3300 if (ST.hasGFX940Insts() && !TII.isXDL(
MI))
3303 const MachineOperand *SrcC =
3304 TII.getNamedOperand(
MI, AMDGPU::OpName::src2);
3314 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
3319 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(
MFMA);
3320 int NeedWaitStates = MaxWaitStates;
3321 switch (HazardDefLatency) {
3322 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
3324 case 4:
assert(ST.hasGFX940Insts());
3325 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
3327 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
3329 case 16: [[fallthrough]];
3330 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
3334 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
3335 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3338 return WaitStatesNeeded;
3351 return MAI !=
nullptr;
3355 if (IsMFMAFn(*
MI)) {
3356 int W = getWaitStatesSince(IsMFMAFn, 16);
3358 return W < (int)TSchedModel.computeInstrLatency(MAI);
3372 while (
I->isBundledWithPred())
3378 if (
I->getOpcode() != AMDGPU::S_GETPC_B64)
3382 const unsigned NewBytes = 4;
3384 "Unexpected instruction insertion in bundle");
3387 while (NextMI != End && NextMI->isBundledWithPred()) {
3388 for (
auto &Operand : NextMI->operands()) {
3389 if (Operand.isGlobal())
3390 Operand.setOffset(Operand.getOffset() + NewBytes);
3396bool GCNHazardRecognizer::fixVALUMaskWriteHazard(
MachineInstr *
MI) {
3397 if (!ST.hasVALUMaskWriteHazard())
3399 assert(!ST.hasExtendedWaitCounts());
3406 if (!IsSALU && !IsVALU)
3418 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3419 const MachineRegisterInfo &MRI = MF.getRegInfo();
3424 case AMDGPU::EXEC_LO:
3425 case AMDGPU::EXEC_HI:
3427 case AMDGPU::SGPR_NULL:
3428 case AMDGPU::SGPR_NULL64:
3436 return Reg == AMDGPU::VCC ||
Reg == AMDGPU::VCC_LO ||
Reg == AMDGPU::VCC_HI;
3440 SmallSet<Register, 2> HazardSGPRs;
3442 static unsigned getHashValue(
const StateType &State) {
3445 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
3446 return LHS.HazardSGPRs ==
RHS.HazardSGPRs;
3450 SmallVector<const MachineInstr *> WaitInstrs;
3451 bool HasSGPRRead =
false;
3452 StateType InitialState;
3455 MachineOperand *HazardDef =
nullptr;
3456 for (MachineOperand &
Op :
MI->operands()) {
3459 if (
Op.isDef() && HazardDef)
3463 if (IgnoreableSGPR(
Reg))
3466 if (
Op.isImplicit())
3468 if (!TRI->isSGPRReg(MRI,
Reg))
3486 if (AMDGPU::SReg_32RegClass.
contains(HazardReg)) {
3487 InitialState.HazardSGPRs.insert(HazardReg);
3490 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0));
3491 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1));
3494 auto IsHazardFn = [&](StateType &State,
const MachineInstr &
I) {
3495 if (State.HazardSGPRs.empty())
3498 switch (
I.getOpcode()) {
3499 case AMDGPU::V_ADDC_U32_e32:
3500 case AMDGPU::V_ADDC_U32_dpp:
3501 case AMDGPU::V_CNDMASK_B16_t16_e32:
3502 case AMDGPU::V_CNDMASK_B16_fake16_e32:
3503 case AMDGPU::V_CNDMASK_B16_t16_dpp:
3504 case AMDGPU::V_CNDMASK_B16_fake16_dpp:
3505 case AMDGPU::V_CNDMASK_B32_e32:
3506 case AMDGPU::V_CNDMASK_B32_dpp:
3507 case AMDGPU::V_DIV_FMAS_F32_e64:
3508 case AMDGPU::V_DIV_FMAS_F64_e64:
3509 case AMDGPU::V_SUBB_U32_e32:
3510 case AMDGPU::V_SUBB_U32_dpp:
3511 case AMDGPU::V_SUBBREV_U32_e32:
3512 case AMDGPU::V_SUBBREV_U32_dpp: {
3516 case AMDGPU::V_ADDC_U32_e64:
3517 case AMDGPU::V_ADDC_U32_e64_dpp:
3518 case AMDGPU::V_CNDMASK_B16_t16_e64:
3519 case AMDGPU::V_CNDMASK_B16_fake16_e64:
3520 case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:
3521 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3522 case AMDGPU::V_CNDMASK_B32_e64:
3523 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3524 case AMDGPU::V_SUBB_U32_e64:
3525 case AMDGPU::V_SUBB_U32_e64_dpp:
3526 case AMDGPU::V_SUBBREV_U32_e64:
3527 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3529 const MachineOperand *SSRCOp = TII.getNamedOperand(
I, AMDGPU::OpName::src2);
3531 bool Result = TRI->regsOverlap(SSRCOp->
getReg(), HazardReg);
3543 auto UpdateStateFn = [&](StateType &State,
const MachineInstr &
I) {
3544 switch (
I.getOpcode()) {
3545 case AMDGPU::S_WAITCNT_DEPCTR:
3547 if (!HasSGPRRead &&
I.getParent() ==
MI->getParent() && !
I.isBundled() &&
3548 (
I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits)
3553 for (
auto &
Op :
I.operands()) {
3558 if (IgnoreableSGPR(
Reg))
3561 if (
Op.isImplicit())
3563 if (!TRI->isSGPRReg(MRI,
Reg))
3574 for (
Register SGPR : State.HazardSGPRs) {
3575 if (
Reg == SGPR || TRI->regsOverlap(
Reg, SGPR))
3579 State.HazardSGPRs.erase(SGPR);
3588 std::next(
MI->getReverseIterator())))
3598 if (!WaitInstrs.
empty()) {
3602 SmallVector<MachineInstr *> ToErase;
3604 for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(),
3605 End = MI->getParent()->rend();
3606 Found < WaitInstrs.size() && It != End; ++It) {
3607 MachineInstr *WaitMI = &*It;
3609 if (std::as_const(WaitMI) != WaitInstrs[Found])
3612 unsigned WaitMask = WaitMI->getOperand(0).getImm();
3613 assert((WaitMask & ConstantMaskBits) == ConstantMaskBits);
3614 DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(
3615 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask),
3616 AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr)));
3617 DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(
3618 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask),
3619 AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr)));
3620 DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(
3621 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask),
3622 AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr)));
3623 ToErase.push_back(WaitMI);
3626 for (MachineInstr *WaitMI : ToErase)
3627 WaitMI->eraseFromParent();
3631 auto NextMI = std::next(
MI->getIterator());
3632 auto NewMI =
BuildMI(*
MI->getParent(), NextMI,
MI->getDebugLoc(),
3633 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3645 if (EntryMBB.
begin() != EntryMBB.
end()) {
3646 auto &EntryMI = *EntryMBB.
begin();
3647 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3648 EntryMI.getOperand(0).getImm() >= Priority)
3657bool GCNHazardRecognizer::fixRequiredExportPriority(
MachineInstr *
MI) {
3658 if (!ST.hasRequiredExportPriority())
3663 MachineBasicBlock *
MBB =
MI->getParent();
3676 const int MaxPriority = 3;
3677 const int NormalPriority = 2;
3678 const int PostExportPriority = 0;
3680 auto It =
MI->getIterator();
3681 switch (
MI->getOpcode()) {
3682 case AMDGPU::S_ENDPGM:
3683 case AMDGPU::S_ENDPGM_SAVED:
3684 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3685 case AMDGPU::SI_RETURN_TO_EPILOG:
3688 if (MF->getFrameInfo().hasCalls())
3691 case AMDGPU::S_SETPRIO: {
3693 auto &PrioOp =
MI->getOperand(0);
3694 int Prio = PrioOp.getImm();
3695 bool InWA = (Prio == PostExportPriority) &&
3696 (It !=
MBB->
begin() && TII.isEXP(*std::prev(It)));
3697 if (InWA || Prio >= NormalPriority)
3699 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3703 if (!TII.isEXP(*
MI))
3714 auto NextMI = std::next(It);
3715 bool EndOfShader =
false;
3716 if (NextMI !=
MBB->
end()) {
3718 if (TII.isEXP(*NextMI))
3721 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3722 NextMI->getOperand(0).getImm() == PostExportPriority)
3724 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3731 .
addImm(PostExportPriority);
3735 BuildMI(*
MBB, NextMI,
DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3736 .
addReg(AMDGPU::SGPR_NULL)
3756 const SIInstrInfo *TII = ST.getInstrInfo();
3768 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3773bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(
MachineInstr *
MI) {
3774 if (
MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
3777 const SIInstrInfo *TII = ST.getInstrInfo();
3779 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3781 BuildMI(*
MI->getParent(), std::next(
MI->getIterator()),
MI->getDebugLoc(),
3782 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3788bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(
MachineInstr *
MI) {
3791 if (!IsHazardRecognizerMode)
3794 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3795 const SIInstrInfo *TII = ST.getInstrInfo();
3797 const int FlatScrBaseWaitStates = 10;
3799 bool ReadsFlatScrLo =
3800 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
3801 bool ReadsFlatScrHi =
3802 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
3808 ReadsFlatScrLo =
true;
3811 ReadsFlatScrHi =
true;
3816 const MachineRegisterInfo &MRI = MF.getRegInfo();
3819 DenseSet<const MachineBasicBlock *> Visited;
3821 return MI.modifiesRegister(
Reg, TRI);
3826 auto IsSGPRDef = [TII, TRI, &MRI](
const MachineInstr &
MI) ->
unsigned {
3827 if (!TII->isSALU(
MI) && !TII->isVALU(
MI))
3829 for (
const MachineOperand &MO :
MI.all_defs()) {
3830 if (TRI->isSGPRReg(MRI, MO.getReg()))
3836 auto IsExpiredFn = [=](
const MachineInstr &
MI,
int SgprWrites) {
3837 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
3838 unsigned Wait =
MI.getOperand(0).getImm();
3843 return SgprWrites >= FlatScrBaseWaitStates;
3846 return ::getWaitStatesSince(
3847 IsHazardFn,
MI->getParent(), std::next(
MI->getReverseIterator()),
3848 0,
IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates;
3852 !IsRegDefHazard(AMDGPU::SGPR102)) &&
3854 !IsRegDefHazard(AMDGPU::SGPR103)))
3858 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3869 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3870 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Rewrite AGPR Copy MFMA
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static cl::opt< bool > EnableWMMAVnopHoisting("amdgpu-wmma-vnop-hoisting", cl::init(true), cl::Hidden, cl::desc("Hoist WMMA hazard V_NOPs from loops to preheaders"))
static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, const MachineOperand *Dst, const SIRegisterInfo *TRI)
Checks whether the provided MI "consumes" the operand with a Dest sel fowarding issue Dst .
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isLdsDma(const MachineInstr &MI)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses, bool IsGFX950)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST)
Dest sel forwarding issue occurs if additional logic is needed to swizzle / pack the computed value i...
static int GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, bool IsGFX950)
static void updateGetPCBundle(MachineInstr *NewMI)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses, bool IsGFX950)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, const MachineInstr &MI)
static bool isSSetReg(unsigned Opcode)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static bool IsWMMAHazardInstInCategory(const MachineInstr &MI, const SIInstrInfo *TII, unsigned Latency, unsigned Category)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static bool hasHazard(StateT InitialState, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *InitialMBB, MachineBasicBlock::const_reverse_instr_iterator InitialI)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, bool IsGFX950)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static bool isCoexecutableVALUInst(const MachineInstr &MI)
static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
static cl::opt< unsigned > NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden, cl::desc("Insert a s_nop x before every instruction"))
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, bool IsGFX950)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static llvm::Error parse(DataExtractor &Data, uint64_t BaseAddr, LineEntryCallback const &Callback)
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static const uint32_t IV[8]
unsigned get(InstCounterType T) const
std::pair< iterator, bool > insert_as(std::pair< KeyT, ValueT > &&KV, const LookupKeyT &Val)
Alternate version of insert() which allows a different, and possibly less expensive,...
Implements a dense probed hash-table based set.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
unsigned getHazardWaitStates(MachineInstr *MI) const
Returns the number of wait states until all hazards for MI are resolved.
unsigned PreEmitNoopsCommon(MachineInstr *) const
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
function_ref< unsigned int(const MachineInstr &)> GetNumWaitStatesFn
bool ShouldPreferAnother(SUnit *SU) const override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
function_ref< bool(const MachineInstr &, int WaitStates)> IsExpiredFn
GCNHazardRecognizer(const MachineFunction &MF, MachineLoopInfo *MLI=nullptr)
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
Wrapper class representing physical registers. Should be passed by value.
Instructions::const_reverse_iterator const_reverse_instr_iterator
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
Instructions::iterator instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isBundled() const
Return true if this instruction part of a bundle.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool isConstantPhysReg(MCRegister PhysReg) const
Returns true if PhysReg is unallocatable and constant throughout the function.
LLVM_ABI bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isSMRD(const MachineInstr &MI)
static bool isMTBUF(const MachineInstr &MI)
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isSDWA(const MachineInstr &MI)
static bool isDOT(const MachineInstr &MI)
static bool isSWMMAC(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isTRANS(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isWaitcnt(unsigned Opcode)
static bool isDPP(const MachineInstr &MI)
static bool isMFMA(const MachineInstr &MI)
static bool isMAI(const MCInstrDesc &Desc)
static bool isFPAtomic(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
unsigned getOccupancy() const
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
unsigned getMaxLookAhead() const
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
A SetVector that performs no allocations if smaller than a certain size.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
std::pair< iterator, bool > insert(const ValueT &V)
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc)
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned decodeFieldVaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
FPType getFPDstSelType(unsigned Opc)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
This namespace contains all of the command line option processing machinery.
initializer< Ty > init(const Ty &Val)
NodeAddr< DefNode * > Def
NodeAddr< UseNode * > Use
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
constexpr RegState getDeadRegState(bool B)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
FunctionAddr VTableAddr Count
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
DWARFExpression::Operation Op
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
An information struct used to provide DenseMap with the various necessary components for a given valu...