23#define DEBUG_TYPE "si-fold-operands"
44 unsigned DefSubReg = AMDGPU::NoSubRegister;
49 FoldableDef() =
delete;
51 unsigned DefSubReg = AMDGPU::NoSubRegister)
52 : DefRC(DefRC), DefSubReg(DefSubReg), Kind(FoldOp.
getType()) {
55 ImmToFold = FoldOp.
getImm();
56 }
else if (FoldOp.
isFI()) {
57 FrameIndexToFold = FoldOp.
getIndex();
67 unsigned DefSubReg = AMDGPU::NoSubRegister)
68 : ImmToFold(FoldImm), DefRC(DefRC), DefSubReg(DefSubReg),
73 FoldableDef Copy(*
this);
74 Copy.DefSubReg =
TRI.composeSubRegIndices(DefSubReg,
SubReg);
82 return OpToFold->getReg();
85 unsigned getSubReg()
const {
87 return OpToFold->getSubReg();
98 return FrameIndexToFold;
106 std::optional<int64_t> getEffectiveImmVal()
const {
114 unsigned OpIdx)
const {
117 std::optional<int64_t> ImmToFold = getEffectiveImmVal();
127 if (DefSubReg != AMDGPU::NoSubRegister)
135 if (DefSubReg != AMDGPU::NoSubRegister)
137 return TII.isOperandLegal(
MI,
OpIdx, OpToFold);
144struct FoldCandidate {
152 bool Commuted =
false,
int ShrinkOp = -1)
153 :
UseMI(
MI), Def(Def), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
154 Commuted(Commuted) {}
156 bool isFI()
const {
return Def.isFI(); }
160 return Def.FrameIndexToFold;
163 bool isImm()
const {
return Def.isImm(); }
165 bool isReg()
const {
return Def.isReg(); }
169 bool isGlobal()
const {
return Def.isGlobal(); }
171 bool needsShrink()
const {
return ShrinkOpcode != -1; }
174class SIFoldOperandsImpl {
184 const FoldableDef &OpToFold)
const;
187 unsigned convertToVALUOp(
unsigned Opc,
bool UseVOP3 =
false)
const {
189 case AMDGPU::S_ADD_I32: {
190 if (ST->hasAddNoCarry())
191 return UseVOP3 ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_U32_e32;
192 return UseVOP3 ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
194 case AMDGPU::S_OR_B32:
195 return UseVOP3 ? AMDGPU::V_OR_B32_e64 : AMDGPU::V_OR_B32_e32;
196 case AMDGPU::S_AND_B32:
197 return UseVOP3 ? AMDGPU::V_AND_B32_e64 : AMDGPU::V_AND_B32_e32;
198 case AMDGPU::S_MUL_I32:
199 return AMDGPU::V_MUL_LO_U32_e64;
201 return AMDGPU::INSTRUCTION_LIST_END;
205 bool foldCopyToVGPROfScalarAddOfFrameIndex(
Register DstReg,
Register SrcReg,
211 int64_t ImmVal)
const;
215 int64_t ImmVal)
const;
219 const FoldableDef &OpToFold)
const;
228 getRegSeqInit(
SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
231 std::pair<int64_t, const TargetRegisterClass *>
249 bool foldInstOperand(
MachineInstr &
MI,
const FoldableDef &OpToFold)
const;
251 bool foldCopyToAGPRRegSequence(
MachineInstr *CopyMI)
const;
258 std::pair<const MachineOperand *, int> isOMod(
const MachineInstr &
MI)
const;
267 SIFoldOperandsImpl() =
default;
281 return SIFoldOperandsImpl().run(MF);
284 StringRef getPassName()
const override {
return "SI Fold Operands"; }
301char SIFoldOperandsLegacy::
ID = 0;
310 TRI.getSubRegisterClass(RC, MO.getSubReg()))
318 case AMDGPU::V_MAC_F32_e64:
319 return AMDGPU::V_MAD_F32_e64;
320 case AMDGPU::V_MAC_F16_e64:
321 return AMDGPU::V_MAD_F16_e64;
322 case AMDGPU::V_FMAC_F32_e64:
323 return AMDGPU::V_FMA_F32_e64;
324 case AMDGPU::V_FMAC_F16_e64:
325 return AMDGPU::V_FMA_F16_gfx9_e64;
326 case AMDGPU::V_FMAC_F16_t16_e64:
327 return AMDGPU::V_FMA_F16_gfx9_t16_e64;
328 case AMDGPU::V_FMAC_F16_fake16_e64:
329 return AMDGPU::V_FMA_F16_gfx9_fake16_e64;
330 case AMDGPU::V_FMAC_LEGACY_F32_e64:
331 return AMDGPU::V_FMA_LEGACY_F32_e64;
332 case AMDGPU::V_FMAC_F64_e64:
333 return AMDGPU::V_FMA_F64_e64;
335 return AMDGPU::INSTRUCTION_LIST_END;
341 const FoldableDef &OpToFold)
const {
342 if (!OpToFold.isFI())
345 const unsigned Opc =
UseMI.getOpcode();
347 case AMDGPU::S_ADD_I32:
348 case AMDGPU::S_ADD_U32:
349 case AMDGPU::V_ADD_U32_e32:
350 case AMDGPU::V_ADD_CO_U32_e32:
354 return UseMI.getOperand(OpNo == 1 ? 2 : 1).isImm() &&
356 case AMDGPU::V_ADD_U32_e64:
357 case AMDGPU::V_ADD_CO_U32_e64:
358 return UseMI.getOperand(OpNo == 2 ? 3 : 2).isImm() &&
365 return OpNo == AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::vaddr);
369 int SIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::saddr);
373 int VIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::vaddr);
374 return OpNo == VIdx && SIdx == -1;
380bool SIFoldOperandsImpl::foldCopyToVGPROfScalarAddOfFrameIndex(
382 if (
TRI->isVGPR(*
MRI, DstReg) &&
TRI->isSGPRReg(*
MRI, SrcReg) &&
383 MRI->hasOneNonDBGUse(SrcReg)) {
384 MachineInstr *
Def =
MRI->getVRegDef(SrcReg);
385 if (!Def ||
Def->getNumOperands() != 4)
388 MachineOperand *Src0 = &
Def->getOperand(1);
389 MachineOperand *Src1 = &
Def->getOperand(2);
400 const bool UseVOP3 = !Src0->
isImm() ||
TII->isInlineConstant(*Src0);
401 unsigned NewOp = convertToVALUOp(
Def->getOpcode(), UseVOP3);
402 if (NewOp == AMDGPU::INSTRUCTION_LIST_END ||
403 !
Def->getOperand(3).isDead())
406 MachineBasicBlock *
MBB =
Def->getParent();
408 if (NewOp != AMDGPU::V_ADD_CO_U32_e32) {
409 MachineInstrBuilder
Add =
412 if (
Add->getDesc().getNumDefs() == 2) {
413 Register CarryOutReg =
MRI->createVirtualRegister(
TRI->getBoolRC());
415 MRI->setRegAllocationHint(CarryOutReg, 0,
TRI->getVCC());
418 Add.add(*Src0).add(*Src1).setMIFlags(
Def->getFlags());
422 Def->eraseFromParent();
423 MI.eraseFromParent();
427 assert(NewOp == AMDGPU::V_ADD_CO_U32_e32);
438 Def->eraseFromParent();
439 MI.eraseFromParent();
448 return new SIFoldOperandsLegacy();
451bool SIFoldOperandsImpl::canUseImmWithOpSel(
const MachineInstr *
MI,
453 int64_t ImmVal)
const {
454 const uint64_t TSFlags =
MI->getDesc().TSFlags;
462 int OpNo =
MI->getOperandNo(&Old);
464 unsigned Opcode =
MI->getOpcode();
465 uint8_t OpType =
TII->get(Opcode).operands()[OpNo].OperandType;
487bool SIFoldOperandsImpl::tryFoldImmWithOpSel(MachineInstr *
MI,
unsigned UseOpNo,
488 int64_t ImmVal)
const {
489 MachineOperand &Old =
MI->getOperand(UseOpNo);
490 unsigned Opcode =
MI->getOpcode();
491 int OpNo =
MI->getOperandNo(&Old);
492 uint8_t OpType =
TII->get(Opcode).operands()[OpNo].OperandType;
504 AMDGPU::OpName ModName = AMDGPU::OpName::NUM_OPERAND_NAMES;
505 unsigned SrcIdx = ~0;
506 if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) {
507 ModName = AMDGPU::OpName::src0_modifiers;
509 }
else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) {
510 ModName = AMDGPU::OpName::src1_modifiers;
512 }
else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) {
513 ModName = AMDGPU::OpName::src2_modifiers;
516 assert(ModName != AMDGPU::OpName::NUM_OPERAND_NAMES);
517 int ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModName);
518 MachineOperand &
Mod =
MI->getOperand(ModIdx);
519 unsigned ModVal =
Mod.getImm();
525 uint32_t
Imm = (
static_cast<uint32_t
>(ImmHi) << 16) | ImmLo;
530 auto tryFoldToInline = [&](uint32_t
Imm) ->
bool {
539 uint16_t
Lo =
static_cast<uint16_t
>(
Imm);
540 uint16_t
Hi =
static_cast<uint16_t
>(
Imm >> 16);
543 Mod.setImm(NewModVal);
548 if (
static_cast<int16_t
>(
Lo) < 0) {
549 int32_t SExt =
static_cast<int16_t
>(
Lo);
551 Mod.setImm(NewModVal);
566 uint32_t Swapped = (
static_cast<uint32_t
>(
Lo) << 16) |
Hi;
577 if (tryFoldToInline(Imm))
586 bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16;
587 bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16;
588 if (SrcIdx == 1 && (IsUAdd || IsUSub)) {
590 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp);
591 bool Clamp =
MI->getOperand(ClampIdx).getImm() != 0;
594 uint16_t NegLo = -
static_cast<uint16_t
>(
Imm);
595 uint16_t NegHi = -
static_cast<uint16_t
>(
Imm >> 16);
596 uint32_t NegImm = (
static_cast<uint32_t
>(NegHi) << 16) | NegLo;
598 if (tryFoldToInline(NegImm)) {
600 IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16;
601 MI->setDesc(
TII->get(NegOpcode));
610bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold)
const {
611 MachineInstr *
MI = Fold.UseMI;
612 MachineOperand &Old =
MI->getOperand(Fold.UseOpNo);
615 std::optional<int64_t> ImmVal;
617 ImmVal = Fold.Def.getEffectiveImmVal();
619 if (ImmVal && canUseImmWithOpSel(Fold.UseMI, Fold.UseOpNo, *ImmVal)) {
620 if (tryFoldImmWithOpSel(Fold.UseMI, Fold.UseOpNo, *ImmVal))
626 int OpNo =
MI->getOperandNo(&Old);
627 if (!
TII->isOperandLegal(*
MI, OpNo, &New))
633 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
634 MachineBasicBlock *
MBB =
MI->getParent();
641 int Op32 = Fold.ShrinkOpcode;
642 MachineOperand &Dst0 =
MI->getOperand(0);
643 MachineOperand &Dst1 =
MI->getOperand(1);
646 bool HaveNonDbgCarryUse = !
MRI->use_nodbg_empty(Dst1.
getReg());
648 const TargetRegisterClass *Dst0RC =
MRI->getRegClass(Dst0.
getReg());
649 Register NewReg0 =
MRI->createVirtualRegister(Dst0RC);
651 MachineInstr *Inst32 =
TII->buildShrunkInst(*
MI, Op32);
653 if (HaveNonDbgCarryUse) {
666 for (
unsigned I =
MI->getNumOperands() - 1;
I > 0; --
I)
667 MI->removeOperand(
I);
668 MI->setDesc(
TII->get(AMDGPU::IMPLICIT_DEF));
671 TII->commuteInstruction(*Inst32,
false);
675 assert(!Fold.needsShrink() &&
"not handled");
680 if (NewMFMAOpc == -1)
682 MI->setDesc(
TII->get(NewMFMAOpc));
683 MI->untieRegOperand(0);
684 const MCInstrDesc &MCID =
MI->getDesc();
685 for (
unsigned I = 0;
I <
MI->getNumDefs(); ++
I)
687 MI->getOperand(
I).setIsEarlyClobber(
true);
692 int OpNo =
MI->getOperandNo(&Old);
693 if (!
TII->isOperandLegal(*
MI, OpNo, &New))
700 if (Fold.isGlobal()) {
701 Old.
ChangeToGA(Fold.Def.OpToFold->getGlobal(),
702 Fold.Def.OpToFold->getOffset(),
703 Fold.Def.OpToFold->getTargetFlags());
712 MachineOperand *
New = Fold.Def.OpToFold;
715 if (
const TargetRegisterClass *OpRC =
716 TII->getRegClass(
MI->getDesc(), Fold.UseOpNo,
TRI)) {
717 const TargetRegisterClass *NewRC =
718 TRI->getRegClassForReg(*
MRI,
New->getReg());
720 const TargetRegisterClass *ConstrainRC = OpRC;
721 if (
New->getSubReg()) {
723 TRI->getMatchingSuperRegClass(NewRC, OpRC,
New->getSubReg());
729 if (
New->getReg().isVirtual() &&
730 !
MRI->constrainRegClass(
New->getReg(), ConstrainRC)) {
732 <<
TRI->getRegClassName(ConstrainRC) <<
'\n');
741 if (
New->getReg().isPhysical()) {
751 FoldCandidate &&Entry) {
753 for (FoldCandidate &Fold : FoldList)
754 if (Fold.UseMI == Entry.UseMI && Fold.UseOpNo == Entry.UseOpNo)
756 LLVM_DEBUG(
dbgs() <<
"Append " << (Entry.Commuted ?
"commuted" :
"normal")
757 <<
" operand " << Entry.UseOpNo <<
"\n " << *Entry.UseMI);
763 const FoldableDef &FoldOp,
764 bool Commuted =
false,
int ShrinkOp = -1) {
766 FoldCandidate(
MI, OpNo, FoldOp, Commuted, ShrinkOp));
769bool SIFoldOperandsImpl::tryAddToFoldList(
770 SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *
MI,
unsigned OpNo,
771 const FoldableDef &OpToFold)
const {
772 const unsigned Opc =
MI->getOpcode();
774 auto tryToFoldAsFMAAKorMK = [&]() {
775 if (!OpToFold.isImm())
778 const bool TryAK = OpNo == 3;
779 const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
780 MI->setDesc(
TII->get(NewOpc));
783 bool FoldAsFMAAKorMK =
784 tryAddToFoldList(FoldList,
MI, TryAK ? 3 : 2, OpToFold);
785 if (FoldAsFMAAKorMK) {
787 MI->untieRegOperand(3);
790 MachineOperand &Op1 =
MI->getOperand(1);
791 MachineOperand &Op2 =
MI->getOperand(2);
808 bool IsLegal = OpToFold.isOperandLegal(*
TII, *
MI, OpNo);
809 if (!IsLegal && OpToFold.isImm()) {
810 if (std::optional<int64_t> ImmVal = OpToFold.getEffectiveImmVal())
811 IsLegal = canUseImmWithOpSel(
MI, OpNo, *ImmVal);
817 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
820 MI->setDesc(
TII->get(NewOpc));
825 bool FoldAsMAD = tryAddToFoldList(FoldList,
MI, OpNo, OpToFold);
827 MI->untieRegOperand(OpNo);
831 MI->removeOperand(
MI->getNumExplicitOperands() - 1);
837 if (
Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
838 if (tryToFoldAsFMAAKorMK())
843 if (OpToFold.isImm()) {
845 if (
Opc == AMDGPU::S_SETREG_B32)
846 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
847 else if (
Opc == AMDGPU::S_SETREG_B32_mode)
848 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
850 MI->setDesc(
TII->get(ImmOpc));
859 bool CanCommute =
TII->findCommutedOpIndices(*
MI, OpNo, CommuteOpNo);
863 MachineOperand &
Op =
MI->getOperand(OpNo);
864 MachineOperand &CommutedOp =
MI->getOperand(CommuteOpNo);
870 if (!
Op.isReg() || !CommutedOp.
isReg())
875 if (
Op.isReg() && CommutedOp.
isReg() &&
876 (
Op.getReg() == CommutedOp.
getReg() &&
880 if (!
TII->commuteInstruction(*
MI,
false, OpNo, CommuteOpNo))
884 if (!OpToFold.isOperandLegal(*
TII, *
MI, CommuteOpNo)) {
885 if ((
Opc != AMDGPU::V_ADD_CO_U32_e64 &&
Opc != AMDGPU::V_SUB_CO_U32_e64 &&
886 Opc != AMDGPU::V_SUBREV_CO_U32_e64) ||
887 (!OpToFold.isImm() && !OpToFold.isFI() && !OpToFold.isGlobal())) {
888 TII->commuteInstruction(*
MI,
false, OpNo, CommuteOpNo);
894 MachineOperand &OtherOp =
MI->getOperand(OpNo);
895 if (!OtherOp.
isReg() ||
902 unsigned MaybeCommutedOpc =
MI->getOpcode();
916 if (
Opc == AMDGPU::S_FMAC_F32 &&
917 (OpNo != 1 || !
MI->getOperand(1).isIdenticalTo(
MI->getOperand(2)))) {
918 if (tryToFoldAsFMAAKorMK())
926bool SIFoldOperandsImpl::isUseSafeToFold(
const MachineInstr &
MI,
927 const MachineOperand &UseMO)
const {
929 return !
TII->isSDWA(
MI);
937 SubDef &&
TII.isFoldableCopy(*SubDef);
938 SubDef =
MRI.getVRegDef(
Sub->getReg())) {
939 unsigned SrcIdx =
TII.getFoldableCopySrcIdx(*SubDef);
948 if (
SrcOp.getSubReg())
955const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
956 MachineInstr &RegSeq,
957 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs)
const {
961 const TargetRegisterClass *RC =
nullptr;
971 else if (!
TRI->getCommonSubClass(RC, OpRC))
976 Defs.emplace_back(&SrcOp, SubRegIdx);
981 if (DefSrc && (DefSrc->
isReg() || DefSrc->
isImm())) {
982 Defs.emplace_back(DefSrc, SubRegIdx);
986 Defs.emplace_back(&SrcOp, SubRegIdx);
995const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
996 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
999 if (!Def || !
Def->isRegSequence())
1002 return getRegSeqInit(*Def, Defs);
1005std::pair<int64_t, const TargetRegisterClass *>
1006SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq)
const {
1008 const TargetRegisterClass *SrcRC = getRegSeqInit(RegSeq, Defs);
1012 bool TryToMatchSplat64 =
false;
1015 for (
unsigned I = 0,
E = Defs.
size();
I !=
E; ++
I) {
1016 const MachineOperand *
Op = Defs[
I].first;
1020 int64_t SubImm =
Op->getImm();
1026 if (Imm != SubImm) {
1027 if (
I == 1 && (
E & 1) == 0) {
1030 TryToMatchSplat64 =
true;
1038 if (!TryToMatchSplat64)
1039 return {Defs[0].first->getImm(), SrcRC};
1044 for (
unsigned I = 0,
E = Defs.
size();
I !=
E;
I += 2) {
1045 const MachineOperand *Op0 = Defs[
I].first;
1046 const MachineOperand *Op1 = Defs[
I + 1].first;
1051 unsigned SubReg0 = Defs[
I].second;
1052 unsigned SubReg1 = Defs[
I + 1].second;
1056 if (
TRI->getChannelFromSubReg(SubReg0) + 1 !=
1057 TRI->getChannelFromSubReg(SubReg1))
1062 SplatVal64 = MergedVal;
1063 else if (SplatVal64 != MergedVal)
1067 const TargetRegisterClass *RC64 =
TRI->getSubRegisterClass(
1070 return {SplatVal64, RC64};
1073bool SIFoldOperandsImpl::tryFoldRegSeqSplat(
1074 MachineInstr *
UseMI,
unsigned UseOpIdx, int64_t SplatVal,
1075 const TargetRegisterClass *SplatRC)
const {
1077 if (UseOpIdx >=
Desc.getNumOperands())
1084 int16_t RCID =
TII->getOpRegClassID(
Desc.operands()[UseOpIdx]);
1088 const TargetRegisterClass *OpRC =
TRI->getRegClass(RCID);
1093 if (SplatVal != 0 && SplatVal != -1) {
1097 uint8_t OpTy =
Desc.operands()[UseOpIdx].OperandType;
1103 OpRC =
TRI->getSubRegisterClass(OpRC, AMDGPU::sub0);
1108 OpRC =
TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1);
1114 if (!
TRI->getCommonSubClass(OpRC, SplatRC))
1119 if (!
TII->isOperandLegal(*
UseMI, UseOpIdx, &TmpOp))
1125bool SIFoldOperandsImpl::tryToFoldACImm(
1126 const FoldableDef &OpToFold, MachineInstr *
UseMI,
unsigned UseOpIdx,
1127 SmallVectorImpl<FoldCandidate> &FoldList)
const {
1129 if (UseOpIdx >=
Desc.getNumOperands())
1136 if (OpToFold.isImm() && OpToFold.isOperandLegal(*
TII, *
UseMI, UseOpIdx)) {
1144void SIFoldOperandsImpl::foldOperand(
1145 FoldableDef OpToFold, MachineInstr *
UseMI,
int UseOpIdx,
1146 SmallVectorImpl<FoldCandidate> &FoldList,
1147 SmallVectorImpl<MachineInstr *> &CopiesToReplace)
const {
1150 if (!isUseSafeToFold(*
UseMI, *UseOp))
1154 if (UseOp->
isReg() && OpToFold.isReg()) {
1158 if (UseOp->
getSubReg() != AMDGPU::NoSubRegister &&
1160 !
TRI->isSGPRReg(*
MRI, OpToFold.getReg())))
1172 const TargetRegisterClass *SplatRC;
1173 std::tie(SplatVal, SplatRC) = isRegSeqSplat(*
UseMI);
1178 for (
unsigned I = 0;
I != UsesToProcess.size(); ++
I) {
1179 MachineOperand *RSUse = UsesToProcess[
I];
1180 MachineInstr *RSUseMI = RSUse->
getParent();
1190 if (tryFoldRegSeqSplat(RSUseMI, OpNo, SplatVal, SplatRC)) {
1191 FoldableDef SplatDef(SplatVal, SplatRC);
1198 if (RSUse->
getSubReg() != RegSeqDstSubReg)
1203 foldOperand(OpToFold, RSUseMI, RSUseMI->
getOperandNo(RSUse), FoldList,
1210 if (tryToFoldACImm(OpToFold,
UseMI, UseOpIdx, FoldList))
1213 if (frameIndexMayFold(*
UseMI, UseOpIdx, OpToFold)) {
1218 if (
TII->getNamedOperand(*
UseMI, AMDGPU::OpName::srsrc)->getReg() !=
1224 MachineOperand &SOff =
1225 *
TII->getNamedOperand(*
UseMI, AMDGPU::OpName::soffset);
1236 TII->getNamedOperand(*
UseMI, AMDGPU::OpName::cpol)->getImm();
1251 bool FoldingImmLike =
1252 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1260 const TargetRegisterClass *SrcRC =
MRI->getRegClass(SrcReg);
1268 const TargetRegisterClass *DestRC =
TRI->getRegClassForReg(*
MRI, DestReg);
1271 for (
unsigned MovOp :
1272 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
1273 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_MOV_B16_t16_e64,
1274 AMDGPU::V_ACCVGPR_WRITE_B32_e64, AMDGPU::AV_MOV_B32_IMM_PSEUDO,
1275 AMDGPU::AV_MOV_B64_IMM_PSEUDO}) {
1276 const MCInstrDesc &MovDesc =
TII->get(MovOp);
1277 const TargetRegisterClass *MovDstRC =
1286 const int SrcIdx = MovOp == AMDGPU::V_MOV_B16_t16_e64 ? 2 : 1;
1287 const TargetRegisterClass *MovSrcRC =
1288 TRI->getRegClass(
TII->getOpRegClassID(MovDesc.
operands()[SrcIdx]));
1292 MovSrcRC =
TRI->getMatchingSuperRegClass(SrcRC, MovSrcRC, UseSubReg);
1296 if (MovOp == AMDGPU::AV_MOV_B32_IMM_PSEUDO &&
1297 (!OpToFold.isImm() ||
1298 !
TII->isImmOperandLegal(MovDesc, SrcIdx,
1299 *OpToFold.getEffectiveImmVal())))
1302 if (!
MRI->constrainRegClass(SrcReg, MovSrcRC))
1312 if (!OpToFold.isImm() ||
1313 !
TII->isImmOperandLegal(MovDesc, 1, *OpToFold.getEffectiveImmVal()))
1319 while (ImpOpI != ImpOpE) {
1326 if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
1328 MachineOperand NewSrcOp(SrcOp);
1350 LLVM_DEBUG(
dbgs() <<
"Folding " << OpToFold.OpToFold <<
"\n into "
1355 unsigned SubRegIdx = OpToFold.getSubReg();
1369 static_assert(AMDGPU::sub1_hi16 == 12,
"Subregister layout has changed");
1374 if (SubRegIdx > AMDGPU::sub1) {
1375 LaneBitmask
M =
TRI->getSubRegIndexLaneMask(SubRegIdx);
1376 M |=
M.getLane(
M.getHighestLane() - 1);
1377 SmallVector<unsigned, 4> Indexes;
1378 TRI->getCoveringSubRegIndexes(
TRI->getRegClassForReg(*
MRI,
UseReg), M,
1380 assert(Indexes.
size() == 1 &&
"Expected one 32-bit subreg to cover");
1381 SubRegIdx = Indexes[0];
1383 }
else if (
TII->getOpSize(*
UseMI, 1) == 4)
1386 SubRegIdx = AMDGPU::sub0;
1391 OpToFold.OpToFold->setIsKill(
false);
1395 if (foldCopyToAGPRRegSequence(
UseMI))
1400 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
1401 (UseOpc == AMDGPU::V_READLANE_B32 &&
1403 AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
1408 if (FoldingImmLike) {
1411 *OpToFold.DefMI, *
UseMI))
1416 if (OpToFold.isImm()) {
1418 *OpToFold.getEffectiveImmVal());
1419 }
else if (OpToFold.isFI())
1422 assert(OpToFold.isGlobal());
1424 OpToFold.OpToFold->getOffset(),
1425 OpToFold.OpToFold->getTargetFlags());
1431 if (OpToFold.isReg() &&
TRI->isSGPRReg(*
MRI, OpToFold.getReg())) {
1434 *OpToFold.DefMI, *
UseMI))
1455 UseDesc.
operands()[UseOpIdx].RegClass == -1)
1463 tryAddToFoldList(FoldList,
UseMI, UseOpIdx, OpToFold);
1469 case AMDGPU::V_AND_B32_e64:
1470 case AMDGPU::V_AND_B32_e32:
1471 case AMDGPU::S_AND_B32:
1474 case AMDGPU::V_OR_B32_e64:
1475 case AMDGPU::V_OR_B32_e32:
1476 case AMDGPU::S_OR_B32:
1479 case AMDGPU::V_XOR_B32_e64:
1480 case AMDGPU::V_XOR_B32_e32:
1481 case AMDGPU::S_XOR_B32:
1484 case AMDGPU::S_XNOR_B32:
1487 case AMDGPU::S_NAND_B32:
1490 case AMDGPU::S_NOR_B32:
1493 case AMDGPU::S_ANDN2_B32:
1496 case AMDGPU::S_ORN2_B32:
1499 case AMDGPU::V_LSHL_B32_e64:
1500 case AMDGPU::V_LSHL_B32_e32:
1501 case AMDGPU::S_LSHL_B32:
1503 Result =
LHS << (
RHS & 31);
1505 case AMDGPU::V_LSHLREV_B32_e64:
1506 case AMDGPU::V_LSHLREV_B32_e32:
1507 Result =
RHS << (
LHS & 31);
1509 case AMDGPU::V_LSHR_B32_e64:
1510 case AMDGPU::V_LSHR_B32_e32:
1511 case AMDGPU::S_LSHR_B32:
1512 Result =
LHS >> (
RHS & 31);
1514 case AMDGPU::V_LSHRREV_B32_e64:
1515 case AMDGPU::V_LSHRREV_B32_e32:
1516 Result =
RHS >> (
LHS & 31);
1518 case AMDGPU::V_ASHR_I32_e64:
1519 case AMDGPU::V_ASHR_I32_e32:
1520 case AMDGPU::S_ASHR_I32:
1521 Result =
static_cast<int32_t
>(
LHS) >> (
RHS & 31);
1523 case AMDGPU::V_ASHRREV_I32_e64:
1524 case AMDGPU::V_ASHRREV_I32_e32:
1525 Result =
static_cast<int32_t
>(
RHS) >> (
LHS & 31);
1533 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1537 MI.setDesc(NewDesc);
1543 unsigned NumOps =
Desc.getNumOperands() +
Desc.implicit_uses().size() +
1544 Desc.implicit_defs().size();
1546 for (
unsigned I =
MI.getNumOperands() - 1;
I >=
NumOps; --
I)
1547 MI.removeOperand(
I);
1550std::optional<int64_t>
1551SIFoldOperandsImpl::getImmOrMaterializedImm(MachineOperand &
Op)
const {
1555 if (!
Op.isReg() || !
Op.getReg().isVirtual())
1556 return std::nullopt;
1558 const MachineInstr *
Def =
MRI->getVRegDef(
Op.getReg());
1559 if (Def &&
Def->isMoveImmediate()) {
1560 const MachineOperand &ImmSrc =
Def->getOperand(1);
1562 return TII->extractSubregFromImm(ImmSrc.
getImm(),
Op.getSubReg());
1565 return std::nullopt;
1571bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *
MI)
const {
1572 if (!
MI->allImplicitDefsAreDead())
1575 unsigned Opc =
MI->getOpcode();
1577 int Src0Idx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src0);
1581 MachineOperand *Src0 = &
MI->getOperand(Src0Idx);
1582 std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(*Src0);
1584 if ((
Opc == AMDGPU::V_NOT_B32_e64 ||
Opc == AMDGPU::V_NOT_B32_e32 ||
1585 Opc == AMDGPU::S_NOT_B32) &&
1587 MI->getOperand(1).ChangeToImmediate(~*Src0Imm);
1592 int Src1Idx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src1);
1596 MachineOperand *Src1 = &
MI->getOperand(Src1Idx);
1597 std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);
1599 if (!Src0Imm && !Src1Imm)
1605 if (Src0Imm && Src1Imm) {
1610 bool IsSGPR =
TRI->isSGPRReg(*
MRI,
MI->getOperand(0).getReg());
1614 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1615 MI->removeOperand(Src1Idx);
1620 if (!
MI->isCommutable())
1623 if (Src0Imm && !Src1Imm) {
1629 int32_t Src1Val =
static_cast<int32_t
>(*Src1Imm);
1630 if (
Opc == AMDGPU::V_OR_B32_e64 ||
1631 Opc == AMDGPU::V_OR_B32_e32 ||
1632 Opc == AMDGPU::S_OR_B32) {
1635 MI->removeOperand(Src1Idx);
1637 }
else if (Src1Val == -1) {
1639 MI->removeOperand(Src1Idx);
1647 if (
Opc == AMDGPU::V_AND_B32_e64 ||
Opc == AMDGPU::V_AND_B32_e32 ||
1648 Opc == AMDGPU::S_AND_B32) {
1651 MI->removeOperand(Src0Idx);
1653 }
else if (Src1Val == -1) {
1655 MI->removeOperand(Src1Idx);
1663 if (
Opc == AMDGPU::V_XOR_B32_e64 ||
Opc == AMDGPU::V_XOR_B32_e32 ||
1664 Opc == AMDGPU::S_XOR_B32) {
1667 MI->removeOperand(Src1Idx);
1677bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &
MI)
const {
1678 unsigned Opc =
MI.getOpcode();
1679 if (
Opc != AMDGPU::V_CNDMASK_B32_e32 &&
Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1680 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1683 MachineOperand *Src0 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
1684 MachineOperand *Src1 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
1686 std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);
1690 std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(*Src0);
1691 if (!Src0Imm || *Src0Imm != *Src1Imm)
1696 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src1_modifiers);
1698 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src0_modifiers);
1699 if ((Src1ModIdx != -1 &&
MI.getOperand(Src1ModIdx).getImm() != 0) ||
1700 (Src0ModIdx != -1 &&
MI.getOperand(Src0ModIdx).getImm() != 0))
1706 int Src2Idx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
1708 MI.removeOperand(Src2Idx);
1709 MI.removeOperand(AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src1));
1710 if (Src1ModIdx != -1)
1711 MI.removeOperand(Src1ModIdx);
1712 if (Src0ModIdx != -1)
1713 MI.removeOperand(Src0ModIdx);
1719bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &
MI)
const {
1720 if (
MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1721 MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1724 std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(
MI.getOperand(1));
1725 if (!Src0Imm || *Src0Imm != 0xffff || !
MI.getOperand(2).isReg())
1729 MachineInstr *SrcDef =
MRI->getVRegDef(Src1);
1734 MRI->replaceRegWith(Dst, Src1);
1735 if (!
MI.getOperand(2).isKill())
1736 MRI->clearKillFlags(Src1);
1737 MI.eraseFromParent();
1741bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &
MI,
1742 const FoldableDef &OpToFold)
const {
1746 SmallVector<MachineInstr *, 4> CopiesToReplace;
1748 MachineOperand &Dst =
MI.getOperand(0);
1751 if (OpToFold.isImm()) {
1762 if (tryConstantFoldOp(&
UseMI)) {
1771 for (
auto *U : UsesToProcess) {
1772 MachineInstr *
UseMI =
U->getParent();
1774 FoldableDef SubOpToFold = OpToFold.getWithSubReg(*
TRI,
U->getSubReg());
1779 if (CopiesToReplace.
empty() && FoldList.
empty())
1782 MachineFunction *MF =
MI.getParent()->getParent();
1784 for (MachineInstr *Copy : CopiesToReplace)
1785 Copy->addImplicitDefUseOperands(*MF);
1787 SetVector<MachineInstr *> ConstantFoldCandidates;
1788 for (FoldCandidate &Fold : FoldList) {
1789 assert(!Fold.isReg() || Fold.Def.OpToFold);
1790 if (Fold.isReg() && Fold.getReg().isVirtual()) {
1792 const MachineInstr *
DefMI = Fold.Def.DefMI;
1800 assert(Fold.Def.OpToFold && Fold.isReg());
1804 MRI->clearKillFlags(Fold.getReg());
1807 <<
static_cast<int>(Fold.UseOpNo) <<
" of "
1811 ConstantFoldCandidates.
insert(Fold.UseMI);
1813 }
else if (Fold.Commuted) {
1815 TII->commuteInstruction(*Fold.UseMI,
false);
1819 for (MachineInstr *
MI : ConstantFoldCandidates) {
1820 if (tryConstantFoldOp(
MI)) {
1830bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI)
const {
1835 const TargetRegisterClass *DefRC =
1837 if (!
TRI->isAGPRClass(DefRC))
1841 MachineInstr *RegSeq =
MRI->getVRegDef(
UseReg);
1849 DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
1851 const TargetRegisterClass *UseRC =
1858 unsigned NumFoldable = 0;
1860 for (
unsigned I = 1;
I != NumRegSeqOperands;
I += 2) {
1876 const TargetRegisterClass *DestSuperRC =
TRI->getMatchingSuperRegClass(
1877 DefRC, &AMDGPU::AGPR_32RegClass, SubRegIdx);
1886 const TargetRegisterClass *InputRC =
1896 const TargetRegisterClass *MatchRC =
1897 TRI->getMatchingSuperRegClass(DefRC, InputRC, SubRegIdx);
1908 if (NumFoldable == 0)
1911 CopyMI->
setDesc(
TII->get(AMDGPU::REG_SEQUENCE));
1915 for (
auto [Def, DestSubIdx] : NewDefs) {
1916 if (!
Def->isReg()) {
1919 Register Tmp =
MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
1920 BuildMI(
MBB, CopyMI,
DL,
TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)
1925 Def->setIsKill(
false);
1927 Register &VGPRCopy = VGPRCopies[Src];
1929 const TargetRegisterClass *VGPRUseSubRC =
1930 TRI->getSubRegisterClass(UseRC, DestSubIdx);
1939 const TargetRegisterClass *SubRC =
1940 TRI->getSubRegisterClass(
MRI->getRegClass(Src.Reg), Src.SubReg);
1943 VGPRCopy =
MRI->createVirtualRegister(VGPRUseSubRC);
1955 B.addImm(DestSubIdx);
1962bool SIFoldOperandsImpl::tryFoldFoldableCopy(
1963 MachineInstr &
MI, MachineOperand *&CurrentKnownM0Val)
const {
1967 if (DstReg == AMDGPU::M0) {
1968 MachineOperand &NewM0Val =
MI.getOperand(1);
1969 if (CurrentKnownM0Val && CurrentKnownM0Val->
isIdenticalTo(NewM0Val)) {
1970 MI.eraseFromParent();
1981 MachineOperand *OpToFoldPtr;
1982 if (
MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e64) {
1984 if (
TII->hasAnyModifiersSet(
MI))
1986 OpToFoldPtr = &
MI.getOperand(2);
1988 OpToFoldPtr = &
MI.getOperand(1);
1989 MachineOperand &OpToFold = *OpToFoldPtr;
1993 if (!FoldingImm && !OpToFold.
isReg())
1998 !
TRI->isConstantPhysReg(OpToFold.
getReg()))
2010 const TargetRegisterClass *DstRC =
2011 MRI->getRegClass(
MI.getOperand(0).getReg());
2027 if (
MI.getOpcode() == AMDGPU::COPY && OpToFold.
isReg() &&
2029 if (DstRC == &AMDGPU::SReg_32RegClass &&
2030 DstRC ==
MRI->getRegClass(OpToFold.
getReg())) {
2038 if (OpToFold.
isReg() &&
MI.isCopy() && !
MI.getOperand(1).getSubReg()) {
2039 if (foldCopyToAGPRRegSequence(&
MI))
2043 FoldableDef
Def(OpToFold, DstRC);
2044 bool Changed = foldInstOperand(
MI, Def);
2051 auto *InstToErase = &
MI;
2052 while (
MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
2053 auto &SrcOp = InstToErase->getOperand(1);
2055 InstToErase->eraseFromParent();
2057 InstToErase =
nullptr;
2060 InstToErase =
MRI->getVRegDef(SrcReg);
2061 if (!InstToErase || !
TII->isFoldableCopy(*InstToErase))
2065 if (InstToErase && InstToErase->isRegSequence() &&
2066 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
2067 InstToErase->eraseFromParent();
2077 return OpToFold.
isReg() &&
2078 foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.
getReg(),
MI);
2083const MachineOperand *
2084SIFoldOperandsImpl::isClamp(
const MachineInstr &
MI)
const {
2085 unsigned Op =
MI.getOpcode();
2087 case AMDGPU::V_MAX_F32_e64:
2088 case AMDGPU::V_MAX_F16_e64:
2089 case AMDGPU::V_MAX_F16_t16_e64:
2090 case AMDGPU::V_MAX_F16_fake16_e64:
2091 case AMDGPU::V_MAX_F64_e64:
2092 case AMDGPU::V_MAX_NUM_F64_e64:
2093 case AMDGPU::V_PK_MAX_F16:
2094 case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2095 case AMDGPU::V_PK_MAX_NUM_BF16: {
2096 if (
MI.mayRaiseFPException())
2099 if (!
TII->getNamedOperand(
MI, AMDGPU::OpName::clamp)->getImm())
2103 const MachineOperand *Src0 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
2104 const MachineOperand *Src1 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
2108 Src0->
getSubReg() != AMDGPU::NoSubRegister)
2112 if (
TII->hasModifiersSet(
MI, AMDGPU::OpName::omod))
2116 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0_modifiers)->getImm();
2118 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1_modifiers)->getImm();
2122 unsigned UnsetMods =
2123 (
Op == AMDGPU::V_PK_MAX_F16 ||
Op == AMDGPU::V_PK_MAX_NUM_BF16)
2126 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
2136bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &
MI) {
2137 const MachineOperand *ClampSrc = isClamp(
MI);
2138 if (!ClampSrc || !
MRI->hasOneNonDBGUser(ClampSrc->
getReg()))
2150 if (
TII->getClampMask(*Def) !=
TII->getClampMask(
MI))
2153 if (
Def->mayRaiseFPException())
2156 MachineOperand *DefClamp =
TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
2160 LLVM_DEBUG(
dbgs() <<
"Folding clamp " << *DefClamp <<
" into " << *Def);
2166 Register MIDstReg =
MI.getOperand(0).getReg();
2167 if (
TRI->isSGPRReg(*
MRI, DefReg)) {
2174 MRI->replaceRegWith(MIDstReg, DefReg);
2176 MI.eraseFromParent();
2181 if (
TII->convertToThreeAddress(*Def,
nullptr,
nullptr))
2182 Def->eraseFromParent();
2189 case AMDGPU::V_MUL_F64_e64:
2190 case AMDGPU::V_MUL_F64_pseudo_e64: {
2192 case 0x3fe0000000000000:
2194 case 0x4000000000000000:
2196 case 0x4010000000000000:
2202 case AMDGPU::V_MUL_F32_e64: {
2203 switch (
static_cast<uint32_t>(Val)) {
2214 case AMDGPU::V_MUL_F16_e64:
2215 case AMDGPU::V_MUL_F16_t16_e64:
2216 case AMDGPU::V_MUL_F16_fake16_e64: {
2217 switch (
static_cast<uint16_t>(Val)) {
2236std::pair<const MachineOperand *, int>
2237SIFoldOperandsImpl::isOMod(
const MachineInstr &
MI)
const {
2238 unsigned Op =
MI.getOpcode();
2240 case AMDGPU::V_MUL_F64_e64:
2241 case AMDGPU::V_MUL_F64_pseudo_e64:
2242 case AMDGPU::V_MUL_F32_e64:
2243 case AMDGPU::V_MUL_F16_t16_e64:
2244 case AMDGPU::V_MUL_F16_fake16_e64:
2245 case AMDGPU::V_MUL_F16_e64: {
2247 if ((
Op == AMDGPU::V_MUL_F32_e64 &&
2249 ((
Op == AMDGPU::V_MUL_F64_e64 ||
Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
2250 Op == AMDGPU::V_MUL_F16_e64 ||
Op == AMDGPU::V_MUL_F16_t16_e64 ||
2251 Op == AMDGPU::V_MUL_F16_fake16_e64) &&
2254 MI.mayRaiseFPException())
2257 const MachineOperand *RegOp =
nullptr;
2258 const MachineOperand *ImmOp =
nullptr;
2259 const MachineOperand *Src0 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
2260 const MachineOperand *Src1 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
2261 if (Src0->
isImm()) {
2264 }
else if (Src1->
isImm()) {
2272 TII->hasModifiersSet(
MI, AMDGPU::OpName::src0_modifiers) ||
2273 TII->hasModifiersSet(
MI, AMDGPU::OpName::src1_modifiers) ||
2274 TII->hasModifiersSet(
MI, AMDGPU::OpName::omod) ||
2275 TII->hasModifiersSet(
MI, AMDGPU::OpName::clamp))
2278 return std::pair(RegOp, OMod);
2280 case AMDGPU::V_ADD_F64_e64:
2281 case AMDGPU::V_ADD_F64_pseudo_e64:
2282 case AMDGPU::V_ADD_F32_e64:
2283 case AMDGPU::V_ADD_F16_e64:
2284 case AMDGPU::V_ADD_F16_t16_e64:
2285 case AMDGPU::V_ADD_F16_fake16_e64: {
2287 if ((
Op == AMDGPU::V_ADD_F32_e64 &&
2289 ((
Op == AMDGPU::V_ADD_F64_e64 ||
Op == AMDGPU::V_ADD_F64_pseudo_e64 ||
2290 Op == AMDGPU::V_ADD_F16_e64 ||
Op == AMDGPU::V_ADD_F16_t16_e64 ||
2291 Op == AMDGPU::V_ADD_F16_fake16_e64) &&
2296 const MachineOperand *Src0 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
2297 const MachineOperand *Src1 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
2301 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::src0_modifiers) &&
2302 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::src1_modifiers) &&
2303 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::clamp) &&
2304 !
TII->hasModifiersSet(
MI, AMDGPU::OpName::omod))
2315bool SIFoldOperandsImpl::tryFoldOMod(MachineInstr &
MI) {
2316 const MachineOperand *RegOp;
2318 std::tie(RegOp, OMod) = isOMod(
MI);
2320 RegOp->
getSubReg() != AMDGPU::NoSubRegister ||
2321 !
MRI->hasOneNonDBGUser(RegOp->
getReg()))
2325 MachineOperand *DefOMod =
TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
2329 if (
Def->mayRaiseFPException())
2334 if (
TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
2340 MRI->replaceRegWith(
MI.getOperand(0).getReg(),
Def->getOperand(0).getReg());
2343 MRI->clearKillFlags(
Def->getOperand(0).getReg());
2344 MI.eraseFromParent();
2349 if (
TII->convertToThreeAddress(*Def,
nullptr,
nullptr))
2350 Def->eraseFromParent();
2357bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &
MI) {
2359 auto Reg =
MI.getOperand(0).getReg();
2362 !
MRI->hasOneNonDBGUse(
Reg))
2366 if (!getRegSeqInit(Defs,
Reg))
2369 for (
auto &[
Op, SubIdx] : Defs) {
2372 if (
TRI->isAGPR(*
MRI,
Op->getReg()))
2375 const MachineInstr *SubDef =
MRI->getVRegDef(
Op->getReg());
2382 MachineOperand *
Op = &*
MRI->use_nodbg_begin(
Reg);
2383 MachineInstr *
UseMI =
Op->getParent();
2392 if (
Op->getSubReg())
2397 const TargetRegisterClass *OpRC =
TII->getRegClass(InstDesc,
OpIdx,
TRI);
2398 if (!OpRC || !
TRI->isVectorSuperClass(OpRC))
2401 const auto *NewDstRC =
TRI->getEquivalentAGPRClass(
MRI->getRegClass(
Reg));
2402 auto Dst =
MRI->createVirtualRegister(NewDstRC);
2404 TII->get(AMDGPU::REG_SEQUENCE), Dst);
2406 for (
auto &[Def, SubIdx] : Defs) {
2407 Def->setIsKill(
false);
2411 MachineInstr *SubDef =
MRI->getVRegDef(
Def->getReg());
2421 RS->eraseFromParent();
2429 if (
MRI->use_nodbg_empty(
MI.getOperand(0).getReg()))
2430 MI.eraseFromParent();
2438 Register &OutReg,
unsigned &OutSubReg) {
2448 if (
TRI.isAGPR(
MRI, CopySrcReg)) {
2449 OutReg = CopySrcReg;
2458 if (!CopySrcDef || !CopySrcDef->
isCopy())
2465 OtherCopySrc.
getSubReg() != AMDGPU::NoSubRegister ||
2466 !
TRI.isAGPR(
MRI, OtherCopySrcReg))
2469 OutReg = OtherCopySrcReg;
2503bool SIFoldOperandsImpl::tryFoldPhiAGPR(MachineInstr &
PHI) {
2507 if (!
TRI->isVGPR(*
MRI, PhiOut))
2512 const TargetRegisterClass *ARC =
nullptr;
2513 for (
unsigned K = 1;
K <
PHI.getNumExplicitOperands();
K += 2) {
2514 MachineOperand &MO =
PHI.getOperand(K);
2516 if (!Copy || !
Copy->isCopy())
2520 unsigned AGPRRegMask = AMDGPU::NoSubRegister;
2524 const TargetRegisterClass *CopyInRC =
MRI->getRegClass(AGPRSrc);
2525 if (
const auto *SubRC =
TRI->getSubRegisterClass(CopyInRC, AGPRRegMask))
2536 bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass);
2540 for (
unsigned K = 1;
K <
PHI.getNumExplicitOperands();
K += 2) {
2541 MachineOperand &MO =
PHI.getOperand(K);
2545 MachineBasicBlock *InsertMBB =
nullptr;
2548 unsigned CopyOpc = AMDGPU::COPY;
2549 if (MachineInstr *Def =
MRI->getVRegDef(
Reg)) {
2553 if (
Def->isCopy()) {
2555 unsigned AGPRSubReg = AMDGPU::NoSubRegister;
2568 MachineOperand &CopyIn =
Def->getOperand(1);
2571 CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
2574 InsertMBB =
Def->getParent();
2581 Register NewReg =
MRI->createVirtualRegister(ARC);
2582 MachineInstr *
MI =
BuildMI(*InsertMBB, InsertPt,
PHI.getDebugLoc(),
2583 TII->get(CopyOpc), NewReg)
2592 Register NewReg =
MRI->createVirtualRegister(ARC);
2593 PHI.getOperand(0).setReg(NewReg);
2599 TII->get(AMDGPU::COPY), PhiOut)
2607bool SIFoldOperandsImpl::tryFoldLoad(MachineInstr &
MI) {
2612 MachineOperand &
Def =
MI.getOperand(0);
2629 while (!
Users.empty()) {
2630 const MachineInstr *
I =
Users.pop_back_val();
2631 if (!
I->isCopy() && !
I->isRegSequence())
2633 Register DstReg =
I->getOperand(0).getReg();
2637 if (
TRI->isAGPR(*
MRI, DstReg))
2640 for (
const MachineInstr &U :
MRI->use_nodbg_instructions(DstReg))
2641 Users.push_back(&U);
2644 const TargetRegisterClass *RC =
MRI->getRegClass(DefReg);
2645 MRI->setRegClass(DefReg,
TRI->getEquivalentAGPRClass(RC));
2646 if (!
TII->isOperandLegal(
MI, 0, &Def)) {
2647 MRI->setRegClass(DefReg, RC);
2651 while (!MoveRegs.
empty()) {
2653 MRI->setRegClass(
Reg,
TRI->getEquivalentAGPRClass(
MRI->getRegClass(
Reg)));
2693bool SIFoldOperandsImpl::tryOptimizeAGPRPhis(MachineBasicBlock &
MBB) {
2700 DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>>
2703 for (
auto &
MI :
MBB) {
2707 if (!
TRI->isAGPR(*
MRI,
MI.getOperand(0).getReg()))
2710 for (
unsigned K = 1;
K <
MI.getNumOperands();
K += 2) {
2711 MachineOperand &PhiMO =
MI.getOperand(K);
2721 for (
const auto &[Entry, MOs] : RegToMO) {
2722 if (MOs.size() == 1)
2726 MachineInstr *
Def =
MRI->getVRegDef(
Reg);
2727 MachineBasicBlock *DefMBB =
Def->getParent();
2733 MRI->createVirtualRegister(
TRI->getEquivalentVGPRClass(ARC));
2734 MachineInstr *VGPRCopy =
2736 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
2740 Register TempAGPR =
MRI->createVirtualRegister(ARC);
2742 TII->get(AMDGPU::COPY), TempAGPR)
2746 for (MachineOperand *MO : MOs) {
2758bool SIFoldOperandsImpl::run(MachineFunction &MF) {
2763 TRI = &
TII->getRegisterInfo();
2764 MFI = MF.
getInfo<SIMachineFunctionInfo>();
2775 MachineOperand *CurrentKnownM0Val =
nullptr;
2779 if (tryFoldZeroHighBits(
MI)) {
2784 if (
MI.isRegSequence() && tryFoldRegSequence(
MI)) {
2789 if (
MI.isPHI() && tryFoldPhiAGPR(
MI)) {
2794 if (
MI.mayLoad() && tryFoldLoad(
MI)) {
2799 if (
TII->isFoldableCopy(
MI)) {
2800 Changed |= tryFoldFoldableCopy(
MI, CurrentKnownM0Val);
2805 if (CurrentKnownM0Val &&
MI.modifiesRegister(AMDGPU::M0,
TRI))
2806 CurrentKnownM0Val =
nullptr;
2825 bool Changed = SIFoldOperandsImpl().run(MF);
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat)
Updates the operand at Idx in instruction Inst with the result of instruction Mat.
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
iv Induction Variable Users
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
static unsigned macToMad(unsigned Opc)
static bool isAGPRCopy(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, const MachineInstr &Copy, Register &OutReg, unsigned &OutSubReg)
Checks whether Copy is a AGPR -> VGPR copy.
static void appendFoldCandidate(SmallVectorImpl< FoldCandidate > &FoldList, FoldCandidate &&Entry)
static const TargetRegisterClass * getRegOpRC(const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const MachineOperand &MO)
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, uint32_t LHS, uint32_t RHS)
static int getOModValue(unsigned Opc, int64_t Val)
static unsigned getMovOpc(bool IsScalar)
static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc)
static MachineOperand * lookUpCopyChain(const SIInstrInfo &TII, const MachineRegisterInfo &MRI, Register SrcReg)
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
static int Lookup(ArrayRef< TableEntry > Table, unsigned Opcode)
bool hasNoSignedZerosFPMath() const
Represent the analysis usage information of a pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Represents analyses that only rely on functions' control flow.
FunctionPass class - This class is used to implement most global optimizations.
bool hasGFX90AInsts() const
const SIInstrInfo * getInstrInfo() const override
bool hasDOTOpSelHazard() const
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
Describe properties that are true of each instruction in the target description file.
ArrayRef< MCOperandInfo > operands() const
int getOperandConstraint(unsigned OpNum, MCOI::OperandConstraint Constraint) const
Returns the value of the specified operand constraint if it is present.
bool isVariadic() const
Return true if this instruction can have a variable number of operands.
An RAII based helper class to modify MachineFunctionProperties when running pass.
LLVM_ABI iterator SkipPHIsLabelsAndDebug(iterator I, Register Reg=Register(), bool SkipPseudoOp=true)
Return the first instruction in MBB after I that is not a PHI, label or debug.
LLVM_ABI LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
LivenessQueryResult
Possible outcome of a register liveness query to computeRegisterLiveness()
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
bool isRegSequence() const
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
MachineOperand * mop_iterator
iterator/begin/end - Iterate over all operands of a machine instruction.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
LLVM_ABI void substVirtReg(Register Reg, unsigned SubIdx, const TargetRegisterInfo &)
substVirtReg - Substitute the current register with the virtual subregister Reg:SubReg.
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
LLVM_ABI void substPhysReg(MCRegister Reg, const TargetRegisterInfo &)
substPhysReg - Substitute the current register with the physical register Reg, taking any existing Su...
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_GlobalAddress
Address of a global value.
@ MO_FrameIndex
Abstract Stack Frame Index.
@ MO_Register
Register operand.
static MachineOperand CreateFI(int Idx)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Wrapper class representing virtual and physical registers.
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
SIModeRegisterDefaults getMode() const
bool insert(const value_type &X)
Insert a new element into the SetVector.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
StringRef - Represent a constant reference to a string, i.e.
static const unsigned CommuteAnyOperandIndex
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
@ OPERAND_REG_INLINE_C_FP64
@ OPERAND_REG_INLINE_C_V2BF16
@ OPERAND_REG_IMM_V2INT16
@ OPERAND_REG_INLINE_C_INT64
@ OPERAND_REG_IMM_NOINLINE_V2FP16
@ OPERAND_REG_INLINE_C_V2FP16
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
@ OPERAND_REG_INLINE_AC_FP32
@ OPERAND_REG_INLINE_C_FP32
@ OPERAND_REG_INLINE_C_INT32
@ OPERAND_REG_INLINE_C_V2INT16
@ OPERAND_REG_INLINE_AC_FP64
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
LLVM_READONLY int getFlatScratchInstSSfromSV(uint16_t Opcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Kill
The last use of a register.
NodeAddr< DefNode * > Def
This is an optimization pass for GlobalISel generic memory operations.
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
FunctionPass * createSIFoldOperandsLegacyPass()
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
@ Sub
Subtraction of integers.
DWARFExpression::Operation Op
char & SIFoldOperandsLegacyID
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
iterator_range< df_iterator< T > > depth_first(const T &G)
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
constexpr uint64_t Make_64(uint32_t High, uint32_t Low)
Make a 64-bit integer from a high / low pair of 32-bit integers.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.