25#include "llvm/IR/IntrinsicsAMDGPU.h"
27#define DEBUG_TYPE "amdgpu-regbanklegalize"
35 : MF(B.getMF()), ST(MF.getSubtarget<
GCNSubtarget>()), B(B),
36 MRI(*B.getMRI()), MUI(MUI), RBI(RBI), MORE(MF, nullptr),
37 RBLRules(RBLRules), IsWave32(ST.isWave32()),
38 SgprRB(&RBI.getRegBank(
AMDGPU::SGPRRegBankID)),
39 VgprRB(&RBI.getRegBank(
AMDGPU::VGPRRegBankID)),
40 VccRB(&RBI.getRegBank(
AMDGPU::VCCRegBankID)) {}
46 "No AMDGPU RegBankLegalize rules defined for opcode",
54 "AMDGPU RegBankLegalize: none of the rules defined with "
55 "'Any' for MI's opcode matched MI",
63 B.setInsertPt(*
MI.getParent(), std::next(
MI.getIterator()));
73 if (!lower(
MI, *Mapping, WaterfallSgprs))
79bool RegBankLegalizeHelper::executeInWaterfallLoop(
91 unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
93 MovExecOpc = AMDGPU::S_MOV_B32;
94 MovExecTermOpc = AMDGPU::S_MOV_B32_term;
95 XorTermOpc = AMDGPU::S_XOR_B32_term;
96 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
97 ExecReg = AMDGPU::EXEC_LO;
99 MovExecOpc = AMDGPU::S_MOV_B64;
100 MovExecTermOpc = AMDGPU::S_MOV_B64_term;
101 XorTermOpc = AMDGPU::S_XOR_B64_term;
102 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
103 ExecReg = AMDGPU::EXEC;
107 const int OrigRangeSize = std::distance(
Range.begin(),
Range.end());
111 Register SaveExecReg =
MRI.createVirtualRegister(WaveRC);
112 Register InitSaveExecReg =
MRI.createVirtualRegister(WaveRC);
115 B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
117 Register SavedExec =
MRI.createVirtualRegister(WaveRC);
141 MBB.addSuccessor(LoopBB);
144 B.setInsertPt(*LoopBB, LoopBB->
end());
195 auto NewEnd = BodyBB->
end();
196 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
204 if (!SGPROperandRegs.
count(OldReg))
209 auto OldVal = WaterfalledRegMap.
find(OldReg);
210 if (OldVal != WaterfalledRegMap.
end()) {
211 Op.setReg(OldVal->second);
216 LLT OpTy = MRI.getType(OpReg);
219 assert(MRI.getRegBank(OpReg) == VgprRB);
220 Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
225 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
227 unsigned NumParts = OpSize / PartSize;
233 CurrentLaneParts.
push_back(CurrentLaneReg);
235 auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
236 auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
237 for (
unsigned i = 0; i < NumParts; ++i) {
239 CurrentLaneParts.
push_back(UnmergeCurrLane.getReg(i));
243 for (
unsigned i = 0; i < NumParts; ++i) {
244 Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
250 CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
253 Op.setReg(CurrentLaneReg);
256 WaterfalledRegMap.
insert(std::pair(OldReg,
Op.getReg()));
262 MRI.createVirtualRegister({WaveRC,
LLT::scalar(IsWave32 ? 32 : 64)});
263 B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
266 B.buildInstr(AndSaveExecOpc)
269 MRI.setSimpleHint(SavedExec, CondRegLM);
271 B.setInsertPt(*BodyBB, BodyBB->
end());
274 B.buildInstr(XorTermOpc).addDef(ExecReg).addReg(ExecReg).addReg(SavedExec);
280 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
284 B.buildInstr(MovExecOpc).addDef(SaveExecReg).addReg(ExecReg);
287 B.setInsertPt(*RestoreExecBB, RestoreExecBB->
begin());
288 B.buildInstr(MovExecTermOpc).addDef(ExecReg).addReg(SaveExecReg);
292 B.setInsertPt(*RemainderBB, RemainderBB->
begin());
297bool RegBankLegalizeHelper::splitLoad(MachineInstr &
MI,
299 MachineFunction &MF = B.getMF();
300 assert(
MI.getNumMemOperands() == 1);
301 MachineMemOperand &BaseMMO = **
MI.memoperands_begin();
303 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
305 LLT PtrTy = MRI.getType(
Base);
306 const RegisterBank *PtrRB = MRI.getRegBankOrNull(
Base);
310 unsigned ByteOffset = 0;
311 for (LLT PartTy : LLTBreakdown) {
313 if (ByteOffset == 0) {
314 BasePlusOffset =
Base;
316 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
320 auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
321 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
322 LoadPartRegs.
push_back(LoadPart.getReg(0));
328 B.buildMergeLikeInstr(Dst, LoadPartRegs);
334 if (MRI.getType(
Reg) == MergeTy) {
337 auto Unmerge = B.buildUnmerge({DstRB, MergeTy},
Reg);
338 for (
unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
339 MergeTyParts.
push_back(Unmerge.getReg(i));
342 B.buildMergeLikeInstr(Dst, MergeTyParts);
344 MI.eraseFromParent();
348bool RegBankLegalizeHelper::widenLoad(MachineInstr &
MI, LLT WideTy,
350 MachineFunction &MF = B.getMF();
351 assert(
MI.getNumMemOperands() == 1);
352 MachineMemOperand &BaseMMO = **
MI.memoperands_begin();
354 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
357 MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
358 auto WideLoad = B.buildLoad({DstRB, WideTy},
Base, *WideMMO);
361 B.buildTrunc(Dst, WideLoad);
364 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
366 LLT DstTy = MRI.getType(Dst);
368 for (
unsigned i = 0; i < NumElts; ++i) {
369 MergeTyParts.
push_back(Unmerge.getReg(i));
371 B.buildMergeLikeInstr(Dst, MergeTyParts);
373 MI.eraseFromParent();
377bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &
MI)
const {
380 MachineMemOperand &MMO =
MI.getMMO();
383 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
385 if (
MI.getOpcode() == G_LOAD) {
386 B.buildLoad(Dst, Ptr, *WideMMO);
388 auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
390 if (
MI.getOpcode() == G_ZEXTLOAD) {
392 auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
393 B.buildAnd(Dst, Load, MaskCst);
395 assert(
MI.getOpcode() == G_SEXTLOAD);
396 B.buildSExtInReg(Dst, Load, MemSize);
400 MI.eraseFromParent();
404bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &
MI) {
406 LLT Ty = MRI.getType(Dst);
408 unsigned Opc =
MI.getOpcode();
409 int TrueExtCst =
Opc == G_SEXT ? -1 : 1;
410 if (Ty == S32 || Ty == S16) {
411 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
412 auto False = B.buildConstant({VgprRB, Ty}, 0);
413 B.buildSelect(Dst, Src, True, False);
414 }
else if (Ty == S64) {
415 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
416 auto False = B.buildConstant({VgprRB_S32}, 0);
417 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
418 MachineInstrBuilder
Hi;
427 Hi = B.buildUndef({VgprRB_S32});
431 MF, MORE,
"amdgpu-regbanklegalize",
432 "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported",
MI);
436 B.buildMergeValues(Dst, {
Lo.getReg(0),
Hi.getReg(0)});
439 MF, MORE,
"amdgpu-regbanklegalize",
440 "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported",
MI);
444 MI.eraseFromParent();
448std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(
Register Reg) {
449 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
450 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
451 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
452 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
453 return {
Lo.getReg(0),
Hi.getReg(0)};
456std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(
Register Reg) {
457 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
458 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
459 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
460 return {
Lo.getReg(0),
Hi.getReg(0)};
463std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(
Register Reg) {
464 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
466 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
467 return {
Lo.getReg(0),
Hi.getReg(0)};
470std::pair<Register, Register>
471RegBankLegalizeHelper::unpackAExtTruncS16(
Register Reg) {
472 auto [Lo32, Hi32] = unpackAExt(
Reg);
473 return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0),
474 B.buildTrunc(SgprRB_S16, Hi32).getReg(0)};
477bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &
MI) {
479 switch (
MI.getOpcode()) {
480 case AMDGPU::G_SHL: {
481 auto [Val0, Val1] = unpackAExt(
MI.getOperand(1).getReg());
482 auto [Amt0, Amt1] = unpackAExt(
MI.getOperand(2).getReg());
483 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
484 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
487 case AMDGPU::G_LSHR: {
488 auto [Val0, Val1] = unpackZExt(
MI.getOperand(1).getReg());
489 auto [Amt0, Amt1] = unpackZExt(
MI.getOperand(2).getReg());
490 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
491 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
494 case AMDGPU::G_ASHR: {
495 auto [Val0, Val1] = unpackSExt(
MI.getOperand(1).getReg());
496 auto [Amt0, Amt1] = unpackSExt(
MI.getOperand(2).getReg());
497 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
498 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
503 MF, MORE,
"amdgpu-regbanklegalize",
504 "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented",
508 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(), {Lo, Hi});
509 MI.eraseFromParent();
513bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &
MI) {
515 switch (
MI.getOpcode()) {
517 case AMDGPU::G_SMAX: {
519 auto [Val0_Lo, Val0_Hi] = unpackSExt(
MI.getOperand(1).getReg());
520 auto [Val1_Lo, Val1_Hi] = unpackSExt(
MI.getOperand(2).getReg());
521 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
523 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
528 case AMDGPU::G_UMAX: {
530 auto [Val0_Lo, Val0_Hi] = unpackZExt(
MI.getOperand(1).getReg());
531 auto [Val1_Lo, Val1_Hi] = unpackZExt(
MI.getOperand(2).getReg());
532 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
534 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
540 MF, MORE,
"amdgpu-regbanklegalize",
541 "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented",
MI);
544 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(), {Lo, Hi});
545 MI.eraseFromParent();
549bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &
MI) {
550 auto [Op1Lo, Op1Hi] = unpackAExt(
MI.getOperand(1).getReg());
551 auto [Op2Lo, Op2Hi] = unpackAExt(
MI.getOperand(2).getReg());
552 auto ResLo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
553 auto ResHi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
554 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(),
555 {ResLo.getReg(0), ResHi.getReg(0)});
556 MI.eraseFromParent();
562 return (GI->is(Intrinsic::amdgcn_sbfe));
564 return MI.getOpcode() == AMDGPU::G_SBFX;
567bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &
MI) {
574 Register Src =
MI.getOperand(FirstOpnd).getReg();
575 Register LSBit =
MI.getOperand(FirstOpnd + 1).getReg();
576 Register Width =
MI.getOperand(FirstOpnd + 2).getReg();
581 unsigned SHROpc =
Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
582 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
590 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
591 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
592 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
593 MI.eraseFromParent();
597 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
598 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
599 Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
600 Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
601 auto Zero = B.buildConstant({VgprRB, S32}, 0);
602 unsigned BFXOpc =
Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
604 if (WidthImm <= 32) {
606 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo,
Zero, Width});
607 MachineInstrBuilder
Hi;
610 Hi = B.buildAShr(VgprRB_S32,
Lo, B.buildConstant(VgprRB_S32, 31));
615 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
617 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
619 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi,
Zero, Amt});
620 B.buildMergeLikeInstr(Dst, {SHRSrcLo,
Hi});
623 MI.eraseFromParent();
627bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &
MI) {
629 LLT Ty = MRI.getType(DstReg);
632 Register Src =
MI.getOperand(FirstOpnd).getReg();
633 Register LSBit =
MI.getOperand(FirstOpnd + 1).getReg();
634 Register Width =
MI.getOperand(FirstOpnd + 2).getReg();
641 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
642 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
643 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset,
Size);
644 unsigned Opc32 =
Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
645 unsigned Opc64 =
Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
646 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
650 auto S_BFE = B.buildInstr(
Opc, {{SgprRB, Ty}},
651 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
653 *ST.getRegisterInfo(), RBI)) {
655 MF, MORE,
"amdgpu-regbanklegalize",
656 "AMDGPU RegBankLegalize: lowerS_BFE, failed to constrain BFE",
MI);
660 B.buildCopy(DstReg,
S_BFE->getOperand(0).getReg());
661 MI.eraseFromParent();
665bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &
MI) {
667 LLT DstTy = MRI.getType(Dst);
668 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
669 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
670 auto Op1 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(1).
getReg());
671 auto Op2 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(2).
getReg());
672 unsigned Opc =
MI.getOpcode();
675 B.buildInstr(
Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)},
Flags);
677 B.buildInstr(
Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)},
Flags);
678 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
679 MI.eraseFromParent();
683bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &
MI) {
685 assert(MRI.getType(Dst) == V2S16);
686 unsigned Opc =
MI.getOpcode();
687 unsigned NumOps =
MI.getNumOperands();
690 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(
MI.getOperand(1).getReg());
693 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo},
Flags);
694 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi},
Flags);
695 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
696 MI.eraseFromParent();
700 auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(
MI.getOperand(2).getReg());
703 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo, Op2Lo},
Flags);
704 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi, Op2Hi},
Flags);
705 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
706 MI.eraseFromParent();
711 auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(
MI.getOperand(3).getReg());
712 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo, Op2Lo, Op3Lo},
Flags);
713 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi, Op2Hi, Op3Hi},
Flags);
714 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
715 MI.eraseFromParent();
719bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &
MI) {
721 LLT DstTy = MRI.getType(Dst);
722 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
724 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
725 auto Op2 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(2).
getReg());
726 auto Op3 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(3).
getReg());
730 B.buildSelect({VgprRB, Ty},
Cond, Op2.getReg(0), Op3.getReg(0), Flags);
732 B.buildSelect({VgprRB, Ty},
Cond, Op2.getReg(1), Op3.getReg(1), Flags);
734 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
735 MI.eraseFromParent();
739bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &
MI) {
740 auto Op1 = B.buildUnmerge(VgprRB_S32,
MI.getOperand(1).getReg());
741 int Amt =
MI.getOperand(2).getImm();
745 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
748 Lo = Freeze.getReg(0);
751 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
754 auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
755 Hi = B.buildAShr(VgprRB_S32,
Lo, SignExtCst).getReg(0);
759 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
762 B.buildMergeLikeInstr(
MI.getOperand(0).getReg(), {Lo, Hi});
763 MI.eraseFromParent();
767bool RegBankLegalizeHelper::lower(MachineInstr &
MI,
769 SmallSet<Register, 4> &WaterfallSgprs) {
775 return lowerVccExtToSel(
MI);
777 LLT Ty = MRI.getType(
MI.getOperand(0).getReg());
778 auto True = B.buildConstant({SgprRB, Ty},
779 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
780 auto False = B.buildConstant({SgprRB, Ty}, 0);
784 B.buildSelect(
MI.getOperand(0).getReg(),
MI.getOperand(1).getReg(), True,
786 MI.eraseFromParent();
790 return lowerUnpackBitShift(
MI);
792 return lowerUnpackMinMax(
MI);
794 return lowerSplitTo16(
MI);
796 const RegisterBank *RB = MRI.getRegBank(
MI.getOperand(0).getReg());
797 MachineInstrBuilder
Hi;
798 switch (
MI.getOpcode()) {
799 case AMDGPU::G_ZEXT: {
800 Hi = B.buildConstant({RB, S32}, 0);
803 case AMDGPU::G_SEXT: {
805 auto ShiftAmt = B.buildConstant({RB, S32}, 31);
806 Hi = B.buildAShr({RB, S32},
MI.getOperand(1).
getReg(), ShiftAmt);
809 case AMDGPU::G_ANYEXT: {
810 Hi = B.buildUndef({RB, S32});
815 "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode",
820 B.buildMergeLikeInstr(
MI.getOperand(0).getReg(),
821 {MI.getOperand(1).getReg(), Hi});
822 MI.eraseFromParent();
826 uint64_t ConstVal =
MI.getOperand(1).getCImm()->getZExtValue();
827 B.buildConstant(
MI.getOperand(0).getReg(), ConstVal);
829 MI.eraseFromParent();
834 LLT Ty = MRI.getType(Src);
838 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
840 auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
841 auto One = B.buildConstant(VgprRB_S32, 1);
842 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
843 auto Zero = B.buildConstant(VgprRB_S32, 0);
844 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero);
845 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi});
847 assert(Ty == S32 || Ty == S16);
848 auto One = B.buildConstant({VgprRB, Ty}, 1);
849 B.buildAnd(BoolSrc, Src, One);
851 auto Zero = B.buildConstant({VgprRB, Ty}, 0);
853 MI.eraseFromParent();
857 return lowerV_BFE(
MI);
859 return lowerS_BFE(
MI);
861 return lowerSplitTo32(
MI);
863 return lowerSplitTo32Select(
MI);
865 return lowerSplitTo32SExtInReg(
MI);
867 LLT DstTy = MRI.getType(
MI.getOperand(0).getReg());
880 else if (
Size / 128 == 4)
884 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
890 else if (DstTy == S96)
891 splitLoad(
MI, {S64, S32}, S32);
892 else if (DstTy == V3S32)
893 splitLoad(
MI, {V2S32, S32}, S32);
894 else if (DstTy == V6S16)
895 splitLoad(
MI, {V4S16, V2S16}, V2S16);
898 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
905 LLT DstTy = MRI.getType(
MI.getOperand(0).getReg());
908 else if (DstTy == V3S32)
909 widenLoad(
MI, V4S32, S32);
910 else if (DstTy == V6S16)
911 widenLoad(
MI, V8S16, V2S16);
914 "AMDGPU RegBankLegalize: WidenLoad, unsuported type",
921 return lowerUnpackAExt(
MI);
926 if (!WaterfallSgprs.
empty()) {
928 if (!executeInWaterfallLoop(B,
make_range(
I, std::next(
I)), WaterfallSgprs))
1008 return isAnyPtr(Ty, 32) ? Ty : LLT();
1011 return isAnyPtr(Ty, 64) ? Ty : LLT();
1014 return isAnyPtr(Ty, 128) ? Ty : LLT();
1132bool RegBankLegalizeHelper::applyMappingDst(
1133 MachineInstr &
MI,
unsigned &
OpIdx,
1134 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
1139 MachineOperand &
Op =
MI.getOperand(
OpIdx);
1141 LLT Ty = MRI.getType(
Reg);
1142 [[maybe_unused]]
const RegisterBank *RB = MRI.getRegBank(
Reg);
1144 switch (MethodIDs[
OpIdx]) {
1203 Register NewDst = MRI.createVirtualRegister(VccRB_S1);
1206 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
1207 B.buildTrunc(
Reg, CopyS32_Vcc);
1213 Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
1214 Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
1215 Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
1216 Op.setReg(NewVgprDstS16);
1217 B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
1219 B.buildTrunc(
Reg, NewSgprDstS32);
1229 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1230 Op.setReg(NewVgprDst);
1242 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1243 Op.setReg(NewVgprDst);
1251 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1253 if (!MRI.use_empty(
Reg))
1254 B.buildTrunc(
Reg, NewDst);
1259 MF, MORE,
"amdgpu-regbanklegalize",
1260 "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for",
MI);
1265 MF, MORE,
"amdgpu-regbanklegalize",
1266 "AMDGPU RegBankLegalize: applyMappingDst, ID not supported",
MI);
1274bool RegBankLegalizeHelper::applyMappingSrc(
1275 MachineInstr &
MI,
unsigned &
OpIdx,
1276 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
1277 SmallSet<Register, 4> &SgprWaterfallOperandRegs) {
1278 for (
unsigned i = 0; i < MethodIDs.
size(); ++
OpIdx, ++i) {
1279 if (MethodIDs[i] ==
None || MethodIDs[i] ==
IntrId || MethodIDs[i] ==
Imm)
1282 MachineOperand &
Op =
MI.getOperand(
OpIdx);
1284 LLT Ty = MRI.getType(
Reg);
1285 const RegisterBank *RB = MRI.getRegBank(
Reg);
1287 switch (MethodIDs[i]) {
1290 assert(RB == VccRB || RB == SgprRB);
1292 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
1294 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext});
1295 Op.setReg(CopyVcc_Scc.getReg(0));
1313 assert(Ty == getTyFromID(MethodIDs[i]));
1314 assert(RB == getRegBankFromID(MethodIDs[i]));
1327 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1328 assert(RB == getRegBankFromID(MethodIDs[i]));
1344 assert(Ty == getTyFromID(MethodIDs[i]));
1346 auto CopyToVgpr = B.buildCopy({VgprRB, Ty},
Reg);
1347 Op.setReg(CopyToVgpr.getReg(0));
1361 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1363 auto CopyToVgpr = B.buildCopy({VgprRB, Ty},
Reg);
1364 Op.setReg(CopyToVgpr.getReg(0));
1371 assert(Ty == getTyFromID(MethodIDs[i]));
1381 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
1382 Op.setReg(Aext.getReg(0));
1389 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
1392 auto Cst1 = B.buildConstant(SgprRB_S32, 1);
1393 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
1394 Op.setReg(BoolInReg.getReg(0));
1400 auto Sext = B.buildSExt(SgprRB_S32,
Reg);
1401 Op.setReg(Sext.getReg(0));
1407 auto Zext = B.buildZExt({SgprRB, S32},
Reg);
1408 Op.setReg(Zext.getReg(0));
1415 auto Sext = B.buildSExt({VgprRB, S32},
Reg);
1416 Op.setReg(Sext.getReg(0));
1423 auto Zext = B.buildZExt({VgprRB, S32},
Reg);
1424 Op.setReg(Zext.getReg(0));
1429 MF, MORE,
"amdgpu-regbanklegalize",
1430 "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported",
MI);
1439 LLT Ty = MRI.getType(Dst);
1442 B.setInsertPt(*
MI.getParent(),
MI.getParent()->getFirstNonPHI());
1444 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1445 MI.getOperand(0).setReg(NewDst);
1446 B.buildTrunc(Dst, NewDst);
1448 for (
unsigned i = 1; i <
MI.getNumOperands(); i += 2) {
1451 auto DefMI = MRI.getVRegDef(
UseReg)->getIterator();
1456 auto NewUse = B.buildAnyExt(SgprRB_S32,
UseReg);
1457 MI.getOperand(i).setReg(NewUse.getReg(0));
1466 if (Ty ==
LLT::scalar(1) && MUI.isDivergent(Dst)) {
1468 "AMDGPU RegBankLegalize: Can't lower divergent S1 G_PHI",
1482 "AMDGPU RegBankLegalize: type not supported for G_PHI",
1490 unsigned StartOpIdx,
1491 unsigned EndOpIdx) {
1492 for (
unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
1493 if (
MRI.getRegBankOrNull(
MI.getOperand(i).getReg()) != RB)
1500 const RegisterBank *RB = MRI.getRegBank(
MI.getOperand(0).getReg());
1502 unsigned NumDefs =
MI.getNumDefs();
1503 unsigned NumOperands =
MI.getNumOperands();
1511 for (
unsigned i = NumDefs; i < NumOperands; ++i) {
1513 if (MRI.getRegBank(Reg) != RB) {
1514 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1515 MI.getOperand(i).setReg(Copy.getReg(0));
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
Provides AMDGPU specific target descriptions.
static bool isSignedBFE(MachineInstr &MI)
static bool verifyRegBankOnOperands(MachineInstr &MI, const RegisterBank *RB, MachineRegisterInfo &MRI, unsigned StartOpIdx, unsigned EndOpIdx)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
static Register UseReg(const MachineOperand &MO)
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
const SmallVectorImpl< MachineOperand > & Cond
bool findRuleAndApplyMapping(MachineInstr &MI)
bool applyMappingPHI(MachineInstr &MI)
void applyMappingTrivial(MachineInstr &MI)
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
const SIRegisterInfo * getRegisterInfo() const override
Represents a call to an intrinsic.
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
TypeSize getValue() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator SkipPHIsAndLabels(iterator I)
Return the first instruction in MBB after I that is not a PHI or a label.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
BasicBlockListType::iterator iterator
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Helper class to build MachineInstr.
Representation of each machine instruction.
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Holds all the information related to register banks.
This class implements the register bank concept.
Wrapper class representing virtual and physical registers.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
void push_back(const T &Elt)
A range adaptor for a pair of iterators.
bool isAnyPtr(LLT Ty, unsigned Width)
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Kill
The last use of a register.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI void reportGISelFailure(MachineFunction &MF, MachineOptimizationRemarkEmitter &MORE, MachineOptimizationRemarkMissed &R)
Report an ISel error as a missed optimization remark to the LLVMContext's diagnostic stream.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
LoweringMethodID LoweringMethod
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping