LLVM 23.0.0git
AMDGPURegBankLegalizeHelper.cpp
Go to the documentation of this file.
1//===-- AMDGPURegBankLegalizeHelper.cpp -----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// Implements actual lowering algorithms for each ID that can be used in
10/// Rule.OperandMapping. Similar to legalizer helper but with register banks.
11//
12//===----------------------------------------------------------------------===//
13
16#include "AMDGPUInstrInfo.h"
19#include "GCNSubtarget.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
27
28#define DEBUG_TYPE "amdgpu-regbanklegalize"
29
30using namespace llvm;
31using namespace AMDGPU;
32
35 const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
36 : MF(B.getMF()), ST(MF.getSubtarget<GCNSubtarget>()), B(B),
37 MRI(*B.getMRI()), MUI(MUI), RBI(RBI), MORE(MF, nullptr),
38 RBLRules(RBLRules), IsWave32(ST.isWave32()),
39 SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
40 VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
41 VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
42
44 const SetOfRulesForOpcode *RuleSet = RBLRules.getRulesForOpc(MI);
45 if (!RuleSet) {
46 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
47 "No AMDGPU RegBankLegalize rules defined for opcode",
48 MI);
49 return false;
50 }
51
52 const RegBankLLTMapping *Mapping = RuleSet->findMappingForMI(MI, MRI, MUI);
53 if (!Mapping) {
54 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
55 "AMDGPU RegBankLegalize: none of the rules defined with "
56 "'Any' for MI's opcode matched MI",
57 MI);
58 return false;
59 }
60
61 WaterfallInfo WFI;
62 unsigned OpIdx = 0;
63 if (!Mapping->DstOpMapping.empty()) {
64 B.setInsertPt(*MI.getParent(), std::next(MI.getIterator()));
65 if (!applyMappingDst(MI, OpIdx, Mapping->DstOpMapping))
66 return false;
67 }
68 if (!Mapping->SrcOpMapping.empty()) {
69 B.setInstr(MI);
70 if (!applyMappingSrc(MI, OpIdx, Mapping->SrcOpMapping, WFI))
71 return false;
72 }
73
74 if (!lower(MI, *Mapping, WFI))
75 return false;
76
77 return true;
78}
79
80bool RegBankLegalizeHelper::executeInWaterfallLoop(MachineIRBuilder &B,
81 const WaterfallInfo &WFI) {
82 assert(WFI.Start.isValid() && WFI.End.isValid() &&
83 "Waterfall range not initialized");
84
85 // Track use registers which have already been expanded with a readfirstlane
86 // sequence. This may have multiple uses if moving a sequence.
87 DenseMap<Register, Register> WaterfalledRegMap;
88
89 MachineBasicBlock &MBB = B.getMBB();
90 MachineFunction &MF = B.getMF();
91
94
96 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
97 unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
98 if (IsWave32) {
99 MovExecOpc = AMDGPU::S_MOV_B32;
100 MovExecTermOpc = AMDGPU::S_MOV_B32_term;
101 XorTermOpc = AMDGPU::S_XOR_B32_term;
102 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
103 ExecReg = AMDGPU::EXEC_LO;
104 } else {
105 MovExecOpc = AMDGPU::S_MOV_B64;
106 MovExecTermOpc = AMDGPU::S_MOV_B64_term;
107 XorTermOpc = AMDGPU::S_XOR_B64_term;
108 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
109 ExecReg = AMDGPU::EXEC;
110 }
111
112#ifndef NDEBUG
113 const int OrigRangeSize = std::distance(BeginIt, EndIt);
114#endif
115
116 MachineRegisterInfo &MRI = *B.getMRI();
117 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
118 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
119
120 // Don't bother using generic instructions/registers for the exec mask.
121 B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
122
123 Register SavedExec = MRI.createVirtualRegister(WaveRC);
124
125 // To insert the loop we need to split the block. Move everything before
126 // this point to a new block, and insert a new empty block before this
127 // instruction.
130 MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock();
131 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
133 ++MBBI;
134 MF.insert(MBBI, LoopBB);
135 MF.insert(MBBI, BodyBB);
136 MF.insert(MBBI, RestoreExecBB);
137 MF.insert(MBBI, RemainderBB);
138
139 LoopBB->addSuccessor(BodyBB);
140 BodyBB->addSuccessor(RestoreExecBB);
141 BodyBB->addSuccessor(LoopBB);
142
143 // Move the rest of the block into a new block.
145 RemainderBB->splice(RemainderBB->begin(), &MBB, EndIt, MBB.end());
146
147 MBB.addSuccessor(LoopBB);
148 RestoreExecBB->addSuccessor(RemainderBB);
149
150 B.setInsertPt(*LoopBB, LoopBB->end());
151
152 // +-MBB:------------+
153 // | ... |
154 // | %0 = G_INST_1 |
155 // | %Dst = MI %Vgpr |
156 // | %1 = G_INST_2 |
157 // | ... |
158 // +-----------------+
159 // ->
160 // +-MBB-------------------------------+
161 // | ... |
162 // | %0 = G_INST_1 |
163 // | %SaveExecReg = S_MOV_B32 $exec_lo |
164 // +----------------|------------------+
165 // | /------------------------------|
166 // V V |
167 // +-LoopBB---------------------------------------------------------------+ |
168 // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
169 // | instead of executing for each lane, see if other lanes had | |
170 // | same value for %Vgpr and execute for them also. | |
171 // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
172 // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
173 // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
174 // | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
175 // +----------------|-----------------------------------------------------+ |
176 // V |
177 // +-BodyBB------------------------------------------------------------+ |
178 // | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
179 // | executed only for active lanes and written to Dst | |
180 // | $exec = S_XOR_B32 $exec, %SavedExec | |
181 // | set active lanes to 0 in SavedExec, lanes that did not write to | |
182 // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
183 // | SI_WATERFALL_LOOP LoopBB |-----|
184 // +----------------|--------------------------------------------------+
185 // V
186 // +-RestoreExecBB--------------------------+
187 // | $exec_lo = S_MOV_B32_term %SaveExecReg |
188 // +----------------|-----------------------+
189 // V
190 // +-RemainderBB:----------------------+
191 // | %1 = G_INST_2 |
192 // | ... |
193 // +---------------------------------- +
194
195 // Move the instruction into the loop body. Note we moved everything after
196 // Range.end() already into a new block, so Range.end() is no longer valid.
197 BodyBB->splice(BodyBB->end(), &MBB, BeginIt, MBB.end());
198
199 // Figure out the iterator range after splicing the instructions.
200 MachineBasicBlock::iterator NewBegin = BeginIt;
201 auto NewEnd = BodyBB->end();
202 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
203
204 B.setMBB(*LoopBB);
205 Register CondReg;
206
207 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
208 for (MachineOperand &Op : MI.all_uses()) {
209 Register OldReg = Op.getReg();
210 if (!WFI.SgprWaterfallOperandRegs.count(OldReg))
211 continue;
212
213 // See if we already processed this register in another instruction in
214 // the sequence.
215 auto OldVal = WaterfalledRegMap.find(OldReg);
216 if (OldVal != WaterfalledRegMap.end()) {
217 Op.setReg(OldVal->second);
218 continue;
219 }
220
221 Register OpReg = Op.getReg();
222 LLT OpTy = MRI.getType(OpReg);
223
224 // TODO: support for agpr
225 assert(MRI.getRegBank(OpReg) == VgprRB);
226 Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
227 buildReadFirstLane(B, CurrentLaneReg, OpReg, RBI);
228
229 // Build the comparison(s), CurrentLaneReg == OpReg.
230 unsigned OpSize = OpTy.getSizeInBits();
231 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
232 LLT PartTy = LLT::scalar(PartSize);
233 unsigned NumParts = OpSize / PartSize;
235 SmallVector<Register, 8> CurrentLaneParts;
236
237 if (NumParts == 1) {
238 OpParts.push_back(OpReg);
239 CurrentLaneParts.push_back(CurrentLaneReg);
240 } else {
241 auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
242 auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
243 for (unsigned i = 0; i < NumParts; ++i) {
244 OpParts.push_back(UnmergeOp.getReg(i));
245 CurrentLaneParts.push_back(UnmergeCurrLane.getReg(i));
246 }
247 }
248
249 for (unsigned i = 0; i < NumParts; ++i) {
250 Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
251 B.buildICmp(CmpInst::ICMP_EQ, CmpReg, CurrentLaneParts[i], OpParts[i]);
252
253 if (!CondReg)
254 CondReg = CmpReg;
255 else
256 CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
257 }
258
259 Op.setReg(CurrentLaneReg);
260
261 // Make sure we don't re-process this register again.
262 WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
263 }
264 }
265
266 // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
267 Register CondRegLM =
268 MRI.createVirtualRegister({WaveRC, LLT::scalar(IsWave32 ? 32 : 64)});
269 B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
270
271 // Update EXEC, save the original EXEC value to SavedExec.
272 B.buildInstr(AndSaveExecOpc)
273 .addDef(SavedExec)
274 .addReg(CondRegLM, RegState::Kill);
275 MRI.setSimpleHint(SavedExec, CondRegLM);
276
277 B.setInsertPt(*BodyBB, BodyBB->end());
278
279 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
280 B.buildInstr(XorTermOpc).addDef(ExecReg).addReg(ExecReg).addReg(SavedExec);
281
282 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
283 // s_cbranch_scc0?
284
285 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
286 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
287
288 // Save the EXEC mask before the loop.
289 B.setInsertPt(MBB, MBB.end());
290 B.buildInstr(MovExecOpc).addDef(SaveExecReg).addReg(ExecReg);
291
292 // Restore the EXEC mask after the loop.
293 B.setInsertPt(*RestoreExecBB, RestoreExecBB->begin());
294 B.buildInstr(MovExecTermOpc).addDef(ExecReg).addReg(SaveExecReg);
295
296 // Set the insert point after the original instruction, so any new
297 // instructions will be in the remainder.
298 B.setInsertPt(*RemainderBB, RemainderBB->begin());
299
300 return true;
301}
302
303bool RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
304 ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
305 MachineFunction &MF = B.getMF();
306 assert(MI.getNumMemOperands() == 1);
307 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
308 Register Dst = MI.getOperand(0).getReg();
309 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
310 Register Base = MI.getOperand(1).getReg();
311 LLT PtrTy = MRI.getType(Base);
312 const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base);
313 LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
314 SmallVector<Register, 4> LoadPartRegs;
315
316 unsigned ByteOffset = 0;
317 for (LLT PartTy : LLTBreakdown) {
318 Register BasePlusOffset;
319 if (ByteOffset == 0) {
320 BasePlusOffset = Base;
321 } else {
322 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
323 BasePlusOffset =
324 B.buildObjectPtrOffset({PtrRB, PtrTy}, Base, Offset).getReg(0);
325 }
326 auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
327 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
328 LoadPartRegs.push_back(LoadPart.getReg(0));
329 ByteOffset += PartTy.getSizeInBytes();
330 }
331
332 if (!MergeTy.isValid()) {
333 // Loads are of same size, concat or merge them together.
334 B.buildMergeLikeInstr(Dst, LoadPartRegs);
335 } else {
336 // Loads are not all of same size, need to unmerge them to smaller pieces
337 // of MergeTy type, then merge pieces to Dst.
338 SmallVector<Register, 4> MergeTyParts;
339 for (Register Reg : LoadPartRegs) {
340 if (MRI.getType(Reg) == MergeTy) {
341 MergeTyParts.push_back(Reg);
342 } else {
343 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg);
344 for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
345 MergeTyParts.push_back(Unmerge.getReg(i));
346 }
347 }
348 B.buildMergeLikeInstr(Dst, MergeTyParts);
349 }
350 MI.eraseFromParent();
351 return true;
352}
353
354bool RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
355 LLT MergeTy) {
356 MachineFunction &MF = B.getMF();
357 assert(MI.getNumMemOperands() == 1);
358 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
359 Register Dst = MI.getOperand(0).getReg();
360 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
361 Register Base = MI.getOperand(1).getReg();
362
363 MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
364 auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO);
365
366 if (WideTy.isScalar()) {
367 B.buildTrunc(Dst, WideLoad);
368 } else {
369 SmallVector<Register, 4> MergeTyParts;
370 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
371
372 LLT DstTy = MRI.getType(Dst);
373 unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits();
374 for (unsigned i = 0; i < NumElts; ++i) {
375 MergeTyParts.push_back(Unmerge.getReg(i));
376 }
377 B.buildMergeLikeInstr(Dst, MergeTyParts);
378 }
379 MI.eraseFromParent();
380 return true;
381}
382
383bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
384 Register Dst = MI.getDstReg();
385 Register Ptr = MI.getPointerReg();
386 MachineMemOperand &MMO = MI.getMMO();
387 unsigned MemSize = 8 * MMO.getSize().getValue();
388
389 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
390
391 if (MI.getOpcode() == G_LOAD) {
392 B.buildLoad(Dst, Ptr, *WideMMO);
393 } else {
394 auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
395
396 if (MI.getOpcode() == G_ZEXTLOAD) {
397 APInt Mask = APInt::getLowBitsSet(S32.getSizeInBits(), MemSize);
398 auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
399 B.buildAnd(Dst, Load, MaskCst);
400 } else {
401 assert(MI.getOpcode() == G_SEXTLOAD);
402 B.buildSExtInReg(Dst, Load, MemSize);
403 }
404 }
405
406 MI.eraseFromParent();
407 return true;
408}
409
410bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
411 Register Dst = MI.getOperand(0).getReg();
412 LLT Ty = MRI.getType(Dst);
413 Register Src = MI.getOperand(1).getReg();
414 unsigned Opc = MI.getOpcode();
415 int TrueExtCst = Opc == G_SEXT ? -1 : 1;
416 if (Ty == S32 || Ty == S16) {
417 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
418 auto False = B.buildConstant({VgprRB, Ty}, 0);
419 B.buildSelect(Dst, Src, True, False);
420 } else if (Ty == S64) {
421 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
422 auto False = B.buildConstant({VgprRB_S32}, 0);
423 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
424 MachineInstrBuilder Hi;
425 switch (Opc) {
426 case G_SEXT:
427 Hi = Lo;
428 break;
429 case G_ZEXT:
430 Hi = False;
431 break;
432 case G_ANYEXT:
433 Hi = B.buildUndef({VgprRB_S32});
434 break;
435 default:
437 MF, MORE, "amdgpu-regbanklegalize",
438 "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported", MI);
439 return false;
440 }
441
442 B.buildMergeValues(Dst, {Lo.getReg(0), Hi.getReg(0)});
443 } else {
445 MF, MORE, "amdgpu-regbanklegalize",
446 "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported", MI);
447 return false;
448 }
449
450 MI.eraseFromParent();
451 return true;
452}
453
454std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) {
455 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
456 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
457 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
458 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
459 return {Lo.getReg(0), Hi.getReg(0)};
460}
461
462std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(Register Reg) {
463 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
464 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
465 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
466 return {Lo.getReg(0), Hi.getReg(0)};
467}
468
469std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
470 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
471 auto Lo = PackedS32;
472 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
473 return {Lo.getReg(0), Hi.getReg(0)};
474}
475
476std::pair<Register, Register>
477RegBankLegalizeHelper::unpackAExtTruncS16(Register Reg) {
478 auto [Lo32, Hi32] = unpackAExt(Reg);
479 return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0),
480 B.buildTrunc(SgprRB_S16, Hi32).getReg(0)};
481}
482
483bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
484 Register Lo, Hi;
485 switch (MI.getOpcode()) {
486 case AMDGPU::G_SHL: {
487 auto [Val0, Val1] = unpackAExt(MI.getOperand(1).getReg());
488 auto [Amt0, Amt1] = unpackAExt(MI.getOperand(2).getReg());
489 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
490 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
491 break;
492 }
493 case AMDGPU::G_LSHR: {
494 auto [Val0, Val1] = unpackZExt(MI.getOperand(1).getReg());
495 auto [Amt0, Amt1] = unpackZExt(MI.getOperand(2).getReg());
496 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
497 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
498 break;
499 }
500 case AMDGPU::G_ASHR: {
501 auto [Val0, Val1] = unpackSExt(MI.getOperand(1).getReg());
502 auto [Amt0, Amt1] = unpackSExt(MI.getOperand(2).getReg());
503 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
504 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
505 break;
506 }
507 default:
509 MF, MORE, "amdgpu-regbanklegalize",
510 "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented",
511 MI);
512 return false;
513 }
514 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
515 MI.eraseFromParent();
516 return true;
517}
518
519bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
520 Register Lo, Hi;
521 switch (MI.getOpcode()) {
522 case AMDGPU::G_SMIN:
523 case AMDGPU::G_SMAX: {
524 // For signed operations, use sign extension
525 auto [Val0_Lo, Val0_Hi] = unpackSExt(MI.getOperand(1).getReg());
526 auto [Val1_Lo, Val1_Hi] = unpackSExt(MI.getOperand(2).getReg());
527 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
528 .getReg(0);
529 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
530 .getReg(0);
531 break;
532 }
533 case AMDGPU::G_UMIN:
534 case AMDGPU::G_UMAX: {
535 // For unsigned operations, use zero extension
536 auto [Val0_Lo, Val0_Hi] = unpackZExt(MI.getOperand(1).getReg());
537 auto [Val1_Lo, Val1_Hi] = unpackZExt(MI.getOperand(2).getReg());
538 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
539 .getReg(0);
540 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
541 .getReg(0);
542 break;
543 }
544 default:
546 MF, MORE, "amdgpu-regbanklegalize",
547 "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented", MI);
548 return false;
549 }
550 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
551 MI.eraseFromParent();
552 return true;
553}
554
555bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &MI) {
556 auto [Op1Lo, Op1Hi] = unpackAExt(MI.getOperand(1).getReg());
557 auto [Op2Lo, Op2Hi] = unpackAExt(MI.getOperand(2).getReg());
558 auto ResLo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
559 auto ResHi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
560 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(),
561 {ResLo.getReg(0), ResHi.getReg(0)});
562 MI.eraseFromParent();
563 return true;
564}
565
568 return (GI->is(Intrinsic::amdgcn_sbfe));
569
570 return MI.getOpcode() == AMDGPU::G_SBFX;
571}
572
573bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
574 Register Dst = MI.getOperand(0).getReg();
575 assert(MRI.getType(Dst) == LLT::scalar(64));
576 bool Signed = isSignedBFE(MI);
577 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
578 // Extract bitfield from Src, LSBit is the least-significant bit for the
579 // extraction (field offset) and Width is size of bitfield.
580 Register Src = MI.getOperand(FirstOpnd).getReg();
581 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
582 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
583 // Comments are for signed bitfield extract, similar for unsigned. x is sign
584 // bit. s is sign, l is LSB and y are remaining bits of bitfield to extract.
585
586 // Src >> LSBit Hi|Lo: x?????syyyyyyl??? -> xxxx?????syyyyyyl
587 unsigned SHROpc = Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
588 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
589
590 auto ConstWidth = getIConstantVRegValWithLookThrough(Width, MRI);
591
592 // Expand to Src >> LSBit << (64 - Width) >> (64 - Width)
593 // << (64 - Width): Hi|Lo: xxxx?????syyyyyyl -> syyyyyyl000000000
594 // >> (64 - Width): Hi|Lo: syyyyyyl000000000 -> ssssssssssyyyyyyl
595 if (!ConstWidth) {
596 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
597 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
598 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
599 MI.eraseFromParent();
600 return true;
601 }
602
603 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
604 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
605 Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
606 Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
607 auto Zero = B.buildConstant({VgprRB, S32}, 0);
608 unsigned BFXOpc = Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
609
610 if (WidthImm <= 32) {
611 // SHRSrc Hi|Lo: ????????|???syyyl -> ????????|ssssyyyl
612 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo, Zero, Width});
613 MachineInstrBuilder Hi;
614 if (Signed) {
615 // SHRSrc Hi|Lo: ????????|ssssyyyl -> ssssssss|ssssyyyl
616 Hi = B.buildAShr(VgprRB_S32, Lo, B.buildConstant(VgprRB_S32, 31));
617 } else {
618 // SHRSrc Hi|Lo: ????????|000syyyl -> 00000000|000syyyl
619 Hi = Zero;
620 }
621 B.buildMergeLikeInstr(Dst, {Lo, Hi});
622 } else {
623 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
624 // SHRSrc Hi|Lo: ??????sy|yyyyyyyl -> sssssssy|yyyyyyyl
625 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi, Zero, Amt});
626 B.buildMergeLikeInstr(Dst, {SHRSrcLo, Hi});
627 }
628
629 MI.eraseFromParent();
630 return true;
631}
632
633bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
634 Register DstReg = MI.getOperand(0).getReg();
635 LLT Ty = MRI.getType(DstReg);
636 bool Signed = isSignedBFE(MI);
637 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
638 Register Src = MI.getOperand(FirstOpnd).getReg();
639 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
640 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
641 // For uniform bit field extract there are 4 available instructions, but
642 // LSBit(field offset) and Width(size of bitfield) need to be packed in S32,
643 // field offset in low and size in high 16 bits.
644
645 // Src1 Hi16|Lo16 = Size|FieldOffset
646 auto Mask = B.buildConstant(SgprRB_S32, maskTrailingOnes<unsigned>(6));
647 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
648 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
649 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset, Size);
650 unsigned Opc32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
651 unsigned Opc64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
652 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
653
654 // Select machine instruction, because of reg class constraining, insert
655 // copies from reg class to reg bank.
656 auto S_BFE = B.buildInstr(Opc, {{SgprRB, Ty}},
657 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
658 constrainSelectedInstRegOperands(*S_BFE, *ST.getInstrInfo(),
659 *ST.getRegisterInfo(), RBI);
660
661 B.buildCopy(DstReg, S_BFE->getOperand(0).getReg());
662 MI.eraseFromParent();
663 return true;
664}
665
666bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
667 Register Dst = MI.getOperand(0).getReg();
668 LLT DstTy = MRI.getType(Dst);
669 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
670 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
671 auto Op1 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(1).getReg());
672 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
673 unsigned Opc = MI.getOpcode();
674 auto Flags = MI.getFlags();
675 auto Lo =
676 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)}, Flags);
677 auto Hi =
678 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)}, Flags);
679 B.buildMergeLikeInstr(Dst, {Lo, Hi});
680 MI.eraseFromParent();
681 return true;
682}
683
684bool RegBankLegalizeHelper::lowerSplitTo32Mul(MachineInstr &MI) {
685 Register Dst = MI.getOperand(0).getReg();
686 assert(MRI.getType(Dst) == S64);
687 auto Op1 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(1).getReg());
688 auto Op2 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(2).getReg());
689
690 // TODO: G_AMDGPU_MAD_* optimizations for G_MUL divergent S64 operation to
691 // match GlobalISel with old regbankselect.
692 auto Lo = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
693 auto Carry = B.buildUMulH(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
694 auto MulLo0Hi1 = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(1));
695 auto MulHi0Lo1 = B.buildMul(VgprRB_S32, Op1.getReg(1), Op2.getReg(0));
696 auto Sum = B.buildAdd(VgprRB_S32, MulLo0Hi1, MulHi0Lo1);
697 auto Hi = B.buildAdd(VgprRB_S32, Sum, Carry);
698
699 B.buildMergeLikeInstr(Dst, {Lo, Hi});
700 MI.eraseFromParent();
701 return true;
702}
703
704bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
705 Register Dst = MI.getOperand(0).getReg();
706 assert(MRI.getType(Dst) == V2S16);
707 unsigned Opc = MI.getOpcode();
708 unsigned NumOps = MI.getNumOperands();
709 auto Flags = MI.getFlags();
710
711 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(MI.getOperand(1).getReg());
712
713 if (NumOps == 2) {
714 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo}, Flags);
715 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi}, Flags);
716 B.buildMergeLikeInstr(Dst, {Lo, Hi});
717 MI.eraseFromParent();
718 return true;
719 }
720
721 auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(MI.getOperand(2).getReg());
722
723 if (NumOps == 3) {
724 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags);
725 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags);
726 B.buildMergeLikeInstr(Dst, {Lo, Hi});
727 MI.eraseFromParent();
728 return true;
729 }
730
731 assert(NumOps == 4);
732 auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(MI.getOperand(3).getReg());
733 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo, Op3Lo}, Flags);
734 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi, Op3Hi}, Flags);
735 B.buildMergeLikeInstr(Dst, {Lo, Hi});
736 MI.eraseFromParent();
737 return true;
738}
739
740bool RegBankLegalizeHelper::lowerUniMAD64(MachineInstr &MI) {
741 Register Dst0 = MI.getOperand(0).getReg();
742 Register Dst1 = MI.getOperand(1).getReg();
743 Register Src0 = MI.getOperand(2).getReg();
744 Register Src1 = MI.getOperand(3).getReg();
745 Register Src2 = MI.getOperand(4).getReg();
746
747 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
748
749 // Keep the multiplication on the SALU.
750 Register DstLo = B.buildMul(SgprRB_S32, Src0, Src1).getReg(0);
751 Register DstHi = MRI.createVirtualRegister(SgprRB_S32);
752 if (ST.hasScalarMulHiInsts()) {
753 B.buildInstr(AMDGPU::G_UMULH, {{DstHi}}, {Src0, Src1});
754 } else {
755 auto VSrc0 = B.buildCopy(VgprRB_S32, Src0);
756 auto VSrc1 = B.buildCopy(VgprRB_S32, Src1);
757 auto MulHi = B.buildInstr(AMDGPU::G_UMULH, {VgprRB_S32}, {VSrc0, VSrc1});
758 buildReadAnyLane(B, DstHi, MulHi.getReg(0), RBI);
759 }
760
761 // Accumulate and produce the "carry-out" bit.
762
763 // The "carry-out" is defined as bit 64 of the result when computed as a
764 // big integer. For unsigned multiply-add, this matches the usual
765 // definition of carry-out.
766 if (mi_match(Src2, MRI, MIPatternMatch::m_ZeroInt())) {
767 // No accumulate: result is just the multiplication, carry is 0.
768 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
769 B.buildConstant(Dst1, 0);
770 } else {
771 // Accumulate: add Src2 to the multiplication result with carry chain.
772 Register Src2Lo = MRI.createVirtualRegister(SgprRB_S32);
773 Register Src2Hi = MRI.createVirtualRegister(SgprRB_S32);
774 B.buildUnmerge({Src2Lo, Src2Hi}, Src2);
775
776 auto AddLo = B.buildUAddo(SgprRB_S32, SgprRB_S32, DstLo, Src2Lo);
777 auto AddHi =
778 B.buildUAdde(SgprRB_S32, SgprRB_S32, DstHi, Src2Hi, AddLo.getReg(1));
779 B.buildMergeLikeInstr(Dst0, {AddLo.getReg(0), AddHi.getReg(0)});
780 B.buildCopy(Dst1, AddHi.getReg(1));
781 }
782
783 MI.eraseFromParent();
784 return true;
785}
786
787bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
788 Register Dst = MI.getOperand(0).getReg();
789 LLT DstTy = MRI.getType(Dst);
790 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
791 (DstTy.isPointer() && DstTy.getSizeInBits() == 64));
792 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
793 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
794 auto Op3 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(3).getReg());
795 Register Cond = MI.getOperand(1).getReg();
796 auto Flags = MI.getFlags();
797 auto Lo =
798 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(0), Op3.getReg(0), Flags);
799 auto Hi =
800 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(1), Op3.getReg(1), Flags);
801
802 B.buildMergeLikeInstr(Dst, {Lo, Hi});
803 MI.eraseFromParent();
804 return true;
805}
806
807bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
808 auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
809 int Amt = MI.getOperand(2).getImm();
810 Register Lo, Hi;
811 // Hi|Lo: s sign bit, ?/x bits changed/not changed by sign-extend
812 if (Amt <= 32) {
813 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
814 if (Amt == 32) {
815 // Hi|Lo: ????????|sxxxxxxx -> ssssssss|sxxxxxxx
816 Lo = Freeze.getReg(0);
817 } else {
818 // Hi|Lo: ????????|???sxxxx -> ssssssss|ssssxxxx
819 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
820 }
821
822 auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
823 Hi = B.buildAShr(VgprRB_S32, Lo, SignExtCst).getReg(0);
824 } else {
825 // Hi|Lo: ?????sxx|xxxxxxxx -> ssssssxx|xxxxxxxx
826 Lo = Op1.getReg(0);
827 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
828 }
829
830 B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi});
831 MI.eraseFromParent();
832 return true;
833}
834
835bool RegBankLegalizeHelper::lowerSplitBitCount64To32(MachineInstr &MI) {
836 // Split 64-bit find-first-bit operations into 32-bit halves:
837 // (ffbh hi:lo) -> umin(ffbh(hi), uaddsat(ffbh(lo), 32))
838 // (ffbl hi:lo) -> umin(ffbl(lo), uaddsat(ffbl(hi), 32))
839 // (ctlz_zero_undef hi:lo) -> umin(ffbh(hi), add(ffbh(lo), 32))
840 // (cttz_zero_undef hi:lo) -> umin(ffbl(lo), add(ffbl(hi), 32))
841 unsigned Opc = MI.getOpcode();
842
843 // FFBH/FFBL return 0xFFFFFFFF on zero input, using uaddsat to avoid
844 // wrapping. CTLZ/CTTZ guarantee non-zero input (zero_undef), so plain add
845 // is fine.
846 unsigned FFBOpc;
847 unsigned AddOpc;
848 bool SearchFromMSB;
849 switch (Opc) {
850 case AMDGPU::G_AMDGPU_FFBH_U32:
851 FFBOpc = Opc;
852 AddOpc = AMDGPU::G_UADDSAT;
853 SearchFromMSB = true;
854 break;
855 case AMDGPU::G_AMDGPU_FFBL_B32:
856 FFBOpc = Opc;
857 AddOpc = AMDGPU::G_UADDSAT;
858 SearchFromMSB = false;
859 break;
860 case AMDGPU::G_CTLZ_ZERO_UNDEF:
861 FFBOpc = AMDGPU::G_AMDGPU_FFBH_U32;
862 AddOpc = AMDGPU::G_ADD;
863 SearchFromMSB = true;
864 break;
865 case AMDGPU::G_CTTZ_ZERO_UNDEF:
866 FFBOpc = AMDGPU::G_AMDGPU_FFBL_B32;
867 AddOpc = AMDGPU::G_ADD;
868 SearchFromMSB = false;
869 break;
870 default:
871 llvm_unreachable("unexpected opcode in lowerSplitBitCount64To32");
872 }
873
874 auto Unmerge = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
875 Register Lo = Unmerge.getReg(0);
876 Register Hi = Unmerge.getReg(1);
877
878 // MSB-first (FFBH/CTLZ) searches hi first; LSB-first (FFBL/CTTZ) searches
879 // lo first. The secondary half adds 32 to account for the primary half's
880 // width.
881 auto Primary = B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ? Hi : Lo});
882 auto Secondary =
883 B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ? Lo : Hi});
884
885 auto Adjusted = B.buildInstr(AddOpc, {VgprRB_S32},
886 {Secondary, B.buildConstant(VgprRB_S32, 32)});
887 B.buildUMin(MI.getOperand(0).getReg(), Primary, Adjusted);
888
889 MI.eraseFromParent();
890 return true;
891}
892
893bool RegBankLegalizeHelper::lower(MachineInstr &MI,
894 const RegBankLLTMapping &Mapping,
895 WaterfallInfo &WFI) {
896
897 switch (Mapping.LoweringMethod) {
898 case DoNotLower:
899 break;
900 case VccExtToSel:
901 return lowerVccExtToSel(MI);
902 case UniExtToSel: {
903 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
904 auto True = B.buildConstant({SgprRB, Ty},
905 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
906 auto False = B.buildConstant({SgprRB, Ty}, 0);
907 // Input to G_{Z|S}EXT is 'Legalizer legal' S1. Most common case is compare.
908 // We are making select here. S1 cond was already 'any-extended to S32' +
909 // 'AND with 1 to clean high bits' by Sgpr32AExtBoolInReg.
910 B.buildSelect(MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), True,
911 False);
912 MI.eraseFromParent();
913 return true;
914 }
915 case UnpackBitShift:
916 return lowerUnpackBitShift(MI);
917 case UnpackMinMax:
918 return lowerUnpackMinMax(MI);
919 case ScalarizeToS16:
920 return lowerSplitTo16(MI);
921 case Ext32To64: {
922 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
923 MachineInstrBuilder Hi;
924 switch (MI.getOpcode()) {
925 case AMDGPU::G_ZEXT: {
926 Hi = B.buildConstant({RB, S32}, 0);
927 break;
928 }
929 case AMDGPU::G_SEXT: {
930 // Replicate sign bit from 32-bit extended part.
931 auto ShiftAmt = B.buildConstant({RB, S32}, 31);
932 Hi = B.buildAShr({RB, S32}, MI.getOperand(1).getReg(), ShiftAmt);
933 break;
934 }
935 case AMDGPU::G_ANYEXT: {
936 Hi = B.buildUndef({RB, S32});
937 break;
938 }
939 default:
940 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
941 "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode",
942 MI);
943 return false;
944 }
945
946 B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
947 {MI.getOperand(1).getReg(), Hi});
948 MI.eraseFromParent();
949 return true;
950 }
951 case UniCstExt: {
952 uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
953 B.buildConstant(MI.getOperand(0).getReg(), ConstVal);
954
955 MI.eraseFromParent();
956 return true;
957 }
958 case VgprToVccCopy: {
959 Register Src = MI.getOperand(1).getReg();
960 LLT Ty = MRI.getType(Src);
961 // Take lowest bit from each lane and put it in lane mask.
962 // Lowering via compare, but we need to clean high bits first as compare
963 // compares all bits in register.
964 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
965 if (Ty == S64) {
966 auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
967 auto One = B.buildConstant(VgprRB_S32, 1);
968 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
969 auto Zero = B.buildConstant(VgprRB_S32, 0);
970 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero);
971 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi});
972 } else {
973 assert(Ty == S32 || Ty == S16);
974 auto One = B.buildConstant({VgprRB, Ty}, 1);
975 B.buildAnd(BoolSrc, Src, One);
976 }
977 auto Zero = B.buildConstant({VgprRB, Ty}, 0);
978 B.buildICmp(CmpInst::ICMP_NE, MI.getOperand(0).getReg(), BoolSrc, Zero);
979 MI.eraseFromParent();
980 return true;
981 }
982 case V_BFE:
983 return lowerV_BFE(MI);
984 case S_BFE:
985 return lowerS_BFE(MI);
986 case UniMAD64:
987 return lowerUniMAD64(MI);
988 case UniMul64: {
989 B.buildMul(MI.getOperand(0), MI.getOperand(1), MI.getOperand(2));
990 MI.eraseFromParent();
991 return true;
992 }
993 case DivSMulToMAD: {
994 auto Op1 = B.buildTrunc(VgprRB_S32, MI.getOperand(1));
995 auto Op2 = B.buildTrunc(VgprRB_S32, MI.getOperand(2));
996 auto Zero = B.buildConstant({VgprRB, S64}, 0);
997
998 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_AMDGPU_S_MUL_U64_U32
999 ? AMDGPU::G_AMDGPU_MAD_U64_U32
1000 : AMDGPU::G_AMDGPU_MAD_I64_I32;
1001
1002 B.buildInstr(NewOpc, {MI.getOperand(0).getReg(), {SgprRB, S32}},
1003 {Op1, Op2, Zero});
1004 MI.eraseFromParent();
1005 return true;
1006 }
1007 case SplitTo32:
1008 return lowerSplitTo32(MI);
1009 case SplitTo32Mul:
1010 return lowerSplitTo32Mul(MI);
1011 case SplitTo32Select:
1012 return lowerSplitTo32Select(MI);
1013 case SplitTo32SExtInReg:
1014 return lowerSplitTo32SExtInReg(MI);
1015 case SplitLoad: {
1016 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
1017 unsigned Size = DstTy.getSizeInBits();
1018 // Even split to 128-bit loads
1019 if (Size > 128) {
1020 LLT B128;
1021 if (DstTy.isVector()) {
1022 LLT EltTy = DstTy.getElementType();
1023 B128 = LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1024 } else {
1025 B128 = LLT::scalar(128);
1026 }
1027 if (Size / 128 == 2)
1028 splitLoad(MI, {B128, B128});
1029 else if (Size / 128 == 4)
1030 splitLoad(MI, {B128, B128, B128, B128});
1031 else {
1032 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1033 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1034 MI);
1035 return false;
1036 }
1037 }
1038 // 64 and 32 bit load
1039 else if (DstTy == S96)
1040 splitLoad(MI, {S64, S32}, S32);
1041 else if (DstTy == V3S32)
1042 splitLoad(MI, {V2S32, S32}, S32);
1043 else if (DstTy == V6S16)
1044 splitLoad(MI, {V4S16, V2S16}, V2S16);
1045 else {
1046 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1047 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1048 MI);
1049 return false;
1050 }
1051 return true;
1052 }
1053 case WidenLoad: {
1054 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
1055 if (DstTy == S96)
1056 widenLoad(MI, S128);
1057 else if (DstTy == V3S32)
1058 widenLoad(MI, V4S32, S32);
1059 else if (DstTy == V6S16)
1060 widenLoad(MI, V8S16, V2S16);
1061 else {
1062 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1063 "AMDGPU RegBankLegalize: WidenLoad, unsuported type",
1064 MI);
1065 return false;
1066 }
1067 return true;
1068 }
1069 case UnpackAExt:
1070 return lowerUnpackAExt(MI);
1071 case WidenMMOToS32:
1072 return widenMMOToS32(cast<GAnyLoad>(MI));
1073 case VerifyAllSgpr: {
1074 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1075 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1076 }));
1077 return true;
1078 }
1079 case ApplyAllVgpr: {
1080 assert(llvm::all_of(MI.defs(), [&](const MachineOperand &Op) {
1081 return MRI.getRegBankOrNull(Op.getReg()) == VgprRB;
1082 }));
1083 B.setInstrAndDebugLoc(MI);
1084 for (unsigned i = MI.getNumDefs(); i < MI.getNumOperands(); ++i) {
1085 Register Reg = MI.getOperand(i).getReg();
1086 if (MRI.getRegBank(Reg) != VgprRB) {
1087 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1088 MI.getOperand(i).setReg(Copy.getReg(0));
1089 }
1090 }
1091 return true;
1092 }
1093 case UnmergeToShiftTrunc: {
1094 GUnmerge *Unmerge = dyn_cast<GUnmerge>(&MI);
1095 LLT Ty = MRI.getType(Unmerge->getSourceReg());
1096 if (Ty.getSizeInBits() % 32 != 0) {
1097 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1098 "AMDGPU RegBankLegalize: unmerge not multiple of 32",
1099 MI);
1100 return false;
1101 }
1102
1103 B.setInstrAndDebugLoc(MI);
1104 if (Ty.getSizeInBits() > 32) {
1105 auto UnmergeV2S16 =
1106 B.buildUnmerge({SgprRB, V2S16}, Unmerge->getSourceReg());
1107 for (unsigned i = 0; i < UnmergeV2S16->getNumDefs(); ++i) {
1108 auto [Dst0S32, Dst1S32] =
1109 unpackAExt(UnmergeV2S16->getOperand(i).getReg());
1110 B.buildTrunc(MI.getOperand(i * 2).getReg(), Dst0S32);
1111 B.buildTrunc(MI.getOperand(i * 2 + 1).getReg(), Dst1S32);
1112 }
1113 } else {
1114 auto [Dst0S32, Dst1S32] = unpackAExt(MI.getOperand(2).getReg());
1115 B.buildTrunc(MI.getOperand(0).getReg(), Dst0S32);
1116 B.buildTrunc(MI.getOperand(1).getReg(), Dst1S32);
1117 }
1118
1119 MI.eraseFromParent();
1120 return true;
1121 }
1123 Register Dst = MI.getOperand(0).getReg();
1124 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1125 B.setInsertPt(*MI.getParent(), MI.getParent()->getFirstNonPHI());
1126 MI.getOperand(0).setReg(NewDst);
1127 B.buildTrunc(Dst, NewDst);
1128
1129 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1130 Register UseReg = MI.getOperand(i).getReg();
1131
1132 auto DefMI = MRI.getVRegDef(UseReg)->getIterator();
1133 MachineBasicBlock *DefMBB = DefMI->getParent();
1134
1135 B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI)));
1136
1137 auto NewUse = B.buildAnyExt(SgprRB_S32, UseReg);
1138 MI.getOperand(i).setReg(NewUse.getReg(0));
1139 }
1140 break;
1141 }
1142 case VerifyAllSgprGPHI: {
1143 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1144 if (Op.isMBB())
1145 return true;
1146 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1147 }));
1148 return true;
1149 }
1151 assert(MRI.getRegBankOrNull(MI.getOperand(0).getReg()) == VgprRB);
1152 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1153 if (Op.isMBB())
1154 return true;
1155 const RegisterBank *RB = MRI.getRegBankOrNull(Op.getReg());
1156 return RB == VgprRB || RB == SgprRB;
1157 }));
1158 return true;
1159 }
1160 case ApplyINTRIN_IMAGE:
1161 return applyRegisterBanksINTRIN_IMAGE(MI);
1163 return lowerSplitBitCount64To32(MI);
1164 }
1165
1166 if (!WFI.SgprWaterfallOperandRegs.empty()) {
1167 if (!executeInWaterfallLoop(B, WFI))
1168 return false;
1169 }
1170 return true;
1171}
1172
1173LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
1174 switch (ID) {
1175 case Vcc:
1176 case UniInVcc:
1177 return LLT::scalar(1);
1178 case Sgpr16:
1179 case Vgpr16:
1180 case UniInVgprS16:
1181 return LLT::scalar(16);
1182 case Sgpr32:
1183 case Sgpr32_WF:
1184 case Sgpr32Trunc:
1185 case Sgpr32AExt:
1187 case Sgpr32SExt:
1188 case Sgpr32ZExt:
1189 case UniInVgprS32:
1190 case Sgpr32ToVgprDst:
1191 case Vgpr32:
1192 case Vgpr32AExt:
1193 case Vgpr32SExt:
1194 case Vgpr32ZExt:
1195 return LLT::scalar(32);
1196 case Sgpr64:
1197 case Vgpr64:
1198 case UniInVgprS64:
1199 case Sgpr64ToVgprDst:
1200 return LLT::scalar(64);
1201 case Sgpr128:
1202 case Vgpr128:
1203 return LLT::scalar(128);
1204 case SgprP0:
1205 case SgprP0Call_WF:
1206 case VgprP0:
1207 return LLT::pointer(0, 64);
1208 case SgprP1:
1209 case VgprP1:
1210 return LLT::pointer(1, 64);
1211 case SgprP2:
1212 case VgprP2:
1213 return LLT::pointer(2, 32);
1214 case SgprP3:
1215 case VgprP3:
1216 return LLT::pointer(3, 32);
1217 case SgprP4:
1218 case SgprP4Call_WF:
1219 case VgprP4:
1220 return LLT::pointer(4, 64);
1221 case SgprP5:
1222 case VgprP5:
1223 return LLT::pointer(5, 32);
1224 case SgprP8:
1225 return LLT::pointer(8, 128);
1226 case SgprV2S16:
1227 case VgprV2S16:
1228 case UniInVgprV2S16:
1229 return LLT::fixed_vector(2, 16);
1230 case SgprV2S32:
1231 case VgprV2S32:
1232 case UniInVgprV2S32:
1233 return LLT::fixed_vector(2, 32);
1234 case VgprV3S32:
1235 return LLT::fixed_vector(3, 32);
1236 case VgprV4S16:
1237 return LLT::fixed_vector(4, 16);
1238 case SgprV4S32:
1239 case SgprV4S32_WF:
1240 case VgprV4S32:
1241 case UniInVgprV4S32:
1242 return LLT::fixed_vector(4, 32);
1243 case VgprV8S32:
1244 return LLT::fixed_vector(8, 32);
1245 case VgprV2S64:
1246 case UniInVgprV2S64:
1247 return LLT::fixed_vector(2, 64);
1248 default:
1249 return LLT();
1250 }
1251}
1252
1253LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
1254 switch (ID) {
1255 case SgprB32:
1256 case VgprB32:
1257 case SgprB32_M0:
1259 case UniInVgprB32:
1260 if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) ||
1261 isAnyPtr(Ty, 32))
1262 return Ty;
1263 return LLT();
1264 case SgprPtr32:
1265 case VgprPtr32:
1266 return isAnyPtr(Ty, 32) ? Ty : LLT();
1267 case SgprPtr64:
1268 case VgprPtr64:
1269 return isAnyPtr(Ty, 64) ? Ty : LLT();
1270 case SgprPtr128:
1271 case VgprPtr128:
1272 return isAnyPtr(Ty, 128) ? Ty : LLT();
1273 case SgprB64:
1274 case VgprB64:
1276 case UniInVgprB64:
1277 if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
1278 Ty == LLT::fixed_vector(4, 16) || isAnyPtr(Ty, 64))
1279 return Ty;
1280 return LLT();
1281 case SgprB96:
1282 case VgprB96:
1283 case UniInVgprB96:
1284 if (Ty == LLT::scalar(96) || Ty == LLT::fixed_vector(3, 32) ||
1285 Ty == LLT::fixed_vector(6, 16))
1286 return Ty;
1287 return LLT();
1288 case SgprB128:
1289 case VgprB128:
1290 case UniInVgprB128:
1291 if (Ty == LLT::scalar(128) || Ty == LLT::fixed_vector(4, 32) ||
1292 Ty == LLT::fixed_vector(2, 64) || Ty == LLT::fixed_vector(8, 16) ||
1293 isAnyPtr(Ty, 128))
1294 return Ty;
1295 return LLT();
1296 case VgprB160:
1297 case UniInVgprB160:
1298 if (Ty.getSizeInBits() == 160)
1299 return Ty;
1300 return LLT();
1301 case SgprB256:
1302 case VgprB256:
1303 case UniInVgprB256:
1304 if (Ty == LLT::scalar(256) || Ty == LLT::fixed_vector(8, 32) ||
1305 Ty == LLT::fixed_vector(4, 64) || Ty == LLT::fixed_vector(16, 16))
1306 return Ty;
1307 return LLT();
1308 case SgprB512:
1309 case VgprB512:
1310 case UniInVgprB512:
1311 if (Ty == LLT::scalar(512) || Ty == LLT::fixed_vector(16, 32) ||
1312 Ty == LLT::fixed_vector(8, 64))
1313 return Ty;
1314 return LLT();
1315 case SgprBRC: {
1316 const SIRegisterInfo *TRI =
1317 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1318 unsigned LLTSize = Ty.getSizeInBits();
1319 if (LLTSize >= 32 && TRI->getSGPRClassForBitWidth(LLTSize))
1320 return Ty;
1321 return LLT();
1322 }
1323 case VgprBRC: {
1324 const SIRegisterInfo *TRI =
1325 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1326 if (TRI->getSGPRClassForBitWidth(Ty.getSizeInBits()))
1327 return Ty;
1328 return LLT();
1329 }
1330 default:
1331 return LLT();
1332 }
1333}
1334
1335const RegisterBank *
1336RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
1337 switch (ID) {
1338 case Vcc:
1339 return VccRB;
1340 case Sgpr16:
1341 case Sgpr32:
1342 case Sgpr32_WF:
1343 case Sgpr64:
1344 case Sgpr128:
1345 case SgprP0:
1346 case SgprP0Call_WF:
1347 case SgprP1:
1348 case SgprP2:
1349 case SgprP3:
1350 case SgprP4:
1351 case SgprP4Call_WF:
1352 case SgprP5:
1353 case SgprP8:
1354 case SgprPtr32:
1355 case SgprPtr64:
1356 case SgprPtr128:
1357 case SgprV2S16:
1358 case SgprV2S32:
1359 case SgprV4S32:
1360 case SgprV4S32_WF:
1361 case SgprB32:
1362 case SgprB64:
1363 case SgprB96:
1364 case SgprB128:
1365 case SgprB256:
1366 case SgprB512:
1367 case SgprBRC:
1368 case UniInVcc:
1369 case UniInVgprS16:
1370 case UniInVgprS32:
1371 case UniInVgprS64:
1372 case UniInVgprV2S16:
1373 case UniInVgprV2S32:
1374 case UniInVgprV4S32:
1375 case UniInVgprV2S64:
1376 case UniInVgprB32:
1377 case UniInVgprB64:
1378 case UniInVgprB96:
1379 case UniInVgprB128:
1380 case UniInVgprB160:
1381 case UniInVgprB256:
1382 case UniInVgprB512:
1383 case Sgpr32Trunc:
1384 case Sgpr32AExt:
1386 case Sgpr32SExt:
1387 case Sgpr32ZExt:
1388 return SgprRB;
1389 case Vgpr16:
1390 case Vgpr32:
1391 case Vgpr64:
1392 case Vgpr128:
1393 case VgprP0:
1394 case VgprP1:
1395 case VgprP2:
1396 case VgprP3:
1397 case VgprP4:
1398 case VgprP5:
1399 case VgprPtr32:
1400 case VgprPtr64:
1401 case VgprPtr128:
1402 case VgprV2S16:
1403 case VgprV2S32:
1404 case VgprV2S64:
1405 case VgprV3S32:
1406 case VgprV4S16:
1407 case VgprV4S32:
1408 case VgprV8S32:
1409 case VgprB32:
1410 case VgprB64:
1411 case VgprB96:
1412 case VgprB128:
1413 case VgprB160:
1414 case VgprB256:
1415 case VgprB512:
1416 case VgprBRC:
1417 case Vgpr32AExt:
1418 case Vgpr32SExt:
1419 case Vgpr32ZExt:
1420 case Sgpr32ToVgprDst:
1421 case Sgpr64ToVgprDst:
1422 return VgprRB;
1423 default:
1424 return nullptr;
1425 }
1426}
1427
1428bool RegBankLegalizeHelper::applyMappingDst(
1429 MachineInstr &MI, unsigned &OpIdx,
1430 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
1431 // Defs start from operand 0
1432 for (; OpIdx < MethodIDs.size(); ++OpIdx) {
1433 if (MethodIDs[OpIdx] == None)
1434 continue;
1435 MachineOperand &Op = MI.getOperand(OpIdx);
1436 Register Reg = Op.getReg();
1437 LLT Ty = MRI.getType(Reg);
1438 [[maybe_unused]] const RegisterBank *RB = MRI.getRegBank(Reg);
1439
1440 switch (MethodIDs[OpIdx]) {
1441 // vcc, sgpr and vgpr scalars, pointers and vectors
1442 case Vcc:
1443 case Sgpr16:
1444 case Sgpr32:
1445 case Sgpr64:
1446 case Sgpr128:
1447 case SgprP0:
1448 case SgprP1:
1449 case SgprP3:
1450 case SgprP4:
1451 case SgprP5:
1452 case SgprP8:
1453 case SgprV2S16:
1454 case SgprV2S32:
1455 case SgprV4S32:
1456 case Vgpr16:
1457 case Vgpr32:
1458 case Vgpr64:
1459 case Vgpr128:
1460 case VgprP0:
1461 case VgprP1:
1462 case VgprP2:
1463 case VgprP3:
1464 case VgprP4:
1465 case VgprP5:
1466 case VgprV2S16:
1467 case VgprV2S32:
1468 case VgprV2S64:
1469 case VgprV3S32:
1470 case VgprV4S16:
1471 case VgprV4S32:
1472 case VgprV8S32: {
1473 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1474 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1475 break;
1476 }
1477 // sgpr and vgpr B-types
1478 case SgprB32:
1479 case SgprB64:
1480 case SgprB96:
1481 case SgprB128:
1482 case SgprB256:
1483 case SgprB512:
1484 case SgprBRC:
1485 case SgprPtr32:
1486 case SgprPtr64:
1487 case SgprPtr128:
1488 case VgprB32:
1489 case VgprB64:
1490 case VgprB96:
1491 case VgprB128:
1492 case VgprB160:
1493 case VgprB256:
1494 case VgprB512:
1495 case VgprBRC:
1496 case VgprPtr32:
1497 case VgprPtr64:
1498 case VgprPtr128: {
1499 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1500 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1501 break;
1502 }
1503 // uniform in vcc/vgpr: scalars, vectors and B-types
1504 case UniInVcc: {
1505 assert(Ty == S1);
1506 assert(RB == SgprRB);
1507 Register NewDst = MRI.createVirtualRegister(VccRB_S1);
1508 Op.setReg(NewDst);
1509 if (!MRI.use_empty(Reg)) {
1510 auto CopyS32_Vcc =
1511 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
1512 B.buildTrunc(Reg, CopyS32_Vcc);
1513 }
1514 break;
1515 }
1516 case UniInVgprS16: {
1517 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1518 assert(RB == SgprRB);
1519 Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
1520 Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
1521 Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
1522 Op.setReg(NewVgprDstS16);
1523 B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
1524 buildReadAnyLane(B, NewSgprDstS32, NewVgprDstS32, RBI);
1525 B.buildTrunc(Reg, NewSgprDstS32);
1526 break;
1527 }
1528 case UniInVgprS32:
1529 case UniInVgprS64:
1530 case UniInVgprV2S16:
1531 case UniInVgprV2S32:
1532 case UniInVgprV4S32:
1533 case UniInVgprV2S64: {
1534 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1535 assert(RB == SgprRB);
1536 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1537 Op.setReg(NewVgprDst);
1538 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1539 break;
1540 }
1541 case UniInVgprB32:
1542 case UniInVgprB64:
1543 case UniInVgprB96:
1544 case UniInVgprB128:
1545 case UniInVgprB160:
1546 case UniInVgprB256:
1547 case UniInVgprB512: {
1548 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1549 assert(RB == SgprRB);
1550 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1551 Op.setReg(NewVgprDst);
1552 AMDGPU::buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1553 break;
1554 }
1555 // sgpr trunc
1556 case Sgpr32Trunc: {
1557 assert(Ty.getSizeInBits() < 32);
1558 assert(RB == SgprRB);
1559 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1560 Op.setReg(NewDst);
1561 if (!MRI.use_empty(Reg))
1562 B.buildTrunc(Reg, NewDst);
1563 break;
1564 }
1565 case Sgpr32ToVgprDst:
1566 case Sgpr64ToVgprDst: {
1567 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1568 assert(RB == VgprRB);
1569 Op.setReg(MRI.createVirtualRegister({SgprRB, Ty}));
1570 B.buildCopy(Reg, Op.getReg());
1571 break;
1572 }
1573 case InvalidMapping: {
1575 MF, MORE, "amdgpu-regbanklegalize",
1576 "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for", MI);
1577 return false;
1578 }
1579 default:
1581 MF, MORE, "amdgpu-regbanklegalize",
1582 "AMDGPU RegBankLegalize: applyMappingDst, ID not supported", MI);
1583 return false;
1584 }
1585 }
1586
1587 return true;
1588}
1589
1590bool RegBankLegalizeHelper::applyMappingSrc(
1591 MachineInstr &MI, unsigned &OpIdx,
1592 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
1593 WaterfallInfo &WFI) {
1594 for (unsigned i = 0; i < MethodIDs.size(); ++OpIdx, ++i) {
1595 if (MethodIDs[i] == None || MethodIDs[i] == IntrId || MethodIDs[i] == Imm)
1596 continue;
1597
1598 MachineOperand &Op = MI.getOperand(OpIdx);
1599 Register Reg = Op.getReg();
1600 LLT Ty = MRI.getType(Reg);
1601 const RegisterBank *RB = MRI.getRegBank(Reg);
1602
1603 switch (MethodIDs[i]) {
1604 case Vcc: {
1605 assert(Ty == S1);
1606 assert(RB == VccRB || RB == SgprRB);
1607 if (RB == SgprRB) {
1608 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1609 auto CopyVcc_Scc =
1610 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext});
1611 Op.setReg(CopyVcc_Scc.getReg(0));
1612 }
1613 break;
1614 }
1615 // sgpr scalars, pointers and vectors
1616 case Sgpr16:
1617 case Sgpr32:
1618 case Sgpr64:
1619 case Sgpr128:
1620 case SgprP0:
1621 case SgprP1:
1622 case SgprP3:
1623 case SgprP4:
1624 case SgprP5:
1625 case SgprP8:
1626 case SgprV2S16:
1627 case SgprV2S32:
1628 case SgprV4S32: {
1629 assert(Ty == getTyFromID(MethodIDs[i]));
1630 assert(RB == getRegBankFromID(MethodIDs[i]));
1631 break;
1632 }
1633 // sgpr B-types
1634 case SgprB32:
1635 case SgprB64:
1636 case SgprB96:
1637 case SgprB128:
1638 case SgprB256:
1639 case SgprB512:
1640 case SgprBRC:
1641 case SgprPtr32:
1642 case SgprPtr64:
1643 case SgprPtr128: {
1644 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1645 assert(RB == getRegBankFromID(MethodIDs[i]));
1646 break;
1647 }
1648 // vgpr scalars, pointers and vectors
1649 case Vgpr16:
1650 case Vgpr32:
1651 case Vgpr64:
1652 case Vgpr128:
1653 case VgprP0:
1654 case VgprP1:
1655 case VgprP2:
1656 case VgprP3:
1657 case VgprP4:
1658 case VgprP5:
1659 case VgprV2S16:
1660 case VgprV2S32:
1661 case VgprV2S64:
1662 case VgprV3S32:
1663 case VgprV4S16:
1664 case VgprV4S32:
1665 case VgprV8S32: {
1666 assert(Ty == getTyFromID(MethodIDs[i]));
1667 if (RB != VgprRB) {
1668 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1669 Op.setReg(CopyToVgpr.getReg(0));
1670 }
1671 break;
1672 }
1673 // vgpr B-types
1674 case VgprB32:
1675 case VgprB64:
1676 case VgprB96:
1677 case VgprB128:
1678 case VgprB160:
1679 case VgprB256:
1680 case VgprB512:
1681 case VgprBRC:
1682 case VgprPtr32:
1683 case VgprPtr64:
1684 case VgprPtr128: {
1685 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1686 if (RB != VgprRB) {
1687 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1688 Op.setReg(CopyToVgpr.getReg(0));
1689 }
1690 break;
1691 }
1692 // sgpr waterfall, scalars, and vectors
1693 case Sgpr32_WF:
1694 case SgprV4S32_WF: {
1695 assert(Ty == getTyFromID(MethodIDs[i]));
1696 if (RB != SgprRB) {
1697 WFI.SgprWaterfallOperandRegs.insert(Reg);
1698 if (!WFI.Start.isValid()) {
1699 WFI.Start = MI.getIterator();
1700 WFI.End = std::next(MI.getIterator());
1701 }
1702 }
1703 break;
1704 }
1705 case SgprP0Call_WF:
1706 case SgprP4Call_WF: {
1707 assert(Ty == getTyFromID(MethodIDs[i]));
1708 if (RB != SgprRB) {
1709 WFI.SgprWaterfallOperandRegs.insert(Reg);
1710
1711 // Find the ADJCALLSTACKUP before the call.
1712 MachineBasicBlock::iterator Start = MI.getIterator();
1713 while (Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
1714 --Start;
1715
1716 // Find the ADJCALLSTACKDOWN after the call (include it in range).
1717 MachineBasicBlock::iterator End = MI.getIterator();
1718 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
1719 ++End;
1720 ++End;
1721
1722 B.setInsertPt(*MI.getParent(), Start);
1723 WFI.Start = Start;
1724 WFI.End = End;
1725 }
1726 break;
1727 }
1728 case SgprB32_M0:
1730 case SgprB64_ReadFirstLane: {
1731 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1732 if (RB == SgprRB)
1733 break;
1734 assert(RB == VgprRB);
1735 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
1736 buildReadFirstLane(B, NewSGPR, Op.getReg(), RBI);
1737 Op.setReg(NewSGPR);
1738 break;
1739 }
1740 // sgpr and vgpr scalars with extend
1741 case Sgpr32AExt: {
1742 // Note: this ext allows S1, and it is meant to be combined away.
1743 assert(Ty.getSizeInBits() < 32);
1744 assert(RB == SgprRB);
1745 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1746 Op.setReg(Aext.getReg(0));
1747 break;
1748 }
1749 case Sgpr32AExtBoolInReg: {
1750 // Note: this ext allows S1, and it is meant to be combined away.
1751 assert(Ty.getSizeInBits() == 1);
1752 assert(RB == SgprRB);
1753 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1754 // Zext SgprS1 is not legal, make AND with 1 instead. This instruction is
1755 // most of times meant to be combined away in AMDGPURegBankCombiner.
1756 auto Cst1 = B.buildConstant(SgprRB_S32, 1);
1757 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
1758 Op.setReg(BoolInReg.getReg(0));
1759 break;
1760 }
1761 case Sgpr32SExt: {
1762 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
1763 assert(RB == SgprRB);
1764 auto Sext = B.buildSExt(SgprRB_S32, Reg);
1765 Op.setReg(Sext.getReg(0));
1766 break;
1767 }
1768 case Sgpr32ZExt: {
1769 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
1770 assert(RB == SgprRB);
1771 auto Zext = B.buildZExt({SgprRB, S32}, Reg);
1772 Op.setReg(Zext.getReg(0));
1773 break;
1774 }
1775 case Vgpr32AExt: {
1776 assert(Ty.getSizeInBits() < 32);
1777 assert(RB == VgprRB);
1778 auto Aext = B.buildAnyExt({VgprRB, S32}, Reg);
1779 Op.setReg(Aext.getReg(0));
1780 break;
1781 }
1782 case Vgpr32SExt: {
1783 // Note this ext allows S1, and it is meant to be combined away.
1784 assert(Ty.getSizeInBits() < 32);
1785 assert(RB == VgprRB);
1786 auto Sext = B.buildSExt({VgprRB, S32}, Reg);
1787 Op.setReg(Sext.getReg(0));
1788 break;
1789 }
1790 case Vgpr32ZExt: {
1791 // Note this ext allows S1, and it is meant to be combined away.
1792 assert(Ty.getSizeInBits() < 32);
1793 assert(RB == VgprRB);
1794 auto Zext = B.buildZExt({VgprRB, S32}, Reg);
1795 Op.setReg(Zext.getReg(0));
1796 break;
1797 }
1798 default:
1800 MF, MORE, "amdgpu-regbanklegalize",
1801 "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported", MI);
1802 return false;
1803 }
1804 }
1805 return true;
1806}
1807
1808[[maybe_unused]] static bool verifyRegBankOnOperands(MachineInstr &MI,
1809 const RegisterBank *RB,
1811 unsigned StartOpIdx,
1812 unsigned EndOpIdx) {
1813 for (unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
1814 if (MRI.getRegBankOrNull(MI.getOperand(i).getReg()) != RB)
1815 return false;
1816 }
1817 return true;
1818}
1819
1821 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
1822 // Put RB on all registers
1823 unsigned NumDefs = MI.getNumDefs();
1824 unsigned NumOperands = MI.getNumOperands();
1825
1826 assert(verifyRegBankOnOperands(MI, RB, MRI, 0, NumDefs - 1));
1827 if (RB == SgprRB)
1828 assert(verifyRegBankOnOperands(MI, RB, MRI, NumDefs, NumOperands - 1));
1829
1830 if (RB == VgprRB) {
1831 B.setInstr(MI);
1832 for (unsigned i = NumDefs; i < NumOperands; ++i) {
1833 Register Reg = MI.getOperand(i).getReg();
1834 if (MRI.getRegBank(Reg) != RB) {
1835 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1836 MI.getOperand(i).setReg(Copy.getReg(0));
1837 }
1838 }
1839 }
1840}
1841
1842bool RegBankLegalizeHelper::applyRegisterBanksINTRIN_IMAGE(MachineInstr &MI) {
1843 const AMDGPU::RsrcIntrinsic *RSrcIntrin =
1845 assert(RSrcIntrin && RSrcIntrin->IsImage);
1846
1847 unsigned RsrcIdx = RSrcIntrin->RsrcArg;
1848 const unsigned NumDefs = MI.getNumExplicitDefs();
1849
1850 // The reported argument index is relative to the IR intrinsic call arguments,
1851 // so we need to shift by the number of defs and the intrinsic ID.
1852 RsrcIdx += NumDefs + 1;
1853
1854 MachineBasicBlock *MBB = MI.getParent();
1855 B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(MI.getIterator())));
1856
1857 // Defs(for image loads with return) are vgpr.
1858 for (unsigned i = 0; i < NumDefs; ++i) {
1859 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(i).getReg());
1860 if (RB == VgprRB)
1861 continue;
1862
1863 Register Reg = MI.getOperand(i).getReg();
1864 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, MRI.getType(Reg)});
1865 MI.getOperand(i).setReg(NewVgprDst);
1866 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1867 }
1868
1869 B.setInstrAndDebugLoc(MI);
1870
1871 // Register uses(before RsrcIdx) are vgpr.
1872 for (unsigned i = 1; i < RsrcIdx; ++i) {
1873 MachineOperand &Op = MI.getOperand(i);
1874 if (!Op.isReg())
1875 continue;
1876
1877 Register Reg = Op.getReg();
1878 if (!Reg.isVirtual())
1879 continue;
1880
1881 if (MRI.getRegBank(Reg) == VgprRB)
1882 continue;
1883
1884 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1885 Op.setReg(Copy.getReg(0));
1886 }
1887
1888 SmallSet<Register, 4> OpsToWaterfall;
1889
1890 // Register use RsrcIdx(and RsrcIdx+1 in some cases) is sgpr.
1891 for (unsigned i = RsrcIdx; i < MI.getNumOperands(); ++i) {
1892 MachineOperand &Op = MI.getOperand(i);
1893 if (!Op.isReg())
1894 continue;
1895
1896 Register Reg = Op.getReg();
1897 if (MRI.getRegBank(Reg) != SgprRB)
1898 OpsToWaterfall.insert(Reg);
1899 }
1900
1901 if (!OpsToWaterfall.empty()) {
1902 MachineBasicBlock::iterator MII = MI.getIterator();
1903 executeInWaterfallLoop(B, {OpsToWaterfall, MII, std::next(MII)});
1904 }
1905
1906 return true;
1907}
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
Provides AMDGPU specific target descriptions.
static bool isSignedBFE(MachineInstr &MI)
static bool verifyRegBankOnOperands(MachineInstr &MI, const RegisterBank *RB, MachineRegisterInfo &MRI, unsigned StartOpIdx, unsigned EndOpIdx)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
static Register UseReg(const MachineOperand &MO)
IRTranslator LLVM IR MI
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Machine IR instance of the generic uniformity analysis.
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
const SmallVectorImpl< MachineOperand > & Cond
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
@ ICMP_NE
not equal
Definition InstrTypes.h:698
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
const SIRegisterInfo * getRegisterInfo() const override
Represents a call to an intrinsic.
Register getSourceReg() const
Get the unmerge source register.
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
TypeSize getValue() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator SkipPHIsAndLabels(iterator I)
Return the first instruction in MBB after I that is not a PHI or a label.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
BasicBlockListType::iterator iterator
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Helper class to build MachineInstr.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const RegisterBank * getRegBank(Register Reg) const
Return the register bank of Reg.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const RegisterBank * getRegBankOrNull(Register Reg) const
Return the register bank of Reg, or null if Reg has not been assigned a register bank or has been ass...
Holds all the information related to register banks.
This class implements the register bank concept.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
bool empty() const
Definition SmallSet.h:169
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
void push_back(const T &Elt)
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isAnyPtr(LLT Ty, unsigned Width)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
@ Offset
Definition DWP.cpp:532
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:155
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI void reportGISelFailure(MachineFunction &MF, MachineOptimizationRemarkEmitter &MORE, MachineOptimizationRemarkMissed &R)
Report an ISel error as a missed optimization remark to the LLVMContext's diagnostic stream.
Definition Utils.cpp:257
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:432
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping
Holds waterfall loop information: the set of SGPR operand registers that need waterfalling,...
MachineBasicBlock::iterator Start
SmallSet< Register, 4 > SgprWaterfallOperandRegs