LLVM 23.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
15#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
45 const AMDGPUTargetMachine &TM)
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
49#include "AMDGPUGenGlobalISel.inc"
52#include "AMDGPUGenGlobalISel.inc"
54{
55}
56
57const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
58
69
70// Return the wave level SGPR base address if this is a wave address.
72 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
73 ? Def->getOperand(1).getReg()
74 : Register();
75}
76
77bool AMDGPUInstructionSelector::isVCC(Register Reg,
78 const MachineRegisterInfo &MRI) const {
79 // The verifier is oblivious to s1 being a valid value for wavesize registers.
80 if (Reg.isPhysical())
81 return false;
82
83 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
84 const TargetRegisterClass *RC =
86 if (RC) {
87 const LLT Ty = MRI.getType(Reg);
88 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
89 return false;
90 // G_TRUNC s1 result is never vcc.
91 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
92 RC->hasSuperClassEq(TRI.getBoolRC());
93 }
94
95 const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank);
96 return RB->getID() == AMDGPU::VCCRegBankID;
97}
98
99bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
100 unsigned NewOpc) const {
101 MI.setDesc(TII.get(NewOpc));
102 MI.removeOperand(1); // Remove intrinsic ID.
103 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
104
105 MachineOperand &Dst = MI.getOperand(0);
106 MachineOperand &Src = MI.getOperand(1);
107
108 // TODO: This should be legalized to s32 if needed
109 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
110 return false;
111
112 const TargetRegisterClass *DstRC
113 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
114 const TargetRegisterClass *SrcRC
115 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
116 if (!DstRC || DstRC != SrcRC)
117 return false;
118
119 if (!RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) ||
120 !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
121 return false;
122 const MCInstrDesc &MCID = MI.getDesc();
123 if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
124 MI.getOperand(0).setIsEarlyClobber(true);
125 }
126 return true;
127}
128
129bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
130 const DebugLoc &DL = I.getDebugLoc();
131 MachineBasicBlock *BB = I.getParent();
132 I.setDesc(TII.get(TargetOpcode::COPY));
133
134 const MachineOperand &Src = I.getOperand(1);
135 MachineOperand &Dst = I.getOperand(0);
136 Register DstReg = Dst.getReg();
137 Register SrcReg = Src.getReg();
138
139 if (isVCC(DstReg, *MRI)) {
140 if (SrcReg == AMDGPU::SCC) {
141 const TargetRegisterClass *RC
142 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
143 if (!RC)
144 return true;
145 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
146 }
147
148 if (!isVCC(SrcReg, *MRI)) {
149 // TODO: Should probably leave the copy and let copyPhysReg expand it.
150 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
151 return false;
152
153 const TargetRegisterClass *SrcRC
154 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
155
156 std::optional<ValueAndVReg> ConstVal =
157 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
158 if (ConstVal) {
159 unsigned MovOpc =
160 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
161 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
162 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
163 } else {
164 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
165
166 // We can't trust the high bits at this point, so clear them.
167
168 // TODO: Skip masking high bits if def is known boolean.
169
170 if (AMDGPU::getRegBitWidth(SrcRC->getID()) == 16) {
171 assert(Subtarget->useRealTrue16Insts());
172 const int64_t NoMods = 0;
173 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg)
174 .addImm(NoMods)
175 .addImm(1)
176 .addImm(NoMods)
177 .addReg(SrcReg)
178 .addImm(NoMods);
179 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg)
180 .addImm(NoMods)
181 .addImm(0)
182 .addImm(NoMods)
183 .addReg(MaskedReg)
184 .addImm(NoMods);
185 } else {
186 bool IsSGPR = TRI.isSGPRClass(SrcRC);
187 unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
188 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
189 .addImm(1)
190 .addReg(SrcReg);
191 if (IsSGPR)
192 And.setOperandDead(3); // Dead scc
193
194 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
195 .addImm(0)
196 .addReg(MaskedReg);
197 }
198 }
199
200 if (!MRI->getRegClassOrNull(SrcReg))
201 MRI->setRegClass(SrcReg, SrcRC);
202 I.eraseFromParent();
203 return true;
204 }
205
206 const TargetRegisterClass *RC =
207 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
208 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
209 return false;
210
211 return true;
212 }
213
214 for (const MachineOperand &MO : I.operands()) {
215 if (MO.getReg().isPhysical())
216 continue;
217
218 const TargetRegisterClass *RC =
219 TRI.getConstrainedRegClassForOperand(MO, *MRI);
220 if (!RC)
221 continue;
222 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
223 }
224 return true;
225}
226
227bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
228 const DebugLoc &DL = I.getDebugLoc();
229 MachineBasicBlock *BB = I.getParent();
230 Register VCCReg = I.getOperand(1).getReg();
231 MachineInstr *Cmp;
232
233 // Set SCC as a side effect with S_CMP or S_OR.
234 if (STI.hasScalarCompareEq64()) {
235 unsigned CmpOpc =
236 STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
237 Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0);
238 } else {
239 Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
240 Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst)
241 .addReg(VCCReg)
242 .addReg(VCCReg);
243 }
244
245 constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
246
247 Register DstReg = I.getOperand(0).getReg();
248 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);
249
250 I.eraseFromParent();
251 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
252}
253
254bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
255 const DebugLoc &DL = I.getDebugLoc();
256 MachineBasicBlock *BB = I.getParent();
257
258 Register DstReg = I.getOperand(0).getReg();
259 Register SrcReg = I.getOperand(1).getReg();
260 std::optional<ValueAndVReg> Arg =
261 getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI);
262
263 if (Arg) {
264 const int64_t Value = Arg->Value.getZExtValue();
265 if (Value == 0) {
266 unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
267 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
268 } else {
269 assert(Value == 1);
270 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec());
271 }
272 I.eraseFromParent();
273 return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI);
274 }
275
276 // RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0).
277 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg);
278
279 unsigned SelectOpcode =
280 STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
281 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
282 .addReg(TRI.getExec())
283 .addImm(0);
284
285 I.eraseFromParent();
287 return true;
288}
289
290bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
291 Register DstReg = I.getOperand(0).getReg();
292 Register SrcReg = I.getOperand(1).getReg();
293
294 const DebugLoc &DL = I.getDebugLoc();
295 MachineBasicBlock *BB = I.getParent();
296
297 auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
298 .addReg(SrcReg);
299
300 I.eraseFromParent();
301 constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
302 return true;
303}
304
305bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
306 const Register DefReg = I.getOperand(0).getReg();
307 const LLT DefTy = MRI->getType(DefReg);
308
309 // S1 G_PHIs should not be selected in instruction-select, instead:
310 // - divergent S1 G_PHI should go through lane mask merging algorithm
311 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
312 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
313 if (DefTy == LLT::scalar(1))
314 return false;
315
316 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
317
318 const RegClassOrRegBank &RegClassOrBank =
319 MRI->getRegClassOrRegBank(DefReg);
320
321 const TargetRegisterClass *DefRC =
323 if (!DefRC) {
324 if (!DefTy.isValid()) {
325 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
326 return false;
327 }
328
329 const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
330 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
331 if (!DefRC) {
332 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
333 return false;
334 }
335 }
336
337 // If inputs have register bank, assign corresponding reg class.
338 // Note: registers don't need to have the same reg bank.
339 for (unsigned i = 1; i != I.getNumOperands(); i += 2) {
340 const Register SrcReg = I.getOperand(i).getReg();
341
342 const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg);
343 if (RB) {
344 const LLT SrcTy = MRI->getType(SrcReg);
345 const TargetRegisterClass *SrcRC =
346 TRI.getRegClassForTypeOnBank(SrcTy, *RB);
347 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
348 return false;
349 }
350 }
351
352 I.setDesc(TII.get(TargetOpcode::PHI));
353 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
354}
355
357AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
358 const TargetRegisterClass &SubRC,
359 unsigned SubIdx) const {
360
361 MachineInstr *MI = MO.getParent();
362 MachineBasicBlock *BB = MO.getParent()->getParent();
363 Register DstReg = MRI->createVirtualRegister(&SubRC);
364
365 if (MO.isReg()) {
366 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
367 Register Reg = MO.getReg();
368 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
369 .addReg(Reg, {}, ComposedSubIdx);
370
371 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
372 MO.isKill(), MO.isDead(), MO.isUndef(),
373 MO.isEarlyClobber(), 0, MO.isDebug(),
374 MO.isInternalRead());
375 }
376
377 assert(MO.isImm());
378
379 APInt Imm(64, MO.getImm());
380
381 switch (SubIdx) {
382 default:
383 llvm_unreachable("do not know to split immediate with this sub index.");
384 case AMDGPU::sub0:
385 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
386 case AMDGPU::sub1:
387 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
388 }
389}
390
391static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
392 switch (Opc) {
393 case AMDGPU::G_AND:
394 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
395 case AMDGPU::G_OR:
396 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
397 case AMDGPU::G_XOR:
398 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
399 default:
400 llvm_unreachable("not a bit op");
401 }
402}
403
404bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
405 Register DstReg = I.getOperand(0).getReg();
406 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
407
408 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
409 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
410 DstRB->getID() != AMDGPU::VCCRegBankID)
411 return false;
412
413 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
414 STI.isWave64());
415 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
416
417 // Dead implicit-def of scc
418 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
419 true, // isImp
420 false, // isKill
421 true)); // isDead
422 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
423 return true;
424}
425
426bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
427 MachineBasicBlock *BB = I.getParent();
428 MachineFunction *MF = BB->getParent();
429 Register DstReg = I.getOperand(0).getReg();
430 const DebugLoc &DL = I.getDebugLoc();
431 LLT Ty = MRI->getType(DstReg);
432 if (Ty.isVector())
433 return false;
434
435 unsigned Size = Ty.getSizeInBits();
436 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
437 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
438 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
439
440 if (Size == 32) {
441 if (IsSALU) {
442 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
443 MachineInstr *Add =
444 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
445 .add(I.getOperand(1))
446 .add(I.getOperand(2))
447 .setOperandDead(3); // Dead scc
448 I.eraseFromParent();
449 constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
450 return true;
451 }
452
453 if (STI.hasAddNoCarryInsts()) {
454 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
455 I.setDesc(TII.get(Opc));
456 I.addOperand(*MF, MachineOperand::CreateImm(0));
457 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
458 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
459 return true;
460 }
461
462 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
463
464 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
465 MachineInstr *Add
466 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
467 .addDef(UnusedCarry, RegState::Dead)
468 .add(I.getOperand(1))
469 .add(I.getOperand(2))
470 .addImm(0);
471 I.eraseFromParent();
472 constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
473 return true;
474 }
475
476 assert(!Sub && "illegal sub should not reach here");
477
478 const TargetRegisterClass &RC
479 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
480 const TargetRegisterClass &HalfRC
481 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
482
483 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
484 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
485 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
486 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
487
488 Register DstLo = MRI->createVirtualRegister(&HalfRC);
489 Register DstHi = MRI->createVirtualRegister(&HalfRC);
490
491 if (IsSALU) {
492 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
493 .add(Lo1)
494 .add(Lo2);
495 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
496 .add(Hi1)
497 .add(Hi2)
498 .setOperandDead(3); // Dead scc
499 } else {
500 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
501 Register CarryReg = MRI->createVirtualRegister(CarryRC);
502 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
503 .addDef(CarryReg)
504 .add(Lo1)
505 .add(Lo2)
506 .addImm(0);
507 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
508 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
509 .add(Hi1)
510 .add(Hi2)
511 .addReg(CarryReg, RegState::Kill)
512 .addImm(0);
513
514 constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI);
515 }
516
517 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
518 .addReg(DstLo)
519 .addImm(AMDGPU::sub0)
520 .addReg(DstHi)
521 .addImm(AMDGPU::sub1);
522
523
524 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
525 return false;
526
527 I.eraseFromParent();
528 return true;
529}
530
531bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
532 MachineInstr &I) const {
533 MachineBasicBlock *BB = I.getParent();
534 MachineFunction *MF = BB->getParent();
535 const DebugLoc &DL = I.getDebugLoc();
536 Register Dst0Reg = I.getOperand(0).getReg();
537 Register Dst1Reg = I.getOperand(1).getReg();
538 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
539 I.getOpcode() == AMDGPU::G_UADDE;
540 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
541 I.getOpcode() == AMDGPU::G_USUBE;
542
543 if (isVCC(Dst1Reg, *MRI)) {
544 unsigned NoCarryOpc =
545 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
546 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
547 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
548 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
549 I.addOperand(*MF, MachineOperand::CreateImm(0));
550 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
551 return true;
552 }
553
554 Register Src0Reg = I.getOperand(2).getReg();
555 Register Src1Reg = I.getOperand(3).getReg();
556
557 if (HasCarryIn) {
558 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
559 .addReg(I.getOperand(4).getReg());
560 }
561
562 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
563 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
564
565 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
566 .add(I.getOperand(2))
567 .add(I.getOperand(3));
568
569 if (MRI->use_nodbg_empty(Dst1Reg)) {
570 CarryInst.setOperandDead(3); // Dead scc
571 } else {
572 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
573 .addReg(AMDGPU::SCC);
574 if (!MRI->getRegClassOrNull(Dst1Reg))
575 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
576 }
577
578 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
579 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
580 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
581 return false;
582
583 if (HasCarryIn &&
584 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
585 AMDGPU::SReg_32RegClass, *MRI))
586 return false;
587
588 I.eraseFromParent();
589 return true;
590}
591
592bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
593 MachineInstr &I) const {
594 MachineBasicBlock *BB = I.getParent();
595 MachineFunction *MF = BB->getParent();
596 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
597 bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() &&
598 MRI->use_nodbg_empty(I.getOperand(1).getReg());
599
600 unsigned Opc;
601 if (Subtarget->hasMADIntraFwdBug())
602 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
603 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
604 else if (UseNoCarry)
605 Opc = IsUnsigned ? AMDGPU::V_MAD_NC_U64_U32_e64
606 : AMDGPU::V_MAD_NC_I64_I32_e64;
607 else
608 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
609
610 if (UseNoCarry)
611 I.removeOperand(1);
612
613 I.setDesc(TII.get(Opc));
614 I.addOperand(*MF, MachineOperand::CreateImm(0));
615 I.addImplicitDefUseOperands(*MF);
616 I.getOperand(0).setIsEarlyClobber(true);
617 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
618 return true;
619}
620
621// TODO: We should probably legalize these to only using 32-bit results.
622bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
623 MachineBasicBlock *BB = I.getParent();
624 Register DstReg = I.getOperand(0).getReg();
625 Register SrcReg = I.getOperand(1).getReg();
626 LLT DstTy = MRI->getType(DstReg);
627 LLT SrcTy = MRI->getType(SrcReg);
628 const unsigned SrcSize = SrcTy.getSizeInBits();
629 unsigned DstSize = DstTy.getSizeInBits();
630
631 // TODO: Should handle any multiple of 32 offset.
632 unsigned Offset = I.getOperand(2).getImm();
633 if (Offset % 32 != 0 || DstSize > 128)
634 return false;
635
636 // 16-bit operations really use 32-bit registers.
637 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
638 if (DstSize == 16)
639 DstSize = 32;
640
641 const TargetRegisterClass *DstRC =
642 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
643 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
644 return false;
645
646 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
647 const TargetRegisterClass *SrcRC =
648 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
649 if (!SrcRC)
650 return false;
651 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
652 DstSize / 32);
653 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
654 if (!SrcRC)
655 return false;
656
657 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
658 *SrcRC, I.getOperand(1));
659 const DebugLoc &DL = I.getDebugLoc();
660 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
661 .addReg(SrcReg, {}, SubReg);
662
663 I.eraseFromParent();
664 return true;
665}
666
667bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
668 MachineBasicBlock *BB = MI.getParent();
669 Register DstReg = MI.getOperand(0).getReg();
670 LLT DstTy = MRI->getType(DstReg);
671 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
672
673 const unsigned SrcSize = SrcTy.getSizeInBits();
674 if (SrcSize < 32)
675 return selectImpl(MI, *CoverageInfo);
676
677 const DebugLoc &DL = MI.getDebugLoc();
678 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
679 const unsigned DstSize = DstTy.getSizeInBits();
680 const TargetRegisterClass *DstRC =
681 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
682 if (!DstRC)
683 return false;
684
685 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
686 MachineInstrBuilder MIB =
687 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
688 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
689 MachineOperand &Src = MI.getOperand(I + 1);
690 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
691 MIB.addImm(SubRegs[I]);
692
693 const TargetRegisterClass *SrcRC
694 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
695 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
696 return false;
697 }
698
699 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
700 return false;
701
702 MI.eraseFromParent();
703 return true;
704}
705
706bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
707 MachineBasicBlock *BB = MI.getParent();
708 const int NumDst = MI.getNumOperands() - 1;
709
710 MachineOperand &Src = MI.getOperand(NumDst);
711
712 Register SrcReg = Src.getReg();
713 Register DstReg0 = MI.getOperand(0).getReg();
714 LLT DstTy = MRI->getType(DstReg0);
715 LLT SrcTy = MRI->getType(SrcReg);
716
717 const unsigned DstSize = DstTy.getSizeInBits();
718 const unsigned SrcSize = SrcTy.getSizeInBits();
719 const DebugLoc &DL = MI.getDebugLoc();
720 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
721
722 const TargetRegisterClass *SrcRC =
723 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
724 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
725 return false;
726
727 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
728 // source, and this relies on the fact that the same subregister indices are
729 // used for both.
730 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
731 for (int I = 0, E = NumDst; I != E; ++I) {
732 MachineOperand &Dst = MI.getOperand(I);
733 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
734 .addReg(SrcReg, {}, SubRegs[I]);
735
736 // Make sure the subregister index is valid for the source register.
737 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
738 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
739 return false;
740
741 const TargetRegisterClass *DstRC =
742 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
743 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
744 return false;
745 }
746
747 MI.eraseFromParent();
748 return true;
749}
750
751bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
752 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
753 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
754
755 Register Src0 = MI.getOperand(1).getReg();
756 Register Src1 = MI.getOperand(2).getReg();
757 LLT SrcTy = MRI->getType(Src0);
758 const unsigned SrcSize = SrcTy.getSizeInBits();
759
760 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
761 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
762 return selectG_MERGE_VALUES(MI);
763 }
764
765 // Selection logic below is for V2S16 only.
766 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
767 Register Dst = MI.getOperand(0).getReg();
768 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
769 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
770 SrcTy != LLT::scalar(32)))
771 return selectImpl(MI, *CoverageInfo);
772
773 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
774 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
775 return false;
776
777 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
778 DstBank->getID() == AMDGPU::VGPRRegBankID);
779 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
780
781 const DebugLoc &DL = MI.getDebugLoc();
782 MachineBasicBlock *BB = MI.getParent();
783
784 // First, before trying TableGen patterns, check if both sources are
785 // constants. In those cases, we can trivially compute the final constant
786 // and emit a simple move.
787 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
788 if (ConstSrc1) {
789 auto ConstSrc0 =
790 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
791 if (ConstSrc0) {
792 const int64_t K0 = ConstSrc0->Value.getSExtValue();
793 const int64_t K1 = ConstSrc1->Value.getSExtValue();
794 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
795 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
796 uint32_t Imm = Lo16 | (Hi16 << 16);
797
798 // VALU
799 if (IsVector) {
800 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
801 MI.eraseFromParent();
802 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
803 }
804
805 // SALU
806 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
807 MI.eraseFromParent();
808 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
809 }
810 }
811
812 // Now try TableGen patterns.
813 if (selectImpl(MI, *CoverageInfo))
814 return true;
815
816 // TODO: This should probably be a combine somewhere
817 // (build_vector $src0, undef) -> copy $src0
818 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
819 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
820 MI.setDesc(TII.get(AMDGPU::COPY));
821 MI.removeOperand(2);
822 const auto &RC =
823 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
824 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
825 RBI.constrainGenericRegister(Src0, RC, *MRI);
826 }
827
828 // TODO: Can be improved?
829 if (IsVector) {
830 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
831 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
832 .addImm(0xFFFF)
833 .addReg(Src0);
834 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
835
836 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
837 .addReg(Src1)
838 .addImm(16)
839 .addReg(TmpReg);
840 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
841
842 MI.eraseFromParent();
843 return true;
844 }
845
846 Register ShiftSrc0;
847 Register ShiftSrc1;
848
849 // With multiple uses of the shift, this will duplicate the shift and
850 // increase register pressure.
851 //
852 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
853 // => (S_PACK_HH_B32_B16 $src0, $src1)
854 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
855 // => (S_PACK_HL_B32_B16 $src0, $src1)
856 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
857 // => (S_PACK_LH_B32_B16 $src0, $src1)
858 // (build_vector $src0, $src1)
859 // => (S_PACK_LL_B32_B16 $src0, $src1)
860
861 bool Shift0 = mi_match(
862 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
863
864 bool Shift1 = mi_match(
865 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
866
867 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
868 if (Shift0 && Shift1) {
869 Opc = AMDGPU::S_PACK_HH_B32_B16;
870 MI.getOperand(1).setReg(ShiftSrc0);
871 MI.getOperand(2).setReg(ShiftSrc1);
872 } else if (Shift1) {
873 Opc = AMDGPU::S_PACK_LH_B32_B16;
874 MI.getOperand(2).setReg(ShiftSrc1);
875 } else if (Shift0) {
876 auto ConstSrc1 =
877 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
878 if (ConstSrc1 && ConstSrc1->Value == 0) {
879 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
880 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
881 .addReg(ShiftSrc0)
882 .addImm(16)
883 .setOperandDead(3); // Dead scc
884
885 MI.eraseFromParent();
886 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
887 return true;
888 }
889 if (STI.hasSPackHL()) {
890 Opc = AMDGPU::S_PACK_HL_B32_B16;
891 MI.getOperand(1).setReg(ShiftSrc0);
892 }
893 }
894
895 MI.setDesc(TII.get(Opc));
896 constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
897 return true;
898}
899
900bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
901 const MachineOperand &MO = I.getOperand(0);
902
903 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
904 // regbank check here is to know why getConstrainedRegClassForOperand failed.
905 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
906 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
907 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
908 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
909 return true;
910 }
911
912 return false;
913}
914
915bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
916 MachineBasicBlock *BB = I.getParent();
917
918 Register DstReg = I.getOperand(0).getReg();
919 Register Src0Reg = I.getOperand(1).getReg();
920 Register Src1Reg = I.getOperand(2).getReg();
921 LLT Src1Ty = MRI->getType(Src1Reg);
922
923 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
924 unsigned InsSize = Src1Ty.getSizeInBits();
925
926 int64_t Offset = I.getOperand(3).getImm();
927
928 // FIXME: These cases should have been illegal and unnecessary to check here.
929 if (Offset % 32 != 0 || InsSize % 32 != 0)
930 return false;
931
932 // Currently not handled by getSubRegFromChannel.
933 if (InsSize > 128)
934 return false;
935
936 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
937 if (SubReg == AMDGPU::NoSubRegister)
938 return false;
939
940 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
941 const TargetRegisterClass *DstRC =
942 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
943 if (!DstRC)
944 return false;
945
946 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
947 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
948 const TargetRegisterClass *Src0RC =
949 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
950 const TargetRegisterClass *Src1RC =
951 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
952
953 // Deal with weird cases where the class only partially supports the subreg
954 // index.
955 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
956 if (!Src0RC || !Src1RC)
957 return false;
958
959 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
960 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
961 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
962 return false;
963
964 const DebugLoc &DL = I.getDebugLoc();
965 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
966 .addReg(Src0Reg)
967 .addReg(Src1Reg)
968 .addImm(SubReg);
969
970 I.eraseFromParent();
971 return true;
972}
973
974bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
975 Register DstReg = MI.getOperand(0).getReg();
976 Register SrcReg = MI.getOperand(1).getReg();
977 Register OffsetReg = MI.getOperand(2).getReg();
978 Register WidthReg = MI.getOperand(3).getReg();
979
980 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
981 "scalar BFX instructions are expanded in regbankselect");
982 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
983 "64-bit vector BFX instructions are expanded in regbankselect");
984
985 const DebugLoc &DL = MI.getDebugLoc();
986 MachineBasicBlock *MBB = MI.getParent();
987
988 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
989 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
990 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
991 .addReg(SrcReg)
992 .addReg(OffsetReg)
993 .addReg(WidthReg);
994 MI.eraseFromParent();
995 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
996 return true;
997}
998
999bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
1000 if (STI.getLDSBankCount() != 16)
1001 return selectImpl(MI, *CoverageInfo);
1002
1003 Register Dst = MI.getOperand(0).getReg();
1004 Register Src0 = MI.getOperand(2).getReg();
1005 Register M0Val = MI.getOperand(6).getReg();
1006 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
1007 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
1008 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
1009 return false;
1010
1011 // This requires 2 instructions. It is possible to write a pattern to support
1012 // this, but the generated isel emitter doesn't correctly deal with multiple
1013 // output instructions using the same physical register input. The copy to m0
1014 // is incorrectly placed before the second instruction.
1015 //
1016 // TODO: Match source modifiers.
1017
1018 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1019 const DebugLoc &DL = MI.getDebugLoc();
1020 MachineBasicBlock *MBB = MI.getParent();
1021
1022 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1023 .addReg(M0Val);
1024 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
1025 .addImm(2)
1026 .addImm(MI.getOperand(4).getImm()) // $attr
1027 .addImm(MI.getOperand(3).getImm()); // $attrchan
1028
1029 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
1030 .addImm(0) // $src0_modifiers
1031 .addReg(Src0) // $src0
1032 .addImm(MI.getOperand(4).getImm()) // $attr
1033 .addImm(MI.getOperand(3).getImm()) // $attrchan
1034 .addImm(0) // $src2_modifiers
1035 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
1036 .addImm(MI.getOperand(5).getImm()) // $high
1037 .addImm(0) // $clamp
1038 .addImm(0); // $omod
1039
1040 MI.eraseFromParent();
1041 return true;
1042}
1043
1044// Writelane is special in that it can use SGPR and M0 (which would normally
1045// count as using the constant bus twice - but in this case it is allowed since
1046// the lane selector doesn't count as a use of the constant bus). However, it is
1047// still required to abide by the 1 SGPR rule. Fix this up if we might have
1048// multiple SGPRs.
1049bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
1050 // With a constant bus limit of at least 2, there's no issue.
1051 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
1052 return selectImpl(MI, *CoverageInfo);
1053
1054 MachineBasicBlock *MBB = MI.getParent();
1055 const DebugLoc &DL = MI.getDebugLoc();
1056 Register VDst = MI.getOperand(0).getReg();
1057 Register Val = MI.getOperand(2).getReg();
1058 Register LaneSelect = MI.getOperand(3).getReg();
1059 Register VDstIn = MI.getOperand(4).getReg();
1060
1061 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
1062
1063 std::optional<ValueAndVReg> ConstSelect =
1064 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
1065 if (ConstSelect) {
1066 // The selector has to be an inline immediate, so we can use whatever for
1067 // the other operands.
1068 MIB.addReg(Val);
1069 MIB.addImm(ConstSelect->Value.getSExtValue() &
1070 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
1071 } else {
1072 std::optional<ValueAndVReg> ConstVal =
1074
1075 // If the value written is an inline immediate, we can get away without a
1076 // copy to m0.
1077 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
1078 STI.hasInv2PiInlineImm())) {
1079 MIB.addImm(ConstVal->Value.getSExtValue());
1080 MIB.addReg(LaneSelect);
1081 } else {
1082 MIB.addReg(Val);
1083
1084 // If the lane selector was originally in a VGPR and copied with
1085 // readfirstlane, there's a hazard to read the same SGPR from the
1086 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
1087 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
1088
1089 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1090 .addReg(LaneSelect);
1091 MIB.addReg(AMDGPU::M0);
1092 }
1093 }
1094
1095 MIB.addReg(VDstIn);
1096
1097 MI.eraseFromParent();
1098 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1099 return true;
1100}
1101
1102// We need to handle this here because tablegen doesn't support matching
1103// instructions with multiple outputs.
1104bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
1105 Register Dst0 = MI.getOperand(0).getReg();
1106 Register Dst1 = MI.getOperand(1).getReg();
1107
1108 LLT Ty = MRI->getType(Dst0);
1109 unsigned Opc;
1110 if (Ty == LLT::scalar(32))
1111 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
1112 else if (Ty == LLT::scalar(64))
1113 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
1114 else
1115 return false;
1116
1117 // TODO: Match source modifiers.
1118
1119 const DebugLoc &DL = MI.getDebugLoc();
1120 MachineBasicBlock *MBB = MI.getParent();
1121
1122 Register Numer = MI.getOperand(3).getReg();
1123 Register Denom = MI.getOperand(4).getReg();
1124 unsigned ChooseDenom = MI.getOperand(5).getImm();
1125
1126 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1127
1128 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1129 .addDef(Dst1)
1130 .addImm(0) // $src0_modifiers
1131 .addUse(Src0) // $src0
1132 .addImm(0) // $src1_modifiers
1133 .addUse(Denom) // $src1
1134 .addImm(0) // $src2_modifiers
1135 .addUse(Numer) // $src2
1136 .addImm(0) // $clamp
1137 .addImm(0); // $omod
1138
1139 MI.eraseFromParent();
1140 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1141 return true;
1142}
1143
1144bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1145 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1146 switch (IntrinsicID) {
1147 case Intrinsic::amdgcn_if_break: {
1148 MachineBasicBlock *BB = I.getParent();
1149
1150 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1151 // SelectionDAG uses for wave32 vs wave64.
1152 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1153 .add(I.getOperand(0))
1154 .add(I.getOperand(2))
1155 .add(I.getOperand(3));
1156
1157 Register DstReg = I.getOperand(0).getReg();
1158 Register Src0Reg = I.getOperand(2).getReg();
1159 Register Src1Reg = I.getOperand(3).getReg();
1160
1161 I.eraseFromParent();
1162
1163 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1164 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1165
1166 return true;
1167 }
1168 case Intrinsic::amdgcn_interp_p1_f16:
1169 return selectInterpP1F16(I);
1170 case Intrinsic::amdgcn_wqm:
1171 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1172 case Intrinsic::amdgcn_softwqm:
1173 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1174 case Intrinsic::amdgcn_strict_wwm:
1175 case Intrinsic::amdgcn_wwm:
1176 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1177 case Intrinsic::amdgcn_strict_wqm:
1178 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1179 case Intrinsic::amdgcn_writelane:
1180 return selectWritelane(I);
1181 case Intrinsic::amdgcn_div_scale:
1182 return selectDivScale(I);
1183 case Intrinsic::amdgcn_icmp:
1184 case Intrinsic::amdgcn_fcmp:
1185 if (selectImpl(I, *CoverageInfo))
1186 return true;
1187 return selectIntrinsicCmp(I);
1188 case Intrinsic::amdgcn_ballot:
1189 return selectBallot(I);
1190 case Intrinsic::amdgcn_reloc_constant:
1191 return selectRelocConstant(I);
1192 case Intrinsic::amdgcn_groupstaticsize:
1193 return selectGroupStaticSize(I);
1194 case Intrinsic::returnaddress:
1195 return selectReturnAddress(I);
1196 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1197 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1198 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1199 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1200 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1201 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1202 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1203 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1204 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1205 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1206 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1207 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1208 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1209 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1210 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
1211 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
1212 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
1213 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
1214 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
1215 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
1216 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1217 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
1218 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1219 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
1220 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
1221 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
1222 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
1223 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
1224 return selectSMFMACIntrin(I);
1225 case Intrinsic::amdgcn_permlane16_swap:
1226 case Intrinsic::amdgcn_permlane32_swap:
1227 return selectPermlaneSwapIntrin(I, IntrinsicID);
1228 case Intrinsic::amdgcn_wave_shuffle:
1229 return selectWaveShuffleIntrin(I);
1230 default:
1231 return selectImpl(I, *CoverageInfo);
1232 }
1233}
1234
1236 const GCNSubtarget &ST) {
1237 if (Size != 16 && Size != 32 && Size != 64)
1238 return -1;
1239
1240 if (Size == 16 && !ST.has16BitInsts())
1241 return -1;
1242
1243 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc,
1244 unsigned FakeS16Opc, unsigned S32Opc,
1245 unsigned S64Opc) {
1246 if (Size == 16)
1247 return ST.hasTrue16BitInsts()
1248 ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc
1249 : S16Opc;
1250 if (Size == 32)
1251 return S32Opc;
1252 return S64Opc;
1253 };
1254
1255 switch (P) {
1256 default:
1257 llvm_unreachable("Unknown condition code!");
1258 case CmpInst::ICMP_NE:
1259 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1260 AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,
1261 AMDGPU::V_CMP_NE_U64_e64);
1262 case CmpInst::ICMP_EQ:
1263 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1264 AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,
1265 AMDGPU::V_CMP_EQ_U64_e64);
1266 case CmpInst::ICMP_SGT:
1267 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1268 AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,
1269 AMDGPU::V_CMP_GT_I64_e64);
1270 case CmpInst::ICMP_SGE:
1271 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1272 AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,
1273 AMDGPU::V_CMP_GE_I64_e64);
1274 case CmpInst::ICMP_SLT:
1275 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1276 AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,
1277 AMDGPU::V_CMP_LT_I64_e64);
1278 case CmpInst::ICMP_SLE:
1279 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1280 AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,
1281 AMDGPU::V_CMP_LE_I64_e64);
1282 case CmpInst::ICMP_UGT:
1283 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1284 AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,
1285 AMDGPU::V_CMP_GT_U64_e64);
1286 case CmpInst::ICMP_UGE:
1287 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1288 AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,
1289 AMDGPU::V_CMP_GE_U64_e64);
1290 case CmpInst::ICMP_ULT:
1291 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1292 AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,
1293 AMDGPU::V_CMP_LT_U64_e64);
1294 case CmpInst::ICMP_ULE:
1295 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1296 AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,
1297 AMDGPU::V_CMP_LE_U64_e64);
1298
1299 case CmpInst::FCMP_OEQ:
1300 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1301 AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,
1302 AMDGPU::V_CMP_EQ_F64_e64);
1303 case CmpInst::FCMP_OGT:
1304 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1305 AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,
1306 AMDGPU::V_CMP_GT_F64_e64);
1307 case CmpInst::FCMP_OGE:
1308 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1309 AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,
1310 AMDGPU::V_CMP_GE_F64_e64);
1311 case CmpInst::FCMP_OLT:
1312 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1313 AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,
1314 AMDGPU::V_CMP_LT_F64_e64);
1315 case CmpInst::FCMP_OLE:
1316 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1317 AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,
1318 AMDGPU::V_CMP_LE_F64_e64);
1319 case CmpInst::FCMP_ONE:
1320 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1321 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1322 AMDGPU::V_CMP_NEQ_F64_e64);
1323 case CmpInst::FCMP_ORD:
1324 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1325 AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,
1326 AMDGPU::V_CMP_O_F64_e64);
1327 case CmpInst::FCMP_UNO:
1328 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1329 AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,
1330 AMDGPU::V_CMP_U_F64_e64);
1331 case CmpInst::FCMP_UEQ:
1332 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1333 AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,
1334 AMDGPU::V_CMP_NLG_F64_e64);
1335 case CmpInst::FCMP_UGT:
1336 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1337 AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,
1338 AMDGPU::V_CMP_NLE_F64_e64);
1339 case CmpInst::FCMP_UGE:
1340 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1341 AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,
1342 AMDGPU::V_CMP_NLT_F64_e64);
1343 case CmpInst::FCMP_ULT:
1344 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1345 AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,
1346 AMDGPU::V_CMP_NGE_F64_e64);
1347 case CmpInst::FCMP_ULE:
1348 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1349 AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,
1350 AMDGPU::V_CMP_NGT_F64_e64);
1351 case CmpInst::FCMP_UNE:
1352 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1353 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1354 AMDGPU::V_CMP_NEQ_F64_e64);
1355 case CmpInst::FCMP_TRUE:
1356 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1357 AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,
1358 AMDGPU::V_CMP_TRU_F64_e64);
1360 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1361 AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,
1362 AMDGPU::V_CMP_F_F64_e64);
1363 }
1364}
1365
1366int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1367 unsigned Size) const {
1368 if (Size == 64) {
1369 if (!STI.hasScalarCompareEq64())
1370 return -1;
1371
1372 switch (P) {
1373 case CmpInst::ICMP_NE:
1374 return AMDGPU::S_CMP_LG_U64;
1375 case CmpInst::ICMP_EQ:
1376 return AMDGPU::S_CMP_EQ_U64;
1377 default:
1378 return -1;
1379 }
1380 }
1381
1382 if (Size == 32) {
1383 switch (P) {
1384 case CmpInst::ICMP_NE:
1385 return AMDGPU::S_CMP_LG_U32;
1386 case CmpInst::ICMP_EQ:
1387 return AMDGPU::S_CMP_EQ_U32;
1388 case CmpInst::ICMP_SGT:
1389 return AMDGPU::S_CMP_GT_I32;
1390 case CmpInst::ICMP_SGE:
1391 return AMDGPU::S_CMP_GE_I32;
1392 case CmpInst::ICMP_SLT:
1393 return AMDGPU::S_CMP_LT_I32;
1394 case CmpInst::ICMP_SLE:
1395 return AMDGPU::S_CMP_LE_I32;
1396 case CmpInst::ICMP_UGT:
1397 return AMDGPU::S_CMP_GT_U32;
1398 case CmpInst::ICMP_UGE:
1399 return AMDGPU::S_CMP_GE_U32;
1400 case CmpInst::ICMP_ULT:
1401 return AMDGPU::S_CMP_LT_U32;
1402 case CmpInst::ICMP_ULE:
1403 return AMDGPU::S_CMP_LE_U32;
1404 case CmpInst::FCMP_OEQ:
1405 return AMDGPU::S_CMP_EQ_F32;
1406 case CmpInst::FCMP_OGT:
1407 return AMDGPU::S_CMP_GT_F32;
1408 case CmpInst::FCMP_OGE:
1409 return AMDGPU::S_CMP_GE_F32;
1410 case CmpInst::FCMP_OLT:
1411 return AMDGPU::S_CMP_LT_F32;
1412 case CmpInst::FCMP_OLE:
1413 return AMDGPU::S_CMP_LE_F32;
1414 case CmpInst::FCMP_ONE:
1415 return AMDGPU::S_CMP_LG_F32;
1416 case CmpInst::FCMP_ORD:
1417 return AMDGPU::S_CMP_O_F32;
1418 case CmpInst::FCMP_UNO:
1419 return AMDGPU::S_CMP_U_F32;
1420 case CmpInst::FCMP_UEQ:
1421 return AMDGPU::S_CMP_NLG_F32;
1422 case CmpInst::FCMP_UGT:
1423 return AMDGPU::S_CMP_NLE_F32;
1424 case CmpInst::FCMP_UGE:
1425 return AMDGPU::S_CMP_NLT_F32;
1426 case CmpInst::FCMP_ULT:
1427 return AMDGPU::S_CMP_NGE_F32;
1428 case CmpInst::FCMP_ULE:
1429 return AMDGPU::S_CMP_NGT_F32;
1430 case CmpInst::FCMP_UNE:
1431 return AMDGPU::S_CMP_NEQ_F32;
1432 default:
1433 llvm_unreachable("Unknown condition code!");
1434 }
1435 }
1436
1437 if (Size == 16) {
1438 if (!STI.hasSALUFloatInsts())
1439 return -1;
1440
1441 switch (P) {
1442 case CmpInst::FCMP_OEQ:
1443 return AMDGPU::S_CMP_EQ_F16;
1444 case CmpInst::FCMP_OGT:
1445 return AMDGPU::S_CMP_GT_F16;
1446 case CmpInst::FCMP_OGE:
1447 return AMDGPU::S_CMP_GE_F16;
1448 case CmpInst::FCMP_OLT:
1449 return AMDGPU::S_CMP_LT_F16;
1450 case CmpInst::FCMP_OLE:
1451 return AMDGPU::S_CMP_LE_F16;
1452 case CmpInst::FCMP_ONE:
1453 return AMDGPU::S_CMP_LG_F16;
1454 case CmpInst::FCMP_ORD:
1455 return AMDGPU::S_CMP_O_F16;
1456 case CmpInst::FCMP_UNO:
1457 return AMDGPU::S_CMP_U_F16;
1458 case CmpInst::FCMP_UEQ:
1459 return AMDGPU::S_CMP_NLG_F16;
1460 case CmpInst::FCMP_UGT:
1461 return AMDGPU::S_CMP_NLE_F16;
1462 case CmpInst::FCMP_UGE:
1463 return AMDGPU::S_CMP_NLT_F16;
1464 case CmpInst::FCMP_ULT:
1465 return AMDGPU::S_CMP_NGE_F16;
1466 case CmpInst::FCMP_ULE:
1467 return AMDGPU::S_CMP_NGT_F16;
1468 case CmpInst::FCMP_UNE:
1469 return AMDGPU::S_CMP_NEQ_F16;
1470 default:
1471 llvm_unreachable("Unknown condition code!");
1472 }
1473 }
1474
1475 return -1;
1476}
1477
1478bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1479
1480 MachineBasicBlock *BB = I.getParent();
1481 const DebugLoc &DL = I.getDebugLoc();
1482
1483 Register SrcReg = I.getOperand(2).getReg();
1484 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1485
1486 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1487
1488 Register CCReg = I.getOperand(0).getReg();
1489 if (!isVCC(CCReg, *MRI)) {
1490 int Opcode = getS_CMPOpcode(Pred, Size);
1491 if (Opcode == -1)
1492 return false;
1493 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1494 .add(I.getOperand(2))
1495 .add(I.getOperand(3));
1496 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1497 .addReg(AMDGPU::SCC);
1498 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1499 bool Ret =
1500 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1501 I.eraseFromParent();
1502 return Ret;
1503 }
1504
1505 if (I.getOpcode() == AMDGPU::G_FCMP)
1506 return false;
1507
1508 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1509 if (Opcode == -1)
1510 return false;
1511
1512 MachineInstrBuilder ICmp;
1513 // t16 instructions
1514 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers)) {
1515 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1516 .addImm(0)
1517 .add(I.getOperand(2))
1518 .addImm(0)
1519 .add(I.getOperand(3))
1520 .addImm(0); // op_sel
1521 } else {
1522 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1523 .add(I.getOperand(2))
1524 .add(I.getOperand(3));
1525 }
1526
1527 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1528 *TRI.getBoolRC(), *MRI);
1529 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1530 I.eraseFromParent();
1531 return true;
1532}
1533
1534bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1535 Register Dst = I.getOperand(0).getReg();
1536 if (isVCC(Dst, *MRI))
1537 return false;
1538
1539 LLT DstTy = MRI->getType(Dst);
1540 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1541 return false;
1542
1543 MachineBasicBlock *BB = I.getParent();
1544 const DebugLoc &DL = I.getDebugLoc();
1545 Register SrcReg = I.getOperand(2).getReg();
1546 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1547
1548 // i1 inputs are not supported in GlobalISel.
1549 if (Size == 1)
1550 return false;
1551
1552 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1553 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1554 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1555 I.eraseFromParent();
1556 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1557 }
1558
1559 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1560 if (Opcode == -1)
1561 return false;
1562
1563 MachineInstrBuilder SelectedMI;
1564 MachineOperand &LHS = I.getOperand(2);
1565 MachineOperand &RHS = I.getOperand(3);
1566 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
1567 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
1568 Register Src0Reg =
1569 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1570 Register Src1Reg =
1571 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1572 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1573 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1574 SelectedMI.addImm(Src0Mods);
1575 SelectedMI.addReg(Src0Reg);
1576 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1577 SelectedMI.addImm(Src1Mods);
1578 SelectedMI.addReg(Src1Reg);
1579 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1580 SelectedMI.addImm(0); // clamp
1581 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1582 SelectedMI.addImm(0); // op_sel
1583
1584 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1585 constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI);
1586
1587 I.eraseFromParent();
1588 return true;
1589}
1590
1591// Ballot has to zero bits in input lane-mask that are zero in current exec,
1592// Done as AND with exec. For inputs that are results of instruction that
1593// implicitly use same exec, for example compares in same basic block or SCC to
1594// VCC copy, use copy.
1597 MachineInstr *MI = MRI.getVRegDef(Reg);
1598 if (MI->getParent() != MBB)
1599 return false;
1600
1601 // Lane mask generated by SCC to VCC copy.
1602 if (MI->getOpcode() == AMDGPU::COPY) {
1603 auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg());
1604 auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg());
1605 if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&
1606 SrcRB->getID() == AMDGPU::SGPRRegBankID)
1607 return true;
1608 }
1609
1610 // Lane mask generated using compare with same exec.
1611 if (isa<GAnyCmp>(MI))
1612 return true;
1613
1614 Register LHS, RHS;
1615 // Look through AND.
1616 if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS))))
1617 return isLaneMaskFromSameBlock(LHS, MRI, MBB) ||
1619
1620 return false;
1621}
1622
1623bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1624 MachineBasicBlock *BB = I.getParent();
1625 const DebugLoc &DL = I.getDebugLoc();
1626 Register DstReg = I.getOperand(0).getReg();
1627 Register SrcReg = I.getOperand(2).getReg();
1628 const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits();
1629 const unsigned WaveSize = STI.getWavefrontSize();
1630
1631 // In the common case, the return type matches the wave size.
1632 // However we also support emitting i64 ballots in wave32 mode.
1633 if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
1634 return false;
1635
1636 std::optional<ValueAndVReg> Arg =
1638
1639 Register Dst = DstReg;
1640 // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
1641 if (BallotSize != WaveSize) {
1642 Dst = MRI->createVirtualRegister(TRI.getBoolRC());
1643 }
1644
1645 if (Arg) {
1646 const int64_t Value = Arg->Value.getZExtValue();
1647 if (Value == 0) {
1648 // Dst = S_MOV 0
1649 unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1650 BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0);
1651 } else {
1652 // Dst = COPY EXEC
1653 assert(Value == 1);
1654 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec());
1655 }
1656 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1657 return false;
1658 } else {
1659 if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) {
1660 // Dst = COPY SrcReg
1661 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg);
1662 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1663 return false;
1664 } else {
1665 // Dst = S_AND SrcReg, EXEC
1666 unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1667 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst)
1668 .addReg(SrcReg)
1669 .addReg(TRI.getExec())
1670 .setOperandDead(3); // Dead scc
1671 constrainSelectedInstRegOperands(*And, TII, TRI, RBI);
1672 }
1673 }
1674
1675 // i64 ballot on Wave32: zero-extend i32 ballot to i64.
1676 if (BallotSize != WaveSize) {
1677 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1678 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1679 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1680 .addReg(Dst)
1681 .addImm(AMDGPU::sub0)
1682 .addReg(HiReg)
1683 .addImm(AMDGPU::sub1);
1684 }
1685
1686 I.eraseFromParent();
1687 return true;
1688}
1689
1690bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1691 Register DstReg = I.getOperand(0).getReg();
1692 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1693 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1694 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1695 return false;
1696
1697 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1698
1699 Module *M = MF->getFunction().getParent();
1700 const MDNode *Metadata = I.getOperand(2).getMetadata();
1701 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1702 auto *RelocSymbol = cast<GlobalVariable>(
1703 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1704
1705 MachineBasicBlock *BB = I.getParent();
1706 BuildMI(*BB, &I, I.getDebugLoc(),
1707 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1709
1710 I.eraseFromParent();
1711 return true;
1712}
1713
1714bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1715 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1716
1717 Register DstReg = I.getOperand(0).getReg();
1718 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1719 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1720 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1721
1722 MachineBasicBlock *MBB = I.getParent();
1723 const DebugLoc &DL = I.getDebugLoc();
1724
1725 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1726
1727 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1728 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1729 MIB.addImm(MFI->getLDSSize());
1730 } else {
1731 Module *M = MF->getFunction().getParent();
1732 const GlobalValue *GV =
1733 Intrinsic::getOrInsertDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1735 }
1736
1737 I.eraseFromParent();
1738 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1739 return true;
1740}
1741
1742bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1743 MachineBasicBlock *MBB = I.getParent();
1744 MachineFunction &MF = *MBB->getParent();
1745 const DebugLoc &DL = I.getDebugLoc();
1746
1747 MachineOperand &Dst = I.getOperand(0);
1748 Register DstReg = Dst.getReg();
1749 unsigned Depth = I.getOperand(2).getImm();
1750
1751 const TargetRegisterClass *RC
1752 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1753 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1754 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1755 return false;
1756
1757 // Check for kernel and shader functions
1758 if (Depth != 0 ||
1759 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1760 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1761 .addImm(0);
1762 I.eraseFromParent();
1763 return true;
1764 }
1765
1766 MachineFrameInfo &MFI = MF.getFrameInfo();
1767 // There is a call to @llvm.returnaddress in this function
1768 MFI.setReturnAddressIsTaken(true);
1769
1770 // Get the return address reg and mark it as an implicit live-in
1771 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1772 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1773 AMDGPU::SReg_64RegClass, DL);
1774 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1775 .addReg(LiveIn);
1776 I.eraseFromParent();
1777 return true;
1778}
1779
1780bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1781 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1782 // SelectionDAG uses for wave32 vs wave64.
1783 MachineBasicBlock *BB = MI.getParent();
1784 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1785 .add(MI.getOperand(1));
1786
1787 Register Reg = MI.getOperand(1).getReg();
1788 MI.eraseFromParent();
1789
1790 if (!MRI->getRegClassOrNull(Reg))
1791 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1792 return true;
1793}
1794
1795bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1796 MachineInstr &MI, Intrinsic::ID IntrID) const {
1797 MachineBasicBlock *MBB = MI.getParent();
1798 MachineFunction *MF = MBB->getParent();
1799 const DebugLoc &DL = MI.getDebugLoc();
1800
1801 unsigned IndexOperand = MI.getOperand(7).getImm();
1802 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1803 bool WaveDone = MI.getOperand(9).getImm() != 0;
1804
1805 if (WaveDone && !WaveRelease) {
1806 // TODO: Move this to IR verifier
1807 const Function &Fn = MF->getFunction();
1808 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1809 Fn, "ds_ordered_count: wave_done requires wave_release", DL));
1810 }
1811
1812 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1813 IndexOperand &= ~0x3f;
1814 unsigned CountDw = 0;
1815
1816 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1817 CountDw = (IndexOperand >> 24) & 0xf;
1818 IndexOperand &= ~(0xf << 24);
1819
1820 if (CountDw < 1 || CountDw > 4) {
1821 const Function &Fn = MF->getFunction();
1822 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1823 Fn, "ds_ordered_count: dword count must be between 1 and 4", DL));
1824 CountDw = 1;
1825 }
1826 }
1827
1828 if (IndexOperand) {
1829 const Function &Fn = MF->getFunction();
1830 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1831 Fn, "ds_ordered_count: bad index operand", DL));
1832 }
1833
1834 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1835 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1836
1837 unsigned Offset0 = OrderedCountIndex << 2;
1838 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1839
1840 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1841 Offset1 |= (CountDw - 1) << 6;
1842
1843 if (STI.getGeneration() < AMDGPUSubtarget::GFX11)
1844 Offset1 |= ShaderType << 2;
1845
1846 unsigned Offset = Offset0 | (Offset1 << 8);
1847
1848 Register M0Val = MI.getOperand(2).getReg();
1849 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1850 .addReg(M0Val);
1851
1852 Register DstReg = MI.getOperand(0).getReg();
1853 Register ValReg = MI.getOperand(3).getReg();
1854 MachineInstrBuilder DS =
1855 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1856 .addReg(ValReg)
1857 .addImm(Offset)
1858 .cloneMemRefs(MI);
1859
1860 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1861 return false;
1862
1863 constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1864 MI.eraseFromParent();
1865 return true;
1866}
1867
1868static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1869 switch (IntrID) {
1870 case Intrinsic::amdgcn_ds_gws_init:
1871 return AMDGPU::DS_GWS_INIT;
1872 case Intrinsic::amdgcn_ds_gws_barrier:
1873 return AMDGPU::DS_GWS_BARRIER;
1874 case Intrinsic::amdgcn_ds_gws_sema_v:
1875 return AMDGPU::DS_GWS_SEMA_V;
1876 case Intrinsic::amdgcn_ds_gws_sema_br:
1877 return AMDGPU::DS_GWS_SEMA_BR;
1878 case Intrinsic::amdgcn_ds_gws_sema_p:
1879 return AMDGPU::DS_GWS_SEMA_P;
1880 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1881 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1882 default:
1883 llvm_unreachable("not a gws intrinsic");
1884 }
1885}
1886
1887bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1888 Intrinsic::ID IID) const {
1889 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1890 !STI.hasGWSSemaReleaseAll()))
1891 return false;
1892
1893 // intrinsic ID, vsrc, offset
1894 const bool HasVSrc = MI.getNumOperands() == 3;
1895 assert(HasVSrc || MI.getNumOperands() == 2);
1896
1897 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1898 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1899 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1900 return false;
1901
1902 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1903 unsigned ImmOffset;
1904
1905 MachineBasicBlock *MBB = MI.getParent();
1906 const DebugLoc &DL = MI.getDebugLoc();
1907
1908 MachineInstr *Readfirstlane = nullptr;
1909
1910 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1911 // incoming offset, in case there's an add of a constant. We'll have to put it
1912 // back later.
1913 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1914 Readfirstlane = OffsetDef;
1915 BaseOffset = OffsetDef->getOperand(1).getReg();
1916 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1917 }
1918
1919 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1920 // If we have a constant offset, try to use the 0 in m0 as the base.
1921 // TODO: Look into changing the default m0 initialization value. If the
1922 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1923 // the immediate offset.
1924
1925 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1926 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1927 .addImm(0);
1928 } else {
1929 std::tie(BaseOffset, ImmOffset) =
1930 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, VT);
1931
1932 if (Readfirstlane) {
1933 // We have the constant offset now, so put the readfirstlane back on the
1934 // variable component.
1935 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1936 return false;
1937
1938 Readfirstlane->getOperand(1).setReg(BaseOffset);
1939 BaseOffset = Readfirstlane->getOperand(0).getReg();
1940 } else {
1941 if (!RBI.constrainGenericRegister(BaseOffset,
1942 AMDGPU::SReg_32RegClass, *MRI))
1943 return false;
1944 }
1945
1946 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1947 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1948 .addReg(BaseOffset)
1949 .addImm(16)
1950 .setOperandDead(3); // Dead scc
1951
1952 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1953 .addReg(M0Base);
1954 }
1955
1956 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1957 // offset field) % 64. Some versions of the programming guide omit the m0
1958 // part, or claim it's from offset 0.
1959
1960 unsigned Opc = gwsIntrinToOpcode(IID);
1961 const MCInstrDesc &InstrDesc = TII.get(Opc);
1962
1963 if (HasVSrc) {
1964 Register VSrc = MI.getOperand(1).getReg();
1965
1966 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
1967 const TargetRegisterClass *DataRC = TII.getRegClass(InstrDesc, Data0Idx);
1968 const TargetRegisterClass *SubRC =
1969 TRI.getSubRegisterClass(DataRC, AMDGPU::sub0);
1970
1971 if (!SubRC) {
1972 // 32-bit normal case.
1973 if (!RBI.constrainGenericRegister(VSrc, *DataRC, *MRI))
1974 return false;
1975
1976 BuildMI(*MBB, &MI, DL, InstrDesc)
1977 .addReg(VSrc)
1978 .addImm(ImmOffset)
1979 .cloneMemRefs(MI);
1980 } else {
1981 // Requires even register alignment, so create 64-bit value and pad the
1982 // top half with undef.
1983 Register DataReg = MRI->createVirtualRegister(DataRC);
1984 if (!RBI.constrainGenericRegister(VSrc, *SubRC, *MRI))
1985 return false;
1986
1987 Register UndefReg = MRI->createVirtualRegister(SubRC);
1988 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
1989 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), DataReg)
1990 .addReg(VSrc)
1991 .addImm(AMDGPU::sub0)
1992 .addReg(UndefReg)
1993 .addImm(AMDGPU::sub1);
1994
1995 BuildMI(*MBB, &MI, DL, InstrDesc)
1996 .addReg(DataReg)
1997 .addImm(ImmOffset)
1998 .cloneMemRefs(MI);
1999 }
2000 } else {
2001 BuildMI(*MBB, &MI, DL, InstrDesc)
2002 .addImm(ImmOffset)
2003 .cloneMemRefs(MI);
2004 }
2005
2006 MI.eraseFromParent();
2007 return true;
2008}
2009
2010bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
2011 bool IsAppend) const {
2012 Register PtrBase = MI.getOperand(2).getReg();
2013 LLT PtrTy = MRI->getType(PtrBase);
2014 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2015
2016 unsigned Offset;
2017 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
2018
2019 // TODO: Should this try to look through readfirstlane like GWS?
2020 if (!isDSOffsetLegal(PtrBase, Offset)) {
2021 PtrBase = MI.getOperand(2).getReg();
2022 Offset = 0;
2023 }
2024
2025 MachineBasicBlock *MBB = MI.getParent();
2026 const DebugLoc &DL = MI.getDebugLoc();
2027 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2028
2029 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2030 .addReg(PtrBase);
2031 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
2032 return false;
2033
2034 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
2035 .addImm(Offset)
2036 .addImm(IsGDS ? -1 : 0)
2037 .cloneMemRefs(MI);
2038 MI.eraseFromParent();
2039 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2040 return true;
2041}
2042
2043bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
2044 MachineFunction *MF = MI.getMF();
2045 SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
2046
2047 MFInfo->setInitWholeWave();
2048 return selectImpl(MI, *CoverageInfo);
2049}
2050
2051static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
2052 bool &IsTexFail) {
2053 if (TexFailCtrl)
2054 IsTexFail = true;
2055
2056 TFE = TexFailCtrl & 0x1;
2057 TexFailCtrl &= ~(uint64_t)0x1;
2058 LWE = TexFailCtrl & 0x2;
2059 TexFailCtrl &= ~(uint64_t)0x2;
2060
2061 return TexFailCtrl == 0;
2062}
2063
2064bool AMDGPUInstructionSelector::selectImageIntrinsic(
2065 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
2066 MachineBasicBlock *MBB = MI.getParent();
2067 const DebugLoc &DL = MI.getDebugLoc();
2068 unsigned IntrOpcode = Intr->BaseOpcode;
2069
2070 // For image atomic: use no-return opcode if result is unused.
2071 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode) {
2072 Register ResultDef = MI.getOperand(0).getReg();
2073 if (MRI->use_nodbg_empty(ResultDef))
2074 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
2075 }
2076
2077 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
2079
2080 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
2081 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
2082 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
2083 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
2084
2085 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
2086
2087 Register VDataIn = AMDGPU::NoRegister;
2088 Register VDataOut = AMDGPU::NoRegister;
2089 LLT VDataTy;
2090 int NumVDataDwords = -1;
2091 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
2092 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
2093
2094 bool Unorm;
2095 if (!BaseOpcode->Sampler)
2096 Unorm = true;
2097 else
2098 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
2099
2100 bool TFE;
2101 bool LWE;
2102 bool IsTexFail = false;
2103 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
2104 TFE, LWE, IsTexFail))
2105 return false;
2106
2107 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
2108 const bool IsA16 = (Flags & 1) != 0;
2109 const bool IsG16 = (Flags & 2) != 0;
2110
2111 // A16 implies 16 bit gradients if subtarget doesn't support G16
2112 if (IsA16 && !STI.hasG16() && !IsG16)
2113 return false;
2114
2115 unsigned DMask = 0;
2116 unsigned DMaskLanes = 0;
2117
2118 if (BaseOpcode->Atomic) {
2119 if (!BaseOpcode->NoReturn)
2120 VDataOut = MI.getOperand(0).getReg();
2121 VDataIn = MI.getOperand(2).getReg();
2122 LLT Ty = MRI->getType(VDataIn);
2123
2124 // Be careful to allow atomic swap on 16-bit element vectors.
2125 const bool Is64Bit = BaseOpcode->AtomicX2 ?
2126 Ty.getSizeInBits() == 128 :
2127 Ty.getSizeInBits() == 64;
2128
2129 if (BaseOpcode->AtomicX2) {
2130 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
2131
2132 DMask = Is64Bit ? 0xf : 0x3;
2133 NumVDataDwords = Is64Bit ? 4 : 2;
2134 } else {
2135 DMask = Is64Bit ? 0x3 : 0x1;
2136 NumVDataDwords = Is64Bit ? 2 : 1;
2137 }
2138 } else {
2139 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
2140 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
2141
2142 if (BaseOpcode->Store) {
2143 VDataIn = MI.getOperand(1).getReg();
2144 VDataTy = MRI->getType(VDataIn);
2145 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
2146 } else if (BaseOpcode->NoReturn) {
2147 NumVDataDwords = 0;
2148 } else {
2149 VDataOut = MI.getOperand(0).getReg();
2150 VDataTy = MRI->getType(VDataOut);
2151 NumVDataDwords = DMaskLanes;
2152
2153 if (IsD16 && !STI.hasUnpackedD16VMem())
2154 NumVDataDwords = (DMaskLanes + 1) / 2;
2155 }
2156 }
2157
2158 // Set G16 opcode
2159 if (Subtarget->hasG16() && IsG16) {
2160 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
2162 assert(G16MappingInfo);
2163 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
2164 }
2165
2166 // TODO: Check this in verifier.
2167 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
2168
2169 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
2170 // Keep GLC only when the atomic's result is actually used.
2171 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
2173 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
2175 return false;
2176
2177 int NumVAddrRegs = 0;
2178 int NumVAddrDwords = 0;
2179 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
2180 // Skip the $noregs and 0s inserted during legalization.
2181 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
2182 if (!AddrOp.isReg())
2183 continue; // XXX - Break?
2184
2185 Register Addr = AddrOp.getReg();
2186 if (!Addr)
2187 break;
2188
2189 ++NumVAddrRegs;
2190 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
2191 }
2192
2193 // The legalizer preprocessed the intrinsic arguments. If we aren't using
2194 // NSA, these should have been packed into a single value in the first
2195 // address register
2196 const bool UseNSA =
2197 NumVAddrRegs != 1 &&
2198 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
2199 : NumVAddrDwords == NumVAddrRegs);
2200 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
2201 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
2202 return false;
2203 }
2204
2205 if (IsTexFail)
2206 ++NumVDataDwords;
2207
2208 int Opcode = -1;
2209 if (IsGFX12Plus) {
2210 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
2211 NumVDataDwords, NumVAddrDwords);
2212 } else if (IsGFX11Plus) {
2213 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2214 UseNSA ? AMDGPU::MIMGEncGfx11NSA
2215 : AMDGPU::MIMGEncGfx11Default,
2216 NumVDataDwords, NumVAddrDwords);
2217 } else if (IsGFX10Plus) {
2218 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2219 UseNSA ? AMDGPU::MIMGEncGfx10NSA
2220 : AMDGPU::MIMGEncGfx10Default,
2221 NumVDataDwords, NumVAddrDwords);
2222 } else {
2223 if (Subtarget->hasGFX90AInsts()) {
2224 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
2225 NumVDataDwords, NumVAddrDwords);
2226 if (Opcode == -1) {
2227 LLVM_DEBUG(
2228 dbgs()
2229 << "requested image instruction is not supported on this GPU\n");
2230 return false;
2231 }
2232 }
2233 if (Opcode == -1 &&
2234 STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
2235 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
2236 NumVDataDwords, NumVAddrDwords);
2237 if (Opcode == -1)
2238 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
2239 NumVDataDwords, NumVAddrDwords);
2240 }
2241 if (Opcode == -1)
2242 return false;
2243
2244 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
2245 .cloneMemRefs(MI);
2246
2247 if (VDataOut) {
2248 if (BaseOpcode->AtomicX2) {
2249 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
2250
2251 Register TmpReg = MRI->createVirtualRegister(
2252 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2253 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2254
2255 MIB.addDef(TmpReg);
2256 if (!MRI->use_empty(VDataOut)) {
2257 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
2258 .addReg(TmpReg, RegState::Kill, SubReg);
2259 }
2260
2261 } else {
2262 MIB.addDef(VDataOut); // vdata output
2263 }
2264 }
2265
2266 if (VDataIn)
2267 MIB.addReg(VDataIn); // vdata input
2268
2269 for (int I = 0; I != NumVAddrRegs; ++I) {
2270 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2271 if (SrcOp.isReg()) {
2272 assert(SrcOp.getReg() != 0);
2273 MIB.addReg(SrcOp.getReg());
2274 }
2275 }
2276
2277 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2278 if (BaseOpcode->Sampler)
2279 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2280
2281 MIB.addImm(DMask); // dmask
2282
2283 if (IsGFX10Plus)
2284 MIB.addImm(DimInfo->Encoding);
2285 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2286 MIB.addImm(Unorm);
2287
2288 MIB.addImm(CPol);
2289 MIB.addImm(IsA16 && // a16 or r128
2290 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2291 if (IsGFX10Plus)
2292 MIB.addImm(IsA16 ? -1 : 0);
2293
2294 if (!Subtarget->hasGFX90AInsts()) {
2295 MIB.addImm(TFE); // tfe
2296 } else if (TFE) {
2297 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2298 return false;
2299 }
2300
2301 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2302 MIB.addImm(LWE); // lwe
2303 if (!IsGFX10Plus)
2304 MIB.addImm(DimInfo->DA ? -1 : 0);
2305 if (BaseOpcode->HasD16)
2306 MIB.addImm(IsD16 ? -1 : 0);
2307
2308 MI.eraseFromParent();
2309 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2310 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2311 return true;
2312}
2313
2314// We need to handle this here because tablegen doesn't support matching
2315// instructions with multiple outputs.
2316bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2317 MachineInstr &MI) const {
2318 Register Dst0 = MI.getOperand(0).getReg();
2319 Register Dst1 = MI.getOperand(1).getReg();
2320
2321 const DebugLoc &DL = MI.getDebugLoc();
2322 MachineBasicBlock *MBB = MI.getParent();
2323
2324 Register Addr = MI.getOperand(3).getReg();
2325 Register Data0 = MI.getOperand(4).getReg();
2326 Register Data1 = MI.getOperand(5).getReg();
2327 unsigned Offset = MI.getOperand(6).getImm();
2328
2329 unsigned Opc;
2330 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
2331 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2332 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2333 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2334 break;
2335 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2336 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2337 break;
2338 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2339 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2340 break;
2341 }
2342
2343 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
2344 .addDef(Dst1)
2345 .addUse(Addr)
2346 .addUse(Data0)
2347 .addUse(Data1)
2348 .addImm(Offset)
2349 .cloneMemRefs(MI);
2350
2351 MI.eraseFromParent();
2352 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2353 return true;
2354}
2355
2356bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2357 MachineInstr &I) const {
2358 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2359 switch (IntrinsicID) {
2360 case Intrinsic::amdgcn_end_cf:
2361 return selectEndCfIntrinsic(I);
2362 case Intrinsic::amdgcn_ds_ordered_add:
2363 case Intrinsic::amdgcn_ds_ordered_swap:
2364 return selectDSOrderedIntrinsic(I, IntrinsicID);
2365 case Intrinsic::amdgcn_ds_gws_init:
2366 case Intrinsic::amdgcn_ds_gws_barrier:
2367 case Intrinsic::amdgcn_ds_gws_sema_v:
2368 case Intrinsic::amdgcn_ds_gws_sema_br:
2369 case Intrinsic::amdgcn_ds_gws_sema_p:
2370 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2371 return selectDSGWSIntrinsic(I, IntrinsicID);
2372 case Intrinsic::amdgcn_ds_append:
2373 return selectDSAppendConsume(I, true);
2374 case Intrinsic::amdgcn_ds_consume:
2375 return selectDSAppendConsume(I, false);
2376 case Intrinsic::amdgcn_init_whole_wave:
2377 return selectInitWholeWave(I);
2378 case Intrinsic::amdgcn_raw_buffer_load_lds:
2379 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
2380 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2381 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
2382 case Intrinsic::amdgcn_struct_buffer_load_lds:
2383 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
2384 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2385 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
2386 return selectBufferLoadLds(I);
2387 // Until we can store both the address space of the global and the LDS
2388 // arguments by having tto MachineMemOperands on an intrinsic, we just trust
2389 // that the argument is a global pointer (buffer pointers have been handled by
2390 // a LLVM IR-level lowering).
2391 case Intrinsic::amdgcn_load_to_lds:
2392 case Intrinsic::amdgcn_load_async_to_lds:
2393 case Intrinsic::amdgcn_global_load_lds:
2394 case Intrinsic::amdgcn_global_load_async_lds:
2395 return selectGlobalLoadLds(I);
2396 case Intrinsic::amdgcn_tensor_load_to_lds:
2397 case Intrinsic::amdgcn_tensor_store_from_lds:
2398 return selectTensorLoadStore(I, IntrinsicID);
2399 case Intrinsic::amdgcn_asyncmark:
2400 case Intrinsic::amdgcn_wait_asyncmark:
2401 // FIXME: Not supported on GFX12 yet. Will need a new feature when we do.
2402 if (!Subtarget->hasVMemToLDSLoad())
2403 return false;
2404 break;
2405 case Intrinsic::amdgcn_exp_compr:
2406 if (!STI.hasCompressedExport()) {
2407 Function &F = I.getMF()->getFunction();
2408 F.getContext().diagnose(
2409 DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget",
2410 I.getDebugLoc(), DS_Error));
2411 return false;
2412 }
2413 break;
2414 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2415 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2416 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2417 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2418 return selectDSBvhStackIntrinsic(I);
2419 case Intrinsic::amdgcn_s_alloc_vgpr: {
2420 // S_ALLOC_VGPR doesn't have a destination register, it just implicitly sets
2421 // SCC. We then need to COPY it into the result vreg.
2422 MachineBasicBlock *MBB = I.getParent();
2423 const DebugLoc &DL = I.getDebugLoc();
2424
2425 Register ResReg = I.getOperand(0).getReg();
2426
2427 MachineInstr *AllocMI = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_ALLOC_VGPR))
2428 .add(I.getOperand(2));
2429 (void)BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), ResReg)
2430 .addReg(AMDGPU::SCC);
2431 I.eraseFromParent();
2432 constrainSelectedInstRegOperands(*AllocMI, TII, TRI, RBI);
2433 return RBI.constrainGenericRegister(ResReg, AMDGPU::SReg_32RegClass, *MRI);
2434 }
2435 case Intrinsic::amdgcn_s_barrier_init:
2436 case Intrinsic::amdgcn_s_barrier_signal_var:
2437 return selectNamedBarrierInit(I, IntrinsicID);
2438 case Intrinsic::amdgcn_s_wakeup_barrier: {
2439 if (!STI.hasSWakeupBarrier()) {
2440 Function &F = I.getMF()->getFunction();
2441 F.getContext().diagnose(
2442 DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget",
2443 I.getDebugLoc(), DS_Error));
2444 return false;
2445 }
2446 return selectNamedBarrierInst(I, IntrinsicID);
2447 }
2448 case Intrinsic::amdgcn_s_barrier_join:
2449 case Intrinsic::amdgcn_s_get_named_barrier_state:
2450 return selectNamedBarrierInst(I, IntrinsicID);
2451 case Intrinsic::amdgcn_s_get_barrier_state:
2452 return selectSGetBarrierState(I, IntrinsicID);
2453 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2454 return selectSBarrierSignalIsfirst(I, IntrinsicID);
2455 }
2456 return selectImpl(I, *CoverageInfo);
2457}
2458
2459bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2460 if (selectImpl(I, *CoverageInfo))
2461 return true;
2462
2463 MachineBasicBlock *BB = I.getParent();
2464 const DebugLoc &DL = I.getDebugLoc();
2465
2466 Register DstReg = I.getOperand(0).getReg();
2467 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2468 assert(Size <= 32 || Size == 64);
2469 const MachineOperand &CCOp = I.getOperand(1);
2470 Register CCReg = CCOp.getReg();
2471 if (!isVCC(CCReg, *MRI)) {
2472 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2473 AMDGPU::S_CSELECT_B32;
2474 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2475 .addReg(CCReg);
2476
2477 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2478 // bank, because it does not cover the register class that we used to represent
2479 // for it. So we need to manually set the register class here.
2480 if (!MRI->getRegClassOrNull(CCReg))
2481 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2482 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2483 .add(I.getOperand(2))
2484 .add(I.getOperand(3));
2485
2487 constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2488 I.eraseFromParent();
2489 return true;
2490 }
2491
2492 // Wide VGPR select should have been split in RegBankSelect.
2493 if (Size > 32)
2494 return false;
2495
2496 MachineInstr *Select =
2497 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2498 .addImm(0)
2499 .add(I.getOperand(3))
2500 .addImm(0)
2501 .add(I.getOperand(2))
2502 .add(I.getOperand(1));
2503
2505 I.eraseFromParent();
2506 return true;
2507}
2508
2509bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2510 Register DstReg = I.getOperand(0).getReg();
2511 Register SrcReg = I.getOperand(1).getReg();
2512 const LLT DstTy = MRI->getType(DstReg);
2513 const LLT SrcTy = MRI->getType(SrcReg);
2514 const LLT S1 = LLT::scalar(1);
2515
2516 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2517 const RegisterBank *DstRB;
2518 if (DstTy == S1) {
2519 // This is a special case. We don't treat s1 for legalization artifacts as
2520 // vcc booleans.
2521 DstRB = SrcRB;
2522 } else {
2523 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2524 if (SrcRB != DstRB)
2525 return false;
2526 }
2527
2528 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2529
2530 unsigned DstSize = DstTy.getSizeInBits();
2531 unsigned SrcSize = SrcTy.getSizeInBits();
2532
2533 const TargetRegisterClass *SrcRC =
2534 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2535 const TargetRegisterClass *DstRC =
2536 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2537 if (!SrcRC || !DstRC)
2538 return false;
2539
2540 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2541 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2542 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2543 return false;
2544 }
2545
2546 if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
2547 assert(STI.useRealTrue16Insts());
2548 const DebugLoc &DL = I.getDebugLoc();
2549 MachineBasicBlock *MBB = I.getParent();
2550 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg)
2551 .addReg(SrcReg, {}, AMDGPU::lo16);
2552 I.eraseFromParent();
2553 return true;
2554 }
2555
2556 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2557 MachineBasicBlock *MBB = I.getParent();
2558 const DebugLoc &DL = I.getDebugLoc();
2559
2560 Register LoReg = MRI->createVirtualRegister(DstRC);
2561 Register HiReg = MRI->createVirtualRegister(DstRC);
2562 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2563 .addReg(SrcReg, {}, AMDGPU::sub0);
2564 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2565 .addReg(SrcReg, {}, AMDGPU::sub1);
2566
2567 if (IsVALU && STI.hasSDWA()) {
2568 // Write the low 16-bits of the high element into the high 16-bits of the
2569 // low element.
2570 MachineInstr *MovSDWA =
2571 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2572 .addImm(0) // $src0_modifiers
2573 .addReg(HiReg) // $src0
2574 .addImm(0) // $clamp
2575 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2576 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2577 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2578 .addReg(LoReg, RegState::Implicit);
2579 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2580 } else {
2581 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2582 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2583 Register ImmReg = MRI->createVirtualRegister(DstRC);
2584 if (IsVALU) {
2585 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2586 .addImm(16)
2587 .addReg(HiReg);
2588 } else {
2589 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2590 .addReg(HiReg)
2591 .addImm(16)
2592 .setOperandDead(3); // Dead scc
2593 }
2594
2595 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2596 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2597 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2598
2599 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2600 .addImm(0xffff);
2601 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2602 .addReg(LoReg)
2603 .addReg(ImmReg);
2604 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2605 .addReg(TmpReg0)
2606 .addReg(TmpReg1);
2607
2608 if (!IsVALU) {
2609 And.setOperandDead(3); // Dead scc
2610 Or.setOperandDead(3); // Dead scc
2611 }
2612 }
2613
2614 I.eraseFromParent();
2615 return true;
2616 }
2617
2618 if (!DstTy.isScalar())
2619 return false;
2620
2621 if (SrcSize > 32) {
2622 unsigned SubRegIdx = DstSize < 32
2623 ? static_cast<unsigned>(AMDGPU::sub0)
2624 : TRI.getSubRegFromChannel(0, DstSize / 32);
2625 if (SubRegIdx == AMDGPU::NoSubRegister)
2626 return false;
2627
2628 // Deal with weird cases where the class only partially supports the subreg
2629 // index.
2630 const TargetRegisterClass *SrcWithSubRC
2631 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2632 if (!SrcWithSubRC)
2633 return false;
2634
2635 if (SrcWithSubRC != SrcRC) {
2636 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2637 return false;
2638 }
2639
2640 I.getOperand(1).setSubReg(SubRegIdx);
2641 }
2642
2643 I.setDesc(TII.get(TargetOpcode::COPY));
2644 return true;
2645}
2646
2647/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2648static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2650 int SignedMask = static_cast<int>(Mask);
2651 return SignedMask >= -16 && SignedMask <= 64;
2652}
2653
2654// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2655const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2656 Register Reg, const MachineRegisterInfo &MRI,
2657 const TargetRegisterInfo &TRI) const {
2658 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2659 if (auto *RB = dyn_cast<const RegisterBank *>(RegClassOrBank))
2660 return RB;
2661
2662 // Ignore the type, since we don't use vcc in artifacts.
2663 if (auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
2664 return &RBI.getRegBankFromRegClass(*RC, LLT());
2665 return nullptr;
2666}
2667
2668bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2669 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2670 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2671 const DebugLoc &DL = I.getDebugLoc();
2672 MachineBasicBlock &MBB = *I.getParent();
2673 const Register DstReg = I.getOperand(0).getReg();
2674 const Register SrcReg = I.getOperand(1).getReg();
2675
2676 const LLT DstTy = MRI->getType(DstReg);
2677 const LLT SrcTy = MRI->getType(SrcReg);
2678 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2679 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2680 const unsigned DstSize = DstTy.getSizeInBits();
2681 if (!DstTy.isScalar())
2682 return false;
2683
2684 // Artifact casts should never use vcc.
2685 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2686
2687 // FIXME: This should probably be illegal and split earlier.
2688 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2689 if (DstSize <= 32)
2690 return selectCOPY(I);
2691
2692 const TargetRegisterClass *SrcRC =
2693 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2694 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2695 const TargetRegisterClass *DstRC =
2696 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2697
2698 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2699 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2700 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2701 .addReg(SrcReg)
2702 .addImm(AMDGPU::sub0)
2703 .addReg(UndefReg)
2704 .addImm(AMDGPU::sub1);
2705 I.eraseFromParent();
2706
2707 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2708 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2709 }
2710
2711 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2712 // 64-bit should have been split up in RegBankSelect
2713
2714 // Try to use an and with a mask if it will save code size.
2715 unsigned Mask;
2716 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2717 MachineInstr *ExtI =
2718 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2719 .addImm(Mask)
2720 .addReg(SrcReg);
2721 I.eraseFromParent();
2722 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2723 return true;
2724 }
2725
2726 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2727 MachineInstr *ExtI =
2728 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2729 .addReg(SrcReg)
2730 .addImm(0) // Offset
2731 .addImm(SrcSize); // Width
2732 I.eraseFromParent();
2733 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2734 return true;
2735 }
2736
2737 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2738 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2739 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2740 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2741 return false;
2742
2743 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2744 const unsigned SextOpc = SrcSize == 8 ?
2745 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2746 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2747 .addReg(SrcReg);
2748 I.eraseFromParent();
2749 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2750 }
2751
2752 // Using a single 32-bit SALU to calculate the high half is smaller than
2753 // S_BFE with a literal constant operand.
2754 if (DstSize > 32 && SrcSize == 32) {
2755 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2756 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2757 if (Signed) {
2758 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2759 .addReg(SrcReg, {}, SubReg)
2760 .addImm(31)
2761 .setOperandDead(3); // Dead scc
2762 } else {
2763 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2764 .addImm(0);
2765 }
2766 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2767 .addReg(SrcReg, {}, SubReg)
2768 .addImm(AMDGPU::sub0)
2769 .addReg(HiReg)
2770 .addImm(AMDGPU::sub1);
2771 I.eraseFromParent();
2772 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2773 *MRI);
2774 }
2775
2776 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2777 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2778
2779 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2780 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2781 // We need a 64-bit register source, but the high bits don't matter.
2782 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2783 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2784 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2785
2786 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2787 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2788 .addReg(SrcReg, {}, SubReg)
2789 .addImm(AMDGPU::sub0)
2790 .addReg(UndefReg)
2791 .addImm(AMDGPU::sub1);
2792
2793 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2794 .addReg(ExtReg)
2795 .addImm(SrcSize << 16);
2796
2797 I.eraseFromParent();
2798 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2799 }
2800
2801 unsigned Mask;
2802 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2803 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2804 .addReg(SrcReg)
2805 .addImm(Mask)
2806 .setOperandDead(3); // Dead scc
2807 } else {
2808 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2809 .addReg(SrcReg)
2810 .addImm(SrcSize << 16);
2811 }
2812
2813 I.eraseFromParent();
2814 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2815 }
2816
2817 return false;
2818}
2819
2823
2825 Register BitcastSrc;
2826 if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
2827 Reg = BitcastSrc;
2828 return Reg;
2829}
2830
2832 Register &Out) {
2833 Register Trunc;
2834 if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
2835 return false;
2836
2837 Register LShlSrc;
2838 Register Cst;
2839 if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
2840 Cst = stripCopy(Cst, MRI);
2841 if (mi_match(Cst, MRI, m_SpecificICst(16))) {
2842 Out = stripBitCast(LShlSrc, MRI);
2843 return true;
2844 }
2845 }
2846
2847 MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
2848 if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2849 return false;
2850
2851 assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2852 LLT::fixed_vector(2, 16));
2853
2854 ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
2855 assert(Mask.size() == 2);
2856
2857 if (Mask[0] == 1 && Mask[1] <= 1) {
2858 Out = Shuffle->getOperand(0).getReg();
2859 return true;
2860 }
2861
2862 return false;
2863}
2864
2865bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2866 if (!Subtarget->hasSALUFloatInsts())
2867 return false;
2868
2869 Register Dst = I.getOperand(0).getReg();
2870 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2871 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2872 return false;
2873
2874 Register Src = I.getOperand(1).getReg();
2875
2876 if (MRI->getType(Dst) == LLT::scalar(32) &&
2877 MRI->getType(Src) == LLT::scalar(16)) {
2878 if (isExtractHiElt(*MRI, Src, Src)) {
2879 MachineBasicBlock *BB = I.getParent();
2880 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2881 .addUse(Src);
2882 I.eraseFromParent();
2883 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2884 }
2885 }
2886
2887 return false;
2888}
2889
2890bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2891 // Only manually handle the f64 SGPR case.
2892 //
2893 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2894 // the bit ops theoretically have a second result due to the implicit def of
2895 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2896 // that is easy by disabling the check. The result works, but uses a
2897 // nonsensical sreg32orlds_and_sreg_1 regclass.
2898 //
2899 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2900 // the variadic REG_SEQUENCE operands.
2901
2902 Register Dst = MI.getOperand(0).getReg();
2903 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2904 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2905 MRI->getType(Dst) != LLT::scalar(64))
2906 return false;
2907
2908 Register Src = MI.getOperand(1).getReg();
2909 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2910 if (Fabs)
2911 Src = Fabs->getOperand(1).getReg();
2912
2913 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2914 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2915 return false;
2916
2917 MachineBasicBlock *BB = MI.getParent();
2918 const DebugLoc &DL = MI.getDebugLoc();
2919 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2920 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2921 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2922 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2923
2924 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2925 .addReg(Src, {}, AMDGPU::sub0);
2926 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2927 .addReg(Src, {}, AMDGPU::sub1);
2928 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2929 .addImm(0x80000000);
2930
2931 // Set or toggle sign bit.
2932 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2933 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2934 .addReg(HiReg)
2935 .addReg(ConstReg)
2936 .setOperandDead(3); // Dead scc
2937 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2938 .addReg(LoReg)
2939 .addImm(AMDGPU::sub0)
2940 .addReg(OpReg)
2941 .addImm(AMDGPU::sub1);
2942 MI.eraseFromParent();
2943 return true;
2944}
2945
2946// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2947bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2948 Register Dst = MI.getOperand(0).getReg();
2949 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2950 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2951 MRI->getType(Dst) != LLT::scalar(64))
2952 return false;
2953
2954 Register Src = MI.getOperand(1).getReg();
2955 MachineBasicBlock *BB = MI.getParent();
2956 const DebugLoc &DL = MI.getDebugLoc();
2957 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2958 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2959 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2960 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2961
2962 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2963 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2964 return false;
2965
2966 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2967 .addReg(Src, {}, AMDGPU::sub0);
2968 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2969 .addReg(Src, {}, AMDGPU::sub1);
2970 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2971 .addImm(0x7fffffff);
2972
2973 // Clear sign bit.
2974 // TODO: Should this used S_BITSET0_*?
2975 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2976 .addReg(HiReg)
2977 .addReg(ConstReg)
2978 .setOperandDead(3); // Dead scc
2979 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2980 .addReg(LoReg)
2981 .addImm(AMDGPU::sub0)
2982 .addReg(OpReg)
2983 .addImm(AMDGPU::sub1);
2984
2985 MI.eraseFromParent();
2986 return true;
2987}
2988
2989static bool isConstant(const MachineInstr &MI) {
2990 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2991}
2992
2993void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2994 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2995
2996 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2997 const MachineInstr *PtrMI =
2998 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
2999
3000 assert(PtrMI);
3001
3002 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
3003 return;
3004
3005 GEPInfo GEPInfo;
3006
3007 for (unsigned i = 1; i != 3; ++i) {
3008 const MachineOperand &GEPOp = PtrMI->getOperand(i);
3009 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
3010 assert(OpDef);
3011 if (i == 2 && isConstant(*OpDef)) {
3012 // TODO: Could handle constant base + variable offset, but a combine
3013 // probably should have commuted it.
3014 assert(GEPInfo.Imm == 0);
3015 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
3016 continue;
3017 }
3018 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
3019 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
3020 GEPInfo.SgprParts.push_back(GEPOp.getReg());
3021 else
3022 GEPInfo.VgprParts.push_back(GEPOp.getReg());
3023 }
3024
3025 AddrInfo.push_back(GEPInfo);
3026 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
3027}
3028
3029bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
3030 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
3031}
3032
3033bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
3034 if (!MI.hasOneMemOperand())
3035 return false;
3036
3037 const MachineMemOperand *MMO = *MI.memoperands_begin();
3038 const Value *Ptr = MMO->getValue();
3039
3040 // UndefValue means this is a load of a kernel input. These are uniform.
3041 // Sometimes LDS instructions have constant pointers.
3042 // If Ptr is null, then that means this mem operand contains a
3043 // PseudoSourceValue like GOT.
3045 return true;
3046
3048 return true;
3049
3050 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
3051 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
3052 AMDGPU::SGPRRegBankID;
3053
3054 const Instruction *I = dyn_cast<Instruction>(Ptr);
3055 return I && I->getMetadata("amdgpu.uniform");
3056}
3057
3058bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
3059 for (const GEPInfo &GEPInfo : AddrInfo) {
3060 if (!GEPInfo.VgprParts.empty())
3061 return true;
3062 }
3063 return false;
3064}
3065
3066void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
3067 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
3068 unsigned AS = PtrTy.getAddressSpace();
3070 STI.ldsRequiresM0Init()) {
3071 MachineBasicBlock *BB = I.getParent();
3072
3073 // If DS instructions require M0 initialization, insert it before selecting.
3074 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3075 .addImm(-1);
3076 }
3077}
3078
3079bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
3080 MachineInstr &I) const {
3081 initM0(I);
3082 return selectImpl(I, *CoverageInfo);
3083}
3084
3086 if (Reg.isPhysical())
3087 return false;
3088
3090 const unsigned Opcode = MI.getOpcode();
3091
3092 if (Opcode == AMDGPU::COPY)
3093 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
3094
3095 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
3096 Opcode == AMDGPU::G_XOR)
3097 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
3098 isVCmpResult(MI.getOperand(2).getReg(), MRI);
3099
3100 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
3101 return GI->is(Intrinsic::amdgcn_class);
3102
3103 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
3104}
3105
3106bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
3107 MachineBasicBlock *BB = I.getParent();
3108 MachineOperand &CondOp = I.getOperand(0);
3109 Register CondReg = CondOp.getReg();
3110 const DebugLoc &DL = I.getDebugLoc();
3111
3112 unsigned BrOpcode;
3113 Register CondPhysReg;
3114 const TargetRegisterClass *ConstrainRC;
3115
3116 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
3117 // whether the branch is uniform when selecting the instruction. In
3118 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
3119 // RegBankSelect knows what it's doing if the branch condition is scc, even
3120 // though it currently does not.
3121 if (!isVCC(CondReg, *MRI)) {
3122 if (MRI->getType(CondReg) != LLT::scalar(32))
3123 return false;
3124
3125 CondPhysReg = AMDGPU::SCC;
3126 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
3127 ConstrainRC = &AMDGPU::SReg_32RegClass;
3128 } else {
3129 // FIXME: Should scc->vcc copies and with exec?
3130
3131 // Unless the value of CondReg is a result of a V_CMP* instruction then we
3132 // need to insert an and with exec.
3133 if (!isVCmpResult(CondReg, *MRI)) {
3134 const bool Is64 = STI.isWave64();
3135 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
3136 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
3137
3138 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
3139 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
3140 .addReg(CondReg)
3141 .addReg(Exec)
3142 .setOperandDead(3); // Dead scc
3143 CondReg = TmpReg;
3144 }
3145
3146 CondPhysReg = TRI.getVCC();
3147 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
3148 ConstrainRC = TRI.getBoolRC();
3149 }
3150
3151 if (!MRI->getRegClassOrNull(CondReg))
3152 MRI->setRegClass(CondReg, ConstrainRC);
3153
3154 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
3155 .addReg(CondReg);
3156 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
3157 .addMBB(I.getOperand(1).getMBB());
3158
3159 I.eraseFromParent();
3160 return true;
3161}
3162
3163bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
3164 MachineInstr &I) const {
3165 Register DstReg = I.getOperand(0).getReg();
3166 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3167 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3168 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
3169 if (IsVGPR)
3170 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3171
3172 return RBI.constrainGenericRegister(
3173 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
3174}
3175
3176bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
3177 Register DstReg = I.getOperand(0).getReg();
3178 Register SrcReg = I.getOperand(1).getReg();
3179 Register MaskReg = I.getOperand(2).getReg();
3180 LLT Ty = MRI->getType(DstReg);
3181 LLT MaskTy = MRI->getType(MaskReg);
3182 MachineBasicBlock *BB = I.getParent();
3183 const DebugLoc &DL = I.getDebugLoc();
3184
3185 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3186 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3187 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
3188 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3189 if (DstRB != SrcRB) // Should only happen for hand written MIR.
3190 return false;
3191
3192 // Try to avoid emitting a bit operation when we only need to touch half of
3193 // the 64-bit pointer.
3194 APInt MaskOnes = VT->getKnownOnes(MaskReg).zext(64);
3195 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
3196 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
3197
3198 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
3199 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
3200
3201 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
3202 !CanCopyLow32 && !CanCopyHi32) {
3203 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
3204 .addReg(SrcReg)
3205 .addReg(MaskReg)
3206 .setOperandDead(3); // Dead scc
3207 I.eraseFromParent();
3208 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3209 return true;
3210 }
3211
3212 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
3213 const TargetRegisterClass &RegRC
3214 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3215
3216 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
3217 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
3218 const TargetRegisterClass *MaskRC =
3219 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
3220
3221 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3222 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3223 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
3224 return false;
3225
3226 if (Ty.getSizeInBits() == 32) {
3227 assert(MaskTy.getSizeInBits() == 32 &&
3228 "ptrmask should have been narrowed during legalize");
3229
3230 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
3231 .addReg(SrcReg)
3232 .addReg(MaskReg);
3233
3234 if (!IsVGPR)
3235 NewOp.setOperandDead(3); // Dead scc
3236 I.eraseFromParent();
3237 return true;
3238 }
3239
3240 Register HiReg = MRI->createVirtualRegister(&RegRC);
3241 Register LoReg = MRI->createVirtualRegister(&RegRC);
3242
3243 // Extract the subregisters from the source pointer.
3244 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
3245 .addReg(SrcReg, {}, AMDGPU::sub0);
3246 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
3247 .addReg(SrcReg, {}, AMDGPU::sub1);
3248
3249 Register MaskedLo, MaskedHi;
3250
3251 if (CanCopyLow32) {
3252 // If all the bits in the low half are 1, we only need a copy for it.
3253 MaskedLo = LoReg;
3254 } else {
3255 // Extract the mask subregister and apply the and.
3256 Register MaskLo = MRI->createVirtualRegister(&RegRC);
3257 MaskedLo = MRI->createVirtualRegister(&RegRC);
3258
3259 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
3260 .addReg(MaskReg, {}, AMDGPU::sub0);
3261 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
3262 .addReg(LoReg)
3263 .addReg(MaskLo);
3264 }
3265
3266 if (CanCopyHi32) {
3267 // If all the bits in the high half are 1, we only need a copy for it.
3268 MaskedHi = HiReg;
3269 } else {
3270 Register MaskHi = MRI->createVirtualRegister(&RegRC);
3271 MaskedHi = MRI->createVirtualRegister(&RegRC);
3272
3273 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
3274 .addReg(MaskReg, {}, AMDGPU::sub1);
3275 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
3276 .addReg(HiReg)
3277 .addReg(MaskHi);
3278 }
3279
3280 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3281 .addReg(MaskedLo)
3282 .addImm(AMDGPU::sub0)
3283 .addReg(MaskedHi)
3284 .addImm(AMDGPU::sub1);
3285 I.eraseFromParent();
3286 return true;
3287}
3288
3289/// Return the register to use for the index value, and the subregister to use
3290/// for the indirectly accessed register.
3291static std::pair<Register, unsigned>
3293 const TargetRegisterClass *SuperRC, Register IdxReg,
3294 unsigned EltSize, GISelValueTracking &ValueTracking) {
3295 Register IdxBaseReg;
3296 int Offset;
3297
3298 std::tie(IdxBaseReg, Offset) =
3299 AMDGPU::getBaseWithConstantOffset(MRI, IdxReg, &ValueTracking);
3300 if (IdxBaseReg == AMDGPU::NoRegister) {
3301 // This will happen if the index is a known constant. This should ordinarily
3302 // be legalized out, but handle it as a register just in case.
3303 assert(Offset == 0);
3304 IdxBaseReg = IdxReg;
3305 }
3306
3307 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3308
3309 // Skip out of bounds offsets, or else we would end up using an undefined
3310 // register.
3311 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3312 return std::pair(IdxReg, SubRegs[0]);
3313 return std::pair(IdxBaseReg, SubRegs[Offset]);
3314}
3315
3316bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3317 MachineInstr &MI) const {
3318 Register DstReg = MI.getOperand(0).getReg();
3319 Register SrcReg = MI.getOperand(1).getReg();
3320 Register IdxReg = MI.getOperand(2).getReg();
3321
3322 LLT DstTy = MRI->getType(DstReg);
3323 LLT SrcTy = MRI->getType(SrcReg);
3324
3325 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3326 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3327 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3328
3329 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3330 // into a waterfall loop.
3331 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3332 return false;
3333
3334 const TargetRegisterClass *SrcRC =
3335 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3336 const TargetRegisterClass *DstRC =
3337 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3338 if (!SrcRC || !DstRC)
3339 return false;
3340 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3341 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3342 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3343 return false;
3344
3345 MachineBasicBlock *BB = MI.getParent();
3346 const DebugLoc &DL = MI.getDebugLoc();
3347 const bool Is64 = DstTy.getSizeInBits() == 64;
3348
3349 unsigned SubReg;
3350 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3351 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *VT);
3352
3353 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3354 if (DstTy.getSizeInBits() != 32 && !Is64)
3355 return false;
3356
3357 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3358 .addReg(IdxReg);
3359
3360 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3361 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3362 .addReg(SrcReg, {}, SubReg)
3363 .addReg(SrcReg, RegState::Implicit);
3364 MI.eraseFromParent();
3365 return true;
3366 }
3367
3368 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3369 return false;
3370
3371 if (!STI.useVGPRIndexMode()) {
3372 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3373 .addReg(IdxReg);
3374 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3375 .addReg(SrcReg, {}, SubReg)
3376 .addReg(SrcReg, RegState::Implicit);
3377 MI.eraseFromParent();
3378 return true;
3379 }
3380
3381 const MCInstrDesc &GPRIDXDesc =
3382 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3383 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3384 .addReg(SrcReg)
3385 .addReg(IdxReg)
3386 .addImm(SubReg);
3387
3388 MI.eraseFromParent();
3389 return true;
3390}
3391
3392// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3393bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3394 MachineInstr &MI) const {
3395 Register DstReg = MI.getOperand(0).getReg();
3396 Register VecReg = MI.getOperand(1).getReg();
3397 Register ValReg = MI.getOperand(2).getReg();
3398 Register IdxReg = MI.getOperand(3).getReg();
3399
3400 LLT VecTy = MRI->getType(DstReg);
3401 LLT ValTy = MRI->getType(ValReg);
3402 unsigned VecSize = VecTy.getSizeInBits();
3403 unsigned ValSize = ValTy.getSizeInBits();
3404
3405 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3406 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3407 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3408
3409 assert(VecTy.getElementType() == ValTy);
3410
3411 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3412 // into a waterfall loop.
3413 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3414 return false;
3415
3416 const TargetRegisterClass *VecRC =
3417 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3418 const TargetRegisterClass *ValRC =
3419 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3420
3421 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3422 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3423 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3424 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3425 return false;
3426
3427 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3428 return false;
3429
3430 unsigned SubReg;
3431 std::tie(IdxReg, SubReg) =
3432 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *VT);
3433
3434 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3435 STI.useVGPRIndexMode();
3436
3437 MachineBasicBlock *BB = MI.getParent();
3438 const DebugLoc &DL = MI.getDebugLoc();
3439
3440 if (!IndexMode) {
3441 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3442 .addReg(IdxReg);
3443
3444 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3445 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3446 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3447 .addReg(VecReg)
3448 .addReg(ValReg)
3449 .addImm(SubReg);
3450 MI.eraseFromParent();
3451 return true;
3452 }
3453
3454 const MCInstrDesc &GPRIDXDesc =
3455 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3456 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3457 .addReg(VecReg)
3458 .addReg(ValReg)
3459 .addReg(IdxReg)
3460 .addImm(SubReg);
3461
3462 MI.eraseFromParent();
3463 return true;
3464}
3465
3466static bool isAsyncLDSDMA(Intrinsic::ID Intr) {
3467 switch (Intr) {
3468 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
3469 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
3470 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
3471 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
3472 case Intrinsic::amdgcn_load_async_to_lds:
3473 case Intrinsic::amdgcn_global_load_async_lds:
3474 return true;
3475 }
3476 return false;
3477}
3478
3479bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3480 if (!Subtarget->hasVMemToLDSLoad())
3481 return false;
3482 unsigned Opc;
3483 unsigned Size = MI.getOperand(3).getImm();
3484 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
3485
3486 // The struct intrinsic variants add one additional operand over raw.
3487 const bool HasVIndex = MI.getNumOperands() == 9;
3488 Register VIndex;
3489 int OpOffset = 0;
3490 if (HasVIndex) {
3491 VIndex = MI.getOperand(4).getReg();
3492 OpOffset = 1;
3493 }
3494
3495 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3496 std::optional<ValueAndVReg> MaybeVOffset =
3498 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3499
3500 switch (Size) {
3501 default:
3502 return false;
3503 case 1:
3504 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3505 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3506 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3507 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3508 break;
3509 case 2:
3510 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3511 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3512 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3513 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3514 break;
3515 case 4:
3516 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3517 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3518 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3519 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3520 break;
3521 case 12:
3522 if (!Subtarget->hasLDSLoadB96_B128())
3523 return false;
3524
3525 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
3526 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
3527 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
3528 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
3529 break;
3530 case 16:
3531 if (!Subtarget->hasLDSLoadB96_B128())
3532 return false;
3533
3534 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
3535 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
3536 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
3537 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
3538 break;
3539 }
3540
3541 MachineBasicBlock *MBB = MI.getParent();
3542 const DebugLoc &DL = MI.getDebugLoc();
3543 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3544 .add(MI.getOperand(2));
3545
3546 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3547
3548 if (HasVIndex && HasVOffset) {
3549 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3550 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3551 .addReg(VIndex)
3552 .addImm(AMDGPU::sub0)
3553 .addReg(VOffset)
3554 .addImm(AMDGPU::sub1);
3555
3556 MIB.addReg(IdxReg);
3557 } else if (HasVIndex) {
3558 MIB.addReg(VIndex);
3559 } else if (HasVOffset) {
3560 MIB.addReg(VOffset);
3561 }
3562
3563 MIB.add(MI.getOperand(1)); // rsrc
3564 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3565 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3566 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
3567 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3568 MIB.addImm(Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL
3569 : AMDGPU::CPol::ALL_pregfx12)); // cpol
3570 MIB.addImm(
3571 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
3572 ? 1
3573 : 0); // swz
3574 MIB.addImm(isAsyncLDSDMA(IntrinsicID));
3575
3576 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3577 // Don't set the offset value here because the pointer points to the base of
3578 // the buffer.
3579 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3580
3581 MachinePointerInfo StorePtrI = LoadPtrI;
3582 LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3586
3587 auto F = LoadMMO->getFlags() &
3589 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3590 Size, LoadMMO->getBaseAlign());
3591
3592 MachineMemOperand *StoreMMO =
3593 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3594 sizeof(int32_t), LoadMMO->getBaseAlign());
3595
3596 MIB.setMemRefs({LoadMMO, StoreMMO});
3597
3598 MI.eraseFromParent();
3599 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3600 return true;
3601}
3602
3603/// Match a zero extend from a 32-bit value to 64-bits.
3604Register AMDGPUInstructionSelector::matchZeroExtendFromS32(Register Reg) const {
3605 Register ZExtSrc;
3606 if (mi_match(Reg, *MRI, m_GZExt(m_Reg(ZExtSrc))))
3607 return MRI->getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3608
3609 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3610 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3611 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3612 return Register();
3613
3614 assert(Def->getNumOperands() == 3 &&
3615 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3616 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_ZeroInt())) {
3617 return Def->getOperand(1).getReg();
3618 }
3619
3620 return Register();
3621}
3622
3623/// Match a sign extend from a 32-bit value to 64-bits.
3624Register AMDGPUInstructionSelector::matchSignExtendFromS32(Register Reg) const {
3625 Register SExtSrc;
3626 if (mi_match(Reg, *MRI, m_GSExt(m_Reg(SExtSrc))))
3627 return MRI->getType(SExtSrc) == LLT::scalar(32) ? SExtSrc : Register();
3628
3629 // Match legalized form %sext = G_MERGE_VALUES (s32 %x), G_ASHR((S32 %x, 31))
3630 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3631 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3632 return Register();
3633
3634 assert(Def->getNumOperands() == 3 &&
3635 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3636 if (mi_match(Def->getOperand(2).getReg(), *MRI,
3637 m_GAShr(m_SpecificReg(Def->getOperand(1).getReg()),
3638 m_SpecificICst(31))))
3639 return Def->getOperand(1).getReg();
3640
3641 if (VT->signBitIsZero(Reg))
3642 return matchZeroExtendFromS32(Reg);
3643
3644 return Register();
3645}
3646
3647/// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it
3648/// is 32-bit.
3650AMDGPUInstructionSelector::matchZeroExtendFromS32OrS32(Register Reg) const {
3651 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3652 : matchZeroExtendFromS32(Reg);
3653}
3654
3655/// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it
3656/// is 32-bit.
3658AMDGPUInstructionSelector::matchSignExtendFromS32OrS32(Register Reg) const {
3659 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3660 : matchSignExtendFromS32(Reg);
3661}
3662
3664AMDGPUInstructionSelector::matchExtendFromS32OrS32(Register Reg,
3665 bool IsSigned) const {
3666 if (IsSigned)
3667 return matchSignExtendFromS32OrS32(Reg);
3668
3669 return matchZeroExtendFromS32OrS32(Reg);
3670}
3671
3672Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const {
3673 Register AnyExtSrc;
3674 if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc))))
3675 return MRI->getType(AnyExtSrc) == LLT::scalar(32) ? AnyExtSrc : Register();
3676
3677 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 G_IMPLICIT_DEF)
3678 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3679 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3680 return Register();
3681
3682 assert(Def->getNumOperands() == 3 &&
3683 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3684
3685 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_GImplicitDef()))
3686 return Def->getOperand(1).getReg();
3687
3688 return Register();
3689}
3690
3691bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3692 if (!Subtarget->hasVMemToLDSLoad())
3693 return false;
3694
3695 unsigned Opc;
3696 unsigned Size = MI.getOperand(3).getImm();
3697 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
3698
3699 switch (Size) {
3700 default:
3701 return false;
3702 case 1:
3703 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3704 break;
3705 case 2:
3706 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3707 break;
3708 case 4:
3709 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3710 break;
3711 case 12:
3712 if (!Subtarget->hasLDSLoadB96_B128())
3713 return false;
3714 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3715 break;
3716 case 16:
3717 if (!Subtarget->hasLDSLoadB96_B128())
3718 return false;
3719 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3720 break;
3721 }
3722
3723 MachineBasicBlock *MBB = MI.getParent();
3724 const DebugLoc &DL = MI.getDebugLoc();
3725 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3726 .add(MI.getOperand(2));
3727
3728 Register Addr = MI.getOperand(1).getReg();
3729 Register VOffset;
3730 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3731 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3732 if (!isSGPR(Addr)) {
3733 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3734 if (isSGPR(AddrDef->Reg)) {
3735 Addr = AddrDef->Reg;
3736 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3737 Register SAddr =
3738 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3739 if (isSGPR(SAddr)) {
3740 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3741 if (Register Off = matchZeroExtendFromS32(PtrBaseOffset)) {
3742 Addr = SAddr;
3743 VOffset = Off;
3744 }
3745 }
3746 }
3747 }
3748
3749 if (isSGPR(Addr)) {
3751 if (!VOffset) {
3752 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3753 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3754 .addImm(0);
3755 }
3756 }
3757
3758 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3759 .addReg(Addr);
3760
3761 if (isSGPR(Addr))
3762 MIB.addReg(VOffset);
3763
3764 MIB.add(MI.getOperand(4)); // offset
3765
3766 unsigned Aux = MI.getOperand(5).getImm();
3767 MIB.addImm(Aux & ~AMDGPU::CPol::VIRTUAL_BITS); // cpol
3768 MIB.addImm(isAsyncLDSDMA(IntrinsicID));
3769
3770 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3771 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3772 LoadPtrI.Offset = MI.getOperand(4).getImm();
3773 MachinePointerInfo StorePtrI = LoadPtrI;
3774 LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3778 auto F = LoadMMO->getFlags() &
3780 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3781 Size, LoadMMO->getBaseAlign());
3782 MachineMemOperand *StoreMMO =
3783 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3784 sizeof(int32_t), Align(4));
3785
3786 MIB.setMemRefs({LoadMMO, StoreMMO});
3787
3788 MI.eraseFromParent();
3789 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3790 return true;
3791}
3792
3793bool AMDGPUInstructionSelector::selectTensorLoadStore(MachineInstr &MI,
3794 Intrinsic::ID IID) const {
3795 bool IsLoad = IID == Intrinsic::amdgcn_tensor_load_to_lds;
3796 unsigned Opc =
3797 IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d4 : AMDGPU::TENSOR_STORE_FROM_LDS_d4;
3798 int NumGroups = 4;
3799
3800 // A lamda function to check whether an operand is a vector of all 0s.
3801 const auto isAllZeros = [&](MachineOperand &Opnd) {
3802 const MachineInstr *DefMI = MRI->getVRegDef(Opnd.getReg());
3803 if (!DefMI)
3804 return false;
3805 return llvm::isBuildVectorAllZeros(*DefMI, *MRI, true);
3806 };
3807
3808 // Use _D2 version if both group 2 and 3 are zero-initialized.
3809 if (isAllZeros(MI.getOperand(3)) && isAllZeros(MI.getOperand(4))) {
3810 NumGroups = 2;
3811 Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d2
3812 : AMDGPU::TENSOR_STORE_FROM_LDS_d2;
3813 }
3814
3815 // TODO: Handle the fifth group: MI.getOpetand(5), which is silently ignored
3816 // for now because all existing targets only support up to 4 groups.
3817 MachineBasicBlock *MBB = MI.getParent();
3818 auto MIB = BuildMI(*MBB, &MI, MI.getDebugLoc(), TII.get(Opc))
3819 .add(MI.getOperand(1)) // D# group 0
3820 .add(MI.getOperand(2)); // D# group 1
3821
3822 if (NumGroups >= 4) { // Has at least 4 groups
3823 MIB.add(MI.getOperand(3)) // D# group 2
3824 .add(MI.getOperand(4)); // D# group 3
3825 }
3826
3827 MIB.addImm(0) // r128
3828 .add(MI.getOperand(6)); // cpol
3829
3830 MI.eraseFromParent();
3831 return true;
3832}
3833
3834bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(
3835 MachineInstr &MI) const {
3836 unsigned OpcodeOpIdx =
3837 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3;
3838 MI.setDesc(TII.get(MI.getOperand(OpcodeOpIdx).getImm()));
3839 MI.removeOperand(OpcodeOpIdx);
3840 MI.addImplicitDefUseOperands(*MI.getMF());
3841 constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3842 return true;
3843}
3844
3845// FIXME: This should be removed and let the patterns select. We just need the
3846// AGPR/VGPR combination versions.
3847bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3848 unsigned Opc;
3849 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3850 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3851 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3852 break;
3853 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3854 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3855 break;
3856 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3857 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3858 break;
3859 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3860 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3861 break;
3862 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3863 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3864 break;
3865 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3866 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3867 break;
3868 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3869 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3870 break;
3871 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3872 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3873 break;
3874 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3875 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3876 break;
3877 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3878 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3879 break;
3880 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3881 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3882 break;
3883 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3884 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3885 break;
3886 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3887 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3888 break;
3889 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3890 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3891 break;
3892 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
3893 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;
3894 break;
3895 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
3896 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64;
3897 break;
3898 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
3899 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
3900 break;
3901 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
3902 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
3903 break;
3904 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
3905 Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64;
3906 break;
3907 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
3908 Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64;
3909 break;
3910 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
3911 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
3912 break;
3913 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
3914 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
3915 break;
3916 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
3917 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
3918 break;
3919 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
3920 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
3921 break;
3922 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
3923 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64;
3924 break;
3925 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
3926 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64;
3927 break;
3928 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
3929 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64;
3930 break;
3931 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
3932 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64;
3933 break;
3934 default:
3935 llvm_unreachable("unhandled smfmac intrinsic");
3936 }
3937
3938 auto VDst_In = MI.getOperand(4);
3939
3940 MI.setDesc(TII.get(Opc));
3941 MI.removeOperand(4); // VDst_In
3942 MI.removeOperand(1); // Intrinsic ID
3943 MI.addOperand(VDst_In); // Readd VDst_In to the end
3944 MI.addImplicitDefUseOperands(*MI.getMF());
3945 const MCInstrDesc &MCID = MI.getDesc();
3946 if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
3947 MI.getOperand(0).setIsEarlyClobber(true);
3948 }
3949 return true;
3950}
3951
3952bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
3953 MachineInstr &MI, Intrinsic::ID IntrID) const {
3954 if (IntrID == Intrinsic::amdgcn_permlane16_swap &&
3955 !Subtarget->hasPermlane16Swap())
3956 return false;
3957 if (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3958 !Subtarget->hasPermlane32Swap())
3959 return false;
3960
3961 unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3962 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3963 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3964
3965 MI.removeOperand(2);
3966 MI.setDesc(TII.get(Opcode));
3967 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3968
3969 MachineOperand &FI = MI.getOperand(4);
3971
3972 constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3973 return true;
3974}
3975
3976bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3977 Register DstReg = MI.getOperand(0).getReg();
3978 Register SrcReg = MI.getOperand(1).getReg();
3979 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3980 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3981 MachineBasicBlock *MBB = MI.getParent();
3982 const DebugLoc &DL = MI.getDebugLoc();
3983
3984 if (IsVALU) {
3985 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3986 .addImm(Subtarget->getWavefrontSizeLog2())
3987 .addReg(SrcReg);
3988 } else {
3989 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3990 .addReg(SrcReg)
3991 .addImm(Subtarget->getWavefrontSizeLog2())
3992 .setOperandDead(3); // Dead scc
3993 }
3994
3995 const TargetRegisterClass &RC =
3996 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3997 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3998 return false;
3999
4000 MI.eraseFromParent();
4001 return true;
4002}
4003
4004bool AMDGPUInstructionSelector::selectWaveShuffleIntrin(
4005 MachineInstr &MI) const {
4006 assert(MI.getNumOperands() == 4);
4007 MachineBasicBlock *MBB = MI.getParent();
4008 const DebugLoc &DL = MI.getDebugLoc();
4009
4010 Register DstReg = MI.getOperand(0).getReg();
4011 Register ValReg = MI.getOperand(2).getReg();
4012 Register IdxReg = MI.getOperand(3).getReg();
4013
4014 const LLT DstTy = MRI->getType(DstReg);
4015 unsigned DstSize = DstTy.getSizeInBits();
4016 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
4017 const TargetRegisterClass *DstRC =
4018 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
4019
4020 if (DstTy != LLT::scalar(32))
4021 return false;
4022
4023 if (!Subtarget->supportsBPermute())
4024 return false;
4025
4026 // If we can bpermute across the whole wave, then just do that
4027 if (Subtarget->supportsWaveWideBPermute()) {
4028 Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
4029 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
4030 .addImm(2)
4031 .addReg(IdxReg);
4032
4033 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), DstReg)
4034 .addReg(ShiftIdxReg)
4035 .addReg(ValReg)
4036 .addImm(0);
4037 } else {
4038 // Otherwise, we need to make use of whole wave mode
4039 assert(Subtarget->isWave64());
4040
4041 // Set inactive lanes to poison
4042 Register UndefValReg =
4043 MRI->createVirtualRegister(TRI.getRegClass(AMDGPU::SReg_32RegClassID));
4044 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefValReg);
4045
4046 Register UndefExecReg = MRI->createVirtualRegister(
4047 TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
4048 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefExecReg);
4049
4050 Register PoisonValReg = MRI->createVirtualRegister(DstRC);
4051 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonValReg)
4052 .addImm(0)
4053 .addReg(ValReg)
4054 .addImm(0)
4055 .addReg(UndefValReg)
4056 .addReg(UndefExecReg);
4057
4058 // ds_bpermute requires index to be multiplied by 4
4059 Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
4060 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
4061 .addImm(2)
4062 .addReg(IdxReg);
4063
4064 Register PoisonIdxReg = MRI->createVirtualRegister(DstRC);
4065 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonIdxReg)
4066 .addImm(0)
4067 .addReg(ShiftIdxReg)
4068 .addImm(0)
4069 .addReg(UndefValReg)
4070 .addReg(UndefExecReg);
4071
4072 // Get permutation of each half, then we'll select which one to use
4073 Register SameSidePermReg = MRI->createVirtualRegister(DstRC);
4074 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), SameSidePermReg)
4075 .addReg(PoisonIdxReg)
4076 .addReg(PoisonValReg)
4077 .addImm(0);
4078
4079 Register SwappedValReg = MRI->createVirtualRegister(DstRC);
4080 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_PERMLANE64_B32), SwappedValReg)
4081 .addReg(PoisonValReg);
4082
4083 Register OppSidePermReg = MRI->createVirtualRegister(DstRC);
4084 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), OppSidePermReg)
4085 .addReg(PoisonIdxReg)
4086 .addReg(SwappedValReg)
4087 .addImm(0);
4088
4089 Register WWMSwapPermReg = MRI->createVirtualRegister(DstRC);
4090 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::STRICT_WWM), WWMSwapPermReg)
4091 .addReg(OppSidePermReg);
4092
4093 // Select which side to take the permute from
4094 // We can get away with only using mbcnt_lo here since we're only
4095 // trying to detect which side of 32 each lane is on, and mbcnt_lo
4096 // returns 32 for lanes 32-63.
4097 Register ThreadIDReg = MRI->createVirtualRegister(DstRC);
4098 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MBCNT_LO_U32_B32_e64), ThreadIDReg)
4099 .addImm(-1)
4100 .addImm(0);
4101
4102 Register XORReg = MRI->createVirtualRegister(DstRC);
4103 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_XOR_B32_e64), XORReg)
4104 .addReg(ThreadIDReg)
4105 .addReg(PoisonIdxReg);
4106
4107 Register ANDReg = MRI->createVirtualRegister(DstRC);
4108 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_AND_B32_e64), ANDReg)
4109 .addReg(XORReg)
4110 .addImm(32);
4111
4112 Register CompareReg = MRI->createVirtualRegister(
4113 TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
4114 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), CompareReg)
4115 .addReg(ANDReg)
4116 .addImm(0);
4117
4118 // Finally do the selection
4119 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
4120 .addImm(0)
4121 .addReg(WWMSwapPermReg)
4122 .addImm(0)
4123 .addReg(SameSidePermReg)
4124 .addReg(CompareReg);
4125 }
4126
4127 MI.eraseFromParent();
4128 return true;
4129}
4130
4131// Match BITOP3 operation and return a number of matched instructions plus
4132// truth table.
4133static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
4135 const MachineRegisterInfo &MRI) {
4136 unsigned NumOpcodes = 0;
4137 uint8_t LHSBits, RHSBits;
4138
4139 auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {
4140 // Define truth table given Src0, Src1, Src2 bits permutations:
4141 // 0 0 0
4142 // 0 0 1
4143 // 0 1 0
4144 // 0 1 1
4145 // 1 0 0
4146 // 1 0 1
4147 // 1 1 0
4148 // 1 1 1
4149 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
4150
4151 if (mi_match(Op, MRI, m_AllOnesInt())) {
4152 Bits = 0xff;
4153 return true;
4154 }
4155 if (mi_match(Op, MRI, m_ZeroInt())) {
4156 Bits = 0;
4157 return true;
4158 }
4159
4160 for (unsigned I = 0; I < Src.size(); ++I) {
4161 // Try to find existing reused operand
4162 if (Src[I] == Op) {
4163 Bits = SrcBits[I];
4164 return true;
4165 }
4166 // Try to replace parent operator
4167 if (Src[I] == R) {
4168 Bits = SrcBits[I];
4169 Src[I] = Op;
4170 return true;
4171 }
4172 }
4173
4174 if (Src.size() == 3) {
4175 // No room left for operands. Try one last time, there can be a 'not' of
4176 // one of our source operands. In this case we can compute the bits
4177 // without growing Src vector.
4178 Register LHS;
4179 if (mi_match(Op, MRI, m_Not(m_Reg(LHS)))) {
4181 for (unsigned I = 0; I < Src.size(); ++I) {
4182 if (Src[I] == LHS) {
4183 Bits = ~SrcBits[I];
4184 return true;
4185 }
4186 }
4187 }
4188
4189 return false;
4190 }
4191
4192 Bits = SrcBits[Src.size()];
4193 Src.push_back(Op);
4194 return true;
4195 };
4196
4197 MachineInstr *MI = MRI.getVRegDef(R);
4198 switch (MI->getOpcode()) {
4199 case TargetOpcode::G_AND:
4200 case TargetOpcode::G_OR:
4201 case TargetOpcode::G_XOR: {
4202 Register LHS = getSrcRegIgnoringCopies(MI->getOperand(1).getReg(), MRI);
4203 Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI);
4204
4205 SmallVector<Register, 3> Backup(Src.begin(), Src.end());
4206 if (!getOperandBits(LHS, LHSBits) ||
4207 !getOperandBits(RHS, RHSBits)) {
4208 Src = std::move(Backup);
4209 return std::make_pair(0, 0);
4210 }
4211
4212 // Recursion is naturally limited by the size of the operand vector.
4213 auto Op = BitOp3_Op(LHS, Src, MRI);
4214 if (Op.first) {
4215 NumOpcodes += Op.first;
4216 LHSBits = Op.second;
4217 }
4218
4219 Op = BitOp3_Op(RHS, Src, MRI);
4220 if (Op.first) {
4221 NumOpcodes += Op.first;
4222 RHSBits = Op.second;
4223 }
4224 break;
4225 }
4226 default:
4227 return std::make_pair(0, 0);
4228 }
4229
4230 uint8_t TTbl;
4231 switch (MI->getOpcode()) {
4232 case TargetOpcode::G_AND:
4233 TTbl = LHSBits & RHSBits;
4234 break;
4235 case TargetOpcode::G_OR:
4236 TTbl = LHSBits | RHSBits;
4237 break;
4238 case TargetOpcode::G_XOR:
4239 TTbl = LHSBits ^ RHSBits;
4240 break;
4241 default:
4242 break;
4243 }
4244
4245 return std::make_pair(NumOpcodes + 1, TTbl);
4246}
4247
4248bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
4249 if (!Subtarget->hasBitOp3Insts())
4250 return false;
4251
4252 Register DstReg = MI.getOperand(0).getReg();
4253 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
4254 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
4255 if (!IsVALU)
4256 return false;
4257
4259 uint8_t TTbl;
4260 unsigned NumOpcodes;
4261
4262 std::tie(NumOpcodes, TTbl) = BitOp3_Op(DstReg, Src, *MRI);
4263
4264 // Src.empty() case can happen if all operands are all zero or all ones.
4265 // Normally it shall be optimized out before reaching this.
4266 if (NumOpcodes < 2 || Src.empty())
4267 return false;
4268
4269 const bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32);
4270 if (NumOpcodes == 2 && IsB32) {
4271 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
4272 // asm more readable. This cannot be modeled with AddedComplexity because
4273 // selector does not know how many operations did we match.
4274 if (mi_match(MI, *MRI, m_GXor(m_GXor(m_Reg(), m_Reg()), m_Reg())) ||
4275 mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) ||
4276 mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg())))
4277 return false;
4278 } else if (NumOpcodes < 4) {
4279 // For a uniform case threshold should be higher to account for moves
4280 // between VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be
4281 // in SGPRs and a readtfirstlane after.
4282 return false;
4283 }
4284
4285 unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
4286 if (!IsB32 && STI.hasTrue16BitInsts())
4287 Opc = STI.useRealTrue16Insts() ? AMDGPU::V_BITOP3_B16_gfx1250_t16_e64
4288 : AMDGPU::V_BITOP3_B16_gfx1250_fake16_e64;
4289 unsigned CBL = STI.getConstantBusLimit(Opc);
4290 MachineBasicBlock *MBB = MI.getParent();
4291 const DebugLoc &DL = MI.getDebugLoc();
4292
4293 for (unsigned I = 0; I < Src.size(); ++I) {
4294 const RegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI);
4295 if (RB->getID() != AMDGPU::SGPRRegBankID)
4296 continue;
4297 if (CBL > 0) {
4298 --CBL;
4299 continue;
4300 }
4301 Register NewReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4302 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg)
4303 .addReg(Src[I]);
4304 Src[I] = NewReg;
4305 }
4306
4307 // Last operand can be ignored, turning a ternary operation into a binary.
4308 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
4309 // 'c' with 'a' here without changing the answer. In some pathological
4310 // cases it should be possible to get an operation with a single operand
4311 // too if optimizer would not catch it.
4312 while (Src.size() < 3)
4313 Src.push_back(Src[0]);
4314
4315 auto MIB = BuildMI(*MBB, MI, DL, TII.get(Opc), DstReg);
4316 if (!IsB32)
4317 MIB.addImm(0); // src_mod0
4318 MIB.addReg(Src[0]);
4319 if (!IsB32)
4320 MIB.addImm(0); // src_mod1
4321 MIB.addReg(Src[1]);
4322 if (!IsB32)
4323 MIB.addImm(0); // src_mod2
4324 MIB.addReg(Src[2])
4325 .addImm(TTbl);
4326 if (!IsB32)
4327 MIB.addImm(0); // op_sel
4328
4329 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
4330 MI.eraseFromParent();
4331
4332 return true;
4333}
4334
4335bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
4336 Register SrcReg = MI.getOperand(0).getReg();
4337 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
4338 return false;
4339
4340 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
4341 Register SP =
4342 Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore();
4343 Register WaveAddr = getWaveAddress(DefMI);
4344 MachineBasicBlock *MBB = MI.getParent();
4345 const DebugLoc &DL = MI.getDebugLoc();
4346
4347 if (!WaveAddr) {
4348 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4349 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
4350 .addReg(SrcReg)
4351 .addImm(Subtarget->getWavefrontSizeLog2())
4352 .setOperandDead(3); // Dead scc
4353 }
4354
4355 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
4356 .addReg(WaveAddr);
4357
4358 MI.eraseFromParent();
4359 return true;
4360}
4361
4363
4364 if (!I.isPreISelOpcode()) {
4365 if (I.isCopy())
4366 return selectCOPY(I);
4367 return true;
4368 }
4369
4370 switch (I.getOpcode()) {
4371 case TargetOpcode::G_AND:
4372 case TargetOpcode::G_OR:
4373 case TargetOpcode::G_XOR:
4374 if (selectBITOP3(I))
4375 return true;
4376 if (selectImpl(I, *CoverageInfo))
4377 return true;
4378 return selectG_AND_OR_XOR(I);
4379 case TargetOpcode::G_ADD:
4380 case TargetOpcode::G_SUB:
4381 case TargetOpcode::G_PTR_ADD:
4382 if (selectImpl(I, *CoverageInfo))
4383 return true;
4384 return selectG_ADD_SUB(I);
4385 case TargetOpcode::G_UADDO:
4386 case TargetOpcode::G_USUBO:
4387 case TargetOpcode::G_UADDE:
4388 case TargetOpcode::G_USUBE:
4389 return selectG_UADDO_USUBO_UADDE_USUBE(I);
4390 case AMDGPU::G_AMDGPU_MAD_U64_U32:
4391 case AMDGPU::G_AMDGPU_MAD_I64_I32:
4392 return selectG_AMDGPU_MAD_64_32(I);
4393 case TargetOpcode::G_INTTOPTR:
4394 case TargetOpcode::G_BITCAST:
4395 case TargetOpcode::G_PTRTOINT:
4396 case TargetOpcode::G_FREEZE:
4397 return selectCOPY(I);
4398 case TargetOpcode::G_FNEG:
4399 if (selectImpl(I, *CoverageInfo))
4400 return true;
4401 return selectG_FNEG(I);
4402 case TargetOpcode::G_FABS:
4403 if (selectImpl(I, *CoverageInfo))
4404 return true;
4405 return selectG_FABS(I);
4406 case TargetOpcode::G_EXTRACT:
4407 return selectG_EXTRACT(I);
4408 case TargetOpcode::G_MERGE_VALUES:
4409 case TargetOpcode::G_CONCAT_VECTORS:
4410 return selectG_MERGE_VALUES(I);
4411 case TargetOpcode::G_UNMERGE_VALUES:
4412 return selectG_UNMERGE_VALUES(I);
4413 case TargetOpcode::G_BUILD_VECTOR:
4414 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
4415 return selectG_BUILD_VECTOR(I);
4416 case TargetOpcode::G_IMPLICIT_DEF:
4417 return selectG_IMPLICIT_DEF(I);
4418 case TargetOpcode::G_INSERT:
4419 return selectG_INSERT(I);
4420 case TargetOpcode::G_INTRINSIC:
4421 case TargetOpcode::G_INTRINSIC_CONVERGENT:
4422 return selectG_INTRINSIC(I);
4423 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
4424 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
4425 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
4426 case TargetOpcode::G_ICMP:
4427 case TargetOpcode::G_FCMP:
4428 if (selectG_ICMP_or_FCMP(I))
4429 return true;
4430 return selectImpl(I, *CoverageInfo);
4431 case TargetOpcode::G_LOAD:
4432 case TargetOpcode::G_ZEXTLOAD:
4433 case TargetOpcode::G_SEXTLOAD:
4434 case TargetOpcode::G_STORE:
4435 case TargetOpcode::G_ATOMIC_CMPXCHG:
4436 case TargetOpcode::G_ATOMICRMW_XCHG:
4437 case TargetOpcode::G_ATOMICRMW_ADD:
4438 case TargetOpcode::G_ATOMICRMW_SUB:
4439 case TargetOpcode::G_ATOMICRMW_AND:
4440 case TargetOpcode::G_ATOMICRMW_OR:
4441 case TargetOpcode::G_ATOMICRMW_XOR:
4442 case TargetOpcode::G_ATOMICRMW_MIN:
4443 case TargetOpcode::G_ATOMICRMW_MAX:
4444 case TargetOpcode::G_ATOMICRMW_UMIN:
4445 case TargetOpcode::G_ATOMICRMW_UMAX:
4446 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
4447 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
4448 case TargetOpcode::G_ATOMICRMW_USUB_COND:
4449 case TargetOpcode::G_ATOMICRMW_USUB_SAT:
4450 case TargetOpcode::G_ATOMICRMW_FADD:
4451 case TargetOpcode::G_ATOMICRMW_FMIN:
4452 case TargetOpcode::G_ATOMICRMW_FMAX:
4453 return selectG_LOAD_STORE_ATOMICRMW(I);
4454 case TargetOpcode::G_SELECT:
4455 return selectG_SELECT(I);
4456 case TargetOpcode::G_TRUNC:
4457 return selectG_TRUNC(I);
4458 case TargetOpcode::G_SEXT:
4459 case TargetOpcode::G_ZEXT:
4460 case TargetOpcode::G_ANYEXT:
4461 case TargetOpcode::G_SEXT_INREG:
4462 // This is a workaround. For extension from type i1, `selectImpl()` uses
4463 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
4464 // i1 can only be hold in a SGPR class.
4465 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
4466 selectImpl(I, *CoverageInfo))
4467 return true;
4468 return selectG_SZA_EXT(I);
4469 case TargetOpcode::G_FPEXT:
4470 if (selectG_FPEXT(I))
4471 return true;
4472 return selectImpl(I, *CoverageInfo);
4473 case TargetOpcode::G_BRCOND:
4474 return selectG_BRCOND(I);
4475 case TargetOpcode::G_GLOBAL_VALUE:
4476 return selectG_GLOBAL_VALUE(I);
4477 case TargetOpcode::G_PTRMASK:
4478 return selectG_PTRMASK(I);
4479 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4480 return selectG_EXTRACT_VECTOR_ELT(I);
4481 case TargetOpcode::G_INSERT_VECTOR_ELT:
4482 return selectG_INSERT_VECTOR_ELT(I);
4483 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4484 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4485 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
4486 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4487 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4488 const AMDGPU::ImageDimIntrinsicInfo *Intr =
4490 assert(Intr && "not an image intrinsic with image pseudo");
4491 return selectImageIntrinsic(I, Intr);
4492 }
4493 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY:
4494 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
4495 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
4496 return selectBVHIntersectRayIntrinsic(I);
4497 case AMDGPU::G_SBFX:
4498 case AMDGPU::G_UBFX:
4499 return selectG_SBFX_UBFX(I);
4500 case AMDGPU::G_SI_CALL:
4501 I.setDesc(TII.get(AMDGPU::SI_CALL));
4502 return true;
4503 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
4504 return selectWaveAddress(I);
4505 case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: {
4506 I.setDesc(TII.get(AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN));
4507 return true;
4508 }
4509 case AMDGPU::G_STACKRESTORE:
4510 return selectStackRestore(I);
4511 case AMDGPU::G_PHI:
4512 return selectPHI(I);
4513 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
4514 return selectCOPY_SCC_VCC(I);
4515 case AMDGPU::G_AMDGPU_COPY_VCC_SCC:
4516 return selectCOPY_VCC_SCC(I);
4517 case AMDGPU::G_AMDGPU_READANYLANE:
4518 return selectReadAnyLane(I);
4519 case TargetOpcode::G_CONSTANT:
4520 case TargetOpcode::G_FCONSTANT:
4521 default:
4522 return selectImpl(I, *CoverageInfo);
4523 }
4524 return false;
4525}
4526
4528AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
4529 return {{
4530 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4531 }};
4532
4533}
4534
4535std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
4536 Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
4537 unsigned Mods = 0;
4538 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
4539
4540 if (MI->getOpcode() == AMDGPU::G_FNEG) {
4541 Src = MI->getOperand(1).getReg();
4542 Mods |= SISrcMods::NEG;
4543 MI = getDefIgnoringCopies(Src, *MRI);
4544 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
4545 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
4546 // denormal mode, but we're implicitly canonicalizing in a source operand.
4547 const ConstantFP *LHS =
4548 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
4549 if (LHS && LHS->isZero()) {
4550 Mods |= SISrcMods::NEG;
4551 Src = MI->getOperand(2).getReg();
4552 }
4553 }
4554
4555 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
4556 Src = MI->getOperand(1).getReg();
4557 Mods |= SISrcMods::ABS;
4558 }
4559
4560 if (OpSel)
4561 Mods |= SISrcMods::OP_SEL_0;
4562
4563 return std::pair(Src, Mods);
4564}
4565
4566std::pair<Register, unsigned>
4567AMDGPUInstructionSelector::selectVOP3PModsF32Impl(Register Src) const {
4568 unsigned Mods;
4569 std::tie(Src, Mods) = selectVOP3ModsImpl(Src);
4570 Mods |= SISrcMods::OP_SEL_1;
4571 return std::pair(Src, Mods);
4572}
4573
4574Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
4575 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
4576 bool ForceVGPR) const {
4577 if ((Mods != 0 || ForceVGPR) &&
4578 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
4579
4580 // If we looked through copies to find source modifiers on an SGPR operand,
4581 // we now have an SGPR register source. To avoid potentially violating the
4582 // constant bus restriction, we need to insert a copy to a VGPR.
4583 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
4584 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
4585 TII.get(AMDGPU::COPY), VGPRSrc)
4586 .addReg(Src);
4587 Src = VGPRSrc;
4588 }
4589
4590 return Src;
4591}
4592
4593///
4594/// This will select either an SGPR or VGPR operand and will save us from
4595/// having to write an extra tablegen pattern.
4597AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
4598 return {{
4599 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4600 }};
4601}
4602
4604AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
4605 Register Src;
4606 unsigned Mods;
4607 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4608
4609 return {{
4610 [=](MachineInstrBuilder &MIB) {
4611 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4612 },
4613 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4614 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4615 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4616 }};
4617}
4618
4620AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
4621 Register Src;
4622 unsigned Mods;
4623 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4624 /*IsCanonicalizing=*/true,
4625 /*AllowAbs=*/false);
4626
4627 return {{
4628 [=](MachineInstrBuilder &MIB) {
4629 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4630 },
4631 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4632 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4633 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4634 }};
4635}
4636
4638AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
4639 return {{
4640 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
4641 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4642 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4643 }};
4644}
4645
4647AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
4648 Register Src;
4649 unsigned Mods;
4650 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4651
4652 return {{
4653 [=](MachineInstrBuilder &MIB) {
4654 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4655 },
4656 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4657 }};
4658}
4659
4661AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
4662 MachineOperand &Root) const {
4663 Register Src;
4664 unsigned Mods;
4665 std::tie(Src, Mods) =
4666 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);
4667
4668 return {{
4669 [=](MachineInstrBuilder &MIB) {
4670 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4671 },
4672 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4673 }};
4674}
4675
4677AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
4678 Register Src;
4679 unsigned Mods;
4680 std::tie(Src, Mods) =
4681 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
4682 /*AllowAbs=*/false);
4683
4684 return {{
4685 [=](MachineInstrBuilder &MIB) {
4686 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4687 },
4688 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4689 }};
4690}
4691
4693AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
4694 Register Reg = Root.getReg();
4695 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
4696 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
4697 return {};
4698 return {{
4699 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4700 }};
4701}
4702
4703enum class SrcStatus {
4708 // This means current op = [op_upper, op_lower] and src = -op_lower.
4711 // This means current op = [op_upper, op_lower] and src = [op_upper,
4712 // -op_lower].
4720};
4721/// Test if the MI is truncating to half, such as `%reg0:n = G_TRUNC %reg1:2n`
4722static bool isTruncHalf(const MachineInstr *MI,
4723 const MachineRegisterInfo &MRI) {
4724 if (MI->getOpcode() != AMDGPU::G_TRUNC)
4725 return false;
4726
4727 unsigned DstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits();
4728 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4729 return DstSize * 2 == SrcSize;
4730}
4731
4732/// Test if the MI is logic shift right with half bits,
4733/// such as `%reg0:2n =G_LSHR %reg1:2n, CONST(n)`
4734static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4735 if (MI->getOpcode() != AMDGPU::G_LSHR)
4736 return false;
4737
4738 Register ShiftSrc;
4739 std::optional<ValueAndVReg> ShiftAmt;
4740 if (mi_match(MI->getOperand(0).getReg(), MRI,
4741 m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4742 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4743 unsigned Shift = ShiftAmt->Value.getZExtValue();
4744 return Shift * 2 == SrcSize;
4745 }
4746 return false;
4747}
4748
4749/// Test if the MI is shift left with half bits,
4750/// such as `%reg0:2n =G_SHL %reg1:2n, CONST(n)`
4751static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4752 if (MI->getOpcode() != AMDGPU::G_SHL)
4753 return false;
4754
4755 Register ShiftSrc;
4756 std::optional<ValueAndVReg> ShiftAmt;
4757 if (mi_match(MI->getOperand(0).getReg(), MRI,
4758 m_GShl(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4759 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4760 unsigned Shift = ShiftAmt->Value.getZExtValue();
4761 return Shift * 2 == SrcSize;
4762 }
4763 return false;
4764}
4765
4766/// Test function, if the MI is `%reg0:n, %reg1:n = G_UNMERGE_VALUES %reg2:2n`
4767static bool isUnmergeHalf(const MachineInstr *MI,
4768 const MachineRegisterInfo &MRI) {
4769 if (MI->getOpcode() != AMDGPU::G_UNMERGE_VALUES)
4770 return false;
4771 return MI->getNumOperands() == 3 && MI->getOperand(0).isDef() &&
4772 MI->getOperand(1).isDef() && !MI->getOperand(2).isDef();
4773}
4774
4776
4778 const MachineRegisterInfo &MRI) {
4779 LLT OpTy = MRI.getType(Reg);
4780 if (OpTy.isScalar())
4781 return TypeClass::SCALAR;
4782 if (OpTy.isVector() && OpTy.getNumElements() == 2)
4785}
4786
4788 const MachineRegisterInfo &MRI) {
4789 TypeClass NegType = isVectorOfTwoOrScalar(Reg, MRI);
4790 if (NegType != TypeClass::VECTOR_OF_TWO && NegType != TypeClass::SCALAR)
4791 return SrcStatus::INVALID;
4792
4793 switch (S) {
4794 case SrcStatus::IS_SAME:
4795 if (NegType == TypeClass::VECTOR_OF_TWO) {
4796 // Vector of 2:
4797 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4798 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4799 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4800 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4802 }
4803 if (NegType == TypeClass::SCALAR) {
4804 // Scalar:
4805 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4806 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4807 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4808 // [SrcHi, SrcLo] = [-OpHi, OpLo]
4809 return SrcStatus::IS_HI_NEG;
4810 }
4811 break;
4813 if (NegType == TypeClass::VECTOR_OF_TWO) {
4814 // Vector of 2:
4815 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4816 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4817 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4818 // [SrcHi, SrcLo] = [-(-OpHi), -OpLo] = [OpHi, -OpLo]
4819 return SrcStatus::IS_LO_NEG;
4820 }
4821 if (NegType == TypeClass::SCALAR) {
4822 // Scalar:
4823 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4824 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4825 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4826 // [SrcHi, SrcLo] = [-(-OpHi), OpLo] = [OpHi, OpLo]
4827 return SrcStatus::IS_SAME;
4828 }
4829 break;
4831 if (NegType == TypeClass::VECTOR_OF_TWO) {
4832 // Vector of 2:
4833 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4834 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4835 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4836 // [SrcHi, SrcLo] = [-OpHi, -(-OpLo)] = [-OpHi, OpLo]
4837 return SrcStatus::IS_HI_NEG;
4838 }
4839 if (NegType == TypeClass::SCALAR) {
4840 // Scalar:
4841 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4842 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4843 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4844 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4846 }
4847 break;
4849 if (NegType == TypeClass::VECTOR_OF_TWO) {
4850 // Vector of 2:
4851 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4852 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4853 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4854 // [SrcHi, SrcLo] = [OpHi, OpLo]
4855 return SrcStatus::IS_SAME;
4856 }
4857 if (NegType == TypeClass::SCALAR) {
4858 // Scalar:
4859 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4860 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4861 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4862 // [SrcHi, SrcLo] = [OpHi, -OpLo]
4863 return SrcStatus::IS_LO_NEG;
4864 }
4865 break;
4867 // Vector of 2:
4868 // Src = CurrUpper
4869 // Curr = [CurrUpper, CurrLower]
4870 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4871 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4872 // Src = -OpUpper
4873 //
4874 // Scalar:
4875 // Src = CurrUpper
4876 // Curr = [CurrUpper, CurrLower]
4877 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4878 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4879 // Src = -OpUpper
4882 if (NegType == TypeClass::VECTOR_OF_TWO) {
4883 // Vector of 2:
4884 // Src = CurrLower
4885 // Curr = [CurrUpper, CurrLower]
4886 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4887 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4888 // Src = -OpLower
4890 }
4891 if (NegType == TypeClass::SCALAR) {
4892 // Scalar:
4893 // Src = CurrLower
4894 // Curr = [CurrUpper, CurrLower]
4895 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4896 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4897 // Src = OpLower
4899 }
4900 break;
4902 // Vector of 2:
4903 // Src = -CurrUpper
4904 // Curr = [CurrUpper, CurrLower]
4905 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4906 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4907 // Src = -(-OpUpper) = OpUpper
4908 //
4909 // Scalar:
4910 // Src = -CurrUpper
4911 // Curr = [CurrUpper, CurrLower]
4912 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4913 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4914 // Src = -(-OpUpper) = OpUpper
4917 if (NegType == TypeClass::VECTOR_OF_TWO) {
4918 // Vector of 2:
4919 // Src = -CurrLower
4920 // Curr = [CurrUpper, CurrLower]
4921 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4922 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4923 // Src = -(-OpLower) = OpLower
4925 }
4926 if (NegType == TypeClass::SCALAR) {
4927 // Scalar:
4928 // Src = -CurrLower
4929 // Curr = [CurrUpper, CurrLower]
4930 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4931 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4932 // Src = -OpLower
4934 }
4935 break;
4936 default:
4937 break;
4938 }
4939 llvm_unreachable("unexpected SrcStatus & NegType combination");
4940}
4941
4942static std::optional<std::pair<Register, SrcStatus>>
4943calcNextStatus(std::pair<Register, SrcStatus> Curr,
4944 const MachineRegisterInfo &MRI) {
4945 const MachineInstr *MI = MRI.getVRegDef(Curr.first);
4946
4947 unsigned Opc = MI->getOpcode();
4948
4949 // Handle general Opc cases.
4950 switch (Opc) {
4951 case AMDGPU::G_BITCAST:
4952 return std::optional<std::pair<Register, SrcStatus>>(
4953 {MI->getOperand(1).getReg(), Curr.second});
4954 case AMDGPU::COPY:
4955 if (MI->getOperand(1).getReg().isPhysical())
4956 return std::nullopt;
4957 return std::optional<std::pair<Register, SrcStatus>>(
4958 {MI->getOperand(1).getReg(), Curr.second});
4959 case AMDGPU::G_FNEG: {
4960 SrcStatus Stat = getNegStatus(Curr.first, Curr.second, MRI);
4961 if (Stat == SrcStatus::INVALID)
4962 return std::nullopt;
4963 return std::optional<std::pair<Register, SrcStatus>>(
4964 {MI->getOperand(1).getReg(), Stat});
4965 }
4966 default:
4967 break;
4968 }
4969
4970 // Calc next Stat from current Stat.
4971 switch (Curr.second) {
4972 case SrcStatus::IS_SAME:
4973 if (isTruncHalf(MI, MRI))
4974 return std::optional<std::pair<Register, SrcStatus>>(
4975 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
4976 else if (isUnmergeHalf(MI, MRI)) {
4977 if (Curr.first == MI->getOperand(0).getReg())
4978 return std::optional<std::pair<Register, SrcStatus>>(
4979 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF});
4980 return std::optional<std::pair<Register, SrcStatus>>(
4981 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF});
4982 }
4983 break;
4985 if (isTruncHalf(MI, MRI)) {
4986 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4987 // [CurrHi, CurrLo] = trunc [OpUpper, OpLower] = OpLower
4988 // = [OpLowerHi, OpLowerLo]
4989 // Src = [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4990 // = [-OpLowerHi, OpLowerLo]
4991 // = -OpLower
4992 return std::optional<std::pair<Register, SrcStatus>>(
4993 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4994 }
4995 if (isUnmergeHalf(MI, MRI)) {
4996 if (Curr.first == MI->getOperand(0).getReg())
4997 return std::optional<std::pair<Register, SrcStatus>>(
4998 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4999 return std::optional<std::pair<Register, SrcStatus>>(
5000 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
5001 }
5002 break;
5004 if (isShlHalf(MI, MRI))
5005 return std::optional<std::pair<Register, SrcStatus>>(
5006 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
5007 break;
5009 if (isLshrHalf(MI, MRI))
5010 return std::optional<std::pair<Register, SrcStatus>>(
5011 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF});
5012 break;
5014 if (isShlHalf(MI, MRI))
5015 return std::optional<std::pair<Register, SrcStatus>>(
5016 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
5017 break;
5019 if (isLshrHalf(MI, MRI))
5020 return std::optional<std::pair<Register, SrcStatus>>(
5021 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
5022 break;
5023 default:
5024 break;
5025 }
5026 return std::nullopt;
5027}
5028
5029/// This is used to control valid status that current MI supports. For example,
5030/// non floating point intrinsic such as @llvm.amdgcn.sdot2 does not support NEG
5031/// bit on VOP3P.
5032/// The class can be further extended to recognize support on SEL, NEG, ABS bit
5033/// for different MI on different arch
5035private:
5036 bool HasNeg = false;
5037 // Assume all complex pattern of VOP3P have opsel.
5038 bool HasOpsel = true;
5039
5040public:
5042 const MachineInstr *MI = MRI.getVRegDef(Reg);
5043 unsigned Opc = MI->getOpcode();
5044
5045 if (Opc == TargetOpcode::G_INTRINSIC) {
5046 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(*MI).getIntrinsicID();
5047 // Only float point intrinsic has neg & neg_hi bits.
5048 if (IntrinsicID == Intrinsic::amdgcn_fdot2)
5049 HasNeg = true;
5051 // Keep same for generic op.
5052 HasNeg = true;
5053 }
5054 }
5055 bool checkOptions(SrcStatus Stat) const {
5056 if (!HasNeg &&
5057 (Stat >= SrcStatus::NEG_START && Stat <= SrcStatus::NEG_END)) {
5058 return false;
5059 }
5060 if (!HasOpsel &&
5061 (Stat >= SrcStatus::HALF_START && Stat <= SrcStatus::HALF_END)) {
5062 return false;
5063 }
5064 return true;
5065 }
5066};
5067
5070 int MaxDepth = 3) {
5071 int Depth = 0;
5072 auto Curr = calcNextStatus({Reg, SrcStatus::IS_SAME}, MRI);
5074
5075 while (Depth <= MaxDepth && Curr.has_value()) {
5076 Depth++;
5077 if (SO.checkOptions(Curr.value().second))
5078 Statlist.push_back(Curr.value());
5079 Curr = calcNextStatus(Curr.value(), MRI);
5080 }
5081
5082 return Statlist;
5083}
5084
5085static std::pair<Register, SrcStatus>
5087 int MaxDepth = 3) {
5088 int Depth = 0;
5089 std::pair<Register, SrcStatus> LastSameOrNeg = {Reg, SrcStatus::IS_SAME};
5090 auto Curr = calcNextStatus(LastSameOrNeg, MRI);
5091
5092 while (Depth <= MaxDepth && Curr.has_value()) {
5093 Depth++;
5094 SrcStatus Stat = Curr.value().second;
5095 if (SO.checkOptions(Stat)) {
5096 if (Stat == SrcStatus::IS_SAME || Stat == SrcStatus::IS_HI_NEG ||
5098 LastSameOrNeg = Curr.value();
5099 }
5100 Curr = calcNextStatus(Curr.value(), MRI);
5101 }
5102
5103 return LastSameOrNeg;
5104}
5105
5106static bool isSameBitWidth(Register Reg1, Register Reg2,
5107 const MachineRegisterInfo &MRI) {
5108 unsigned Width1 = MRI.getType(Reg1).getSizeInBits();
5109 unsigned Width2 = MRI.getType(Reg2).getSizeInBits();
5110 return Width1 == Width2;
5111}
5112
5113static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) {
5114 // SrcStatus::IS_LOWER_HALF remain 0.
5115 if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) {
5116 Mods ^= SISrcMods::NEG_HI;
5117 Mods |= SISrcMods::OP_SEL_1;
5118 } else if (HiStat == SrcStatus::IS_UPPER_HALF)
5119 Mods |= SISrcMods::OP_SEL_1;
5120 else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG)
5121 Mods ^= SISrcMods::NEG_HI;
5122 else if (HiStat == SrcStatus::IS_HI_NEG)
5123 Mods ^= SISrcMods::NEG_HI;
5124
5125 if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) {
5126 Mods ^= SISrcMods::NEG;
5127 Mods |= SISrcMods::OP_SEL_0;
5128 } else if (LoStat == SrcStatus::IS_UPPER_HALF)
5129 Mods |= SISrcMods::OP_SEL_0;
5130 else if (LoStat == SrcStatus::IS_LOWER_HALF_NEG)
5131 Mods |= SISrcMods::NEG;
5132 else if (LoStat == SrcStatus::IS_HI_NEG)
5133 Mods ^= SISrcMods::NEG;
5134
5135 return Mods;
5136}
5137
5138static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg,
5139 Register RootReg, const SIInstrInfo &TII,
5140 const MachineRegisterInfo &MRI) {
5141 auto IsHalfState = [](SrcStatus S) {
5144 };
5145 return isSameBitWidth(NewReg, RootReg, MRI) && IsHalfState(LoStat) &&
5146 IsHalfState(HiStat);
5147}
5148
5149std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3PModsImpl(
5150 Register RootReg, const MachineRegisterInfo &MRI, bool IsDOT) const {
5151 unsigned Mods = 0;
5152 // No modification if Root type is not form of <2 x Type>.
5153 if (isVectorOfTwoOrScalar(RootReg, MRI) != TypeClass::VECTOR_OF_TWO) {
5154 Mods |= SISrcMods::OP_SEL_1;
5155 return {RootReg, Mods};
5156 }
5157
5158 SearchOptions SO(RootReg, MRI);
5159
5160 std::pair<Register, SrcStatus> Stat = getLastSameOrNeg(RootReg, MRI, SO);
5161
5162 if (Stat.second == SrcStatus::IS_BOTH_NEG)
5164 else if (Stat.second == SrcStatus::IS_HI_NEG)
5165 Mods ^= SISrcMods::NEG_HI;
5166 else if (Stat.second == SrcStatus::IS_LO_NEG)
5167 Mods ^= SISrcMods::NEG;
5168
5169 MachineInstr *MI = MRI.getVRegDef(Stat.first);
5170
5171 if (MI->getOpcode() != AMDGPU::G_BUILD_VECTOR || MI->getNumOperands() != 3 ||
5172 (IsDOT && Subtarget->hasDOTOpSelHazard())) {
5173 Mods |= SISrcMods::OP_SEL_1;
5174 return {Stat.first, Mods};
5175 }
5176
5178 getSrcStats(MI->getOperand(2).getReg(), MRI, SO);
5179
5180 if (StatlistHi.empty()) {
5181 Mods |= SISrcMods::OP_SEL_1;
5182 return {Stat.first, Mods};
5183 }
5184
5186 getSrcStats(MI->getOperand(1).getReg(), MRI, SO);
5187
5188 if (StatlistLo.empty()) {
5189 Mods |= SISrcMods::OP_SEL_1;
5190 return {Stat.first, Mods};
5191 }
5192
5193 for (int I = StatlistHi.size() - 1; I >= 0; I--) {
5194 for (int J = StatlistLo.size() - 1; J >= 0; J--) {
5195 if (StatlistHi[I].first == StatlistLo[J].first &&
5196 isValidToPack(StatlistHi[I].second, StatlistLo[J].second,
5197 StatlistHi[I].first, RootReg, TII, MRI))
5198 return {StatlistHi[I].first,
5199 updateMods(StatlistHi[I].second, StatlistLo[J].second, Mods)};
5200 }
5201 }
5202 // Packed instructions do not have abs modifiers.
5203 Mods |= SISrcMods::OP_SEL_1;
5204
5205 return {Stat.first, Mods};
5206}
5207
5208// Removed unused function `getAllKindImm` to eliminate dead code.
5209
5210static bool checkRB(Register Reg, unsigned int RBNo,
5211 const AMDGPURegisterBankInfo &RBI,
5212 const MachineRegisterInfo &MRI,
5213 const TargetRegisterInfo &TRI) {
5214 const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI);
5215 return RB->getID() == RBNo;
5216}
5217
5218// This function is used to get the correct register bank for returned reg.
5219// Assume:
5220// 1. VOP3P is always legal for VGPR.
5221// 2. RootOp's regbank is legal.
5222// Thus
5223// 1. If RootOp is SGPR, then NewOp can be SGPR or VGPR.
5224// 2. If RootOp is VGPR, then NewOp must be VGPR.
5226 const AMDGPURegisterBankInfo &RBI,
5228 const TargetRegisterInfo &TRI,
5229 const SIInstrInfo &TII) {
5230 // RootOp can only be VGPR or SGPR (some hand written cases such as.
5231 // inst-select-ashr.v2s16.mir::ashr_v2s16_vs).
5232 if (checkRB(RootReg, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) ||
5233 checkRB(NewReg, AMDGPU::VGPRRegBankID, RBI, MRI, TRI))
5234 return NewReg;
5235
5236 MachineInstr *MI = MRI.getVRegDef(RootReg);
5237 if (MI->getOpcode() == AMDGPU::COPY && NewReg == MI->getOperand(1).getReg()) {
5238 // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp.
5239 return RootReg;
5240 }
5241
5242 MachineBasicBlock *BB = MI->getParent();
5243 Register DstReg = MRI.cloneVirtualRegister(RootReg);
5244
5246 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
5247 .addReg(NewReg);
5248
5249 // Only accept VGPR.
5250 return MIB->getOperand(0).getReg();
5251}
5252
5254AMDGPUInstructionSelector::selectVOP3PRetHelper(MachineOperand &Root,
5255 bool IsDOT) const {
5256 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5257 Register Reg;
5258 unsigned Mods;
5259 std::tie(Reg, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, IsDOT);
5260
5261 Reg = getLegalRegBank(Reg, Root.getReg(), RBI, MRI, TRI, TII);
5262 return {{
5263 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
5264 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5265 }};
5266}
5267
5269AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
5270
5271 return selectVOP3PRetHelper(Root);
5272}
5273
5275AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
5276
5277 return selectVOP3PRetHelper(Root, true);
5278}
5279
5281AMDGPUInstructionSelector::selectVOP3PNoModsDOT(MachineOperand &Root) const {
5282 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5283 Register Src;
5284 unsigned Mods;
5285 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true /*IsDOT*/);
5286 if (Mods != SISrcMods::OP_SEL_1)
5287 return {};
5288
5289 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }}};
5290}
5291
5293AMDGPUInstructionSelector::selectVOP3PModsF32(MachineOperand &Root) const {
5294 Register Src;
5295 unsigned Mods;
5296 std::tie(Src, Mods) = selectVOP3PModsF32Impl(Root.getReg());
5297
5298 return {{
5299 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5300 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5301 }};
5302}
5303
5305AMDGPUInstructionSelector::selectVOP3PNoModsF32(MachineOperand &Root) const {
5306 Register Src;
5307 unsigned Mods;
5308 std::tie(Src, Mods) = selectVOP3PModsF32Impl(Root.getReg());
5309 if (Mods != SISrcMods::OP_SEL_1)
5310 return {};
5311
5312 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }}};
5313}
5314
5316AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
5317 MachineOperand &Root) const {
5318 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
5319 "expected i1 value");
5320 unsigned Mods = SISrcMods::OP_SEL_1;
5321 if (Root.getImm() != 0)
5322 Mods |= SISrcMods::OP_SEL_0;
5323
5324 return {{
5325 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5326 }};
5327}
5328
5330 MachineInstr *InsertPt,
5331 MachineRegisterInfo &MRI) {
5332 const TargetRegisterClass *DstRegClass;
5333 switch (Elts.size()) {
5334 case 8:
5335 DstRegClass = &AMDGPU::VReg_256RegClass;
5336 break;
5337 case 4:
5338 DstRegClass = &AMDGPU::VReg_128RegClass;
5339 break;
5340 case 2:
5341 DstRegClass = &AMDGPU::VReg_64RegClass;
5342 break;
5343 default:
5344 llvm_unreachable("unhandled Reg sequence size");
5345 }
5346
5347 MachineIRBuilder B(*InsertPt);
5348 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
5349 .addDef(MRI.createVirtualRegister(DstRegClass));
5350 for (unsigned i = 0; i < Elts.size(); ++i) {
5351 MIB.addReg(Elts[i]);
5353 }
5354 return MIB->getOperand(0).getReg();
5355}
5356
5357static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
5359 MachineInstr *InsertPt,
5360 MachineRegisterInfo &MRI) {
5361 if (ModOpcode == TargetOpcode::G_FNEG) {
5362 Mods |= SISrcMods::NEG;
5363 // Check if all elements also have abs modifier
5364 SmallVector<Register, 8> NegAbsElts;
5365 for (auto El : Elts) {
5366 Register FabsSrc;
5367 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
5368 break;
5369 NegAbsElts.push_back(FabsSrc);
5370 }
5371 if (Elts.size() != NegAbsElts.size()) {
5372 // Neg
5373 Src = buildRegSequence(Elts, InsertPt, MRI);
5374 } else {
5375 // Neg and Abs
5376 Mods |= SISrcMods::NEG_HI;
5377 Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
5378 }
5379 } else {
5380 assert(ModOpcode == TargetOpcode::G_FABS);
5381 // Abs
5382 Mods |= SISrcMods::NEG_HI;
5383 Src = buildRegSequence(Elts, InsertPt, MRI);
5384 }
5385}
5386
5388AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
5389 Register Src = Root.getReg();
5390 unsigned Mods = SISrcMods::OP_SEL_1;
5392
5393 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
5394 assert(BV->getNumSources() > 0);
5395 // Based on first element decide which mod we match, neg or abs
5396 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
5397 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
5398 ? AMDGPU::G_FNEG
5399 : AMDGPU::G_FABS;
5400 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
5401 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
5402 if (ElF32->getOpcode() != ModOpcode)
5403 break;
5404 EltsF32.push_back(ElF32->getOperand(1).getReg());
5405 }
5406
5407 // All elements had ModOpcode modifier
5408 if (BV->getNumSources() == EltsF32.size()) {
5409 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
5410 *MRI);
5411 }
5412 }
5413
5414 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5415 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5416}
5417
5419AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
5420 Register Src = Root.getReg();
5421 unsigned Mods = SISrcMods::OP_SEL_1;
5422 SmallVector<Register, 8> EltsV2F16;
5423
5424 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5425 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5426 Register FNegSrc;
5427 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
5428 break;
5429 EltsV2F16.push_back(FNegSrc);
5430 }
5431
5432 // All elements had ModOpcode modifier
5433 if (CV->getNumSources() == EltsV2F16.size()) {
5434 Mods |= SISrcMods::NEG;
5435 Mods |= SISrcMods::NEG_HI;
5436 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
5437 }
5438 }
5439
5440 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5441 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5442}
5443
5445AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
5446 Register Src = Root.getReg();
5447 unsigned Mods = SISrcMods::OP_SEL_1;
5448 SmallVector<Register, 8> EltsV2F16;
5449
5450 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5451 assert(CV->getNumSources() > 0);
5452 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
5453 // Based on first element decide which mod we match, neg or abs
5454 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
5455 ? AMDGPU::G_FNEG
5456 : AMDGPU::G_FABS;
5457
5458 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5459 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
5460 if (ElV2F16->getOpcode() != ModOpcode)
5461 break;
5462 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
5463 }
5464
5465 // All elements had ModOpcode modifier
5466 if (CV->getNumSources() == EltsV2F16.size()) {
5467 MachineIRBuilder B(*Root.getParent());
5468 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
5469 *MRI);
5470 }
5471 }
5472
5473 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5474 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5475}
5476
5478AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
5479 std::optional<FPValueAndVReg> FPValReg;
5480 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
5481 if (TII.isInlineConstant(FPValReg->Value)) {
5482 return {{[=](MachineInstrBuilder &MIB) {
5483 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
5484 }}};
5485 }
5486 // Non-inlineable splat floats should not fall-through for integer immediate
5487 // checks.
5488 return {};
5489 }
5490
5491 APInt ICst;
5492 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
5493 if (TII.isInlineConstant(ICst)) {
5494 return {
5495 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
5496 }
5497 }
5498
5499 return {};
5500}
5501
5503AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
5504 Register Src =
5505 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5506 unsigned Key = 0;
5507
5508 Register ShiftSrc;
5509 std::optional<ValueAndVReg> ShiftAmt;
5510 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5511 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5512 ShiftAmt->Value.getZExtValue() % 8 == 0) {
5513 Key = ShiftAmt->Value.getZExtValue() / 8;
5514 Src = ShiftSrc;
5515 }
5516
5517 return {{
5518 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5519 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5520 }};
5521}
5522
5524AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
5525
5526 Register Src =
5527 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5528 unsigned Key = 0;
5529
5530 Register ShiftSrc;
5531 std::optional<ValueAndVReg> ShiftAmt;
5532 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5533 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5534 ShiftAmt->Value.getZExtValue() == 16) {
5535 Src = ShiftSrc;
5536 Key = 1;
5537 }
5538
5539 return {{
5540 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5541 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5542 }};
5543}
5544
5546AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const {
5547 Register Src =
5548 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5549 unsigned Key = 0;
5550
5551 Register S32 = matchZeroExtendFromS32(Src);
5552 if (!S32)
5553 S32 = matchAnyExtendFromS32(Src);
5554
5555 if (S32) {
5556 const MachineInstr *Def = getDefIgnoringCopies(S32, *MRI);
5557 if (Def->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
5558 assert(Def->getNumOperands() == 3);
5559 Register DstReg1 = Def->getOperand(1).getReg();
5560 if (mi_match(S32, *MRI,
5561 m_any_of(m_SpecificReg(DstReg1), m_Copy(m_Reg(DstReg1))))) {
5562 Src = Def->getOperand(2).getReg();
5563 Key = 1;
5564 }
5565 }
5566 }
5567
5568 return {{
5569 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5570 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5571 }};
5572}
5573
5575AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
5576 Register Src;
5577 unsigned Mods;
5578 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
5579
5580 // FIXME: Handle op_sel
5581 return {{
5582 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5583 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5584 }};
5585}
5586
5587// FIXME-TRUE16 remove when fake16 is removed
5589AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
5590 Register Src;
5591 unsigned Mods;
5592 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5593 /*IsCanonicalizing=*/true,
5594 /*AllowAbs=*/false,
5595 /*OpSel=*/false);
5596
5597 return {{
5598 [=](MachineInstrBuilder &MIB) {
5599 MIB.addReg(
5600 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5601 },
5602 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5603 }};
5604}
5605
5607AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
5608 Register Src;
5609 unsigned Mods;
5610 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5611 /*IsCanonicalizing=*/true,
5612 /*AllowAbs=*/false,
5613 /*OpSel=*/true);
5614
5615 return {{
5616 [=](MachineInstrBuilder &MIB) {
5617 MIB.addReg(
5618 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5619 },
5620 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5621 }};
5622}
5623
5624// Given \p Offset and load specified by the \p Root operand check if \p Offset
5625// is a multiple of the load byte size. If it is update \p Offset to a
5626// pre-scaled value and return true.
5627bool AMDGPUInstructionSelector::selectScaleOffset(MachineOperand &Root,
5629 bool IsSigned) const {
5630 if (!Subtarget->hasScaleOffset())
5631 return false;
5632
5633 const MachineInstr &MI = *Root.getParent();
5634 MachineMemOperand *MMO = *MI.memoperands_begin();
5635
5636 if (!MMO->getSize().hasValue())
5637 return false;
5638
5639 uint64_t Size = MMO->getSize().getValue();
5640
5641 Register OffsetReg = matchExtendFromS32OrS32(Offset, IsSigned);
5642 if (!OffsetReg)
5643 OffsetReg = Offset;
5644
5645 if (auto Def = getDefSrcRegIgnoringCopies(OffsetReg, *MRI))
5646 OffsetReg = Def->Reg;
5647
5648 Register Op0;
5649 MachineInstr *Mul;
5650 bool ScaleOffset =
5651 (isPowerOf2_64(Size) &&
5652 mi_match(OffsetReg, *MRI,
5653 m_GShl(m_Reg(Op0),
5656 mi_match(OffsetReg, *MRI,
5658 m_Copy(m_SpecificICst(Size))))) ||
5659 mi_match(
5660 OffsetReg, *MRI,
5661 m_BinOp(IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO : AMDGPU::S_MUL_U64,
5662 m_Reg(Op0), m_SpecificICst(Size))) ||
5663 // Match G_AMDGPU_MAD_U64_U32 offset, c, 0
5664 (mi_match(OffsetReg, *MRI, m_MInstr(Mul)) &&
5665 (Mul->getOpcode() == (IsSigned ? AMDGPU::G_AMDGPU_MAD_I64_I32
5666 : AMDGPU::G_AMDGPU_MAD_U64_U32) ||
5667 (IsSigned && Mul->getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32 &&
5668 VT->signBitIsZero(Mul->getOperand(2).getReg()))) &&
5669 mi_match(Mul->getOperand(4).getReg(), *MRI, m_ZeroInt()) &&
5670 mi_match(Mul->getOperand(3).getReg(), *MRI,
5672 m_Copy(m_SpecificICst(Size))))) &&
5673 mi_match(Mul->getOperand(2).getReg(), *MRI, m_Reg(Op0)));
5674
5675 if (ScaleOffset)
5676 Offset = Op0;
5677
5678 return ScaleOffset;
5679}
5680
5681bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
5682 Register &Base,
5683 Register *SOffset,
5684 int64_t *Offset,
5685 bool *ScaleOffset) const {
5686 MachineInstr *MI = Root.getParent();
5687 MachineBasicBlock *MBB = MI->getParent();
5688
5689 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
5690 // then we can select all ptr + 32-bit offsets.
5691 SmallVector<GEPInfo, 4> AddrInfo;
5692 getAddrModeInfo(*MI, *MRI, AddrInfo);
5693
5694 if (AddrInfo.empty())
5695 return false;
5696
5697 const GEPInfo &GEPI = AddrInfo[0];
5698 std::optional<int64_t> EncodedImm;
5699
5700 if (ScaleOffset)
5701 *ScaleOffset = false;
5702
5703 if (SOffset && Offset) {
5704 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5705 /*HasSOffset=*/true);
5706 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
5707 AddrInfo.size() > 1) {
5708 const GEPInfo &GEPI2 = AddrInfo[1];
5709 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
5710 Register OffsetReg = GEPI2.SgprParts[1];
5711 if (ScaleOffset)
5712 *ScaleOffset =
5713 selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5714 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5715 if (OffsetReg) {
5716 Base = GEPI2.SgprParts[0];
5717 *SOffset = OffsetReg;
5718 *Offset = *EncodedImm;
5719 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
5720 return true;
5721
5722 // For unbuffered smem loads, it is illegal for the Immediate Offset
5723 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
5724 // is negative. Handle the case where the Immediate Offset + SOffset
5725 // is negative.
5726 auto SKnown = VT->getKnownBits(*SOffset);
5727 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
5728 return false;
5729
5730 return true;
5731 }
5732 }
5733 }
5734 return false;
5735 }
5736
5737 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5738 /*HasSOffset=*/false);
5739 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
5740 Base = GEPI.SgprParts[0];
5741 *Offset = *EncodedImm;
5742 return true;
5743 }
5744
5745 // SGPR offset is unsigned.
5746 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
5747 GEPI.Imm != 0) {
5748 // If we make it this far we have a load with an 32-bit immediate offset.
5749 // It is OK to select this using a sgpr offset, because we have already
5750 // failed trying to select this load into one of the _IMM variants since
5751 // the _IMM Patterns are considered before the _SGPR patterns.
5752 Base = GEPI.SgprParts[0];
5753 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5754 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
5755 .addImm(GEPI.Imm);
5756 return true;
5757 }
5758
5759 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
5760 Register OffsetReg = GEPI.SgprParts[1];
5761 if (ScaleOffset)
5762 *ScaleOffset = selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5763 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5764 if (OffsetReg) {
5765 Base = GEPI.SgprParts[0];
5766 *SOffset = OffsetReg;
5767 return true;
5768 }
5769 }
5770
5771 return false;
5772}
5773
5775AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
5776 Register Base;
5777 int64_t Offset;
5778 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset,
5779 /* ScaleOffset */ nullptr))
5780 return std::nullopt;
5781
5782 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5783 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
5784}
5785
5787AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
5788 SmallVector<GEPInfo, 4> AddrInfo;
5789 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
5790
5791 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
5792 return std::nullopt;
5793
5794 const GEPInfo &GEPInfo = AddrInfo[0];
5795 Register PtrReg = GEPInfo.SgprParts[0];
5796 std::optional<int64_t> EncodedImm =
5797 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
5798 if (!EncodedImm)
5799 return std::nullopt;
5800
5801 return {{
5802 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
5803 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
5804 }};
5805}
5806
5808AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
5809 Register Base, SOffset;
5810 bool ScaleOffset;
5811 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr,
5812 &ScaleOffset))
5813 return std::nullopt;
5814
5815 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5816 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5817 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5818 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
5819}
5820
5822AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
5823 Register Base, SOffset;
5824 int64_t Offset;
5825 bool ScaleOffset;
5826 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset, &ScaleOffset))
5827 return std::nullopt;
5828
5829 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5830 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5831 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5832 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
5833 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
5834}
5835
5836std::pair<Register, int>
5837AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
5838 uint64_t FlatVariant) const {
5839 MachineInstr *MI = Root.getParent();
5840
5841 auto Default = std::pair(Root.getReg(), 0);
5842
5843 if (!STI.hasFlatInstOffsets())
5844 return Default;
5845
5846 Register PtrBase;
5847 int64_t ConstOffset;
5848 bool IsInBounds;
5849 std::tie(PtrBase, ConstOffset, IsInBounds) =
5850 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5851
5852 // Adding the offset to the base address with an immediate in a FLAT
5853 // instruction must not change the memory aperture in which the address falls.
5854 // Therefore we can only fold offsets from inbounds GEPs into FLAT
5855 // instructions.
5856 if (ConstOffset == 0 ||
5857 (FlatVariant == SIInstrFlags::FlatScratch &&
5858 !isFlatScratchBaseLegal(Root.getReg())) ||
5859 (FlatVariant == SIInstrFlags::FLAT && !IsInBounds))
5860 return Default;
5861
5862 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
5863 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
5864 return Default;
5865
5866 return std::pair(PtrBase, ConstOffset);
5867}
5868
5870AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
5871 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
5872
5873 return {{
5874 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5875 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5876 }};
5877}
5878
5880AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
5881 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
5882
5883 return {{
5884 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5885 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5886 }};
5887}
5888
5890AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
5891 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
5892
5893 return {{
5894 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5895 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5896 }};
5897}
5898
5899// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
5901AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
5902 unsigned CPolBits,
5903 bool NeedIOffset) const {
5904 Register Addr = Root.getReg();
5905 Register PtrBase;
5906 int64_t ConstOffset;
5907 int64_t ImmOffset = 0;
5908
5909 // Match the immediate offset first, which canonically is moved as low as
5910 // possible.
5911 std::tie(PtrBase, ConstOffset, std::ignore) =
5912 getPtrBaseWithConstantOffset(Addr, *MRI);
5913
5914 if (ConstOffset != 0) {
5915 if (NeedIOffset &&
5916 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
5918 Addr = PtrBase;
5919 ImmOffset = ConstOffset;
5920 } else {
5921 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
5922 if (isSGPR(PtrBaseDef->Reg)) {
5923 if (ConstOffset > 0) {
5924 // Offset is too large.
5925 //
5926 // saddr + large_offset -> saddr +
5927 // (voffset = large_offset & ~MaxOffset) +
5928 // (large_offset & MaxOffset);
5929 int64_t SplitImmOffset = 0, RemainderOffset = ConstOffset;
5930 if (NeedIOffset) {
5931 std::tie(SplitImmOffset, RemainderOffset) =
5932 TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
5934 }
5935
5936 if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
5937 : isUInt<32>(RemainderOffset)) {
5938 MachineInstr *MI = Root.getParent();
5939 MachineBasicBlock *MBB = MI->getParent();
5940 Register HighBits =
5941 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5942
5943 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
5944 HighBits)
5945 .addImm(RemainderOffset);
5946
5947 if (NeedIOffset)
5948 return {{
5949 [=](MachineInstrBuilder &MIB) {
5950 MIB.addReg(PtrBase);
5951 }, // saddr
5952 [=](MachineInstrBuilder &MIB) {
5953 MIB.addReg(HighBits);
5954 }, // voffset
5955 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
5956 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
5957 }};
5958 return {{
5959 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
5960 [=](MachineInstrBuilder &MIB) {
5961 MIB.addReg(HighBits);
5962 }, // voffset
5963 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
5964 }};
5965 }
5966 }
5967
5968 // We are adding a 64 bit SGPR and a constant. If constant bus limit
5969 // is 1 we would need to perform 1 or 2 extra moves for each half of
5970 // the constant and it is better to do a scalar add and then issue a
5971 // single VALU instruction to materialize zero. Otherwise it is less
5972 // instructions to perform VALU adds with immediates or inline literals.
5973 unsigned NumLiterals =
5974 !TII.isInlineConstant(APInt(32, Lo_32(ConstOffset))) +
5975 !TII.isInlineConstant(APInt(32, Hi_32(ConstOffset)));
5976 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
5977 return std::nullopt;
5978 }
5979 }
5980 }
5981
5982 // Match the variable offset.
5983 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5984 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
5985 // Look through the SGPR->VGPR copy.
5986 Register SAddr =
5987 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
5988
5989 if (isSGPR(SAddr)) {
5990 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
5991
5992 // It's possible voffset is an SGPR here, but the copy to VGPR will be
5993 // inserted later.
5994 bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset,
5995 Subtarget->hasSignedGVSOffset());
5996 if (Register VOffset = matchExtendFromS32OrS32(
5997 PtrBaseOffset, Subtarget->hasSignedGVSOffset())) {
5998 if (NeedIOffset)
5999 return {{[=](MachineInstrBuilder &MIB) { // saddr
6000 MIB.addReg(SAddr);
6001 },
6002 [=](MachineInstrBuilder &MIB) { // voffset
6003 MIB.addReg(VOffset);
6004 },
6005 [=](MachineInstrBuilder &MIB) { // offset
6006 MIB.addImm(ImmOffset);
6007 },
6008 [=](MachineInstrBuilder &MIB) { // cpol
6009 MIB.addImm(CPolBits |
6010 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
6011 }}};
6012 return {{[=](MachineInstrBuilder &MIB) { // saddr
6013 MIB.addReg(SAddr);
6014 },
6015 [=](MachineInstrBuilder &MIB) { // voffset
6016 MIB.addReg(VOffset);
6017 },
6018 [=](MachineInstrBuilder &MIB) { // cpol
6019 MIB.addImm(CPolBits |
6020 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
6021 }}};
6022 }
6023 }
6024 }
6025
6026 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
6027 // drop this.
6028 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
6029 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
6030 return std::nullopt;
6031
6032 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
6033 // moves required to copy a 64-bit SGPR to VGPR.
6034 MachineInstr *MI = Root.getParent();
6035 MachineBasicBlock *MBB = MI->getParent();
6036 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6037
6038 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
6039 .addImm(0);
6040
6041 if (NeedIOffset)
6042 return {{
6043 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
6044 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
6045 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
6046 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
6047 }};
6048 return {{
6049 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
6050 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
6051 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
6052 }};
6053}
6054
6056AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
6057 return selectGlobalSAddr(Root, 0);
6058}
6059
6061AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const {
6062 const MachineInstr &I = *Root.getParent();
6063
6064 // We are assuming CPol is always the last operand of the intrinsic.
6065 auto PassedCPol =
6066 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
6067 return selectGlobalSAddr(Root, PassedCPol);
6068}
6069
6071AMDGPUInstructionSelector::selectGlobalSAddrCPolM0(MachineOperand &Root) const {
6072 const MachineInstr &I = *Root.getParent();
6073
6074 // We are assuming CPol is second from last operand of the intrinsic.
6075 auto PassedCPol =
6076 I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
6077 return selectGlobalSAddr(Root, PassedCPol);
6078}
6079
6081AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
6082 return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);
6083}
6084
6086AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset(
6087 MachineOperand &Root) const {
6088 const MachineInstr &I = *Root.getParent();
6089
6090 // We are assuming CPol is always the last operand of the intrinsic.
6091 auto PassedCPol =
6092 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
6093 return selectGlobalSAddr(Root, PassedCPol, false);
6094}
6095
6097AMDGPUInstructionSelector::selectGlobalSAddrNoIOffsetM0(
6098 MachineOperand &Root) const {
6099 const MachineInstr &I = *Root.getParent();
6100
6101 // We are assuming CPol is second from last operand of the intrinsic.
6102 auto PassedCPol =
6103 I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
6104 return selectGlobalSAddr(Root, PassedCPol, false);
6105}
6106
6108AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
6109 Register Addr = Root.getReg();
6110 Register PtrBase;
6111 int64_t ConstOffset;
6112 int64_t ImmOffset = 0;
6113
6114 // Match the immediate offset first, which canonically is moved as low as
6115 // possible.
6116 std::tie(PtrBase, ConstOffset, std::ignore) =
6117 getPtrBaseWithConstantOffset(Addr, *MRI);
6118
6119 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
6120 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
6122 Addr = PtrBase;
6123 ImmOffset = ConstOffset;
6124 }
6125
6126 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
6127 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6128 int FI = AddrDef->MI->getOperand(1).getIndex();
6129 return {{
6130 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
6131 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
6132 }};
6133 }
6134
6135 Register SAddr = AddrDef->Reg;
6136
6137 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
6138 Register LHS = AddrDef->MI->getOperand(1).getReg();
6139 Register RHS = AddrDef->MI->getOperand(2).getReg();
6140 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
6141 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
6142
6143 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
6144 isSGPR(RHSDef->Reg)) {
6145 int FI = LHSDef->MI->getOperand(1).getIndex();
6146 MachineInstr &I = *Root.getParent();
6147 MachineBasicBlock *BB = I.getParent();
6148 const DebugLoc &DL = I.getDebugLoc();
6149 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6150
6151 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
6152 .addFrameIndex(FI)
6153 .addReg(RHSDef->Reg)
6154 .setOperandDead(3); // Dead scc
6155 }
6156 }
6157
6158 if (!isSGPR(SAddr))
6159 return std::nullopt;
6160
6161 return {{
6162 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
6163 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
6164 }};
6165}
6166
6167// Check whether the flat scratch SVS swizzle bug affects this access.
6168bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
6169 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
6170 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
6171 return false;
6172
6173 // The bug affects the swizzling of SVS accesses if there is any carry out
6174 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
6175 // voffset to (soffset + inst_offset).
6176 auto VKnown = VT->getKnownBits(VAddr);
6177 auto SKnown = KnownBits::add(VT->getKnownBits(SAddr),
6178 KnownBits::makeConstant(APInt(32, ImmOffset)));
6179 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
6180 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
6181 return (VMax & 3) + (SMax & 3) >= 4;
6182}
6183
6185AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
6186 Register Addr = Root.getReg();
6187 Register PtrBase;
6188 int64_t ConstOffset;
6189 int64_t ImmOffset = 0;
6190
6191 // Match the immediate offset first, which canonically is moved as low as
6192 // possible.
6193 std::tie(PtrBase, ConstOffset, std::ignore) =
6194 getPtrBaseWithConstantOffset(Addr, *MRI);
6195
6196 Register OrigAddr = Addr;
6197 if (ConstOffset != 0 &&
6198 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
6200 Addr = PtrBase;
6201 ImmOffset = ConstOffset;
6202 }
6203
6204 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
6205 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
6206 return std::nullopt;
6207
6208 Register RHS = AddrDef->MI->getOperand(2).getReg();
6209 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
6210 return std::nullopt;
6211
6212 Register LHS = AddrDef->MI->getOperand(1).getReg();
6213 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
6214
6215 if (OrigAddr != Addr) {
6216 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
6217 return std::nullopt;
6218 } else {
6219 if (!isFlatScratchBaseLegalSV(OrigAddr))
6220 return std::nullopt;
6221 }
6222
6223 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
6224 return std::nullopt;
6225
6226 unsigned CPol = selectScaleOffset(Root, RHS, true /* IsSigned */)
6228 : 0;
6229
6230 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6231 int FI = LHSDef->MI->getOperand(1).getIndex();
6232 return {{
6233 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
6234 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
6235 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
6236 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
6237 }};
6238 }
6239
6240 if (!isSGPR(LHS))
6241 if (auto Def = getDefSrcRegIgnoringCopies(LHS, *MRI))
6242 LHS = Def->Reg;
6243
6244 if (!isSGPR(LHS))
6245 return std::nullopt;
6246
6247 return {{
6248 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
6249 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
6250 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
6251 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
6252 }};
6253}
6254
6256AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
6257 MachineInstr *MI = Root.getParent();
6258 MachineBasicBlock *MBB = MI->getParent();
6259 MachineFunction *MF = MBB->getParent();
6260 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6261
6262 int64_t Offset = 0;
6263 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
6265 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6266
6267 // TODO: Should this be inside the render function? The iterator seems to
6268 // move.
6269 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
6270 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
6271 HighBits)
6272 .addImm(Offset & ~MaxOffset);
6273
6274 return {{[=](MachineInstrBuilder &MIB) { // rsrc
6275 MIB.addReg(Info->getScratchRSrcReg());
6276 },
6277 [=](MachineInstrBuilder &MIB) { // vaddr
6278 MIB.addReg(HighBits);
6279 },
6280 [=](MachineInstrBuilder &MIB) { // soffset
6281 // Use constant zero for soffset and rely on eliminateFrameIndex
6282 // to choose the appropriate frame register if need be.
6283 MIB.addImm(0);
6284 },
6285 [=](MachineInstrBuilder &MIB) { // offset
6286 MIB.addImm(Offset & MaxOffset);
6287 }}};
6288 }
6289
6290 assert(Offset == 0 || Offset == -1);
6291
6292 // Try to fold a frame index directly into the MUBUF vaddr field, and any
6293 // offsets.
6294 std::optional<int> FI;
6295 Register VAddr = Root.getReg();
6296
6297 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6298 Register PtrBase;
6299 int64_t ConstOffset;
6300 std::tie(PtrBase, ConstOffset, std::ignore) =
6301 getPtrBaseWithConstantOffset(VAddr, *MRI);
6302 if (ConstOffset != 0) {
6303 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
6304 (!STI.privateMemoryResourceIsRangeChecked() ||
6305 VT->signBitIsZero(PtrBase))) {
6306 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
6307 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
6308 FI = PtrBaseDef->getOperand(1).getIndex();
6309 else
6310 VAddr = PtrBase;
6311 Offset = ConstOffset;
6312 }
6313 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6314 FI = RootDef->getOperand(1).getIndex();
6315 }
6316
6317 return {{[=](MachineInstrBuilder &MIB) { // rsrc
6318 MIB.addReg(Info->getScratchRSrcReg());
6319 },
6320 [=](MachineInstrBuilder &MIB) { // vaddr
6321 if (FI)
6322 MIB.addFrameIndex(*FI);
6323 else
6324 MIB.addReg(VAddr);
6325 },
6326 [=](MachineInstrBuilder &MIB) { // soffset
6327 // Use constant zero for soffset and rely on eliminateFrameIndex
6328 // to choose the appropriate frame register if need be.
6329 MIB.addImm(0);
6330 },
6331 [=](MachineInstrBuilder &MIB) { // offset
6332 MIB.addImm(Offset);
6333 }}};
6334}
6335
6336bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
6337 int64_t Offset) const {
6338 if (!isUInt<16>(Offset))
6339 return false;
6340
6341 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6342 return true;
6343
6344 // On Southern Islands instruction with a negative base value and an offset
6345 // don't seem to work.
6346 return VT->signBitIsZero(Base);
6347}
6348
6349bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
6350 int64_t Offset1,
6351 unsigned Size) const {
6352 if (Offset0 % Size != 0 || Offset1 % Size != 0)
6353 return false;
6354 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
6355 return false;
6356
6357 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6358 return true;
6359
6360 // On Southern Islands instruction with a negative base value and an offset
6361 // don't seem to work.
6362 return VT->signBitIsZero(Base);
6363}
6364
6365// Return whether the operation has NoUnsignedWrap property.
6366static bool isNoUnsignedWrap(MachineInstr *Addr) {
6367 return Addr->getOpcode() == TargetOpcode::G_OR ||
6368 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
6370}
6371
6372// Check that the base address of flat scratch load/store in the form of `base +
6373// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
6374// requirement). We always treat the first operand as the base address here.
6375bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
6376 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6377
6378 if (isNoUnsignedWrap(AddrMI))
6379 return true;
6380
6381 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6382 // values.
6383 if (STI.hasSignedScratchOffsets())
6384 return true;
6385
6386 Register LHS = AddrMI->getOperand(1).getReg();
6387 Register RHS = AddrMI->getOperand(2).getReg();
6388
6389 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
6390 std::optional<ValueAndVReg> RhsValReg =
6392 // If the immediate offset is negative and within certain range, the base
6393 // address cannot also be negative. If the base is also negative, the sum
6394 // would be either negative or much larger than the valid range of scratch
6395 // memory a thread can access.
6396 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
6397 RhsValReg->Value.getSExtValue() > -0x40000000)
6398 return true;
6399 }
6400
6401 return VT->signBitIsZero(LHS);
6402}
6403
6404// Check address value in SGPR/VGPR are legal for flat scratch in the form
6405// of: SGPR + VGPR.
6406bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
6407 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6408
6409 if (isNoUnsignedWrap(AddrMI))
6410 return true;
6411
6412 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6413 // values.
6414 if (STI.hasSignedScratchOffsets())
6415 return true;
6416
6417 Register LHS = AddrMI->getOperand(1).getReg();
6418 Register RHS = AddrMI->getOperand(2).getReg();
6419 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6420}
6421
6422// Check address value in SGPR/VGPR are legal for flat scratch in the form
6423// of: SGPR + VGPR + Imm.
6424bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
6425 Register Addr) const {
6426 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6427 // values.
6428 if (STI.hasSignedScratchOffsets())
6429 return true;
6430
6431 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6432 Register Base = AddrMI->getOperand(1).getReg();
6433 std::optional<DefinitionAndSourceRegister> BaseDef =
6435 std::optional<ValueAndVReg> RHSOffset =
6437 assert(RHSOffset);
6438
6439 // If the immediate offset is negative and within certain range, the base
6440 // address cannot also be negative. If the base is also negative, the sum
6441 // would be either negative or much larger than the valid range of scratch
6442 // memory a thread can access.
6443 if (isNoUnsignedWrap(BaseDef->MI) &&
6444 (isNoUnsignedWrap(AddrMI) ||
6445 (RHSOffset->Value.getSExtValue() < 0 &&
6446 RHSOffset->Value.getSExtValue() > -0x40000000)))
6447 return true;
6448
6449 Register LHS = BaseDef->MI->getOperand(1).getReg();
6450 Register RHS = BaseDef->MI->getOperand(2).getReg();
6451 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6452}
6453
6454bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
6455 unsigned ShAmtBits) const {
6456 assert(MI.getOpcode() == TargetOpcode::G_AND);
6457
6458 std::optional<APInt> RHS =
6459 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
6460 if (!RHS)
6461 return false;
6462
6463 if (RHS->countr_one() >= ShAmtBits)
6464 return true;
6465
6466 const APInt &LHSKnownZeros = VT->getKnownZeroes(MI.getOperand(1).getReg());
6467 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
6468}
6469
6471AMDGPUInstructionSelector::selectMUBUFScratchOffset(
6472 MachineOperand &Root) const {
6473 Register Reg = Root.getReg();
6474 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6475
6476 std::optional<DefinitionAndSourceRegister> Def =
6478 assert(Def && "this shouldn't be an optional result");
6479 Reg = Def->Reg;
6480
6481 if (Register WaveBase = getWaveAddress(Def->MI)) {
6482 return {{
6483 [=](MachineInstrBuilder &MIB) { // rsrc
6484 MIB.addReg(Info->getScratchRSrcReg());
6485 },
6486 [=](MachineInstrBuilder &MIB) { // soffset
6487 MIB.addReg(WaveBase);
6488 },
6489 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
6490 }};
6491 }
6492
6493 int64_t Offset = 0;
6494
6495 // FIXME: Copy check is a hack
6497 if (mi_match(Reg, *MRI,
6498 m_GPtrAdd(m_Reg(BasePtr),
6500 if (!TII.isLegalMUBUFImmOffset(Offset))
6501 return {};
6502 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
6503 Register WaveBase = getWaveAddress(BasePtrDef);
6504 if (!WaveBase)
6505 return {};
6506
6507 return {{
6508 [=](MachineInstrBuilder &MIB) { // rsrc
6509 MIB.addReg(Info->getScratchRSrcReg());
6510 },
6511 [=](MachineInstrBuilder &MIB) { // soffset
6512 MIB.addReg(WaveBase);
6513 },
6514 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6515 }};
6516 }
6517
6518 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
6519 !TII.isLegalMUBUFImmOffset(Offset))
6520 return {};
6521
6522 return {{
6523 [=](MachineInstrBuilder &MIB) { // rsrc
6524 MIB.addReg(Info->getScratchRSrcReg());
6525 },
6526 [=](MachineInstrBuilder &MIB) { // soffset
6527 MIB.addImm(0);
6528 },
6529 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6530 }};
6531}
6532
6533std::pair<Register, unsigned>
6534AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
6535 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6536 int64_t ConstAddr = 0;
6537
6538 Register PtrBase;
6539 int64_t Offset;
6540 std::tie(PtrBase, Offset, std::ignore) =
6541 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6542
6543 if (Offset) {
6544 if (isDSOffsetLegal(PtrBase, Offset)) {
6545 // (add n0, c0)
6546 return std::pair(PtrBase, Offset);
6547 }
6548 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6549 // TODO
6550
6551
6552 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6553 // TODO
6554
6555 }
6556
6557 return std::pair(Root.getReg(), 0);
6558}
6559
6561AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
6562 Register Reg;
6563 unsigned Offset;
6564 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
6565 return {{
6566 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6567 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
6568 }};
6569}
6570
6572AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
6573 return selectDSReadWrite2(Root, 4);
6574}
6575
6577AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
6578 return selectDSReadWrite2(Root, 8);
6579}
6580
6582AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
6583 unsigned Size) const {
6584 Register Reg;
6585 unsigned Offset;
6586 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
6587 return {{
6588 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6589 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
6590 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
6591 }};
6592}
6593
6594std::pair<Register, unsigned>
6595AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
6596 unsigned Size) const {
6597 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6598 int64_t ConstAddr = 0;
6599
6600 Register PtrBase;
6601 int64_t Offset;
6602 std::tie(PtrBase, Offset, std::ignore) =
6603 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6604
6605 if (Offset) {
6606 int64_t OffsetValue0 = Offset;
6607 int64_t OffsetValue1 = Offset + Size;
6608 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
6609 // (add n0, c0)
6610 return std::pair(PtrBase, OffsetValue0 / Size);
6611 }
6612 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6613 // TODO
6614
6615 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6616 // TODO
6617
6618 }
6619
6620 return std::pair(Root.getReg(), 0);
6621}
6622
6623/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
6624/// the base value with the constant offset, and if the offset computation is
6625/// known to be inbounds. There may be intervening copies between \p Root and
6626/// the identified constant. Returns \p Root, 0, false if this does not match
6627/// the pattern.
6628std::tuple<Register, int64_t, bool>
6629AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
6630 Register Root, const MachineRegisterInfo &MRI) const {
6631 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
6632 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
6633 return {Root, 0, false};
6634
6635 MachineOperand &RHS = RootI->getOperand(2);
6636 std::optional<ValueAndVReg> MaybeOffset =
6638 if (!MaybeOffset)
6639 return {Root, 0, false};
6640 bool IsInBounds = RootI->getFlag(MachineInstr::MIFlag::InBounds);
6641 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue(),
6642 IsInBounds};
6643}
6644
6646 MIB.addImm(0);
6647}
6648
6649/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
6650/// BasePtr is not valid, a null base pointer will be used.
6652 uint32_t FormatLo, uint32_t FormatHi,
6653 Register BasePtr) {
6654 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6655 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6656 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6657 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6658
6659 B.buildInstr(AMDGPU::S_MOV_B32)
6660 .addDef(RSrc2)
6661 .addImm(FormatLo);
6662 B.buildInstr(AMDGPU::S_MOV_B32)
6663 .addDef(RSrc3)
6664 .addImm(FormatHi);
6665
6666 // Build the half of the subregister with the constants before building the
6667 // full 128-bit register. If we are building multiple resource descriptors,
6668 // this will allow CSEing of the 2-component register.
6669 B.buildInstr(AMDGPU::REG_SEQUENCE)
6670 .addDef(RSrcHi)
6671 .addReg(RSrc2)
6672 .addImm(AMDGPU::sub0)
6673 .addReg(RSrc3)
6674 .addImm(AMDGPU::sub1);
6675
6676 Register RSrcLo = BasePtr;
6677 if (!BasePtr) {
6678 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6679 B.buildInstr(AMDGPU::S_MOV_B64)
6680 .addDef(RSrcLo)
6681 .addImm(0);
6682 }
6683
6684 B.buildInstr(AMDGPU::REG_SEQUENCE)
6685 .addDef(RSrc)
6686 .addReg(RSrcLo)
6687 .addImm(AMDGPU::sub0_sub1)
6688 .addReg(RSrcHi)
6689 .addImm(AMDGPU::sub2_sub3);
6690
6691 return RSrc;
6692}
6693
6695 const SIInstrInfo &TII, Register BasePtr) {
6696 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6697
6698 // FIXME: Why are half the "default" bits ignored based on the addressing
6699 // mode?
6700 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
6701}
6702
6704 const SIInstrInfo &TII, Register BasePtr) {
6705 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6706
6707 // FIXME: Why are half the "default" bits ignored based on the addressing
6708 // mode?
6709 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
6710}
6711
6712AMDGPUInstructionSelector::MUBUFAddressData
6713AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
6714 MUBUFAddressData Data;
6715 Data.N0 = Src;
6716
6717 Register PtrBase;
6718 int64_t Offset;
6719
6720 std::tie(PtrBase, Offset, std::ignore) =
6721 getPtrBaseWithConstantOffset(Src, *MRI);
6722 if (isUInt<32>(Offset)) {
6723 Data.N0 = PtrBase;
6724 Data.Offset = Offset;
6725 }
6726
6727 if (MachineInstr *InputAdd
6728 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
6729 Data.N2 = InputAdd->getOperand(1).getReg();
6730 Data.N3 = InputAdd->getOperand(2).getReg();
6731
6732 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
6733 // FIXME: Don't know this was defined by operand 0
6734 //
6735 // TODO: Remove this when we have copy folding optimizations after
6736 // RegBankSelect.
6737 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
6738 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
6739 }
6740
6741 return Data;
6742}
6743
6744/// Return if the addr64 mubuf mode should be used for the given address.
6745bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
6746 // (ptr_add N2, N3) -> addr64, or
6747 // (ptr_add (ptr_add N2, N3), C1) -> addr64
6748 if (Addr.N2)
6749 return true;
6750
6751 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
6752 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
6753}
6754
6755/// Split an immediate offset \p ImmOffset depending on whether it fits in the
6756/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
6757/// component.
6758void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
6759 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
6760 if (TII.isLegalMUBUFImmOffset(ImmOffset))
6761 return;
6762
6763 // Illegal offset, store it in soffset.
6764 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6765 B.buildInstr(AMDGPU::S_MOV_B32)
6766 .addDef(SOffset)
6767 .addImm(ImmOffset);
6768 ImmOffset = 0;
6769}
6770
6771bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
6772 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
6773 Register &SOffset, int64_t &Offset) const {
6774 // FIXME: Predicates should stop this from reaching here.
6775 // addr64 bit was removed for volcanic islands.
6776 if (!STI.hasAddr64() || STI.useFlatForGlobal())
6777 return false;
6778
6779 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6780 if (!shouldUseAddr64(AddrData))
6781 return false;
6782
6783 Register N0 = AddrData.N0;
6784 Register N2 = AddrData.N2;
6785 Register N3 = AddrData.N3;
6786 Offset = AddrData.Offset;
6787
6788 // Base pointer for the SRD.
6789 Register SRDPtr;
6790
6791 if (N2) {
6792 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6793 assert(N3);
6794 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6795 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
6796 // addr64, and construct the default resource from a 0 address.
6797 VAddr = N0;
6798 } else {
6799 SRDPtr = N3;
6800 VAddr = N2;
6801 }
6802 } else {
6803 // N2 is not divergent.
6804 SRDPtr = N2;
6805 VAddr = N3;
6806 }
6807 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6808 // Use the default null pointer in the resource
6809 VAddr = N0;
6810 } else {
6811 // N0 -> offset, or
6812 // (N0 + C1) -> offset
6813 SRDPtr = N0;
6814 }
6815
6816 MachineIRBuilder B(*Root.getParent());
6817 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
6818 splitIllegalMUBUFOffset(B, SOffset, Offset);
6819 return true;
6820}
6821
6822bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
6823 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
6824 int64_t &Offset) const {
6825
6826 // FIXME: Pattern should not reach here.
6827 if (STI.useFlatForGlobal())
6828 return false;
6829
6830 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6831 if (shouldUseAddr64(AddrData))
6832 return false;
6833
6834 // N0 -> offset, or
6835 // (N0 + C1) -> offset
6836 Register SRDPtr = AddrData.N0;
6837 Offset = AddrData.Offset;
6838
6839 // TODO: Look through extensions for 32-bit soffset.
6840 MachineIRBuilder B(*Root.getParent());
6841
6842 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
6843 splitIllegalMUBUFOffset(B, SOffset, Offset);
6844 return true;
6845}
6846
6848AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
6849 Register VAddr;
6850 Register RSrcReg;
6851 Register SOffset;
6852 int64_t Offset = 0;
6853
6854 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
6855 return {};
6856
6857 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
6858 // pattern.
6859 return {{
6860 [=](MachineInstrBuilder &MIB) { // rsrc
6861 MIB.addReg(RSrcReg);
6862 },
6863 [=](MachineInstrBuilder &MIB) { // vaddr
6864 MIB.addReg(VAddr);
6865 },
6866 [=](MachineInstrBuilder &MIB) { // soffset
6867 if (SOffset)
6868 MIB.addReg(SOffset);
6869 else if (STI.hasRestrictedSOffset())
6870 MIB.addReg(AMDGPU::SGPR_NULL);
6871 else
6872 MIB.addImm(0);
6873 },
6874 [=](MachineInstrBuilder &MIB) { // offset
6875 MIB.addImm(Offset);
6876 },
6877 addZeroImm, // cpol
6878 addZeroImm, // tfe
6879 addZeroImm // swz
6880 }};
6881}
6882
6884AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
6885 Register RSrcReg;
6886 Register SOffset;
6887 int64_t Offset = 0;
6888
6889 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
6890 return {};
6891
6892 return {{
6893 [=](MachineInstrBuilder &MIB) { // rsrc
6894 MIB.addReg(RSrcReg);
6895 },
6896 [=](MachineInstrBuilder &MIB) { // soffset
6897 if (SOffset)
6898 MIB.addReg(SOffset);
6899 else if (STI.hasRestrictedSOffset())
6900 MIB.addReg(AMDGPU::SGPR_NULL);
6901 else
6902 MIB.addImm(0);
6903 },
6904 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
6905 addZeroImm, // cpol
6906 addZeroImm, // tfe
6907 addZeroImm, // swz
6908 }};
6909}
6910
6912AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
6913
6914 Register SOffset = Root.getReg();
6915
6916 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
6917 SOffset = AMDGPU::SGPR_NULL;
6918
6919 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
6920}
6921
6922/// Get an immediate that must be 32-bits, and treated as zero extended.
6923static std::optional<uint64_t>
6925 // getIConstantVRegVal sexts any values, so see if that matters.
6926 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
6927 if (!OffsetVal || !isInt<32>(*OffsetVal))
6928 return std::nullopt;
6929 return Lo_32(*OffsetVal);
6930}
6931
6933AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
6934 std::optional<uint64_t> OffsetVal =
6935 Root.isImm() ? Root.getImm() : getConstantZext32Val(Root.getReg(), *MRI);
6936 if (!OffsetVal)
6937 return {};
6938
6939 std::optional<int64_t> EncodedImm =
6940 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
6941 if (!EncodedImm)
6942 return {};
6943
6944 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
6945}
6946
6948AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
6949 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
6950
6951 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
6952 if (!OffsetVal)
6953 return {};
6954
6955 std::optional<int64_t> EncodedImm =
6957 if (!EncodedImm)
6958 return {};
6959
6960 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
6961}
6962
6964AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
6965 // Match the (soffset + offset) pair as a 32-bit register base and
6966 // an immediate offset.
6967 Register SOffset;
6968 unsigned Offset;
6969 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
6970 *MRI, Root.getReg(), VT, /*CheckNUW*/ true);
6971 if (!SOffset)
6972 return std::nullopt;
6973
6974 std::optional<int64_t> EncodedOffset =
6975 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
6976 if (!EncodedOffset)
6977 return std::nullopt;
6978
6979 assert(MRI->getType(SOffset) == LLT::scalar(32));
6980 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
6981 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
6982}
6983
6984std::pair<Register, unsigned>
6985AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
6986 bool &Matched) const {
6987 Matched = false;
6988
6989 Register Src;
6990 unsigned Mods;
6991 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
6992
6993 if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
6994 assert(MRI->getType(Src) == LLT::scalar(16));
6995
6996 // Only change Src if src modifier could be gained. In such cases new Src
6997 // could be sgpr but this does not violate constant bus restriction for
6998 // instruction that is being selected.
6999 Src = stripBitCast(Src, *MRI);
7000
7001 const auto CheckAbsNeg = [&]() {
7002 // Be careful about folding modifiers if we already have an abs. fneg is
7003 // applied last, so we don't want to apply an earlier fneg.
7004 if ((Mods & SISrcMods::ABS) == 0) {
7005 unsigned ModsTmp;
7006 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
7007
7008 if ((ModsTmp & SISrcMods::NEG) != 0)
7009 Mods ^= SISrcMods::NEG;
7010
7011 if ((ModsTmp & SISrcMods::ABS) != 0)
7012 Mods |= SISrcMods::ABS;
7013 }
7014 };
7015
7016 CheckAbsNeg();
7017
7018 // op_sel/op_sel_hi decide the source type and source.
7019 // If the source's op_sel_hi is set, it indicates to do a conversion from
7020 // fp16. If the sources's op_sel is set, it picks the high half of the
7021 // source register.
7022
7023 Mods |= SISrcMods::OP_SEL_1;
7024
7025 if (isExtractHiElt(*MRI, Src, Src)) {
7026 Mods |= SISrcMods::OP_SEL_0;
7027 CheckAbsNeg();
7028 }
7029
7030 Matched = true;
7031 }
7032
7033 return {Src, Mods};
7034}
7035
7037AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
7038 MachineOperand &Root) const {
7039 Register Src;
7040 unsigned Mods;
7041 bool Matched;
7042 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
7043 if (!Matched)
7044 return {};
7045
7046 return {{
7047 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
7048 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
7049 }};
7050}
7051
7053AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
7054 Register Src;
7055 unsigned Mods;
7056 bool Matched;
7057 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
7058
7059 return {{
7060 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
7061 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
7062 }};
7063}
7064
7065bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
7066 MachineInstr &I, Intrinsic::ID IntrID) const {
7067 MachineBasicBlock *MBB = I.getParent();
7068 const DebugLoc &DL = I.getDebugLoc();
7069 Register CCReg = I.getOperand(0).getReg();
7070
7071 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
7072 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_CMP_EQ_U32)).addImm(0).addImm(0);
7073
7074 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
7075 .addImm(I.getOperand(2).getImm());
7076
7077 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
7078
7079 I.eraseFromParent();
7080 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
7081 *MRI);
7082}
7083
7084bool AMDGPUInstructionSelector::selectSGetBarrierState(
7085 MachineInstr &I, Intrinsic::ID IntrID) const {
7086 MachineBasicBlock *MBB = I.getParent();
7087 const DebugLoc &DL = I.getDebugLoc();
7088 const MachineOperand &BarOp = I.getOperand(2);
7089 std::optional<int64_t> BarValImm =
7090 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
7091
7092 if (!BarValImm) {
7093 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
7094 .addReg(BarOp.getReg());
7095 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
7096 }
7097 MachineInstrBuilder MIB;
7098 unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM
7099 : AMDGPU::S_GET_BARRIER_STATE_M0;
7100 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
7101
7102 auto DstReg = I.getOperand(0).getReg();
7103 const TargetRegisterClass *DstRC =
7104 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
7105 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
7106 return false;
7107 MIB.addDef(DstReg);
7108 if (BarValImm) {
7109 MIB.addImm(*BarValImm);
7110 }
7111 I.eraseFromParent();
7112 return true;
7113}
7114
7115unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
7116 if (HasInlineConst) {
7117 switch (IntrID) {
7118 default:
7119 llvm_unreachable("not a named barrier op");
7120 case Intrinsic::amdgcn_s_barrier_join:
7121 return AMDGPU::S_BARRIER_JOIN_IMM;
7122 case Intrinsic::amdgcn_s_wakeup_barrier:
7123 return AMDGPU::S_WAKEUP_BARRIER_IMM;
7124 case Intrinsic::amdgcn_s_get_named_barrier_state:
7125 return AMDGPU::S_GET_BARRIER_STATE_IMM;
7126 };
7127 } else {
7128 switch (IntrID) {
7129 default:
7130 llvm_unreachable("not a named barrier op");
7131 case Intrinsic::amdgcn_s_barrier_join:
7132 return AMDGPU::S_BARRIER_JOIN_M0;
7133 case Intrinsic::amdgcn_s_wakeup_barrier:
7134 return AMDGPU::S_WAKEUP_BARRIER_M0;
7135 case Intrinsic::amdgcn_s_get_named_barrier_state:
7136 return AMDGPU::S_GET_BARRIER_STATE_M0;
7137 };
7138 }
7139}
7140
7141bool AMDGPUInstructionSelector::selectNamedBarrierInit(
7142 MachineInstr &I, Intrinsic::ID IntrID) const {
7143 MachineBasicBlock *MBB = I.getParent();
7144 const DebugLoc &DL = I.getDebugLoc();
7145 const MachineOperand &BarOp = I.getOperand(1);
7146 const MachineOperand &CntOp = I.getOperand(2);
7147
7148 // BarID = (BarOp >> 4) & 0x3F
7149 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7150 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
7151 .add(BarOp)
7152 .addImm(4u)
7153 .setOperandDead(3); // Dead scc
7154
7155 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7156 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
7157 .addReg(TmpReg0)
7158 .addImm(0x3F)
7159 .setOperandDead(3); // Dead scc
7160
7161 // MO = ((CntOp & 0x3F) << shAmt) | BarID
7162 Register TmpReg2 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7163 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg2)
7164 .add(CntOp)
7165 .addImm(0x3F)
7166 .setOperandDead(3); // Dead scc
7167
7168 Register TmpReg3 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7169 constexpr unsigned ShAmt = 16;
7170 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg3)
7171 .addReg(TmpReg2)
7172 .addImm(ShAmt)
7173 .setOperandDead(3); // Dead scc
7174
7175 Register TmpReg4 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7176 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg4)
7177 .addReg(TmpReg1)
7178 .addReg(TmpReg3)
7179 .setOperandDead(3); // Dead scc;
7180
7181 auto CopyMIB =
7182 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4);
7183 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
7184
7185 unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
7186 ? AMDGPU::S_BARRIER_INIT_M0
7187 : AMDGPU::S_BARRIER_SIGNAL_M0;
7188 MachineInstrBuilder MIB;
7189 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
7190
7191 I.eraseFromParent();
7192 return true;
7193}
7194
7195bool AMDGPUInstructionSelector::selectNamedBarrierInst(
7196 MachineInstr &I, Intrinsic::ID IntrID) const {
7197 MachineBasicBlock *MBB = I.getParent();
7198 const DebugLoc &DL = I.getDebugLoc();
7199 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state
7200 ? I.getOperand(2)
7201 : I.getOperand(1);
7202 std::optional<int64_t> BarValImm =
7203 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
7204
7205 if (!BarValImm) {
7206 // BarID = (BarOp >> 4) & 0x3F
7207 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7208 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
7209 .addReg(BarOp.getReg())
7210 .addImm(4u)
7211 .setOperandDead(3); // Dead scc;
7212
7213 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7214 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
7215 .addReg(TmpReg0)
7216 .addImm(0x3F)
7217 .setOperandDead(3); // Dead scc;
7218
7219 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
7220 .addReg(TmpReg1);
7221 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
7222 }
7223
7224 MachineInstrBuilder MIB;
7225 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
7226 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
7227
7228 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
7229 auto DstReg = I.getOperand(0).getReg();
7230 const TargetRegisterClass *DstRC =
7231 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
7232 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
7233 return false;
7234 MIB.addDef(DstReg);
7235 }
7236
7237 if (BarValImm) {
7238 auto BarId = ((*BarValImm) >> 4) & 0x3F;
7239 MIB.addImm(BarId);
7240 }
7241
7242 I.eraseFromParent();
7243 return true;
7244}
7245
7246void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
7247 const MachineInstr &MI,
7248 int OpIdx) const {
7249 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7250 "Expected G_CONSTANT");
7251 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
7252}
7253
7254void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
7255 const MachineInstr &MI,
7256 int OpIdx) const {
7257 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7258 "Expected G_CONSTANT");
7259 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
7260}
7261
7262void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
7263 const MachineInstr &MI,
7264 int OpIdx) const {
7265 const MachineOperand &Op = MI.getOperand(1);
7266 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
7267 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
7268}
7269
7270void AMDGPUInstructionSelector::renderCountTrailingOnesImm(
7271 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7272 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7273 "Expected G_CONSTANT");
7274 MIB.addImm(MI.getOperand(1).getCImm()->getValue().countTrailingOnes());
7275}
7276
7277/// This only really exists to satisfy DAG type checking machinery, so is a
7278/// no-op here.
7279void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
7280 const MachineInstr &MI,
7281 int OpIdx) const {
7282 const MachineOperand &Op = MI.getOperand(OpIdx);
7283 int64_t Imm;
7284 if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm)))
7285 MIB.addImm(Imm);
7286 else
7287 MIB.addImm(Op.getImm());
7288}
7289
7290void AMDGPUInstructionSelector::renderZextBoolTImm(MachineInstrBuilder &MIB,
7291 const MachineInstr &MI,
7292 int OpIdx) const {
7293 MIB.addImm(MI.getOperand(OpIdx).getImm() != 0);
7294}
7295
7296void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
7297 const MachineInstr &MI,
7298 int OpIdx) const {
7299 assert(OpIdx >= 0 && "expected to match an immediate operand");
7300 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7301}
7302
7303void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
7304 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7305 assert(OpIdx >= 0 && "expected to match an immediate operand");
7306 MIB.addImm(
7307 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7308}
7309
7310void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
7311 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7312 assert(OpIdx >= 0 && "expected to match an immediate operand");
7313 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
7315 : (int64_t)SISrcMods::DST_OP_SEL);
7316}
7317
7318void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
7319 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7320 assert(OpIdx >= 0 && "expected to match an immediate operand");
7321 MIB.addImm(
7322 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7323}
7324
7325void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
7326 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7327 assert(OpIdx >= 0 && "expected to match an immediate operand");
7328 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
7329 ? (int64_t)(SISrcMods::OP_SEL_0)
7330 : 0);
7331}
7332
7333void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
7334 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7335 assert(OpIdx >= 0 && "expected to match an immediate operand");
7336 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL)
7337 : 0);
7338}
7339
7340void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(
7341 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7342 assert(OpIdx >= 0 && "expected to match an immediate operand");
7343 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0)
7344 : 0);
7345}
7346
7347void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
7348 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7349 assert(OpIdx >= 0 && "expected to match an immediate operand");
7350 MIB.addImm(
7351 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7352}
7353
7354void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
7355 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7356 assert(OpIdx >= 0 && "expected to match an immediate operand");
7357 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
7358 ? (int64_t)SISrcMods::DST_OP_SEL
7359 : 0);
7360}
7361
7362void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
7363 const MachineInstr &MI,
7364 int OpIdx) const {
7365 assert(OpIdx >= 0 && "expected to match an immediate operand");
7366 MIB.addImm(MI.getOperand(OpIdx).getImm() &
7369}
7370
7371void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
7372 const MachineInstr &MI,
7373 int OpIdx) const {
7374 assert(OpIdx >= 0 && "expected to match an immediate operand");
7375 const bool Swizzle = MI.getOperand(OpIdx).getImm() &
7378 MIB.addImm(Swizzle);
7379}
7380
7381void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
7382 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7383 assert(OpIdx >= 0 && "expected to match an immediate operand");
7384 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
7387 MIB.addImm(Cpol | AMDGPU::CPol::GLC);
7388}
7389
7390void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
7391 const MachineInstr &MI,
7392 int OpIdx) const {
7393 MIB.addFrameIndex(MI.getOperand(1).getIndex());
7394}
7395
7396void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
7397 const MachineInstr &MI,
7398 int OpIdx) const {
7399 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
7400 int ExpVal = APF.getExactLog2Abs();
7401 assert(ExpVal != INT_MIN);
7402 MIB.addImm(ExpVal);
7403}
7404
7405void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
7406 const MachineInstr &MI,
7407 int OpIdx) const {
7408 // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
7409 // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
7410 // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
7411 // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
7412 MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
7413}
7414
7415void AMDGPUInstructionSelector::renderVOP3PModsNeg(MachineInstrBuilder &MIB,
7416 const MachineInstr &MI,
7417 int OpIdx) const {
7418 unsigned Mods = SISrcMods::OP_SEL_1;
7419 if (MI.getOperand(OpIdx).getImm())
7420 Mods ^= SISrcMods::NEG;
7421 MIB.addImm((int64_t)Mods);
7422}
7423
7424void AMDGPUInstructionSelector::renderVOP3PModsNegs(MachineInstrBuilder &MIB,
7425 const MachineInstr &MI,
7426 int OpIdx) const {
7427 unsigned Mods = SISrcMods::OP_SEL_1;
7428 if (MI.getOperand(OpIdx).getImm())
7430 MIB.addImm((int64_t)Mods);
7431}
7432
7433void AMDGPUInstructionSelector::renderVOP3PModsNegAbs(MachineInstrBuilder &MIB,
7434 const MachineInstr &MI,
7435 int OpIdx) const {
7436 unsigned Val = MI.getOperand(OpIdx).getImm();
7437 unsigned Mods = SISrcMods::OP_SEL_1; // default: none
7438 if (Val == 1) // neg
7439 Mods ^= SISrcMods::NEG;
7440 if (Val == 2) // abs
7441 Mods ^= SISrcMods::ABS;
7442 if (Val == 3) // neg and abs
7443 Mods ^= (SISrcMods::NEG | SISrcMods::ABS);
7444 MIB.addImm((int64_t)Mods);
7445}
7446
7447void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB,
7448 const MachineInstr &MI,
7449 int OpIdx) const {
7450 uint32_t V = MI.getOperand(2).getImm();
7453 if (!Subtarget->hasSafeCUPrefetch())
7454 V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe
7455 MIB.addImm(V);
7456}
7457
7458/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
7459void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
7460 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7461 unsigned Val = MI.getOperand(OpIdx).getImm();
7462 unsigned New = 0;
7463 if (Val & 0x1)
7465 if (Val & 0x2)
7467 MIB.addImm(New);
7468}
7469
7470bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
7471 return TII.isInlineConstant(Imm);
7472}
7473
7474bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
7475 return TII.isInlineConstant(Imm);
7476}
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
#define GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static Register getLegalRegBank(Register NewReg, Register RootReg, const AMDGPURegisterBankInfo &RBI, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const SIInstrInfo &TII)
static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is shift left with half bits, such as reg0:2n =G_SHL reg1:2n, CONST(n)
static bool isNoUnsignedWrap(MachineInstr *Addr)
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID)
static bool checkRB(Register Reg, unsigned int RBNo, const AMDGPURegisterBankInfo &RBI, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI)
static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods)
static bool isTruncHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is truncating to half, such as reg0:n = G_TRUNC reg1:2n
static Register getWaveAddress(const MachineInstr *Def)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static bool shouldUseAndMask(unsigned Size, unsigned &Mask)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static TypeClass isVectorOfTwoOrScalar(Register Reg, const MachineRegisterInfo &MRI)
static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI, MachineBasicBlock *MBB)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void addZeroImm(MachineInstrBuilder &MIB)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static bool isConstant(const MachineInstr &MI)
static bool isSameBitWidth(Register Reg1, Register Reg2, const MachineRegisterInfo &MRI)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
static bool isAsyncLDSDMA(Intrinsic::ID Intr)
static std::pair< Register, unsigned > computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, Register IdxReg, unsigned EltSize, GISelValueTracking &ValueTracking)
Return the register to use for the index value, and the subregister to use for the indirectly accesse...
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
static std::pair< Register, SrcStatus > getLastSameOrNeg(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static Register stripCopy(Register Reg, MachineRegisterInfo &MRI)
static std::optional< std::pair< Register, SrcStatus > > calcNextStatus(std::pair< Register, SrcStatus > Curr, const MachineRegisterInfo &MRI)
static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI)
static std::optional< uint64_t > getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI)
Get an immediate that must be 32-bits, and treated as zero extended.
static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg, Register RootReg, const SIInstrInfo &TII, const MachineRegisterInfo &MRI)
static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, const GCNSubtarget &ST)
static SmallVector< std::pair< Register, SrcStatus > > getSrcStats(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static bool isUnmergeHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test function, if the MI is reg0:n, reg1:n = G_UNMERGE_VALUES reg2:2n
static SrcStatus getNegStatus(Register Reg, SrcStatus S, const MachineRegisterInfo &MRI)
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is logic shift right with half bits, such as reg0:2n =G_LSHR reg1:2n,...
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
This file declares the targeting of the InstructionSelector class for AMDGPU.
constexpr LLT S1
constexpr LLT S32
AMDGPU Register Bank Select
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool isAllZeros(StringRef Arr)
Return true if the array is empty or all zeros.
dxil translate DXIL Translate Metadata
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
MachineInstr unsigned OpIdx
#define P(N)
static std::vector< std::pair< int, unsigned > > Swizzle(std::vector< std::pair< int, unsigned > > Src, R600InstrInfo::BankSwizzle Swz)
#define LLVM_DEBUG(...)
Definition Debug.h:114
Value * RHS
Value * LHS
This is used to control valid status that current MI supports.
bool checkOptions(SrcStatus Stat) const
SearchOptions(Register Reg, const MachineRegisterInfo &MRI)
AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM)
static const char * getName()
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
void setupMF(MachineFunction &MF, GISelValueTracking *VT, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override
Setup per-MF executor state.
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1564
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1577
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:693
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:690
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:688
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:678
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
bool isFPPredicate() const
Definition InstrTypes.h:782
bool isIntPredicate() const
Definition InstrTypes.h:783
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
LLVM_ABI DILocation * get() const
Get the underlying DILocation.
Definition DebugLoc.cpp:48
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
std::optional< SmallVector< std::function< void(MachineInstrBuilder &)>, 4 > > ComplexRendererFns
virtual void setupMF(MachineFunction &mf, GISelValueTracking *vt, CodeGenCoverage *covinfo=nullptr, ProfileSummaryInfo *psi=nullptr, BlockFrequencyInfo *bfi=nullptr)
Setup per-MF executor state.
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr bool isVector() const
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool hasValue() const
TypeSize getValue() const
int getOperandConstraint(unsigned OpNum, MCOI::OperandConstraint Constraint) const
Returns the value of the specified operand constraint if it is present.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void setReturnAddressIsTaken(bool s)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Helper class to build MachineInstr.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
const ConstantInt * getCImm() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
ArrayRef< int > getShuffleMask() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isEarlyClobber() const
Register getReg() const
getReg - Returns the register number.
bool isInternalRead() const
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const RegisterBank * getRegBankOrNull(Register Reg) const
Return the register bank of Reg, or null if Reg has not been assigned a register bank or has been ass...
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Analysis providing profile information.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
static bool isGenericOpcode(unsigned Opc)
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
LLVM_READONLY int32_t getGlobalSaddrOp(uint32_t Opcode)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX10Plus(const MCSubtargetInfo &STI)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
IndexMode
ARM Index Modes.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
operand_type_match m_Reg()
SpecificConstantMatch m_SpecificICst(const APInt &RequestedValue)
Matches a constant equal to RequestedValue.
GCstAndRegMatch m_GCst(std::optional< ValueAndVReg > &ValReg)
UnaryOp_match< SrcTy, TargetOpcode::COPY > m_Copy(SrcTy &&Src)
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_XOR, true > m_GXor(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_SEXT > m_GSExt(const SrcTy &Src)
UnaryOp_match< SrcTy, TargetOpcode::G_FPEXT > m_GFPExt(const SrcTy &Src)
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
ConstantMatch< APInt > m_ICst(APInt &Cst)
SpecificConstantMatch m_AllOnesInt()
BinaryOp_match< LHS, RHS, TargetOpcode::G_OR, true > m_GOr(const LHS &L, const RHS &R)
ICstOrSplatMatch< APInt > m_ICstOrSplat(APInt &Cst)
ImplicitDefMatch m_GImplicitDef()
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
BinaryOp_match< LHS, RHS, TargetOpcode::G_ASHR, false > m_GAShr(const LHS &L, const RHS &R)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
SpecificRegisterMatch m_SpecificReg(Register RequestedReg)
Matches a register only if it is equal to RequestedReg.
BinaryOp_match< LHS, RHS, TargetOpcode::G_SHL, false > m_GShl(const LHS &L, const RHS &R)
Or< Preds... > m_any_of(Preds &&... preds)
BinaryOp_match< LHS, RHS, TargetOpcode::G_AND, true > m_GAnd(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_BITCAST > m_GBitcast(const SrcTy &Src)
bind_ty< MachineInstr * > m_MInstr(MachineInstr *&MI)
UnaryOp_match< SrcTy, TargetOpcode::G_FNEG > m_GFNeg(const SrcTy &Src)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
UnaryOp_match< SrcTy, TargetOpcode::G_FABS > m_GFabs(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_ANYEXT > m_GAnyExt(const SrcTy &Src)
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, TargetOpcode::G_MUL, true > m_GMul(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_TRUNC > m_GTrunc(const SrcTy &Src)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition Utils.cpp:916
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI bool isBuildVectorAllZeros(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndef=false)
Return true if the specified instruction is a G_BUILD_VECTOR or G_BUILD_VECTOR_TRUNC where all of the...
Definition Utils.cpp:1482
LLVM_ABI Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition Utils.cpp:56
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:652
PointerUnion< const TargetRegisterClass *, const RegisterBank * > RegClassOrRegBank
Convenient type to represent either a register class or a register bank.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition Utils.cpp:460
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
LLVM_ABI std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition Utils.cpp:293
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:155
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition Utils.cpp:493
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:313
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI std::optional< ValueAndVReg > getAnyConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool LookThroughAnyExt=false)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT or G_FCONST...
Definition Utils.cpp:438
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:432
LLVM_ABI std::optional< DefinitionAndSourceRegister > getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, and underlying value Register folding away any copies.
Definition Utils.cpp:468
LLVM_ABI Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition Utils.cpp:500
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr RegState getUndefRegState(bool B)
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:317
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false, bool SelfAdd=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:363
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.