LLVM 23.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
15#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
45 const AMDGPUTargetMachine &TM)
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
49#include "AMDGPUGenGlobalISel.inc"
52#include "AMDGPUGenGlobalISel.inc"
54{
55}
56
57const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
58
69
70// Return the wave level SGPR base address if this is a wave address.
72 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
73 ? Def->getOperand(1).getReg()
74 : Register();
75}
76
77bool AMDGPUInstructionSelector::isVCC(Register Reg,
78 const MachineRegisterInfo &MRI) const {
79 // The verifier is oblivious to s1 being a valid value for wavesize registers.
80 if (Reg.isPhysical())
81 return false;
82
83 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
84 const TargetRegisterClass *RC =
86 if (RC) {
87 const LLT Ty = MRI.getType(Reg);
88 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
89 return false;
90 // G_TRUNC s1 result is never vcc.
91 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
92 RC->hasSuperClassEq(TRI.getBoolRC());
93 }
94
95 const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank);
96 return RB->getID() == AMDGPU::VCCRegBankID;
97}
98
99bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
100 unsigned NewOpc) const {
101 MI.setDesc(TII.get(NewOpc));
102 MI.removeOperand(1); // Remove intrinsic ID.
103 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
104
105 MachineOperand &Dst = MI.getOperand(0);
106 MachineOperand &Src = MI.getOperand(1);
107
108 // TODO: This should be legalized to s32 if needed
109 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
110 return false;
111
112 const TargetRegisterClass *DstRC
113 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
114 const TargetRegisterClass *SrcRC
115 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
116 if (!DstRC || DstRC != SrcRC)
117 return false;
118
119 if (!RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) ||
120 !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
121 return false;
122 const MCInstrDesc &MCID = MI.getDesc();
123 if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
124 MI.getOperand(0).setIsEarlyClobber(true);
125 }
126 return true;
127}
128
129bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
130 const DebugLoc &DL = I.getDebugLoc();
131 MachineBasicBlock *BB = I.getParent();
132 I.setDesc(TII.get(TargetOpcode::COPY));
133
134 const MachineOperand &Src = I.getOperand(1);
135 MachineOperand &Dst = I.getOperand(0);
136 Register DstReg = Dst.getReg();
137 Register SrcReg = Src.getReg();
138
139 if (isVCC(DstReg, *MRI)) {
140 if (SrcReg == AMDGPU::SCC) {
141 const TargetRegisterClass *RC
142 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
143 if (!RC)
144 return true;
145 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
146 }
147
148 if (!isVCC(SrcReg, *MRI)) {
149 // TODO: Should probably leave the copy and let copyPhysReg expand it.
150 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
151 return false;
152
153 const TargetRegisterClass *SrcRC
154 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
155
156 std::optional<ValueAndVReg> ConstVal =
157 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
158 if (ConstVal) {
159 unsigned MovOpc =
160 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
161 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
162 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
163 } else {
164 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
165
166 // We can't trust the high bits at this point, so clear them.
167
168 // TODO: Skip masking high bits if def is known boolean.
169
170 if (AMDGPU::getRegBitWidth(SrcRC->getID()) == 16) {
171 assert(Subtarget->useRealTrue16Insts());
172 const int64_t NoMods = 0;
173 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg)
174 .addImm(NoMods)
175 .addImm(1)
176 .addImm(NoMods)
177 .addReg(SrcReg)
178 .addImm(NoMods);
179 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg)
180 .addImm(NoMods)
181 .addImm(0)
182 .addImm(NoMods)
183 .addReg(MaskedReg)
184 .addImm(NoMods);
185 } else {
186 bool IsSGPR = TRI.isSGPRClass(SrcRC);
187 unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
188 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
189 .addImm(1)
190 .addReg(SrcReg);
191 if (IsSGPR)
192 And.setOperandDead(3); // Dead scc
193
194 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
195 .addImm(0)
196 .addReg(MaskedReg);
197 }
198 }
199
200 if (!MRI->getRegClassOrNull(SrcReg))
201 MRI->setRegClass(SrcReg, SrcRC);
202 I.eraseFromParent();
203 return true;
204 }
205
206 const TargetRegisterClass *RC =
207 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
208 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
209 return false;
210
211 return true;
212 }
213
214 for (const MachineOperand &MO : I.operands()) {
215 if (MO.getReg().isPhysical())
216 continue;
217
218 const TargetRegisterClass *RC =
219 TRI.getConstrainedRegClassForOperand(MO, *MRI);
220 if (!RC)
221 continue;
222 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
223 }
224 return true;
225}
226
227bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
228 const DebugLoc &DL = I.getDebugLoc();
229 MachineBasicBlock *BB = I.getParent();
230 Register VCCReg = I.getOperand(1).getReg();
231 MachineInstr *Cmp;
232
233 // Set SCC as a side effect with S_CMP or S_OR.
234 if (STI.hasScalarCompareEq64()) {
235 unsigned CmpOpc =
236 STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
237 Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0);
238 } else {
239 Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
240 Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst)
241 .addReg(VCCReg)
242 .addReg(VCCReg);
243 }
244
245 constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
246
247 Register DstReg = I.getOperand(0).getReg();
248 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);
249
250 I.eraseFromParent();
251 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
252}
253
254bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
255 const DebugLoc &DL = I.getDebugLoc();
256 MachineBasicBlock *BB = I.getParent();
257
258 Register DstReg = I.getOperand(0).getReg();
259 Register SrcReg = I.getOperand(1).getReg();
260 std::optional<ValueAndVReg> Arg =
261 getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI);
262
263 if (Arg) {
264 const int64_t Value = Arg->Value.getZExtValue();
265 if (Value == 0) {
266 unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
267 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
268 } else {
269 assert(Value == 1);
270 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec());
271 }
272 I.eraseFromParent();
273 return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI);
274 }
275
276 // RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0).
277 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg);
278
279 unsigned SelectOpcode =
280 STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
281 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
282 .addReg(TRI.getExec())
283 .addImm(0);
284
285 I.eraseFromParent();
287 return true;
288}
289
290bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
291 Register DstReg = I.getOperand(0).getReg();
292 Register SrcReg = I.getOperand(1).getReg();
293
294 const DebugLoc &DL = I.getDebugLoc();
295 MachineBasicBlock *BB = I.getParent();
296
297 auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
298 .addReg(SrcReg);
299
300 I.eraseFromParent();
301 constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
302 return true;
303}
304
305bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
306 const Register DefReg = I.getOperand(0).getReg();
307 const LLT DefTy = MRI->getType(DefReg);
308
309 // S1 G_PHIs should not be selected in instruction-select, instead:
310 // - divergent S1 G_PHI should go through lane mask merging algorithm
311 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
312 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
313 if (DefTy == LLT::scalar(1))
314 return false;
315
316 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
317
318 const RegClassOrRegBank &RegClassOrBank =
319 MRI->getRegClassOrRegBank(DefReg);
320
321 const TargetRegisterClass *DefRC =
323 if (!DefRC) {
324 if (!DefTy.isValid()) {
325 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
326 return false;
327 }
328
329 const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
330 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
331 if (!DefRC) {
332 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
333 return false;
334 }
335 }
336
337 // If inputs have register bank, assign corresponding reg class.
338 // Note: registers don't need to have the same reg bank.
339 for (unsigned i = 1; i != I.getNumOperands(); i += 2) {
340 const Register SrcReg = I.getOperand(i).getReg();
341
342 const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg);
343 if (RB) {
344 const LLT SrcTy = MRI->getType(SrcReg);
345 const TargetRegisterClass *SrcRC =
346 TRI.getRegClassForTypeOnBank(SrcTy, *RB);
347 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
348 return false;
349 }
350 }
351
352 I.setDesc(TII.get(TargetOpcode::PHI));
353 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
354}
355
357AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
358 const TargetRegisterClass &SubRC,
359 unsigned SubIdx) const {
360
361 MachineInstr *MI = MO.getParent();
362 MachineBasicBlock *BB = MO.getParent()->getParent();
363 Register DstReg = MRI->createVirtualRegister(&SubRC);
364
365 if (MO.isReg()) {
366 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
367 Register Reg = MO.getReg();
368 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
369 .addReg(Reg, {}, ComposedSubIdx);
370
371 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
372 MO.isKill(), MO.isDead(), MO.isUndef(),
373 MO.isEarlyClobber(), 0, MO.isDebug(),
374 MO.isInternalRead());
375 }
376
377 assert(MO.isImm());
378
379 APInt Imm(64, MO.getImm());
380
381 switch (SubIdx) {
382 default:
383 llvm_unreachable("do not know to split immediate with this sub index.");
384 case AMDGPU::sub0:
385 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
386 case AMDGPU::sub1:
387 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
388 }
389}
390
391static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
392 switch (Opc) {
393 case AMDGPU::G_AND:
394 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
395 case AMDGPU::G_OR:
396 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
397 case AMDGPU::G_XOR:
398 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
399 default:
400 llvm_unreachable("not a bit op");
401 }
402}
403
404bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
405 Register DstReg = I.getOperand(0).getReg();
406 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
407
408 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
409 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
410 DstRB->getID() != AMDGPU::VCCRegBankID)
411 return false;
412
413 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
414 STI.isWave64());
415 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
416
417 // Dead implicit-def of scc
418 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
419 true, // isImp
420 false, // isKill
421 true)); // isDead
422 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
423 return true;
424}
425
426bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
427 MachineBasicBlock *BB = I.getParent();
428 MachineFunction *MF = BB->getParent();
429 Register DstReg = I.getOperand(0).getReg();
430 const DebugLoc &DL = I.getDebugLoc();
431 LLT Ty = MRI->getType(DstReg);
432 if (Ty.isVector())
433 return false;
434
435 unsigned Size = Ty.getSizeInBits();
436 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
437 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
438 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
439
440 if (Size == 32) {
441 if (IsSALU) {
442 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
443 MachineInstr *Add =
444 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
445 .add(I.getOperand(1))
446 .add(I.getOperand(2))
447 .setOperandDead(3); // Dead scc
448 I.eraseFromParent();
449 constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
450 return true;
451 }
452
453 if (STI.hasAddNoCarryInsts()) {
454 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
455 I.setDesc(TII.get(Opc));
456 I.addOperand(*MF, MachineOperand::CreateImm(0));
457 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
458 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
459 return true;
460 }
461
462 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
463
464 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
465 MachineInstr *Add
466 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
467 .addDef(UnusedCarry, RegState::Dead)
468 .add(I.getOperand(1))
469 .add(I.getOperand(2))
470 .addImm(0);
471 I.eraseFromParent();
472 constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
473 return true;
474 }
475
476 assert(!Sub && "illegal sub should not reach here");
477
478 const TargetRegisterClass &RC
479 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
480 const TargetRegisterClass &HalfRC
481 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
482
483 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
484 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
485 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
486 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
487
488 Register DstLo = MRI->createVirtualRegister(&HalfRC);
489 Register DstHi = MRI->createVirtualRegister(&HalfRC);
490
491 if (IsSALU) {
492 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
493 .add(Lo1)
494 .add(Lo2);
495 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
496 .add(Hi1)
497 .add(Hi2)
498 .setOperandDead(3); // Dead scc
499 } else {
500 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
501 Register CarryReg = MRI->createVirtualRegister(CarryRC);
502 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
503 .addDef(CarryReg)
504 .add(Lo1)
505 .add(Lo2)
506 .addImm(0);
507 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
508 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
509 .add(Hi1)
510 .add(Hi2)
511 .addReg(CarryReg, RegState::Kill)
512 .addImm(0);
513
514 constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI);
515 }
516
517 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
518 .addReg(DstLo)
519 .addImm(AMDGPU::sub0)
520 .addReg(DstHi)
521 .addImm(AMDGPU::sub1);
522
523
524 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
525 return false;
526
527 I.eraseFromParent();
528 return true;
529}
530
531bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
532 MachineInstr &I) const {
533 MachineBasicBlock *BB = I.getParent();
534 MachineFunction *MF = BB->getParent();
535 const DebugLoc &DL = I.getDebugLoc();
536 Register Dst0Reg = I.getOperand(0).getReg();
537 Register Dst1Reg = I.getOperand(1).getReg();
538 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
539 I.getOpcode() == AMDGPU::G_UADDE;
540 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
541 I.getOpcode() == AMDGPU::G_USUBE;
542
543 if (isVCC(Dst1Reg, *MRI)) {
544 unsigned NoCarryOpc =
545 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
546 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
547 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
548 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
549 I.addOperand(*MF, MachineOperand::CreateImm(0));
550 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
551 return true;
552 }
553
554 Register Src0Reg = I.getOperand(2).getReg();
555 Register Src1Reg = I.getOperand(3).getReg();
556
557 if (HasCarryIn) {
558 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
559 .addReg(I.getOperand(4).getReg());
560 }
561
562 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
563 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
564
565 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
566 .add(I.getOperand(2))
567 .add(I.getOperand(3));
568
569 if (MRI->use_nodbg_empty(Dst1Reg)) {
570 CarryInst.setOperandDead(3); // Dead scc
571 } else {
572 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
573 .addReg(AMDGPU::SCC);
574 if (!MRI->getRegClassOrNull(Dst1Reg))
575 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
576 }
577
578 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
579 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
580 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
581 return false;
582
583 if (HasCarryIn &&
584 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
585 AMDGPU::SReg_32RegClass, *MRI))
586 return false;
587
588 I.eraseFromParent();
589 return true;
590}
591
592bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
593 MachineInstr &I) const {
594 MachineBasicBlock *BB = I.getParent();
595 MachineFunction *MF = BB->getParent();
596 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
597 bool UseNoCarry = Subtarget->hasMadNC64_32Insts() &&
598 MRI->use_nodbg_empty(I.getOperand(1).getReg());
599
600 unsigned Opc;
601 if (Subtarget->hasMADIntraFwdBug())
602 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
603 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
604 else if (UseNoCarry)
605 Opc = IsUnsigned ? AMDGPU::V_MAD_NC_U64_U32_e64
606 : AMDGPU::V_MAD_NC_I64_I32_e64;
607 else
608 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
609
610 if (UseNoCarry)
611 I.removeOperand(1);
612
613 I.setDesc(TII.get(Opc));
614 I.addOperand(*MF, MachineOperand::CreateImm(0));
615 I.addImplicitDefUseOperands(*MF);
616 I.getOperand(0).setIsEarlyClobber(true);
617 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
618 return true;
619}
620
621// TODO: We should probably legalize these to only using 32-bit results.
622bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
623 MachineBasicBlock *BB = I.getParent();
624 Register DstReg = I.getOperand(0).getReg();
625 Register SrcReg = I.getOperand(1).getReg();
626 LLT DstTy = MRI->getType(DstReg);
627 LLT SrcTy = MRI->getType(SrcReg);
628 const unsigned SrcSize = SrcTy.getSizeInBits();
629 unsigned DstSize = DstTy.getSizeInBits();
630
631 // TODO: Should handle any multiple of 32 offset.
632 unsigned Offset = I.getOperand(2).getImm();
633 if (Offset % 32 != 0 || DstSize > 128)
634 return false;
635
636 // 16-bit operations really use 32-bit registers.
637 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
638 if (DstSize == 16)
639 DstSize = 32;
640
641 const TargetRegisterClass *DstRC =
642 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
643 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
644 return false;
645
646 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
647 const TargetRegisterClass *SrcRC =
648 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
649 if (!SrcRC)
650 return false;
651 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
652 DstSize / 32);
653 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
654 if (!SrcRC)
655 return false;
656
657 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
658 *SrcRC, I.getOperand(1));
659 const DebugLoc &DL = I.getDebugLoc();
660 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
661 .addReg(SrcReg, {}, SubReg);
662
663 I.eraseFromParent();
664 return true;
665}
666
667bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
668 MachineBasicBlock *BB = MI.getParent();
669 Register DstReg = MI.getOperand(0).getReg();
670 LLT DstTy = MRI->getType(DstReg);
671 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
672
673 const unsigned SrcSize = SrcTy.getSizeInBits();
674 if (SrcSize < 32)
675 return selectImpl(MI, *CoverageInfo);
676
677 const DebugLoc &DL = MI.getDebugLoc();
678 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
679 const unsigned DstSize = DstTy.getSizeInBits();
680 const TargetRegisterClass *DstRC =
681 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
682 if (!DstRC)
683 return false;
684
685 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
686 MachineInstrBuilder MIB =
687 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
688 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
689 MachineOperand &Src = MI.getOperand(I + 1);
690 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
691 MIB.addImm(SubRegs[I]);
692
693 const TargetRegisterClass *SrcRC
694 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
695 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
696 return false;
697 }
698
699 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
700 return false;
701
702 MI.eraseFromParent();
703 return true;
704}
705
706bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
707 MachineBasicBlock *BB = MI.getParent();
708 const int NumDst = MI.getNumOperands() - 1;
709
710 MachineOperand &Src = MI.getOperand(NumDst);
711
712 Register SrcReg = Src.getReg();
713 Register DstReg0 = MI.getOperand(0).getReg();
714 LLT DstTy = MRI->getType(DstReg0);
715 LLT SrcTy = MRI->getType(SrcReg);
716
717 const unsigned DstSize = DstTy.getSizeInBits();
718 const unsigned SrcSize = SrcTy.getSizeInBits();
719 const DebugLoc &DL = MI.getDebugLoc();
720 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
721
722 const TargetRegisterClass *SrcRC =
723 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
724 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
725 return false;
726
727 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
728 // source, and this relies on the fact that the same subregister indices are
729 // used for both.
730 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
731 for (int I = 0, E = NumDst; I != E; ++I) {
732 MachineOperand &Dst = MI.getOperand(I);
733 // hi16:sreg_32 is not allowed so explicitly shift upper 16-bits.
734 if (SrcBank->getID() == AMDGPU::SGPRRegBankID &&
735 SubRegs[I] == AMDGPU::hi16) {
736 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst.getReg())
737 .addReg(SrcReg)
738 .addImm(16);
739 } else {
740 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
741 .addReg(SrcReg, {}, SubRegs[I]);
742 }
743
744 // Make sure the subregister index is valid for the source register.
745 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
746 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
747 return false;
748
749 const TargetRegisterClass *DstRC =
750 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
751 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
752 return false;
753 }
754
755 MI.eraseFromParent();
756 return true;
757}
758
759bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
760 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
761 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
762
763 Register Src0 = MI.getOperand(1).getReg();
764 Register Src1 = MI.getOperand(2).getReg();
765 LLT SrcTy = MRI->getType(Src0);
766 const unsigned SrcSize = SrcTy.getSizeInBits();
767
768 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
769 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
770 return selectG_MERGE_VALUES(MI);
771 }
772
773 // Selection logic below is for V2S16 only.
774 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
775 Register Dst = MI.getOperand(0).getReg();
776 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
777 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
778 SrcTy != LLT::scalar(32)))
779 return selectImpl(MI, *CoverageInfo);
780
781 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
782 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
783 return false;
784
785 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
786 DstBank->getID() == AMDGPU::VGPRRegBankID);
787 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
788
789 const DebugLoc &DL = MI.getDebugLoc();
790 MachineBasicBlock *BB = MI.getParent();
791
792 // First, before trying TableGen patterns, check if both sources are
793 // constants. In those cases, we can trivially compute the final constant
794 // and emit a simple move.
795 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
796 if (ConstSrc1) {
797 auto ConstSrc0 =
798 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
799 if (ConstSrc0) {
800 const int64_t K0 = ConstSrc0->Value.getSExtValue();
801 const int64_t K1 = ConstSrc1->Value.getSExtValue();
802 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
803 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
804 uint32_t Imm = Lo16 | (Hi16 << 16);
805
806 // VALU
807 if (IsVector) {
808 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
809 MI.eraseFromParent();
810 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
811 }
812
813 // SALU
814 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
815 MI.eraseFromParent();
816 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
817 }
818 }
819
820 // Now try TableGen patterns.
821 if (selectImpl(MI, *CoverageInfo))
822 return true;
823
824 // TODO: This should probably be a combine somewhere
825 // (build_vector $src0, undef) -> copy $src0
826 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
827 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
828 MI.setDesc(TII.get(AMDGPU::COPY));
829 MI.removeOperand(2);
830 const auto &RC =
831 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
832 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
833 RBI.constrainGenericRegister(Src0, RC, *MRI);
834 }
835
836 // TODO: Can be improved?
837 if (IsVector) {
838 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
839 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
840 .addImm(0xFFFF)
841 .addReg(Src0);
842 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
843
844 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
845 .addReg(Src1)
846 .addImm(16)
847 .addReg(TmpReg);
848 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
849
850 MI.eraseFromParent();
851 return true;
852 }
853
854 Register ShiftSrc0;
855 Register ShiftSrc1;
856
857 // With multiple uses of the shift, this will duplicate the shift and
858 // increase register pressure.
859 //
860 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
861 // => (S_PACK_HH_B32_B16 $src0, $src1)
862 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
863 // => (S_PACK_HL_B32_B16 $src0, $src1)
864 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
865 // => (S_PACK_LH_B32_B16 $src0, $src1)
866 // (build_vector $src0, $src1)
867 // => (S_PACK_LL_B32_B16 $src0, $src1)
868
869 bool Shift0 = mi_match(
870 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
871
872 bool Shift1 = mi_match(
873 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
874
875 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
876 if (Shift0 && Shift1) {
877 Opc = AMDGPU::S_PACK_HH_B32_B16;
878 MI.getOperand(1).setReg(ShiftSrc0);
879 MI.getOperand(2).setReg(ShiftSrc1);
880 } else if (Shift1) {
881 Opc = AMDGPU::S_PACK_LH_B32_B16;
882 MI.getOperand(2).setReg(ShiftSrc1);
883 } else if (Shift0) {
884 auto ConstSrc1 =
885 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
886 if (ConstSrc1 && ConstSrc1->Value == 0) {
887 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
888 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
889 .addReg(ShiftSrc0)
890 .addImm(16)
891 .setOperandDead(3); // Dead scc
892
893 MI.eraseFromParent();
894 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
895 return true;
896 }
897 if (STI.hasSPackHL()) {
898 Opc = AMDGPU::S_PACK_HL_B32_B16;
899 MI.getOperand(1).setReg(ShiftSrc0);
900 }
901 }
902
903 MI.setDesc(TII.get(Opc));
904 constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
905 return true;
906}
907
908bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
909 const MachineOperand &MO = I.getOperand(0);
910
911 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
912 // regbank check here is to know why getConstrainedRegClassForOperand failed.
913 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
914 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
915 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
916 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
917 return true;
918 }
919
920 return false;
921}
922
923bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
924 MachineBasicBlock *BB = I.getParent();
925
926 Register DstReg = I.getOperand(0).getReg();
927 Register Src0Reg = I.getOperand(1).getReg();
928 Register Src1Reg = I.getOperand(2).getReg();
929 LLT Src1Ty = MRI->getType(Src1Reg);
930
931 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
932 unsigned InsSize = Src1Ty.getSizeInBits();
933
934 int64_t Offset = I.getOperand(3).getImm();
935
936 // FIXME: These cases should have been illegal and unnecessary to check here.
937 if (Offset % 32 != 0 || InsSize % 32 != 0)
938 return false;
939
940 // Currently not handled by getSubRegFromChannel.
941 if (InsSize > 128)
942 return false;
943
944 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
945 if (SubReg == AMDGPU::NoSubRegister)
946 return false;
947
948 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
949 const TargetRegisterClass *DstRC =
950 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
951 if (!DstRC)
952 return false;
953
954 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
955 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
956 const TargetRegisterClass *Src0RC =
957 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
958 const TargetRegisterClass *Src1RC =
959 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
960
961 // Deal with weird cases where the class only partially supports the subreg
962 // index.
963 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
964 if (!Src0RC || !Src1RC)
965 return false;
966
967 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
968 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
969 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
970 return false;
971
972 const DebugLoc &DL = I.getDebugLoc();
973 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
974 .addReg(Src0Reg)
975 .addReg(Src1Reg)
976 .addImm(SubReg);
977
978 I.eraseFromParent();
979 return true;
980}
981
982bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
983 Register DstReg = MI.getOperand(0).getReg();
984 Register SrcReg = MI.getOperand(1).getReg();
985 Register OffsetReg = MI.getOperand(2).getReg();
986 Register WidthReg = MI.getOperand(3).getReg();
987
988 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
989 "scalar BFX instructions are expanded in regbankselect");
990 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
991 "64-bit vector BFX instructions are expanded in regbankselect");
992
993 const DebugLoc &DL = MI.getDebugLoc();
994 MachineBasicBlock *MBB = MI.getParent();
995
996 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
997 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
998 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
999 .addReg(SrcReg)
1000 .addReg(OffsetReg)
1001 .addReg(WidthReg);
1002 MI.eraseFromParent();
1003 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1004 return true;
1005}
1006
1007bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
1008 if (STI.getLDSBankCount() != 16)
1009 return selectImpl(MI, *CoverageInfo);
1010
1011 Register Dst = MI.getOperand(0).getReg();
1012 Register Src0 = MI.getOperand(2).getReg();
1013 Register M0Val = MI.getOperand(6).getReg();
1014 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
1015 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
1016 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
1017 return false;
1018
1019 // This requires 2 instructions. It is possible to write a pattern to support
1020 // this, but the generated isel emitter doesn't correctly deal with multiple
1021 // output instructions using the same physical register input. The copy to m0
1022 // is incorrectly placed before the second instruction.
1023 //
1024 // TODO: Match source modifiers.
1025
1026 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1027 const DebugLoc &DL = MI.getDebugLoc();
1028 MachineBasicBlock *MBB = MI.getParent();
1029
1030 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1031 .addReg(M0Val);
1032 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
1033 .addImm(2)
1034 .addImm(MI.getOperand(4).getImm()) // $attr
1035 .addImm(MI.getOperand(3).getImm()); // $attrchan
1036
1037 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
1038 .addImm(0) // $src0_modifiers
1039 .addReg(Src0) // $src0
1040 .addImm(MI.getOperand(4).getImm()) // $attr
1041 .addImm(MI.getOperand(3).getImm()) // $attrchan
1042 .addImm(0) // $src2_modifiers
1043 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
1044 .addImm(MI.getOperand(5).getImm()) // $high
1045 .addImm(0) // $clamp
1046 .addImm(0); // $omod
1047
1048 MI.eraseFromParent();
1049 return true;
1050}
1051
1052// Writelane is special in that it can use SGPR and M0 (which would normally
1053// count as using the constant bus twice - but in this case it is allowed since
1054// the lane selector doesn't count as a use of the constant bus). However, it is
1055// still required to abide by the 1 SGPR rule. Fix this up if we might have
1056// multiple SGPRs.
1057bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
1058 // With a constant bus limit of at least 2, there's no issue.
1059 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
1060 return selectImpl(MI, *CoverageInfo);
1061
1062 MachineBasicBlock *MBB = MI.getParent();
1063 const DebugLoc &DL = MI.getDebugLoc();
1064 Register VDst = MI.getOperand(0).getReg();
1065 Register Val = MI.getOperand(2).getReg();
1066 Register LaneSelect = MI.getOperand(3).getReg();
1067 Register VDstIn = MI.getOperand(4).getReg();
1068
1069 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
1070
1071 std::optional<ValueAndVReg> ConstSelect =
1072 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
1073 if (ConstSelect) {
1074 // The selector has to be an inline immediate, so we can use whatever for
1075 // the other operands.
1076 MIB.addReg(Val);
1077 MIB.addImm(ConstSelect->Value.getSExtValue() &
1078 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
1079 } else {
1080 std::optional<ValueAndVReg> ConstVal =
1082
1083 // If the value written is an inline immediate, we can get away without a
1084 // copy to m0.
1085 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
1086 STI.hasInv2PiInlineImm())) {
1087 MIB.addImm(ConstVal->Value.getSExtValue());
1088 MIB.addReg(LaneSelect);
1089 } else {
1090 MIB.addReg(Val);
1091
1092 // If the lane selector was originally in a VGPR and copied with
1093 // readfirstlane, there's a hazard to read the same SGPR from the
1094 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
1095 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
1096
1097 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1098 .addReg(LaneSelect);
1099 MIB.addReg(AMDGPU::M0);
1100 }
1101 }
1102
1103 MIB.addReg(VDstIn);
1104
1105 MI.eraseFromParent();
1106 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1107 return true;
1108}
1109
1110// We need to handle this here because tablegen doesn't support matching
1111// instructions with multiple outputs.
1112bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
1113 Register Dst0 = MI.getOperand(0).getReg();
1114 Register Dst1 = MI.getOperand(1).getReg();
1115
1116 LLT Ty = MRI->getType(Dst0);
1117 unsigned Opc;
1118 if (Ty == LLT::scalar(32))
1119 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
1120 else if (Ty == LLT::scalar(64))
1121 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
1122 else
1123 return false;
1124
1125 // TODO: Match source modifiers.
1126
1127 const DebugLoc &DL = MI.getDebugLoc();
1128 MachineBasicBlock *MBB = MI.getParent();
1129
1130 Register Numer = MI.getOperand(3).getReg();
1131 Register Denom = MI.getOperand(4).getReg();
1132 unsigned ChooseDenom = MI.getOperand(5).getImm();
1133
1134 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1135
1136 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1137 .addDef(Dst1)
1138 .addImm(0) // $src0_modifiers
1139 .addUse(Src0) // $src0
1140 .addImm(0) // $src1_modifiers
1141 .addUse(Denom) // $src1
1142 .addImm(0) // $src2_modifiers
1143 .addUse(Numer) // $src2
1144 .addImm(0) // $clamp
1145 .addImm(0); // $omod
1146
1147 MI.eraseFromParent();
1148 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1149 return true;
1150}
1151
1152bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1153 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1154 switch (IntrinsicID) {
1155 case Intrinsic::amdgcn_if_break: {
1156 MachineBasicBlock *BB = I.getParent();
1157
1158 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1159 // SelectionDAG uses for wave32 vs wave64.
1160 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1161 .add(I.getOperand(0))
1162 .add(I.getOperand(2))
1163 .add(I.getOperand(3));
1164
1165 Register DstReg = I.getOperand(0).getReg();
1166 Register Src0Reg = I.getOperand(2).getReg();
1167 Register Src1Reg = I.getOperand(3).getReg();
1168
1169 I.eraseFromParent();
1170
1171 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1172 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1173
1174 return true;
1175 }
1176 case Intrinsic::amdgcn_interp_p1_f16:
1177 return selectInterpP1F16(I);
1178 case Intrinsic::amdgcn_wqm:
1179 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1180 case Intrinsic::amdgcn_softwqm:
1181 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1182 case Intrinsic::amdgcn_strict_wwm:
1183 case Intrinsic::amdgcn_wwm:
1184 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1185 case Intrinsic::amdgcn_strict_wqm:
1186 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1187 case Intrinsic::amdgcn_writelane:
1188 return selectWritelane(I);
1189 case Intrinsic::amdgcn_div_scale:
1190 return selectDivScale(I);
1191 case Intrinsic::amdgcn_icmp:
1192 case Intrinsic::amdgcn_fcmp:
1193 if (selectImpl(I, *CoverageInfo))
1194 return true;
1195 return selectIntrinsicCmp(I);
1196 case Intrinsic::amdgcn_ballot:
1197 return selectBallot(I);
1198 case Intrinsic::amdgcn_reloc_constant:
1199 return selectRelocConstant(I);
1200 case Intrinsic::amdgcn_groupstaticsize:
1201 return selectGroupStaticSize(I);
1202 case Intrinsic::returnaddress:
1203 return selectReturnAddress(I);
1204 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1205 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1206 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1207 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1208 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1209 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1210 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1211 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1212 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1213 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1214 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1215 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1216 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1217 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1218 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
1219 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
1220 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
1221 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
1222 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
1223 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
1224 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1225 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
1226 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1227 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
1228 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
1229 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
1230 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
1231 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
1232 return selectSMFMACIntrin(I);
1233 case Intrinsic::amdgcn_permlane16_swap:
1234 case Intrinsic::amdgcn_permlane32_swap:
1235 return selectPermlaneSwapIntrin(I, IntrinsicID);
1236 case Intrinsic::amdgcn_wave_shuffle:
1237 return selectWaveShuffleIntrin(I);
1238 default:
1239 return selectImpl(I, *CoverageInfo);
1240 }
1241}
1242
1244 const GCNSubtarget &ST) {
1245 if (Size != 16 && Size != 32 && Size != 64)
1246 return -1;
1247
1248 if (Size == 16 && !ST.has16BitInsts())
1249 return -1;
1250
1251 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc,
1252 unsigned FakeS16Opc, unsigned S32Opc,
1253 unsigned S64Opc) {
1254 if (Size == 16)
1255 return ST.hasTrue16BitInsts()
1256 ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc
1257 : S16Opc;
1258 if (Size == 32)
1259 return S32Opc;
1260 return S64Opc;
1261 };
1262
1263 switch (P) {
1264 default:
1265 llvm_unreachable("Unknown condition code!");
1266 case CmpInst::ICMP_NE:
1267 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1268 AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,
1269 AMDGPU::V_CMP_NE_U64_e64);
1270 case CmpInst::ICMP_EQ:
1271 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1272 AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,
1273 AMDGPU::V_CMP_EQ_U64_e64);
1274 case CmpInst::ICMP_SGT:
1275 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1276 AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,
1277 AMDGPU::V_CMP_GT_I64_e64);
1278 case CmpInst::ICMP_SGE:
1279 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1280 AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,
1281 AMDGPU::V_CMP_GE_I64_e64);
1282 case CmpInst::ICMP_SLT:
1283 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1284 AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,
1285 AMDGPU::V_CMP_LT_I64_e64);
1286 case CmpInst::ICMP_SLE:
1287 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1288 AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,
1289 AMDGPU::V_CMP_LE_I64_e64);
1290 case CmpInst::ICMP_UGT:
1291 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1292 AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,
1293 AMDGPU::V_CMP_GT_U64_e64);
1294 case CmpInst::ICMP_UGE:
1295 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1296 AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,
1297 AMDGPU::V_CMP_GE_U64_e64);
1298 case CmpInst::ICMP_ULT:
1299 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1300 AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,
1301 AMDGPU::V_CMP_LT_U64_e64);
1302 case CmpInst::ICMP_ULE:
1303 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1304 AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,
1305 AMDGPU::V_CMP_LE_U64_e64);
1306
1307 case CmpInst::FCMP_OEQ:
1308 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1309 AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,
1310 AMDGPU::V_CMP_EQ_F64_e64);
1311 case CmpInst::FCMP_OGT:
1312 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1313 AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,
1314 AMDGPU::V_CMP_GT_F64_e64);
1315 case CmpInst::FCMP_OGE:
1316 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1317 AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,
1318 AMDGPU::V_CMP_GE_F64_e64);
1319 case CmpInst::FCMP_OLT:
1320 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1321 AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,
1322 AMDGPU::V_CMP_LT_F64_e64);
1323 case CmpInst::FCMP_OLE:
1324 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1325 AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,
1326 AMDGPU::V_CMP_LE_F64_e64);
1327 case CmpInst::FCMP_ONE:
1328 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1329 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1330 AMDGPU::V_CMP_NEQ_F64_e64);
1331 case CmpInst::FCMP_ORD:
1332 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1333 AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,
1334 AMDGPU::V_CMP_O_F64_e64);
1335 case CmpInst::FCMP_UNO:
1336 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1337 AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,
1338 AMDGPU::V_CMP_U_F64_e64);
1339 case CmpInst::FCMP_UEQ:
1340 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1341 AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,
1342 AMDGPU::V_CMP_NLG_F64_e64);
1343 case CmpInst::FCMP_UGT:
1344 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1345 AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,
1346 AMDGPU::V_CMP_NLE_F64_e64);
1347 case CmpInst::FCMP_UGE:
1348 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1349 AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,
1350 AMDGPU::V_CMP_NLT_F64_e64);
1351 case CmpInst::FCMP_ULT:
1352 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1353 AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,
1354 AMDGPU::V_CMP_NGE_F64_e64);
1355 case CmpInst::FCMP_ULE:
1356 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1357 AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,
1358 AMDGPU::V_CMP_NGT_F64_e64);
1359 case CmpInst::FCMP_UNE:
1360 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1361 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1362 AMDGPU::V_CMP_NEQ_F64_e64);
1363 case CmpInst::FCMP_TRUE:
1364 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1365 AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,
1366 AMDGPU::V_CMP_TRU_F64_e64);
1368 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1369 AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,
1370 AMDGPU::V_CMP_F_F64_e64);
1371 }
1372}
1373
1374int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1375 unsigned Size) const {
1376 if (Size == 64) {
1377 if (!STI.hasScalarCompareEq64())
1378 return -1;
1379
1380 switch (P) {
1381 case CmpInst::ICMP_NE:
1382 return AMDGPU::S_CMP_LG_U64;
1383 case CmpInst::ICMP_EQ:
1384 return AMDGPU::S_CMP_EQ_U64;
1385 default:
1386 return -1;
1387 }
1388 }
1389
1390 if (Size == 32) {
1391 switch (P) {
1392 case CmpInst::ICMP_NE:
1393 return AMDGPU::S_CMP_LG_U32;
1394 case CmpInst::ICMP_EQ:
1395 return AMDGPU::S_CMP_EQ_U32;
1396 case CmpInst::ICMP_SGT:
1397 return AMDGPU::S_CMP_GT_I32;
1398 case CmpInst::ICMP_SGE:
1399 return AMDGPU::S_CMP_GE_I32;
1400 case CmpInst::ICMP_SLT:
1401 return AMDGPU::S_CMP_LT_I32;
1402 case CmpInst::ICMP_SLE:
1403 return AMDGPU::S_CMP_LE_I32;
1404 case CmpInst::ICMP_UGT:
1405 return AMDGPU::S_CMP_GT_U32;
1406 case CmpInst::ICMP_UGE:
1407 return AMDGPU::S_CMP_GE_U32;
1408 case CmpInst::ICMP_ULT:
1409 return AMDGPU::S_CMP_LT_U32;
1410 case CmpInst::ICMP_ULE:
1411 return AMDGPU::S_CMP_LE_U32;
1412 case CmpInst::FCMP_OEQ:
1413 return AMDGPU::S_CMP_EQ_F32;
1414 case CmpInst::FCMP_OGT:
1415 return AMDGPU::S_CMP_GT_F32;
1416 case CmpInst::FCMP_OGE:
1417 return AMDGPU::S_CMP_GE_F32;
1418 case CmpInst::FCMP_OLT:
1419 return AMDGPU::S_CMP_LT_F32;
1420 case CmpInst::FCMP_OLE:
1421 return AMDGPU::S_CMP_LE_F32;
1422 case CmpInst::FCMP_ONE:
1423 return AMDGPU::S_CMP_LG_F32;
1424 case CmpInst::FCMP_ORD:
1425 return AMDGPU::S_CMP_O_F32;
1426 case CmpInst::FCMP_UNO:
1427 return AMDGPU::S_CMP_U_F32;
1428 case CmpInst::FCMP_UEQ:
1429 return AMDGPU::S_CMP_NLG_F32;
1430 case CmpInst::FCMP_UGT:
1431 return AMDGPU::S_CMP_NLE_F32;
1432 case CmpInst::FCMP_UGE:
1433 return AMDGPU::S_CMP_NLT_F32;
1434 case CmpInst::FCMP_ULT:
1435 return AMDGPU::S_CMP_NGE_F32;
1436 case CmpInst::FCMP_ULE:
1437 return AMDGPU::S_CMP_NGT_F32;
1438 case CmpInst::FCMP_UNE:
1439 return AMDGPU::S_CMP_NEQ_F32;
1440 default:
1441 llvm_unreachable("Unknown condition code!");
1442 }
1443 }
1444
1445 if (Size == 16) {
1446 if (!STI.hasSALUFloatInsts())
1447 return -1;
1448
1449 switch (P) {
1450 case CmpInst::FCMP_OEQ:
1451 return AMDGPU::S_CMP_EQ_F16;
1452 case CmpInst::FCMP_OGT:
1453 return AMDGPU::S_CMP_GT_F16;
1454 case CmpInst::FCMP_OGE:
1455 return AMDGPU::S_CMP_GE_F16;
1456 case CmpInst::FCMP_OLT:
1457 return AMDGPU::S_CMP_LT_F16;
1458 case CmpInst::FCMP_OLE:
1459 return AMDGPU::S_CMP_LE_F16;
1460 case CmpInst::FCMP_ONE:
1461 return AMDGPU::S_CMP_LG_F16;
1462 case CmpInst::FCMP_ORD:
1463 return AMDGPU::S_CMP_O_F16;
1464 case CmpInst::FCMP_UNO:
1465 return AMDGPU::S_CMP_U_F16;
1466 case CmpInst::FCMP_UEQ:
1467 return AMDGPU::S_CMP_NLG_F16;
1468 case CmpInst::FCMP_UGT:
1469 return AMDGPU::S_CMP_NLE_F16;
1470 case CmpInst::FCMP_UGE:
1471 return AMDGPU::S_CMP_NLT_F16;
1472 case CmpInst::FCMP_ULT:
1473 return AMDGPU::S_CMP_NGE_F16;
1474 case CmpInst::FCMP_ULE:
1475 return AMDGPU::S_CMP_NGT_F16;
1476 case CmpInst::FCMP_UNE:
1477 return AMDGPU::S_CMP_NEQ_F16;
1478 default:
1479 llvm_unreachable("Unknown condition code!");
1480 }
1481 }
1482
1483 return -1;
1484}
1485
1486bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1487
1488 MachineBasicBlock *BB = I.getParent();
1489 const DebugLoc &DL = I.getDebugLoc();
1490
1491 Register SrcReg = I.getOperand(2).getReg();
1492 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1493
1494 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1495
1496 Register CCReg = I.getOperand(0).getReg();
1497 if (!isVCC(CCReg, *MRI)) {
1498 int Opcode = getS_CMPOpcode(Pred, Size);
1499 if (Opcode == -1)
1500 return false;
1501 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1502 .add(I.getOperand(2))
1503 .add(I.getOperand(3));
1504 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1505 .addReg(AMDGPU::SCC);
1506 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1507 bool Ret =
1508 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1509 I.eraseFromParent();
1510 return Ret;
1511 }
1512
1513 if (I.getOpcode() == AMDGPU::G_FCMP)
1514 return false;
1515
1516 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1517 if (Opcode == -1)
1518 return false;
1519
1520 MachineInstrBuilder ICmp;
1521 // t16 instructions
1522 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers)) {
1523 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1524 .addImm(0)
1525 .add(I.getOperand(2))
1526 .addImm(0)
1527 .add(I.getOperand(3))
1528 .addImm(0); // op_sel
1529 } else {
1530 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1531 .add(I.getOperand(2))
1532 .add(I.getOperand(3));
1533 }
1534
1535 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1536 *TRI.getBoolRC(), *MRI);
1537 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1538 I.eraseFromParent();
1539 return true;
1540}
1541
1542bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1543 Register Dst = I.getOperand(0).getReg();
1544 if (isVCC(Dst, *MRI))
1545 return false;
1546
1547 LLT DstTy = MRI->getType(Dst);
1548 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1549 return false;
1550
1551 MachineBasicBlock *BB = I.getParent();
1552 const DebugLoc &DL = I.getDebugLoc();
1553 Register SrcReg = I.getOperand(2).getReg();
1554 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1555
1556 // i1 inputs are not supported in GlobalISel.
1557 if (Size == 1)
1558 return false;
1559
1560 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1561 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1562 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1563 I.eraseFromParent();
1564 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1565 }
1566
1567 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1568 if (Opcode == -1)
1569 return false;
1570
1571 MachineInstrBuilder SelectedMI;
1572 MachineOperand &LHS = I.getOperand(2);
1573 MachineOperand &RHS = I.getOperand(3);
1574 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
1575 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
1576 Register Src0Reg =
1577 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1578 Register Src1Reg =
1579 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1580 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1581 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1582 SelectedMI.addImm(Src0Mods);
1583 SelectedMI.addReg(Src0Reg);
1584 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1585 SelectedMI.addImm(Src1Mods);
1586 SelectedMI.addReg(Src1Reg);
1587 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1588 SelectedMI.addImm(0); // clamp
1589 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1590 SelectedMI.addImm(0); // op_sel
1591
1592 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1593 constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI);
1594
1595 I.eraseFromParent();
1596 return true;
1597}
1598
1599// Ballot has to zero bits in input lane-mask that are zero in current exec,
1600// Done as AND with exec. For inputs that are results of instruction that
1601// implicitly use same exec, for example compares in same basic block or SCC to
1602// VCC copy, use copy.
1605 MachineInstr *MI = MRI.getVRegDef(Reg);
1606 if (MI->getParent() != MBB)
1607 return false;
1608
1609 // Lane mask generated by SCC to VCC copy.
1610 if (MI->getOpcode() == AMDGPU::COPY) {
1611 auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg());
1612 auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg());
1613 if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&
1614 SrcRB->getID() == AMDGPU::SGPRRegBankID)
1615 return true;
1616 }
1617
1618 // Lane mask generated using compare with same exec.
1619 if (isa<GAnyCmp>(MI))
1620 return true;
1621
1622 Register LHS, RHS;
1623 // Look through AND.
1624 if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS))))
1625 return isLaneMaskFromSameBlock(LHS, MRI, MBB) ||
1627
1628 return false;
1629}
1630
1631bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1632 MachineBasicBlock *BB = I.getParent();
1633 const DebugLoc &DL = I.getDebugLoc();
1634 Register DstReg = I.getOperand(0).getReg();
1635 Register SrcReg = I.getOperand(2).getReg();
1636 const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits();
1637 const unsigned WaveSize = STI.getWavefrontSize();
1638
1639 // In the common case, the return type matches the wave size.
1640 // However we also support emitting i64 ballots in wave32 mode.
1641 if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
1642 return false;
1643
1644 std::optional<ValueAndVReg> Arg =
1646
1647 Register Dst = DstReg;
1648 // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
1649 if (BallotSize != WaveSize) {
1650 Dst = MRI->createVirtualRegister(TRI.getBoolRC());
1651 }
1652
1653 if (Arg) {
1654 const int64_t Value = Arg->Value.getZExtValue();
1655 if (Value == 0) {
1656 // Dst = S_MOV 0
1657 unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1658 BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0);
1659 } else {
1660 // Dst = COPY EXEC
1661 assert(Value == 1);
1662 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec());
1663 }
1664 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1665 return false;
1666 } else {
1667 if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) {
1668 // Dst = COPY SrcReg
1669 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg);
1670 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1671 return false;
1672 } else {
1673 // Dst = S_AND SrcReg, EXEC
1674 unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1675 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst)
1676 .addReg(SrcReg)
1677 .addReg(TRI.getExec())
1678 .setOperandDead(3); // Dead scc
1679 constrainSelectedInstRegOperands(*And, TII, TRI, RBI);
1680 }
1681 }
1682
1683 // i64 ballot on Wave32: zero-extend i32 ballot to i64.
1684 if (BallotSize != WaveSize) {
1685 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1686 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1687 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1688 .addReg(Dst)
1689 .addImm(AMDGPU::sub0)
1690 .addReg(HiReg)
1691 .addImm(AMDGPU::sub1);
1692 }
1693
1694 I.eraseFromParent();
1695 return true;
1696}
1697
1698bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1699 Register DstReg = I.getOperand(0).getReg();
1700 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1701 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1702 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1703 return false;
1704
1705 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1706
1707 Module *M = MF->getFunction().getParent();
1708 const MDNode *Metadata = I.getOperand(2).getMetadata();
1709 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1710 auto *RelocSymbol = cast<GlobalVariable>(
1711 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1712
1713 MachineBasicBlock *BB = I.getParent();
1714 BuildMI(*BB, &I, I.getDebugLoc(),
1715 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1717
1718 I.eraseFromParent();
1719 return true;
1720}
1721
1722bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1723 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1724
1725 Register DstReg = I.getOperand(0).getReg();
1726 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1727 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1728 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1729
1730 MachineBasicBlock *MBB = I.getParent();
1731 const DebugLoc &DL = I.getDebugLoc();
1732
1733 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1734
1735 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1736 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1737 MIB.addImm(MFI->getLDSSize());
1738 } else {
1739 Module *M = MF->getFunction().getParent();
1740 const GlobalValue *GV =
1741 Intrinsic::getOrInsertDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1743 }
1744
1745 I.eraseFromParent();
1746 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1747 return true;
1748}
1749
1750bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1751 MachineBasicBlock *MBB = I.getParent();
1752 MachineFunction &MF = *MBB->getParent();
1753 const DebugLoc &DL = I.getDebugLoc();
1754
1755 MachineOperand &Dst = I.getOperand(0);
1756 Register DstReg = Dst.getReg();
1757 unsigned Depth = I.getOperand(2).getImm();
1758
1759 const TargetRegisterClass *RC
1760 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1761 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1762 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1763 return false;
1764
1765 // Check for kernel and shader functions
1766 if (Depth != 0 ||
1767 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1768 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1769 .addImm(0);
1770 I.eraseFromParent();
1771 return true;
1772 }
1773
1774 MachineFrameInfo &MFI = MF.getFrameInfo();
1775 // There is a call to @llvm.returnaddress in this function
1776 MFI.setReturnAddressIsTaken(true);
1777
1778 // Get the return address reg and mark it as an implicit live-in
1779 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1780 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1781 AMDGPU::SReg_64RegClass, DL);
1782 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1783 .addReg(LiveIn);
1784 I.eraseFromParent();
1785 return true;
1786}
1787
1788bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1789 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1790 // SelectionDAG uses for wave32 vs wave64.
1791 MachineBasicBlock *BB = MI.getParent();
1792 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1793 .add(MI.getOperand(1));
1794
1795 Register Reg = MI.getOperand(1).getReg();
1796 MI.eraseFromParent();
1797
1798 if (!MRI->getRegClassOrNull(Reg))
1799 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1800 return true;
1801}
1802
1803bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1804 MachineInstr &MI, Intrinsic::ID IntrID) const {
1805 MachineBasicBlock *MBB = MI.getParent();
1806 MachineFunction *MF = MBB->getParent();
1807 const DebugLoc &DL = MI.getDebugLoc();
1808
1809 unsigned IndexOperand = MI.getOperand(7).getImm();
1810 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1811 bool WaveDone = MI.getOperand(9).getImm() != 0;
1812
1813 if (WaveDone && !WaveRelease) {
1814 // TODO: Move this to IR verifier
1815 const Function &Fn = MF->getFunction();
1816 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1817 Fn, "ds_ordered_count: wave_done requires wave_release", DL));
1818 }
1819
1820 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1821 IndexOperand &= ~0x3f;
1822 unsigned CountDw = 0;
1823
1824 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1825 CountDw = (IndexOperand >> 24) & 0xf;
1826 IndexOperand &= ~(0xf << 24);
1827
1828 if (CountDw < 1 || CountDw > 4) {
1829 const Function &Fn = MF->getFunction();
1830 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1831 Fn, "ds_ordered_count: dword count must be between 1 and 4", DL));
1832 CountDw = 1;
1833 }
1834 }
1835
1836 if (IndexOperand) {
1837 const Function &Fn = MF->getFunction();
1838 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1839 Fn, "ds_ordered_count: bad index operand", DL));
1840 }
1841
1842 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1843 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1844
1845 unsigned Offset0 = OrderedCountIndex << 2;
1846 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1847
1848 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1849 Offset1 |= (CountDw - 1) << 6;
1850
1851 if (STI.getGeneration() < AMDGPUSubtarget::GFX11)
1852 Offset1 |= ShaderType << 2;
1853
1854 unsigned Offset = Offset0 | (Offset1 << 8);
1855
1856 Register M0Val = MI.getOperand(2).getReg();
1857 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1858 .addReg(M0Val);
1859
1860 Register DstReg = MI.getOperand(0).getReg();
1861 Register ValReg = MI.getOperand(3).getReg();
1862 MachineInstrBuilder DS =
1863 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1864 .addReg(ValReg)
1865 .addImm(Offset)
1866 .cloneMemRefs(MI);
1867
1868 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1869 return false;
1870
1871 constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1872 MI.eraseFromParent();
1873 return true;
1874}
1875
1876static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1877 switch (IntrID) {
1878 case Intrinsic::amdgcn_ds_gws_init:
1879 return AMDGPU::DS_GWS_INIT;
1880 case Intrinsic::amdgcn_ds_gws_barrier:
1881 return AMDGPU::DS_GWS_BARRIER;
1882 case Intrinsic::amdgcn_ds_gws_sema_v:
1883 return AMDGPU::DS_GWS_SEMA_V;
1884 case Intrinsic::amdgcn_ds_gws_sema_br:
1885 return AMDGPU::DS_GWS_SEMA_BR;
1886 case Intrinsic::amdgcn_ds_gws_sema_p:
1887 return AMDGPU::DS_GWS_SEMA_P;
1888 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1889 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1890 default:
1891 llvm_unreachable("not a gws intrinsic");
1892 }
1893}
1894
1895bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1896 Intrinsic::ID IID) const {
1897 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1898 !STI.hasGWSSemaReleaseAll()))
1899 return false;
1900
1901 // intrinsic ID, vsrc, offset
1902 const bool HasVSrc = MI.getNumOperands() == 3;
1903 assert(HasVSrc || MI.getNumOperands() == 2);
1904
1905 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1906 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1907 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1908 return false;
1909
1910 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1911 unsigned ImmOffset;
1912
1913 MachineBasicBlock *MBB = MI.getParent();
1914 const DebugLoc &DL = MI.getDebugLoc();
1915
1916 MachineInstr *Readfirstlane = nullptr;
1917
1918 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1919 // incoming offset, in case there's an add of a constant. We'll have to put it
1920 // back later.
1921 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1922 Readfirstlane = OffsetDef;
1923 BaseOffset = OffsetDef->getOperand(1).getReg();
1924 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1925 }
1926
1927 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1928 // If we have a constant offset, try to use the 0 in m0 as the base.
1929 // TODO: Look into changing the default m0 initialization value. If the
1930 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1931 // the immediate offset.
1932
1933 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1934 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1935 .addImm(0);
1936 } else {
1937 std::tie(BaseOffset, ImmOffset) =
1938 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, VT);
1939
1940 if (Readfirstlane) {
1941 // We have the constant offset now, so put the readfirstlane back on the
1942 // variable component.
1943 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1944 return false;
1945
1946 Readfirstlane->getOperand(1).setReg(BaseOffset);
1947 BaseOffset = Readfirstlane->getOperand(0).getReg();
1948 } else {
1949 if (!RBI.constrainGenericRegister(BaseOffset,
1950 AMDGPU::SReg_32RegClass, *MRI))
1951 return false;
1952 }
1953
1954 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1955 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1956 .addReg(BaseOffset)
1957 .addImm(16)
1958 .setOperandDead(3); // Dead scc
1959
1960 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1961 .addReg(M0Base);
1962 }
1963
1964 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1965 // offset field) % 64. Some versions of the programming guide omit the m0
1966 // part, or claim it's from offset 0.
1967
1968 unsigned Opc = gwsIntrinToOpcode(IID);
1969 const MCInstrDesc &InstrDesc = TII.get(Opc);
1970
1971 if (HasVSrc) {
1972 Register VSrc = MI.getOperand(1).getReg();
1973
1974 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
1975 const TargetRegisterClass *DataRC = TII.getRegClass(InstrDesc, Data0Idx);
1976 const TargetRegisterClass *SubRC =
1977 TRI.getSubRegisterClass(DataRC, AMDGPU::sub0);
1978
1979 if (!SubRC) {
1980 // 32-bit normal case.
1981 if (!RBI.constrainGenericRegister(VSrc, *DataRC, *MRI))
1982 return false;
1983
1984 BuildMI(*MBB, &MI, DL, InstrDesc)
1985 .addReg(VSrc)
1986 .addImm(ImmOffset)
1987 .cloneMemRefs(MI);
1988 } else {
1989 // Requires even register alignment, so create 64-bit value and pad the
1990 // top half with undef.
1991 Register DataReg = MRI->createVirtualRegister(DataRC);
1992 if (!RBI.constrainGenericRegister(VSrc, *SubRC, *MRI))
1993 return false;
1994
1995 Register UndefReg = MRI->createVirtualRegister(SubRC);
1996 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
1997 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), DataReg)
1998 .addReg(VSrc)
1999 .addImm(AMDGPU::sub0)
2000 .addReg(UndefReg)
2001 .addImm(AMDGPU::sub1);
2002
2003 BuildMI(*MBB, &MI, DL, InstrDesc)
2004 .addReg(DataReg)
2005 .addImm(ImmOffset)
2006 .cloneMemRefs(MI);
2007 }
2008 } else {
2009 BuildMI(*MBB, &MI, DL, InstrDesc)
2010 .addImm(ImmOffset)
2011 .cloneMemRefs(MI);
2012 }
2013
2014 MI.eraseFromParent();
2015 return true;
2016}
2017
2018bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
2019 bool IsAppend) const {
2020 Register PtrBase = MI.getOperand(2).getReg();
2021 LLT PtrTy = MRI->getType(PtrBase);
2022 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2023
2024 unsigned Offset;
2025 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
2026
2027 // TODO: Should this try to look through readfirstlane like GWS?
2028 if (!isDSOffsetLegal(PtrBase, Offset)) {
2029 PtrBase = MI.getOperand(2).getReg();
2030 Offset = 0;
2031 }
2032
2033 MachineBasicBlock *MBB = MI.getParent();
2034 const DebugLoc &DL = MI.getDebugLoc();
2035 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2036
2037 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2038 .addReg(PtrBase);
2039 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
2040 return false;
2041
2042 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
2043 .addImm(Offset)
2044 .addImm(IsGDS ? -1 : 0)
2045 .cloneMemRefs(MI);
2046 MI.eraseFromParent();
2047 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2048 return true;
2049}
2050
2051bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
2052 MachineFunction *MF = MI.getMF();
2053 SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
2054
2055 MFInfo->setInitWholeWave();
2056 return selectImpl(MI, *CoverageInfo);
2057}
2058
2059static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
2060 bool &IsTexFail) {
2061 if (TexFailCtrl)
2062 IsTexFail = true;
2063
2064 TFE = TexFailCtrl & 0x1;
2065 TexFailCtrl &= ~(uint64_t)0x1;
2066 LWE = TexFailCtrl & 0x2;
2067 TexFailCtrl &= ~(uint64_t)0x2;
2068
2069 return TexFailCtrl == 0;
2070}
2071
2072bool AMDGPUInstructionSelector::selectImageIntrinsic(
2073 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
2074 MachineBasicBlock *MBB = MI.getParent();
2075 const DebugLoc &DL = MI.getDebugLoc();
2076 unsigned IntrOpcode = Intr->BaseOpcode;
2077
2078 // For image atomic: use no-return opcode if result is unused.
2079 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode) {
2080 Register ResultDef = MI.getOperand(0).getReg();
2081 if (MRI->use_nodbg_empty(ResultDef))
2082 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
2083 }
2084
2085 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
2087
2088 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
2089 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
2090 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
2091 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
2092 const bool IsGFX13Plus = AMDGPU::isGFX13Plus(STI);
2093
2094 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
2095
2096 Register VDataIn = AMDGPU::NoRegister;
2097 Register VDataOut = AMDGPU::NoRegister;
2098 LLT VDataTy;
2099 int NumVDataDwords = -1;
2100 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
2101 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
2102
2103 bool Unorm;
2104 if (!BaseOpcode->Sampler)
2105 Unorm = true;
2106 else
2107 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
2108
2109 bool TFE;
2110 bool LWE;
2111 bool IsTexFail = false;
2112 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
2113 TFE, LWE, IsTexFail))
2114 return false;
2115
2116 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
2117 const bool IsA16 = (Flags & 1) != 0;
2118 const bool IsG16 = (Flags & 2) != 0;
2119
2120 // A16 implies 16 bit gradients if subtarget doesn't support G16
2121 if (IsA16 && !STI.hasG16() && !IsG16)
2122 return false;
2123
2124 unsigned DMask = 0;
2125 unsigned DMaskLanes = 0;
2126
2127 if (BaseOpcode->Atomic) {
2128 if (!BaseOpcode->NoReturn)
2129 VDataOut = MI.getOperand(0).getReg();
2130 VDataIn = MI.getOperand(2).getReg();
2131 LLT Ty = MRI->getType(VDataIn);
2132
2133 // Be careful to allow atomic swap on 16-bit element vectors.
2134 const bool Is64Bit = BaseOpcode->AtomicX2 ?
2135 Ty.getSizeInBits() == 128 :
2136 Ty.getSizeInBits() == 64;
2137
2138 if (BaseOpcode->AtomicX2) {
2139 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
2140
2141 DMask = Is64Bit ? 0xf : 0x3;
2142 NumVDataDwords = Is64Bit ? 4 : 2;
2143 } else {
2144 DMask = Is64Bit ? 0x3 : 0x1;
2145 NumVDataDwords = Is64Bit ? 2 : 1;
2146 }
2147 } else {
2148 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
2149 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
2150
2151 if (BaseOpcode->Store) {
2152 VDataIn = MI.getOperand(1).getReg();
2153 VDataTy = MRI->getType(VDataIn);
2154 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
2155 } else if (BaseOpcode->NoReturn) {
2156 NumVDataDwords = 0;
2157 } else {
2158 VDataOut = MI.getOperand(0).getReg();
2159 VDataTy = MRI->getType(VDataOut);
2160 NumVDataDwords = DMaskLanes;
2161
2162 if (IsD16 && !STI.hasUnpackedD16VMem())
2163 NumVDataDwords = (DMaskLanes + 1) / 2;
2164 }
2165 }
2166
2167 // Set G16 opcode
2168 if (Subtarget->hasG16() && IsG16) {
2169 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
2171 assert(G16MappingInfo);
2172 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
2173 }
2174
2175 // TODO: Check this in verifier.
2176 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
2177
2178 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
2179 // Keep GLC only when the atomic's result is actually used.
2180 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
2182 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
2184 return false;
2185
2186 int NumVAddrRegs = 0;
2187 int NumVAddrDwords = 0;
2188 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
2189 // Skip the $noregs and 0s inserted during legalization.
2190 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
2191 if (!AddrOp.isReg())
2192 continue; // XXX - Break?
2193
2194 Register Addr = AddrOp.getReg();
2195 if (!Addr)
2196 break;
2197
2198 ++NumVAddrRegs;
2199 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
2200 }
2201
2202 // The legalizer preprocessed the intrinsic arguments. If we aren't using
2203 // NSA, these should have been packed into a single value in the first
2204 // address register
2205 const bool UseNSA =
2206 NumVAddrRegs != 1 &&
2207 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
2208 : NumVAddrDwords == NumVAddrRegs);
2209 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
2210 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
2211 return false;
2212 }
2213
2214 if (IsTexFail)
2215 ++NumVDataDwords;
2216
2217 int Opcode = -1;
2218 if (IsGFX13Plus) {
2219 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx13,
2220 NumVDataDwords, NumVAddrDwords);
2221 } else if (IsGFX12Plus) {
2222 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
2223 NumVDataDwords, NumVAddrDwords);
2224 } else if (IsGFX11Plus) {
2225 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2226 UseNSA ? AMDGPU::MIMGEncGfx11NSA
2227 : AMDGPU::MIMGEncGfx11Default,
2228 NumVDataDwords, NumVAddrDwords);
2229 } else if (IsGFX10Plus) {
2230 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2231 UseNSA ? AMDGPU::MIMGEncGfx10NSA
2232 : AMDGPU::MIMGEncGfx10Default,
2233 NumVDataDwords, NumVAddrDwords);
2234 } else {
2235 if (Subtarget->hasGFX90AInsts()) {
2236 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
2237 NumVDataDwords, NumVAddrDwords);
2238 if (Opcode == -1) {
2239 LLVM_DEBUG(
2240 dbgs()
2241 << "requested image instruction is not supported on this GPU\n");
2242 return false;
2243 }
2244 }
2245 if (Opcode == -1 &&
2246 STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
2247 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
2248 NumVDataDwords, NumVAddrDwords);
2249 if (Opcode == -1)
2250 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
2251 NumVDataDwords, NumVAddrDwords);
2252 }
2253 if (Opcode == -1)
2254 return false;
2255
2256 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
2257 .cloneMemRefs(MI);
2258
2259 if (VDataOut) {
2260 if (BaseOpcode->AtomicX2) {
2261 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
2262
2263 Register TmpReg = MRI->createVirtualRegister(
2264 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2265 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2266
2267 MIB.addDef(TmpReg);
2268 if (!MRI->use_empty(VDataOut)) {
2269 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
2270 .addReg(TmpReg, RegState::Kill, SubReg);
2271 }
2272
2273 } else {
2274 MIB.addDef(VDataOut); // vdata output
2275 }
2276 }
2277
2278 if (VDataIn)
2279 MIB.addReg(VDataIn); // vdata input
2280
2281 for (int I = 0; I != NumVAddrRegs; ++I) {
2282 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2283 if (SrcOp.isReg()) {
2284 assert(SrcOp.getReg() != 0);
2285 MIB.addReg(SrcOp.getReg());
2286 }
2287 }
2288
2289 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2290 if (BaseOpcode->Sampler)
2291 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2292
2293 MIB.addImm(DMask); // dmask
2294
2295 if (IsGFX10Plus)
2296 MIB.addImm(DimInfo->Encoding);
2297 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2298 MIB.addImm(Unorm);
2299
2300 MIB.addImm(CPol);
2301 MIB.addImm(IsA16 && // a16 or r128
2302 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2303 if (IsGFX10Plus)
2304 MIB.addImm(IsA16 ? -1 : 0);
2305
2306 if (!Subtarget->hasGFX90AInsts()) {
2307 MIB.addImm(TFE); // tfe
2308 } else if (TFE) {
2309 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2310 return false;
2311 }
2312
2313 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2314 MIB.addImm(LWE); // lwe
2315 if (!IsGFX10Plus)
2316 MIB.addImm(DimInfo->DA ? -1 : 0);
2317 if (BaseOpcode->HasD16)
2318 MIB.addImm(IsD16 ? -1 : 0);
2319
2320 MI.eraseFromParent();
2321 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2322 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2323 return true;
2324}
2325
2326// We need to handle this here because tablegen doesn't support matching
2327// instructions with multiple outputs.
2328bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2329 MachineInstr &MI) const {
2330 Register Dst0 = MI.getOperand(0).getReg();
2331 Register Dst1 = MI.getOperand(1).getReg();
2332
2333 const DebugLoc &DL = MI.getDebugLoc();
2334 MachineBasicBlock *MBB = MI.getParent();
2335
2336 Register Addr = MI.getOperand(3).getReg();
2337 Register Data0 = MI.getOperand(4).getReg();
2338 Register Data1 = MI.getOperand(5).getReg();
2339 unsigned Offset = MI.getOperand(6).getImm();
2340
2341 unsigned Opc;
2342 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
2343 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2344 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2345 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2346 break;
2347 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2348 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2349 break;
2350 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2351 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2352 break;
2353 }
2354
2355 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
2356 .addDef(Dst1)
2357 .addUse(Addr)
2358 .addUse(Data0)
2359 .addUse(Data1)
2360 .addImm(Offset)
2361 .cloneMemRefs(MI);
2362
2363 MI.eraseFromParent();
2364 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2365 return true;
2366}
2367
2368bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2369 MachineInstr &I) const {
2370 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2371 switch (IntrinsicID) {
2372 case Intrinsic::amdgcn_end_cf:
2373 return selectEndCfIntrinsic(I);
2374 case Intrinsic::amdgcn_ds_ordered_add:
2375 case Intrinsic::amdgcn_ds_ordered_swap:
2376 return selectDSOrderedIntrinsic(I, IntrinsicID);
2377 case Intrinsic::amdgcn_ds_gws_init:
2378 case Intrinsic::amdgcn_ds_gws_barrier:
2379 case Intrinsic::amdgcn_ds_gws_sema_v:
2380 case Intrinsic::amdgcn_ds_gws_sema_br:
2381 case Intrinsic::amdgcn_ds_gws_sema_p:
2382 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2383 return selectDSGWSIntrinsic(I, IntrinsicID);
2384 case Intrinsic::amdgcn_ds_append:
2385 return selectDSAppendConsume(I, true);
2386 case Intrinsic::amdgcn_ds_consume:
2387 return selectDSAppendConsume(I, false);
2388 case Intrinsic::amdgcn_init_whole_wave:
2389 return selectInitWholeWave(I);
2390 case Intrinsic::amdgcn_raw_buffer_load_lds:
2391 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
2392 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2393 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
2394 case Intrinsic::amdgcn_struct_buffer_load_lds:
2395 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
2396 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2397 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
2398 return selectBufferLoadLds(I);
2399 // Until we can store both the address space of the global and the LDS
2400 // arguments by having tto MachineMemOperands on an intrinsic, we just trust
2401 // that the argument is a global pointer (buffer pointers have been handled by
2402 // a LLVM IR-level lowering).
2403 case Intrinsic::amdgcn_load_to_lds:
2404 case Intrinsic::amdgcn_load_async_to_lds:
2405 case Intrinsic::amdgcn_global_load_lds:
2406 case Intrinsic::amdgcn_global_load_async_lds:
2407 return selectGlobalLoadLds(I);
2408 case Intrinsic::amdgcn_tensor_load_to_lds:
2409 case Intrinsic::amdgcn_tensor_store_from_lds:
2410 return selectTensorLoadStore(I, IntrinsicID);
2411 case Intrinsic::amdgcn_asyncmark:
2412 case Intrinsic::amdgcn_wait_asyncmark:
2413 if (!Subtarget->hasAsyncMark())
2414 return false;
2415 break;
2416 case Intrinsic::amdgcn_exp_compr:
2417 if (!STI.hasCompressedExport()) {
2418 Function &F = I.getMF()->getFunction();
2419 F.getContext().diagnose(
2420 DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget",
2421 I.getDebugLoc(), DS_Error));
2422 return false;
2423 }
2424 break;
2425 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2426 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2427 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2428 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2429 return selectDSBvhStackIntrinsic(I);
2430 case Intrinsic::amdgcn_s_alloc_vgpr: {
2431 // S_ALLOC_VGPR doesn't have a destination register, it just implicitly sets
2432 // SCC. We then need to COPY it into the result vreg.
2433 MachineBasicBlock *MBB = I.getParent();
2434 const DebugLoc &DL = I.getDebugLoc();
2435
2436 Register ResReg = I.getOperand(0).getReg();
2437
2438 MachineInstr *AllocMI = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_ALLOC_VGPR))
2439 .add(I.getOperand(2));
2440 (void)BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), ResReg)
2441 .addReg(AMDGPU::SCC);
2442 I.eraseFromParent();
2443 constrainSelectedInstRegOperands(*AllocMI, TII, TRI, RBI);
2444 return RBI.constrainGenericRegister(ResReg, AMDGPU::SReg_32RegClass, *MRI);
2445 }
2446 case Intrinsic::amdgcn_s_barrier_init:
2447 case Intrinsic::amdgcn_s_barrier_signal_var:
2448 return selectNamedBarrierInit(I, IntrinsicID);
2449 case Intrinsic::amdgcn_s_wakeup_barrier: {
2450 if (!STI.hasSWakeupBarrier()) {
2451 Function &F = I.getMF()->getFunction();
2452 F.getContext().diagnose(
2453 DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget",
2454 I.getDebugLoc(), DS_Error));
2455 return false;
2456 }
2457 return selectNamedBarrierInst(I, IntrinsicID);
2458 }
2459 case Intrinsic::amdgcn_s_barrier_join:
2460 case Intrinsic::amdgcn_s_get_named_barrier_state:
2461 return selectNamedBarrierInst(I, IntrinsicID);
2462 case Intrinsic::amdgcn_s_get_barrier_state:
2463 return selectSGetBarrierState(I, IntrinsicID);
2464 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2465 return selectSBarrierSignalIsfirst(I, IntrinsicID);
2466 }
2467 return selectImpl(I, *CoverageInfo);
2468}
2469
2470bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2471 if (selectImpl(I, *CoverageInfo))
2472 return true;
2473
2474 MachineBasicBlock *BB = I.getParent();
2475 const DebugLoc &DL = I.getDebugLoc();
2476
2477 Register DstReg = I.getOperand(0).getReg();
2478 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2479 assert(Size <= 32 || Size == 64);
2480 const MachineOperand &CCOp = I.getOperand(1);
2481 Register CCReg = CCOp.getReg();
2482 if (!isVCC(CCReg, *MRI)) {
2483 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2484 AMDGPU::S_CSELECT_B32;
2485 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2486 .addReg(CCReg);
2487
2488 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2489 // bank, because it does not cover the register class that we used to represent
2490 // for it. So we need to manually set the register class here.
2491 if (!MRI->getRegClassOrNull(CCReg))
2492 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2493 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2494 .add(I.getOperand(2))
2495 .add(I.getOperand(3));
2496
2498 constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2499 I.eraseFromParent();
2500 return true;
2501 }
2502
2503 // Wide VGPR select should have been split in RegBankSelect.
2504 if (Size > 32)
2505 return false;
2506
2507 MachineInstr *Select =
2508 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2509 .addImm(0)
2510 .add(I.getOperand(3))
2511 .addImm(0)
2512 .add(I.getOperand(2))
2513 .add(I.getOperand(1));
2514
2516 I.eraseFromParent();
2517 return true;
2518}
2519
2520bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2521 Register DstReg = I.getOperand(0).getReg();
2522 Register SrcReg = I.getOperand(1).getReg();
2523 const LLT DstTy = MRI->getType(DstReg);
2524 const LLT SrcTy = MRI->getType(SrcReg);
2525 const LLT S1 = LLT::scalar(1);
2526
2527 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2528 const RegisterBank *DstRB;
2529 if (DstTy == S1) {
2530 // This is a special case. We don't treat s1 for legalization artifacts as
2531 // vcc booleans.
2532 DstRB = SrcRB;
2533 } else {
2534 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2535 if (SrcRB != DstRB)
2536 return false;
2537 }
2538
2539 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2540
2541 unsigned DstSize = DstTy.getSizeInBits();
2542 unsigned SrcSize = SrcTy.getSizeInBits();
2543
2544 const TargetRegisterClass *SrcRC =
2545 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2546 const TargetRegisterClass *DstRC =
2547 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2548 if (!SrcRC || !DstRC)
2549 return false;
2550
2551 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2552 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2553 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2554 return false;
2555 }
2556
2557 if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
2558 assert(STI.useRealTrue16Insts());
2559 const DebugLoc &DL = I.getDebugLoc();
2560 MachineBasicBlock *MBB = I.getParent();
2561 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg)
2562 .addReg(SrcReg, {}, AMDGPU::lo16);
2563 I.eraseFromParent();
2564 return true;
2565 }
2566
2567 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2568 MachineBasicBlock *MBB = I.getParent();
2569 const DebugLoc &DL = I.getDebugLoc();
2570
2571 Register LoReg = MRI->createVirtualRegister(DstRC);
2572 Register HiReg = MRI->createVirtualRegister(DstRC);
2573 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2574 .addReg(SrcReg, {}, AMDGPU::sub0);
2575 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2576 .addReg(SrcReg, {}, AMDGPU::sub1);
2577
2578 if (IsVALU && STI.hasSDWA()) {
2579 // Write the low 16-bits of the high element into the high 16-bits of the
2580 // low element.
2581 MachineInstr *MovSDWA =
2582 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2583 .addImm(0) // $src0_modifiers
2584 .addReg(HiReg) // $src0
2585 .addImm(0) // $clamp
2586 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2587 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2588 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2589 .addReg(LoReg, RegState::Implicit);
2590 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2591 } else {
2592 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2593 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2594 Register ImmReg = MRI->createVirtualRegister(DstRC);
2595 if (IsVALU) {
2596 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2597 .addImm(16)
2598 .addReg(HiReg);
2599 } else {
2600 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2601 .addReg(HiReg)
2602 .addImm(16)
2603 .setOperandDead(3); // Dead scc
2604 }
2605
2606 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2607 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2608 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2609
2610 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2611 .addImm(0xffff);
2612 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2613 .addReg(LoReg)
2614 .addReg(ImmReg);
2615 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2616 .addReg(TmpReg0)
2617 .addReg(TmpReg1);
2618
2619 if (!IsVALU) {
2620 And.setOperandDead(3); // Dead scc
2621 Or.setOperandDead(3); // Dead scc
2622 }
2623 }
2624
2625 I.eraseFromParent();
2626 return true;
2627 }
2628
2629 if (!DstTy.isScalar())
2630 return false;
2631
2632 if (SrcSize > 32) {
2633 unsigned SubRegIdx = DstSize < 32
2634 ? static_cast<unsigned>(AMDGPU::sub0)
2635 : TRI.getSubRegFromChannel(0, DstSize / 32);
2636 if (SubRegIdx == AMDGPU::NoSubRegister)
2637 return false;
2638
2639 // Deal with weird cases where the class only partially supports the subreg
2640 // index.
2641 const TargetRegisterClass *SrcWithSubRC
2642 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2643 if (!SrcWithSubRC)
2644 return false;
2645
2646 if (SrcWithSubRC != SrcRC) {
2647 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2648 return false;
2649 }
2650
2651 I.getOperand(1).setSubReg(SubRegIdx);
2652 }
2653
2654 I.setDesc(TII.get(TargetOpcode::COPY));
2655 return true;
2656}
2657
2658/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2659static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2661 int SignedMask = static_cast<int>(Mask);
2662 return SignedMask >= -16 && SignedMask <= 64;
2663}
2664
2665// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2666const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2667 Register Reg, const MachineRegisterInfo &MRI,
2668 const TargetRegisterInfo &TRI) const {
2669 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2670 if (auto *RB = dyn_cast<const RegisterBank *>(RegClassOrBank))
2671 return RB;
2672
2673 // Ignore the type, since we don't use vcc in artifacts.
2674 if (auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
2675 return &RBI.getRegBankFromRegClass(*RC, LLT());
2676 return nullptr;
2677}
2678
2679bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2680 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2681 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2682 const DebugLoc &DL = I.getDebugLoc();
2683 MachineBasicBlock &MBB = *I.getParent();
2684 const Register DstReg = I.getOperand(0).getReg();
2685 const Register SrcReg = I.getOperand(1).getReg();
2686
2687 const LLT DstTy = MRI->getType(DstReg);
2688 const LLT SrcTy = MRI->getType(SrcReg);
2689 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2690 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2691 const unsigned DstSize = DstTy.getSizeInBits();
2692 if (!DstTy.isScalar())
2693 return false;
2694
2695 // Artifact casts should never use vcc.
2696 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2697
2698 // FIXME: This should probably be illegal and split earlier.
2699 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2700 if (DstSize <= 32)
2701 return selectCOPY(I);
2702
2703 const TargetRegisterClass *SrcRC =
2704 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2705 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2706 const TargetRegisterClass *DstRC =
2707 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2708
2709 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2710 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2711 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2712 .addReg(SrcReg)
2713 .addImm(AMDGPU::sub0)
2714 .addReg(UndefReg)
2715 .addImm(AMDGPU::sub1);
2716 I.eraseFromParent();
2717
2718 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2719 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2720 }
2721
2722 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2723 // 64-bit should have been split up in RegBankSelect
2724
2725 // Try to use an and with a mask if it will save code size.
2726 unsigned Mask;
2727 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2728 MachineInstr *ExtI =
2729 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2730 .addImm(Mask)
2731 .addReg(SrcReg);
2732 I.eraseFromParent();
2733 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2734 return true;
2735 }
2736
2737 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2738 MachineInstr *ExtI =
2739 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2740 .addReg(SrcReg)
2741 .addImm(0) // Offset
2742 .addImm(SrcSize); // Width
2743 I.eraseFromParent();
2744 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2745 return true;
2746 }
2747
2748 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2749 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2750 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2751 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2752 return false;
2753
2754 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2755 const unsigned SextOpc = SrcSize == 8 ?
2756 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2757 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2758 .addReg(SrcReg);
2759 I.eraseFromParent();
2760 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2761 }
2762
2763 // Using a single 32-bit SALU to calculate the high half is smaller than
2764 // S_BFE with a literal constant operand.
2765 if (DstSize > 32 && SrcSize == 32) {
2766 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2767 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2768 if (Signed) {
2769 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2770 .addReg(SrcReg, {}, SubReg)
2771 .addImm(31)
2772 .setOperandDead(3); // Dead scc
2773 } else {
2774 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2775 .addImm(0);
2776 }
2777 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2778 .addReg(SrcReg, {}, SubReg)
2779 .addImm(AMDGPU::sub0)
2780 .addReg(HiReg)
2781 .addImm(AMDGPU::sub1);
2782 I.eraseFromParent();
2783 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2784 *MRI);
2785 }
2786
2787 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2788 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2789
2790 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2791 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2792 // We need a 64-bit register source, but the high bits don't matter.
2793 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2794 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2795 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2796
2797 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2798 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2799 .addReg(SrcReg, {}, SubReg)
2800 .addImm(AMDGPU::sub0)
2801 .addReg(UndefReg)
2802 .addImm(AMDGPU::sub1);
2803
2804 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2805 .addReg(ExtReg)
2806 .addImm(SrcSize << 16);
2807
2808 I.eraseFromParent();
2809 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2810 }
2811
2812 unsigned Mask;
2813 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2814 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2815 .addReg(SrcReg)
2816 .addImm(Mask)
2817 .setOperandDead(3); // Dead scc
2818 } else {
2819 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2820 .addReg(SrcReg)
2821 .addImm(SrcSize << 16);
2822 }
2823
2824 I.eraseFromParent();
2825 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2826 }
2827
2828 return false;
2829}
2830
2834
2836 Register BitcastSrc;
2837 if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
2838 Reg = BitcastSrc;
2839 return Reg;
2840}
2841
2843 Register &Out) {
2844 // When unmerging a register that is composed of 2 x 16-bit values allow to
2845 // use an extract hi instruction for the upper 16 bits. We only need to check
2846 // the size of `In` as all defs are guaranteed to be the same type for
2847 // GUnmerge.
2848 if (auto *Unmerge = dyn_cast<GUnmerge>(MRI.getVRegDef(In))) {
2849 if (Unmerge->getNumDefs() == 2 && Unmerge->getOperand(1).getReg() == In &&
2850 MRI.getType(In).getSizeInBits() == 16) {
2851 Out = Unmerge->getSourceReg();
2852 return true;
2853 }
2854 }
2855
2856 Register Trunc;
2857 if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
2858 return false;
2859
2860 Register LShlSrc;
2861 Register Cst;
2862 if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
2863 Cst = stripCopy(Cst, MRI);
2864 if (mi_match(Cst, MRI, m_SpecificICst(16))) {
2865 Out = stripBitCast(LShlSrc, MRI);
2866 return true;
2867 }
2868 }
2869
2870 MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
2871 if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2872 return false;
2873
2874 assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2875 LLT::fixed_vector(2, 16));
2876
2877 ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
2878 assert(Mask.size() == 2);
2879
2880 if (Mask[0] == 1 && Mask[1] <= 1) {
2881 Out = Shuffle->getOperand(0).getReg();
2882 return true;
2883 }
2884
2885 return false;
2886}
2887
2888bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2889 if (!Subtarget->hasSALUFloatInsts())
2890 return false;
2891
2892 Register Dst = I.getOperand(0).getReg();
2893 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2894 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2895 return false;
2896
2897 Register Src = I.getOperand(1).getReg();
2898
2899 if (MRI->getType(Dst) == LLT::scalar(32) &&
2900 MRI->getType(Src) == LLT::scalar(16)) {
2901 if (isExtractHiElt(*MRI, Src, Src)) {
2902 MachineBasicBlock *BB = I.getParent();
2903 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2904 .addUse(Src);
2905 I.eraseFromParent();
2906 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2907 }
2908 }
2909
2910 return false;
2911}
2912
2913bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2914 // Only manually handle the f64 SGPR case.
2915 //
2916 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2917 // the bit ops theoretically have a second result due to the implicit def of
2918 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2919 // that is easy by disabling the check. The result works, but uses a
2920 // nonsensical sreg32orlds_and_sreg_1 regclass.
2921 //
2922 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2923 // the variadic REG_SEQUENCE operands.
2924
2925 Register Dst = MI.getOperand(0).getReg();
2926 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2927 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2928 MRI->getType(Dst) != LLT::scalar(64))
2929 return false;
2930
2931 Register Src = MI.getOperand(1).getReg();
2932 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2933 if (Fabs)
2934 Src = Fabs->getOperand(1).getReg();
2935
2936 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2937 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2938 return false;
2939
2940 MachineBasicBlock *BB = MI.getParent();
2941 const DebugLoc &DL = MI.getDebugLoc();
2942 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2943 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2944 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2945 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2946
2947 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2948 .addReg(Src, {}, AMDGPU::sub0);
2949 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2950 .addReg(Src, {}, AMDGPU::sub1);
2951 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2952 .addImm(0x80000000);
2953
2954 // Set or toggle sign bit.
2955 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2956 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2957 .addReg(HiReg)
2958 .addReg(ConstReg)
2959 .setOperandDead(3); // Dead scc
2960 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2961 .addReg(LoReg)
2962 .addImm(AMDGPU::sub0)
2963 .addReg(OpReg)
2964 .addImm(AMDGPU::sub1);
2965 MI.eraseFromParent();
2966 return true;
2967}
2968
2969// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2970bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2971 Register Dst = MI.getOperand(0).getReg();
2972 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2973 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2974 MRI->getType(Dst) != LLT::scalar(64))
2975 return false;
2976
2977 Register Src = MI.getOperand(1).getReg();
2978 MachineBasicBlock *BB = MI.getParent();
2979 const DebugLoc &DL = MI.getDebugLoc();
2980 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2981 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2982 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2983 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2984
2985 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2986 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2987 return false;
2988
2989 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2990 .addReg(Src, {}, AMDGPU::sub0);
2991 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2992 .addReg(Src, {}, AMDGPU::sub1);
2993 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2994 .addImm(0x7fffffff);
2995
2996 // Clear sign bit.
2997 // TODO: Should this used S_BITSET0_*?
2998 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2999 .addReg(HiReg)
3000 .addReg(ConstReg)
3001 .setOperandDead(3); // Dead scc
3002 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
3003 .addReg(LoReg)
3004 .addImm(AMDGPU::sub0)
3005 .addReg(OpReg)
3006 .addImm(AMDGPU::sub1);
3007
3008 MI.eraseFromParent();
3009 return true;
3010}
3011
3012static bool isConstant(const MachineInstr &MI) {
3013 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
3014}
3015
3016void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
3017 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
3018
3019 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
3020 const MachineInstr *PtrMI =
3021 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
3022
3023 assert(PtrMI);
3024
3025 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
3026 return;
3027
3028 GEPInfo GEPInfo;
3029
3030 for (unsigned i = 1; i != 3; ++i) {
3031 const MachineOperand &GEPOp = PtrMI->getOperand(i);
3032 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
3033 assert(OpDef);
3034 if (i == 2 && isConstant(*OpDef)) {
3035 // TODO: Could handle constant base + variable offset, but a combine
3036 // probably should have commuted it.
3037 assert(GEPInfo.Imm == 0);
3038 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
3039 continue;
3040 }
3041 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
3042 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
3043 GEPInfo.SgprParts.push_back(GEPOp.getReg());
3044 else
3045 GEPInfo.VgprParts.push_back(GEPOp.getReg());
3046 }
3047
3048 AddrInfo.push_back(GEPInfo);
3049 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
3050}
3051
3052bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
3053 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
3054}
3055
3056bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
3057 if (!MI.hasOneMemOperand())
3058 return false;
3059
3060 const MachineMemOperand *MMO = *MI.memoperands_begin();
3061 const Value *Ptr = MMO->getValue();
3062
3063 // UndefValue means this is a load of a kernel input. These are uniform.
3064 // Sometimes LDS instructions have constant pointers.
3065 // If Ptr is null, then that means this mem operand contains a
3066 // PseudoSourceValue like GOT.
3068 return true;
3069
3071 return true;
3072
3073 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
3074 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
3075 AMDGPU::SGPRRegBankID;
3076
3077 const Instruction *I = dyn_cast<Instruction>(Ptr);
3078 return I && I->getMetadata("amdgpu.uniform");
3079}
3080
3081bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
3082 for (const GEPInfo &GEPInfo : AddrInfo) {
3083 if (!GEPInfo.VgprParts.empty())
3084 return true;
3085 }
3086 return false;
3087}
3088
3089void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
3090 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
3091 unsigned AS = PtrTy.getAddressSpace();
3093 STI.ldsRequiresM0Init()) {
3094 MachineBasicBlock *BB = I.getParent();
3095
3096 // If DS instructions require M0 initialization, insert it before selecting.
3097 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3098 .addImm(-1);
3099 }
3100}
3101
3102bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
3103 MachineInstr &I) const {
3104 initM0(I);
3105 return selectImpl(I, *CoverageInfo);
3106}
3107
3109 if (Reg.isPhysical())
3110 return false;
3111
3113 const unsigned Opcode = MI.getOpcode();
3114
3115 if (Opcode == AMDGPU::COPY)
3116 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
3117
3118 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
3119 Opcode == AMDGPU::G_XOR)
3120 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
3121 isVCmpResult(MI.getOperand(2).getReg(), MRI);
3122
3123 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
3124 return GI->is(Intrinsic::amdgcn_class);
3125
3126 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
3127}
3128
3129bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
3130 MachineBasicBlock *BB = I.getParent();
3131 MachineOperand &CondOp = I.getOperand(0);
3132 Register CondReg = CondOp.getReg();
3133 const DebugLoc &DL = I.getDebugLoc();
3134
3135 unsigned BrOpcode;
3136 Register CondPhysReg;
3137 const TargetRegisterClass *ConstrainRC;
3138
3139 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
3140 // whether the branch is uniform when selecting the instruction. In
3141 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
3142 // RegBankSelect knows what it's doing if the branch condition is scc, even
3143 // though it currently does not.
3144 if (!isVCC(CondReg, *MRI)) {
3145 if (MRI->getType(CondReg) != LLT::scalar(32))
3146 return false;
3147
3148 CondPhysReg = AMDGPU::SCC;
3149 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
3150 ConstrainRC = &AMDGPU::SReg_32RegClass;
3151 } else {
3152 // FIXME: Should scc->vcc copies and with exec?
3153
3154 // Unless the value of CondReg is a result of a V_CMP* instruction then we
3155 // need to insert an and with exec.
3156 if (!isVCmpResult(CondReg, *MRI)) {
3157 const bool Is64 = STI.isWave64();
3158 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
3159 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
3160
3161 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
3162 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
3163 .addReg(CondReg)
3164 .addReg(Exec)
3165 .setOperandDead(3); // Dead scc
3166 CondReg = TmpReg;
3167 }
3168
3169 CondPhysReg = TRI.getVCC();
3170 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
3171 ConstrainRC = TRI.getBoolRC();
3172 }
3173
3174 if (!MRI->getRegClassOrNull(CondReg))
3175 MRI->setRegClass(CondReg, ConstrainRC);
3176
3177 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
3178 .addReg(CondReg);
3179 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
3180 .addMBB(I.getOperand(1).getMBB());
3181
3182 I.eraseFromParent();
3183 return true;
3184}
3185
3186bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
3187 MachineInstr &I) const {
3188 Register DstReg = I.getOperand(0).getReg();
3189 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3190 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3191 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
3192 if (IsVGPR)
3193 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3194
3195 return RBI.constrainGenericRegister(
3196 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
3197}
3198
3199bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
3200 Register DstReg = I.getOperand(0).getReg();
3201 Register SrcReg = I.getOperand(1).getReg();
3202 Register MaskReg = I.getOperand(2).getReg();
3203 LLT Ty = MRI->getType(DstReg);
3204 LLT MaskTy = MRI->getType(MaskReg);
3205 MachineBasicBlock *BB = I.getParent();
3206 const DebugLoc &DL = I.getDebugLoc();
3207
3208 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3209 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3210 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
3211 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3212 if (DstRB != SrcRB) // Should only happen for hand written MIR.
3213 return false;
3214
3215 // Try to avoid emitting a bit operation when we only need to touch half of
3216 // the 64-bit pointer.
3217 APInt MaskOnes = VT->getKnownOnes(MaskReg).zext(64);
3218 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
3219 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
3220
3221 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
3222 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
3223
3224 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
3225 !CanCopyLow32 && !CanCopyHi32) {
3226 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
3227 .addReg(SrcReg)
3228 .addReg(MaskReg)
3229 .setOperandDead(3); // Dead scc
3230 I.eraseFromParent();
3231 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3232 return true;
3233 }
3234
3235 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
3236 const TargetRegisterClass &RegRC
3237 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3238
3239 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
3240 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
3241 const TargetRegisterClass *MaskRC =
3242 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
3243
3244 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3245 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3246 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
3247 return false;
3248
3249 if (Ty.getSizeInBits() == 32) {
3250 assert(MaskTy.getSizeInBits() == 32 &&
3251 "ptrmask should have been narrowed during legalize");
3252
3253 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
3254 .addReg(SrcReg)
3255 .addReg(MaskReg);
3256
3257 if (!IsVGPR)
3258 NewOp.setOperandDead(3); // Dead scc
3259 I.eraseFromParent();
3260 return true;
3261 }
3262
3263 Register HiReg = MRI->createVirtualRegister(&RegRC);
3264 Register LoReg = MRI->createVirtualRegister(&RegRC);
3265
3266 // Extract the subregisters from the source pointer.
3267 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
3268 .addReg(SrcReg, {}, AMDGPU::sub0);
3269 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
3270 .addReg(SrcReg, {}, AMDGPU::sub1);
3271
3272 Register MaskedLo, MaskedHi;
3273
3274 if (CanCopyLow32) {
3275 // If all the bits in the low half are 1, we only need a copy for it.
3276 MaskedLo = LoReg;
3277 } else {
3278 // Extract the mask subregister and apply the and.
3279 Register MaskLo = MRI->createVirtualRegister(&RegRC);
3280 MaskedLo = MRI->createVirtualRegister(&RegRC);
3281
3282 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
3283 .addReg(MaskReg, {}, AMDGPU::sub0);
3284 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
3285 .addReg(LoReg)
3286 .addReg(MaskLo);
3287 }
3288
3289 if (CanCopyHi32) {
3290 // If all the bits in the high half are 1, we only need a copy for it.
3291 MaskedHi = HiReg;
3292 } else {
3293 Register MaskHi = MRI->createVirtualRegister(&RegRC);
3294 MaskedHi = MRI->createVirtualRegister(&RegRC);
3295
3296 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
3297 .addReg(MaskReg, {}, AMDGPU::sub1);
3298 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
3299 .addReg(HiReg)
3300 .addReg(MaskHi);
3301 }
3302
3303 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3304 .addReg(MaskedLo)
3305 .addImm(AMDGPU::sub0)
3306 .addReg(MaskedHi)
3307 .addImm(AMDGPU::sub1);
3308 I.eraseFromParent();
3309 return true;
3310}
3311
3312/// Return the register to use for the index value, and the subregister to use
3313/// for the indirectly accessed register.
3314static std::pair<Register, unsigned>
3316 const TargetRegisterClass *SuperRC, Register IdxReg,
3317 unsigned EltSize, GISelValueTracking &ValueTracking) {
3318 Register IdxBaseReg;
3319 int Offset;
3320
3321 std::tie(IdxBaseReg, Offset) =
3322 AMDGPU::getBaseWithConstantOffset(MRI, IdxReg, &ValueTracking);
3323 if (IdxBaseReg == AMDGPU::NoRegister) {
3324 // This will happen if the index is a known constant. This should ordinarily
3325 // be legalized out, but handle it as a register just in case.
3326 assert(Offset == 0);
3327 IdxBaseReg = IdxReg;
3328 }
3329
3330 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3331
3332 // Skip out of bounds offsets, or else we would end up using an undefined
3333 // register.
3334 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3335 return std::pair(IdxReg, SubRegs[0]);
3336 return std::pair(IdxBaseReg, SubRegs[Offset]);
3337}
3338
3339bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3340 MachineInstr &MI) const {
3341 Register DstReg = MI.getOperand(0).getReg();
3342 Register SrcReg = MI.getOperand(1).getReg();
3343 Register IdxReg = MI.getOperand(2).getReg();
3344
3345 LLT DstTy = MRI->getType(DstReg);
3346 LLT SrcTy = MRI->getType(SrcReg);
3347
3348 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3349 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3350 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3351
3352 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3353 // into a waterfall loop.
3354 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3355 return false;
3356
3357 const TargetRegisterClass *SrcRC =
3358 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3359 const TargetRegisterClass *DstRC =
3360 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3361 if (!SrcRC || !DstRC)
3362 return false;
3363 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3364 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3365 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3366 return false;
3367
3368 MachineBasicBlock *BB = MI.getParent();
3369 const DebugLoc &DL = MI.getDebugLoc();
3370 const bool Is64 = DstTy.getSizeInBits() == 64;
3371
3372 unsigned SubReg;
3373 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3374 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *VT);
3375
3376 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3377 if (DstTy.getSizeInBits() != 32 && !Is64)
3378 return false;
3379
3380 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3381 .addReg(IdxReg);
3382
3383 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3384 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3385 .addReg(SrcReg, {}, SubReg)
3386 .addReg(SrcReg, RegState::Implicit);
3387 MI.eraseFromParent();
3388 return true;
3389 }
3390
3391 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3392 return false;
3393
3394 if (!STI.useVGPRIndexMode()) {
3395 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3396 .addReg(IdxReg);
3397 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3398 .addReg(SrcReg, {}, SubReg)
3399 .addReg(SrcReg, RegState::Implicit);
3400 MI.eraseFromParent();
3401 return true;
3402 }
3403
3404 const MCInstrDesc &GPRIDXDesc =
3405 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3406 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3407 .addReg(SrcReg)
3408 .addReg(IdxReg)
3409 .addImm(SubReg);
3410
3411 MI.eraseFromParent();
3412 return true;
3413}
3414
3415// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3416bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3417 MachineInstr &MI) const {
3418 Register DstReg = MI.getOperand(0).getReg();
3419 Register VecReg = MI.getOperand(1).getReg();
3420 Register ValReg = MI.getOperand(2).getReg();
3421 Register IdxReg = MI.getOperand(3).getReg();
3422
3423 LLT VecTy = MRI->getType(DstReg);
3424 LLT ValTy = MRI->getType(ValReg);
3425 unsigned VecSize = VecTy.getSizeInBits();
3426 unsigned ValSize = ValTy.getSizeInBits();
3427
3428 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3429 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3430 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3431
3432 assert(VecTy.getElementType() == ValTy);
3433
3434 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3435 // into a waterfall loop.
3436 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3437 return false;
3438
3439 const TargetRegisterClass *VecRC =
3440 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3441 const TargetRegisterClass *ValRC =
3442 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3443
3444 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3445 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3446 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3447 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3448 return false;
3449
3450 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3451 return false;
3452
3453 unsigned SubReg;
3454 std::tie(IdxReg, SubReg) =
3455 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *VT);
3456
3457 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3458 STI.useVGPRIndexMode();
3459
3460 MachineBasicBlock *BB = MI.getParent();
3461 const DebugLoc &DL = MI.getDebugLoc();
3462
3463 if (!IndexMode) {
3464 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3465 .addReg(IdxReg);
3466
3467 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3468 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3469 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3470 .addReg(VecReg)
3471 .addReg(ValReg)
3472 .addImm(SubReg);
3473 MI.eraseFromParent();
3474 return true;
3475 }
3476
3477 const MCInstrDesc &GPRIDXDesc =
3478 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3479 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3480 .addReg(VecReg)
3481 .addReg(ValReg)
3482 .addReg(IdxReg)
3483 .addImm(SubReg);
3484
3485 MI.eraseFromParent();
3486 return true;
3487}
3488
3489static bool isAsyncLDSDMA(Intrinsic::ID Intr) {
3490 switch (Intr) {
3491 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
3492 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
3493 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
3494 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
3495 case Intrinsic::amdgcn_load_async_to_lds:
3496 case Intrinsic::amdgcn_global_load_async_lds:
3497 return true;
3498 }
3499 return false;
3500}
3501
3502bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3503 if (!Subtarget->hasVMemToLDSLoad())
3504 return false;
3505 unsigned Opc;
3506 unsigned Size = MI.getOperand(3).getImm();
3507 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
3508
3509 // The struct intrinsic variants add one additional operand over raw.
3510 const bool HasVIndex = MI.getNumOperands() == 9;
3511 Register VIndex;
3512 int OpOffset = 0;
3513 if (HasVIndex) {
3514 VIndex = MI.getOperand(4).getReg();
3515 OpOffset = 1;
3516 }
3517
3518 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3519 std::optional<ValueAndVReg> MaybeVOffset =
3521 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3522
3523 switch (Size) {
3524 default:
3525 return false;
3526 case 1:
3527 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3528 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3529 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3530 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3531 break;
3532 case 2:
3533 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3534 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3535 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3536 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3537 break;
3538 case 4:
3539 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3540 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3541 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3542 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3543 break;
3544 case 12:
3545 if (!Subtarget->hasLDSLoadB96_B128())
3546 return false;
3547
3548 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
3549 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
3550 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
3551 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
3552 break;
3553 case 16:
3554 if (!Subtarget->hasLDSLoadB96_B128())
3555 return false;
3556
3557 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
3558 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
3559 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
3560 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
3561 break;
3562 }
3563
3564 MachineBasicBlock *MBB = MI.getParent();
3565 const DebugLoc &DL = MI.getDebugLoc();
3566 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3567 .add(MI.getOperand(2));
3568
3569 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3570
3571 if (HasVIndex && HasVOffset) {
3572 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3573 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3574 .addReg(VIndex)
3575 .addImm(AMDGPU::sub0)
3576 .addReg(VOffset)
3577 .addImm(AMDGPU::sub1);
3578
3579 MIB.addReg(IdxReg);
3580 } else if (HasVIndex) {
3581 MIB.addReg(VIndex);
3582 } else if (HasVOffset) {
3583 MIB.addReg(VOffset);
3584 }
3585
3586 MIB.add(MI.getOperand(1)); // rsrc
3587 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3588 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3589 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
3590 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3591 MIB.addImm(Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL
3592 : AMDGPU::CPol::ALL_pregfx12)); // cpol
3593 MIB.addImm(
3594 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
3595 ? 1
3596 : 0); // swz
3597 MIB.addImm(isAsyncLDSDMA(IntrinsicID));
3598
3599 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3600 // Don't set the offset value here because the pointer points to the base of
3601 // the buffer.
3602 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3603
3604 MachinePointerInfo StorePtrI = LoadPtrI;
3605 LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3609
3610 auto F = LoadMMO->getFlags() &
3612 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3613 Size, LoadMMO->getBaseAlign());
3614
3615 MachineMemOperand *StoreMMO =
3616 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3617 sizeof(int32_t), LoadMMO->getBaseAlign());
3618
3619 MIB.setMemRefs({LoadMMO, StoreMMO});
3620
3621 MI.eraseFromParent();
3622 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3623 return true;
3624}
3625
3626/// Match a zero extend from a 32-bit value to 64-bits.
3627Register AMDGPUInstructionSelector::matchZeroExtendFromS32(Register Reg) const {
3628 Register ZExtSrc;
3629 if (mi_match(Reg, *MRI, m_GZExt(m_Reg(ZExtSrc))))
3630 return MRI->getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3631
3632 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3633 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3634 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3635 return Register();
3636
3637 assert(Def->getNumOperands() == 3 &&
3638 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3639 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_ZeroInt())) {
3640 return Def->getOperand(1).getReg();
3641 }
3642
3643 return Register();
3644}
3645
3646/// Match a sign extend from a 32-bit value to 64-bits.
3647Register AMDGPUInstructionSelector::matchSignExtendFromS32(Register Reg) const {
3648 Register SExtSrc;
3649 if (mi_match(Reg, *MRI, m_GSExt(m_Reg(SExtSrc))))
3650 return MRI->getType(SExtSrc) == LLT::scalar(32) ? SExtSrc : Register();
3651
3652 // Match legalized form %sext = G_MERGE_VALUES (s32 %x), G_ASHR((S32 %x, 31))
3653 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3654 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3655 return Register();
3656
3657 assert(Def->getNumOperands() == 3 &&
3658 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3659 if (mi_match(Def->getOperand(2).getReg(), *MRI,
3660 m_GAShr(m_SpecificReg(Def->getOperand(1).getReg()),
3661 m_SpecificICst(31))))
3662 return Def->getOperand(1).getReg();
3663
3664 if (VT->signBitIsZero(Reg))
3665 return matchZeroExtendFromS32(Reg);
3666
3667 return Register();
3668}
3669
3670/// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it
3671/// is 32-bit.
3673AMDGPUInstructionSelector::matchZeroExtendFromS32OrS32(Register Reg) const {
3674 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3675 : matchZeroExtendFromS32(Reg);
3676}
3677
3678/// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it
3679/// is 32-bit.
3681AMDGPUInstructionSelector::matchSignExtendFromS32OrS32(Register Reg) const {
3682 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3683 : matchSignExtendFromS32(Reg);
3684}
3685
3687AMDGPUInstructionSelector::matchExtendFromS32OrS32(Register Reg,
3688 bool IsSigned) const {
3689 if (IsSigned)
3690 return matchSignExtendFromS32OrS32(Reg);
3691
3692 return matchZeroExtendFromS32OrS32(Reg);
3693}
3694
3695Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const {
3696 Register AnyExtSrc;
3697 if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc))))
3698 return MRI->getType(AnyExtSrc) == LLT::scalar(32) ? AnyExtSrc : Register();
3699
3700 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 G_IMPLICIT_DEF)
3701 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3702 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3703 return Register();
3704
3705 assert(Def->getNumOperands() == 3 &&
3706 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3707
3708 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_GImplicitDef()))
3709 return Def->getOperand(1).getReg();
3710
3711 return Register();
3712}
3713
3714bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3715 if (!Subtarget->hasVMemToLDSLoad())
3716 return false;
3717
3718 unsigned Opc;
3719 unsigned Size = MI.getOperand(3).getImm();
3720 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
3721
3722 switch (Size) {
3723 default:
3724 return false;
3725 case 1:
3726 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3727 break;
3728 case 2:
3729 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3730 break;
3731 case 4:
3732 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3733 break;
3734 case 12:
3735 if (!Subtarget->hasLDSLoadB96_B128())
3736 return false;
3737 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3738 break;
3739 case 16:
3740 if (!Subtarget->hasLDSLoadB96_B128())
3741 return false;
3742 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3743 break;
3744 }
3745
3746 MachineBasicBlock *MBB = MI.getParent();
3747 const DebugLoc &DL = MI.getDebugLoc();
3748 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3749 .add(MI.getOperand(2));
3750
3751 Register Addr = MI.getOperand(1).getReg();
3752 Register VOffset;
3753 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3754 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3755 if (!isSGPR(Addr)) {
3756 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3757 if (isSGPR(AddrDef->Reg)) {
3758 Addr = AddrDef->Reg;
3759 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3760 Register SAddr =
3761 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3762 if (isSGPR(SAddr)) {
3763 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3764 if (Register Off = matchZeroExtendFromS32(PtrBaseOffset)) {
3765 Addr = SAddr;
3766 VOffset = Off;
3767 }
3768 }
3769 }
3770 }
3771
3772 if (isSGPR(Addr)) {
3774 if (!VOffset) {
3775 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3776 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3777 .addImm(0);
3778 }
3779 }
3780
3781 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3782 .addReg(Addr);
3783
3784 if (isSGPR(Addr))
3785 MIB.addReg(VOffset);
3786
3787 MIB.add(MI.getOperand(4)); // offset
3788
3789 unsigned Aux = MI.getOperand(5).getImm();
3790 MIB.addImm(Aux & ~AMDGPU::CPol::VIRTUAL_BITS); // cpol
3791 MIB.addImm(isAsyncLDSDMA(IntrinsicID));
3792
3793 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3794 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3795 LoadPtrI.Offset = MI.getOperand(4).getImm();
3796 MachinePointerInfo StorePtrI = LoadPtrI;
3797 LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3801 auto F = LoadMMO->getFlags() &
3803 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3804 Size, LoadMMO->getBaseAlign());
3805 MachineMemOperand *StoreMMO =
3806 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3807 sizeof(int32_t), Align(4));
3808
3809 MIB.setMemRefs({LoadMMO, StoreMMO});
3810
3811 MI.eraseFromParent();
3812 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3813 return true;
3814}
3815
3816bool AMDGPUInstructionSelector::selectTensorLoadStore(MachineInstr &MI,
3817 Intrinsic::ID IID) const {
3818 bool IsLoad = IID == Intrinsic::amdgcn_tensor_load_to_lds;
3819 unsigned Opc =
3820 IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d4 : AMDGPU::TENSOR_STORE_FROM_LDS_d4;
3821 int NumGroups = 4;
3822
3823 // A lamda function to check whether an operand is a vector of all 0s.
3824 const auto isAllZeros = [&](MachineOperand &Opnd) {
3825 const MachineInstr *DefMI = MRI->getVRegDef(Opnd.getReg());
3826 if (!DefMI)
3827 return false;
3828 return llvm::isBuildVectorAllZeros(*DefMI, *MRI, true);
3829 };
3830
3831 // Use _D2 version if both group 2 and 3 are zero-initialized.
3832 if (isAllZeros(MI.getOperand(3)) && isAllZeros(MI.getOperand(4))) {
3833 NumGroups = 2;
3834 Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d2
3835 : AMDGPU::TENSOR_STORE_FROM_LDS_d2;
3836 }
3837
3838 // TODO: Handle the fifth group: MI.getOpetand(5), which is silently ignored
3839 // for now because all existing targets only support up to 4 groups.
3840 MachineBasicBlock *MBB = MI.getParent();
3841 auto MIB = BuildMI(*MBB, &MI, MI.getDebugLoc(), TII.get(Opc))
3842 .add(MI.getOperand(1)) // D# group 0
3843 .add(MI.getOperand(2)); // D# group 1
3844
3845 if (NumGroups >= 4) { // Has at least 4 groups
3846 MIB.add(MI.getOperand(3)) // D# group 2
3847 .add(MI.getOperand(4)); // D# group 3
3848 }
3849
3850 MIB.addImm(0) // r128
3851 .add(MI.getOperand(6)); // cpol
3852
3853 MI.eraseFromParent();
3854 return true;
3855}
3856
3857bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(
3858 MachineInstr &MI) const {
3859 unsigned OpcodeOpIdx =
3860 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3;
3861 MI.setDesc(TII.get(MI.getOperand(OpcodeOpIdx).getImm()));
3862 MI.removeOperand(OpcodeOpIdx);
3863 MI.addImplicitDefUseOperands(*MI.getMF());
3864 constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3865 return true;
3866}
3867
3868// FIXME: This should be removed and let the patterns select. We just need the
3869// AGPR/VGPR combination versions.
3870bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3871 unsigned Opc;
3872 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3873 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3874 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3875 break;
3876 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3877 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3878 break;
3879 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3880 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3881 break;
3882 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3883 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3884 break;
3885 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3886 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3887 break;
3888 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3889 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3890 break;
3891 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3892 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3893 break;
3894 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3895 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3896 break;
3897 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3898 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3899 break;
3900 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3901 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3902 break;
3903 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3904 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3905 break;
3906 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3907 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3908 break;
3909 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3910 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3911 break;
3912 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3913 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3914 break;
3915 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
3916 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;
3917 break;
3918 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
3919 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64;
3920 break;
3921 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
3922 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
3923 break;
3924 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
3925 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
3926 break;
3927 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
3928 Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64;
3929 break;
3930 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
3931 Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64;
3932 break;
3933 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
3934 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
3935 break;
3936 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
3937 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
3938 break;
3939 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
3940 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
3941 break;
3942 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
3943 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
3944 break;
3945 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
3946 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64;
3947 break;
3948 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
3949 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64;
3950 break;
3951 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
3952 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64;
3953 break;
3954 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
3955 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64;
3956 break;
3957 default:
3958 llvm_unreachable("unhandled smfmac intrinsic");
3959 }
3960
3961 auto VDst_In = MI.getOperand(4);
3962
3963 MI.setDesc(TII.get(Opc));
3964 MI.removeOperand(4); // VDst_In
3965 MI.removeOperand(1); // Intrinsic ID
3966 MI.addOperand(VDst_In); // Readd VDst_In to the end
3967 MI.addImplicitDefUseOperands(*MI.getMF());
3968 const MCInstrDesc &MCID = MI.getDesc();
3969 if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
3970 MI.getOperand(0).setIsEarlyClobber(true);
3971 }
3972 return true;
3973}
3974
3975bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
3976 MachineInstr &MI, Intrinsic::ID IntrID) const {
3977 if (IntrID == Intrinsic::amdgcn_permlane16_swap &&
3978 !Subtarget->hasPermlane16Swap())
3979 return false;
3980 if (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3981 !Subtarget->hasPermlane32Swap())
3982 return false;
3983
3984 unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3985 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3986 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3987
3988 MI.removeOperand(2);
3989 MI.setDesc(TII.get(Opcode));
3990 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3991
3992 MachineOperand &FI = MI.getOperand(4);
3994
3995 constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3996 return true;
3997}
3998
3999bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
4000 Register DstReg = MI.getOperand(0).getReg();
4001 Register SrcReg = MI.getOperand(1).getReg();
4002 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
4003 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
4004 MachineBasicBlock *MBB = MI.getParent();
4005 const DebugLoc &DL = MI.getDebugLoc();
4006
4007 if (IsVALU) {
4008 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
4009 .addImm(Subtarget->getWavefrontSizeLog2())
4010 .addReg(SrcReg);
4011 } else {
4012 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
4013 .addReg(SrcReg)
4014 .addImm(Subtarget->getWavefrontSizeLog2())
4015 .setOperandDead(3); // Dead scc
4016 }
4017
4018 const TargetRegisterClass &RC =
4019 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
4020 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
4021 return false;
4022
4023 MI.eraseFromParent();
4024 return true;
4025}
4026
4027bool AMDGPUInstructionSelector::selectWaveShuffleIntrin(
4028 MachineInstr &MI) const {
4029 assert(MI.getNumOperands() == 4);
4030 MachineBasicBlock *MBB = MI.getParent();
4031 const DebugLoc &DL = MI.getDebugLoc();
4032
4033 Register DstReg = MI.getOperand(0).getReg();
4034 Register ValReg = MI.getOperand(2).getReg();
4035 Register IdxReg = MI.getOperand(3).getReg();
4036
4037 const LLT DstTy = MRI->getType(DstReg);
4038 unsigned DstSize = DstTy.getSizeInBits();
4039 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
4040 const TargetRegisterClass *DstRC =
4041 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
4042
4043 if (DstTy != LLT::scalar(32))
4044 return false;
4045
4046 if (!Subtarget->supportsBPermute())
4047 return false;
4048
4049 // If we can bpermute across the whole wave, then just do that
4050 if (Subtarget->supportsWaveWideBPermute()) {
4051 Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
4052 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
4053 .addImm(2)
4054 .addReg(IdxReg);
4055
4056 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), DstReg)
4057 .addReg(ShiftIdxReg)
4058 .addReg(ValReg)
4059 .addImm(0);
4060 } else {
4061 // Otherwise, we need to make use of whole wave mode
4062 assert(Subtarget->isWave64());
4063
4064 // Set inactive lanes to poison
4065 Register UndefValReg =
4066 MRI->createVirtualRegister(TRI.getRegClass(AMDGPU::SReg_32RegClassID));
4067 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefValReg);
4068
4069 Register UndefExecReg = MRI->createVirtualRegister(
4070 TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
4071 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefExecReg);
4072
4073 Register PoisonValReg = MRI->createVirtualRegister(DstRC);
4074 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonValReg)
4075 .addImm(0)
4076 .addReg(ValReg)
4077 .addImm(0)
4078 .addReg(UndefValReg)
4079 .addReg(UndefExecReg);
4080
4081 // ds_bpermute requires index to be multiplied by 4
4082 Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
4083 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
4084 .addImm(2)
4085 .addReg(IdxReg);
4086
4087 Register PoisonIdxReg = MRI->createVirtualRegister(DstRC);
4088 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonIdxReg)
4089 .addImm(0)
4090 .addReg(ShiftIdxReg)
4091 .addImm(0)
4092 .addReg(UndefValReg)
4093 .addReg(UndefExecReg);
4094
4095 // Get permutation of each half, then we'll select which one to use
4096 Register SameSidePermReg = MRI->createVirtualRegister(DstRC);
4097 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), SameSidePermReg)
4098 .addReg(PoisonIdxReg)
4099 .addReg(PoisonValReg)
4100 .addImm(0);
4101
4102 Register SwappedValReg = MRI->createVirtualRegister(DstRC);
4103 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_PERMLANE64_B32), SwappedValReg)
4104 .addReg(PoisonValReg);
4105
4106 Register OppSidePermReg = MRI->createVirtualRegister(DstRC);
4107 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), OppSidePermReg)
4108 .addReg(PoisonIdxReg)
4109 .addReg(SwappedValReg)
4110 .addImm(0);
4111
4112 Register WWMSwapPermReg = MRI->createVirtualRegister(DstRC);
4113 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::STRICT_WWM), WWMSwapPermReg)
4114 .addReg(OppSidePermReg);
4115
4116 // Select which side to take the permute from
4117 // We can get away with only using mbcnt_lo here since we're only
4118 // trying to detect which side of 32 each lane is on, and mbcnt_lo
4119 // returns 32 for lanes 32-63.
4120 Register ThreadIDReg = MRI->createVirtualRegister(DstRC);
4121 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MBCNT_LO_U32_B32_e64), ThreadIDReg)
4122 .addImm(-1)
4123 .addImm(0);
4124
4125 Register XORReg = MRI->createVirtualRegister(DstRC);
4126 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_XOR_B32_e64), XORReg)
4127 .addReg(ThreadIDReg)
4128 .addReg(PoisonIdxReg);
4129
4130 Register ANDReg = MRI->createVirtualRegister(DstRC);
4131 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_AND_B32_e64), ANDReg)
4132 .addReg(XORReg)
4133 .addImm(32);
4134
4135 Register CompareReg = MRI->createVirtualRegister(
4136 TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
4137 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), CompareReg)
4138 .addReg(ANDReg)
4139 .addImm(0);
4140
4141 // Finally do the selection
4142 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
4143 .addImm(0)
4144 .addReg(WWMSwapPermReg)
4145 .addImm(0)
4146 .addReg(SameSidePermReg)
4147 .addReg(CompareReg);
4148 }
4149
4150 MI.eraseFromParent();
4151 return true;
4152}
4153
4154// Match BITOP3 operation and return a number of matched instructions plus
4155// truth table.
4156static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
4158 const MachineRegisterInfo &MRI) {
4159 unsigned NumOpcodes = 0;
4160 uint8_t LHSBits, RHSBits;
4161
4162 auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {
4163 // Define truth table given Src0, Src1, Src2 bits permutations:
4164 // 0 0 0
4165 // 0 0 1
4166 // 0 1 0
4167 // 0 1 1
4168 // 1 0 0
4169 // 1 0 1
4170 // 1 1 0
4171 // 1 1 1
4172 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
4173
4174 if (mi_match(Op, MRI, m_AllOnesInt())) {
4175 Bits = 0xff;
4176 return true;
4177 }
4178 if (mi_match(Op, MRI, m_ZeroInt())) {
4179 Bits = 0;
4180 return true;
4181 }
4182
4183 for (unsigned I = 0; I < Src.size(); ++I) {
4184 // Try to find existing reused operand
4185 if (Src[I] == Op) {
4186 Bits = SrcBits[I];
4187 return true;
4188 }
4189 // Try to replace parent operator
4190 if (Src[I] == R) {
4191 Bits = SrcBits[I];
4192 Src[I] = Op;
4193 return true;
4194 }
4195 }
4196
4197 if (Src.size() == 3) {
4198 // No room left for operands. Try one last time, there can be a 'not' of
4199 // one of our source operands. In this case we can compute the bits
4200 // without growing Src vector.
4201 Register LHS;
4202 if (mi_match(Op, MRI, m_Not(m_Reg(LHS)))) {
4204 for (unsigned I = 0; I < Src.size(); ++I) {
4205 if (Src[I] == LHS) {
4206 Bits = ~SrcBits[I];
4207 return true;
4208 }
4209 }
4210 }
4211
4212 return false;
4213 }
4214
4215 Bits = SrcBits[Src.size()];
4216 Src.push_back(Op);
4217 return true;
4218 };
4219
4220 MachineInstr *MI = MRI.getVRegDef(R);
4221 switch (MI->getOpcode()) {
4222 case TargetOpcode::G_AND:
4223 case TargetOpcode::G_OR:
4224 case TargetOpcode::G_XOR: {
4225 Register LHS = getSrcRegIgnoringCopies(MI->getOperand(1).getReg(), MRI);
4226 Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI);
4227
4228 SmallVector<Register, 3> Backup(Src.begin(), Src.end());
4229 if (!getOperandBits(LHS, LHSBits) ||
4230 !getOperandBits(RHS, RHSBits)) {
4231 Src = std::move(Backup);
4232 return std::make_pair(0, 0);
4233 }
4234
4235 // Recursion is naturally limited by the size of the operand vector.
4236 auto Op = BitOp3_Op(LHS, Src, MRI);
4237 if (Op.first) {
4238 NumOpcodes += Op.first;
4239 LHSBits = Op.second;
4240 }
4241
4242 Op = BitOp3_Op(RHS, Src, MRI);
4243 if (Op.first) {
4244 NumOpcodes += Op.first;
4245 RHSBits = Op.second;
4246 }
4247 break;
4248 }
4249 default:
4250 return std::make_pair(0, 0);
4251 }
4252
4253 uint8_t TTbl;
4254 switch (MI->getOpcode()) {
4255 case TargetOpcode::G_AND:
4256 TTbl = LHSBits & RHSBits;
4257 break;
4258 case TargetOpcode::G_OR:
4259 TTbl = LHSBits | RHSBits;
4260 break;
4261 case TargetOpcode::G_XOR:
4262 TTbl = LHSBits ^ RHSBits;
4263 break;
4264 default:
4265 break;
4266 }
4267
4268 return std::make_pair(NumOpcodes + 1, TTbl);
4269}
4270
4271bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
4272 if (!Subtarget->hasBitOp3Insts())
4273 return false;
4274
4275 Register DstReg = MI.getOperand(0).getReg();
4276 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
4277 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
4278 if (!IsVALU)
4279 return false;
4280
4282 uint8_t TTbl;
4283 unsigned NumOpcodes;
4284
4285 std::tie(NumOpcodes, TTbl) = BitOp3_Op(DstReg, Src, *MRI);
4286
4287 // Src.empty() case can happen if all operands are all zero or all ones.
4288 // Normally it shall be optimized out before reaching this.
4289 if (NumOpcodes < 2 || Src.empty())
4290 return false;
4291
4292 const bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32);
4293 if (NumOpcodes == 2 && IsB32) {
4294 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
4295 // asm more readable. This cannot be modeled with AddedComplexity because
4296 // selector does not know how many operations did we match.
4297 if (mi_match(MI, *MRI, m_GXor(m_GXor(m_Reg(), m_Reg()), m_Reg())) ||
4298 mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) ||
4299 mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg())))
4300 return false;
4301 } else if (NumOpcodes < 4) {
4302 // For a uniform case threshold should be higher to account for moves
4303 // between VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be
4304 // in SGPRs and a readtfirstlane after.
4305 return false;
4306 }
4307
4308 unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
4309 if (!IsB32 && STI.hasTrue16BitInsts())
4310 Opc = STI.useRealTrue16Insts() ? AMDGPU::V_BITOP3_B16_gfx1250_t16_e64
4311 : AMDGPU::V_BITOP3_B16_gfx1250_fake16_e64;
4312 unsigned CBL = STI.getConstantBusLimit(Opc);
4313 MachineBasicBlock *MBB = MI.getParent();
4314 const DebugLoc &DL = MI.getDebugLoc();
4315
4316 for (unsigned I = 0; I < Src.size(); ++I) {
4317 const RegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI);
4318 if (RB->getID() != AMDGPU::SGPRRegBankID)
4319 continue;
4320 if (CBL > 0) {
4321 --CBL;
4322 continue;
4323 }
4324 Register NewReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4325 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg)
4326 .addReg(Src[I]);
4327 Src[I] = NewReg;
4328 }
4329
4330 // Last operand can be ignored, turning a ternary operation into a binary.
4331 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
4332 // 'c' with 'a' here without changing the answer. In some pathological
4333 // cases it should be possible to get an operation with a single operand
4334 // too if optimizer would not catch it.
4335 while (Src.size() < 3)
4336 Src.push_back(Src[0]);
4337
4338 auto MIB = BuildMI(*MBB, MI, DL, TII.get(Opc), DstReg);
4339 if (!IsB32)
4340 MIB.addImm(0); // src_mod0
4341 MIB.addReg(Src[0]);
4342 if (!IsB32)
4343 MIB.addImm(0); // src_mod1
4344 MIB.addReg(Src[1]);
4345 if (!IsB32)
4346 MIB.addImm(0); // src_mod2
4347 MIB.addReg(Src[2])
4348 .addImm(TTbl);
4349 if (!IsB32)
4350 MIB.addImm(0); // op_sel
4351
4352 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
4353 MI.eraseFromParent();
4354
4355 return true;
4356}
4357
4358bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
4359 Register SrcReg = MI.getOperand(0).getReg();
4360 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
4361 return false;
4362
4363 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
4364 Register SP =
4365 Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore();
4366 Register WaveAddr = getWaveAddress(DefMI);
4367 MachineBasicBlock *MBB = MI.getParent();
4368 const DebugLoc &DL = MI.getDebugLoc();
4369
4370 if (!WaveAddr) {
4371 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4372 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
4373 .addReg(SrcReg)
4374 .addImm(Subtarget->getWavefrontSizeLog2())
4375 .setOperandDead(3); // Dead scc
4376 }
4377
4378 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
4379 .addReg(WaveAddr);
4380
4381 MI.eraseFromParent();
4382 return true;
4383}
4384
4386
4387 if (!I.isPreISelOpcode()) {
4388 if (I.isCopy())
4389 return selectCOPY(I);
4390 return true;
4391 }
4392
4393 switch (I.getOpcode()) {
4394 case TargetOpcode::G_AND:
4395 case TargetOpcode::G_OR:
4396 case TargetOpcode::G_XOR:
4397 if (selectBITOP3(I))
4398 return true;
4399 if (selectImpl(I, *CoverageInfo))
4400 return true;
4401 return selectG_AND_OR_XOR(I);
4402 case TargetOpcode::G_ADD:
4403 case TargetOpcode::G_SUB:
4404 case TargetOpcode::G_PTR_ADD:
4405 if (selectImpl(I, *CoverageInfo))
4406 return true;
4407 return selectG_ADD_SUB(I);
4408 case TargetOpcode::G_UADDO:
4409 case TargetOpcode::G_USUBO:
4410 case TargetOpcode::G_UADDE:
4411 case TargetOpcode::G_USUBE:
4412 return selectG_UADDO_USUBO_UADDE_USUBE(I);
4413 case AMDGPU::G_AMDGPU_MAD_U64_U32:
4414 case AMDGPU::G_AMDGPU_MAD_I64_I32:
4415 return selectG_AMDGPU_MAD_64_32(I);
4416 case TargetOpcode::G_INTTOPTR:
4417 case TargetOpcode::G_BITCAST:
4418 case TargetOpcode::G_PTRTOINT:
4419 case TargetOpcode::G_FREEZE:
4420 return selectCOPY(I);
4421 case TargetOpcode::G_FNEG:
4422 if (selectImpl(I, *CoverageInfo))
4423 return true;
4424 return selectG_FNEG(I);
4425 case TargetOpcode::G_FABS:
4426 if (selectImpl(I, *CoverageInfo))
4427 return true;
4428 return selectG_FABS(I);
4429 case TargetOpcode::G_EXTRACT:
4430 return selectG_EXTRACT(I);
4431 case TargetOpcode::G_MERGE_VALUES:
4432 case TargetOpcode::G_CONCAT_VECTORS:
4433 return selectG_MERGE_VALUES(I);
4434 case TargetOpcode::G_UNMERGE_VALUES:
4435 return selectG_UNMERGE_VALUES(I);
4436 case TargetOpcode::G_BUILD_VECTOR:
4437 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
4438 return selectG_BUILD_VECTOR(I);
4439 case TargetOpcode::G_IMPLICIT_DEF:
4440 return selectG_IMPLICIT_DEF(I);
4441 case TargetOpcode::G_INSERT:
4442 return selectG_INSERT(I);
4443 case TargetOpcode::G_INTRINSIC:
4444 case TargetOpcode::G_INTRINSIC_CONVERGENT:
4445 return selectG_INTRINSIC(I);
4446 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
4447 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
4448 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
4449 case TargetOpcode::G_ICMP:
4450 case TargetOpcode::G_FCMP:
4451 if (selectG_ICMP_or_FCMP(I))
4452 return true;
4453 return selectImpl(I, *CoverageInfo);
4454 case TargetOpcode::G_LOAD:
4455 case TargetOpcode::G_ZEXTLOAD:
4456 case TargetOpcode::G_SEXTLOAD:
4457 case TargetOpcode::G_STORE:
4458 case TargetOpcode::G_ATOMIC_CMPXCHG:
4459 case TargetOpcode::G_ATOMICRMW_XCHG:
4460 case TargetOpcode::G_ATOMICRMW_ADD:
4461 case TargetOpcode::G_ATOMICRMW_SUB:
4462 case TargetOpcode::G_ATOMICRMW_AND:
4463 case TargetOpcode::G_ATOMICRMW_OR:
4464 case TargetOpcode::G_ATOMICRMW_XOR:
4465 case TargetOpcode::G_ATOMICRMW_MIN:
4466 case TargetOpcode::G_ATOMICRMW_MAX:
4467 case TargetOpcode::G_ATOMICRMW_UMIN:
4468 case TargetOpcode::G_ATOMICRMW_UMAX:
4469 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
4470 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
4471 case TargetOpcode::G_ATOMICRMW_USUB_COND:
4472 case TargetOpcode::G_ATOMICRMW_USUB_SAT:
4473 case TargetOpcode::G_ATOMICRMW_FADD:
4474 case TargetOpcode::G_ATOMICRMW_FMIN:
4475 case TargetOpcode::G_ATOMICRMW_FMAX:
4476 return selectG_LOAD_STORE_ATOMICRMW(I);
4477 case TargetOpcode::G_SELECT:
4478 return selectG_SELECT(I);
4479 case TargetOpcode::G_TRUNC:
4480 return selectG_TRUNC(I);
4481 case TargetOpcode::G_SEXT:
4482 case TargetOpcode::G_ZEXT:
4483 case TargetOpcode::G_ANYEXT:
4484 case TargetOpcode::G_SEXT_INREG:
4485 // This is a workaround. For extension from type i1, `selectImpl()` uses
4486 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
4487 // i1 can only be hold in a SGPR class.
4488 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
4489 selectImpl(I, *CoverageInfo))
4490 return true;
4491 return selectG_SZA_EXT(I);
4492 case TargetOpcode::G_FPEXT:
4493 if (selectG_FPEXT(I))
4494 return true;
4495 return selectImpl(I, *CoverageInfo);
4496 case TargetOpcode::G_BRCOND:
4497 return selectG_BRCOND(I);
4498 case TargetOpcode::G_GLOBAL_VALUE:
4499 return selectG_GLOBAL_VALUE(I);
4500 case TargetOpcode::G_PTRMASK:
4501 return selectG_PTRMASK(I);
4502 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4503 return selectG_EXTRACT_VECTOR_ELT(I);
4504 case TargetOpcode::G_INSERT_VECTOR_ELT:
4505 return selectG_INSERT_VECTOR_ELT(I);
4506 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4507 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4508 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
4509 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4510 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4511 const AMDGPU::ImageDimIntrinsicInfo *Intr =
4513 assert(Intr && "not an image intrinsic with image pseudo");
4514 return selectImageIntrinsic(I, Intr);
4515 }
4516 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY:
4517 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
4518 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
4519 return selectBVHIntersectRayIntrinsic(I);
4520 case AMDGPU::G_SBFX:
4521 case AMDGPU::G_UBFX:
4522 return selectG_SBFX_UBFX(I);
4523 case AMDGPU::G_SI_CALL:
4524 I.setDesc(TII.get(AMDGPU::SI_CALL));
4525 return true;
4526 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
4527 return selectWaveAddress(I);
4528 case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: {
4529 I.setDesc(TII.get(AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN));
4530 return true;
4531 }
4532 case AMDGPU::G_STACKRESTORE:
4533 return selectStackRestore(I);
4534 case AMDGPU::G_PHI:
4535 return selectPHI(I);
4536 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
4537 return selectCOPY_SCC_VCC(I);
4538 case AMDGPU::G_AMDGPU_COPY_VCC_SCC:
4539 return selectCOPY_VCC_SCC(I);
4540 case AMDGPU::G_AMDGPU_READANYLANE:
4541 return selectReadAnyLane(I);
4542 case TargetOpcode::G_CONSTANT:
4543 case TargetOpcode::G_FCONSTANT:
4544 default:
4545 return selectImpl(I, *CoverageInfo);
4546 }
4547 return false;
4548}
4549
4551AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
4552 return {{
4553 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4554 }};
4555
4556}
4557
4558std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
4559 Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
4560 unsigned Mods = 0;
4561 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
4562
4563 if (MI->getOpcode() == AMDGPU::G_FNEG) {
4564 Src = MI->getOperand(1).getReg();
4565 Mods |= SISrcMods::NEG;
4566 MI = getDefIgnoringCopies(Src, *MRI);
4567 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
4568 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
4569 // denormal mode, but we're implicitly canonicalizing in a source operand.
4570 const ConstantFP *LHS =
4571 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
4572 if (LHS && LHS->isZero()) {
4573 Mods |= SISrcMods::NEG;
4574 Src = MI->getOperand(2).getReg();
4575 }
4576 }
4577
4578 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
4579 Src = MI->getOperand(1).getReg();
4580 Mods |= SISrcMods::ABS;
4581 }
4582
4583 if (OpSel)
4584 Mods |= SISrcMods::OP_SEL_0;
4585
4586 return std::pair(Src, Mods);
4587}
4588
4589std::pair<Register, unsigned>
4590AMDGPUInstructionSelector::selectVOP3PModsF32Impl(Register Src) const {
4591 unsigned Mods;
4592 std::tie(Src, Mods) = selectVOP3ModsImpl(Src);
4593 Mods |= SISrcMods::OP_SEL_1;
4594 return std::pair(Src, Mods);
4595}
4596
4597Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
4598 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
4599 bool ForceVGPR) const {
4600 if ((Mods != 0 || ForceVGPR) &&
4601 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
4602
4603 // If we looked through copies to find source modifiers on an SGPR operand,
4604 // we now have an SGPR register source. To avoid potentially violating the
4605 // constant bus restriction, we need to insert a copy to a VGPR.
4606 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
4607 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
4608 TII.get(AMDGPU::COPY), VGPRSrc)
4609 .addReg(Src);
4610 Src = VGPRSrc;
4611 }
4612
4613 return Src;
4614}
4615
4616///
4617/// This will select either an SGPR or VGPR operand and will save us from
4618/// having to write an extra tablegen pattern.
4620AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
4621 return {{
4622 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4623 }};
4624}
4625
4627AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
4628 Register Src;
4629 unsigned Mods;
4630 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4631
4632 return {{
4633 [=](MachineInstrBuilder &MIB) {
4634 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4635 },
4636 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4637 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4638 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4639 }};
4640}
4641
4643AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
4644 Register Src;
4645 unsigned Mods;
4646 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4647 /*IsCanonicalizing=*/true,
4648 /*AllowAbs=*/false);
4649
4650 return {{
4651 [=](MachineInstrBuilder &MIB) {
4652 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4653 },
4654 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4655 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4656 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4657 }};
4658}
4659
4661AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
4662 return {{
4663 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
4664 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4665 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4666 }};
4667}
4668
4670AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
4671 Register Src;
4672 unsigned Mods;
4673 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4674
4675 return {{
4676 [=](MachineInstrBuilder &MIB) {
4677 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4678 },
4679 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4680 }};
4681}
4682
4684AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
4685 MachineOperand &Root) const {
4686 Register Src;
4687 unsigned Mods;
4688 std::tie(Src, Mods) =
4689 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);
4690
4691 return {{
4692 [=](MachineInstrBuilder &MIB) {
4693 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4694 },
4695 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4696 }};
4697}
4698
4700AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
4701 Register Src;
4702 unsigned Mods;
4703 std::tie(Src, Mods) =
4704 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
4705 /*AllowAbs=*/false);
4706
4707 return {{
4708 [=](MachineInstrBuilder &MIB) {
4709 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4710 },
4711 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4712 }};
4713}
4714
4716AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
4717 Register Reg = Root.getReg();
4718 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
4719 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
4720 return {};
4721 return {{
4722 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4723 }};
4724}
4725
4726enum class SrcStatus {
4731 // This means current op = [op_upper, op_lower] and src = -op_lower.
4734 // This means current op = [op_upper, op_lower] and src = [op_upper,
4735 // -op_lower].
4743};
4744/// Test if the MI is truncating to half, such as `%reg0:n = G_TRUNC %reg1:2n`
4745static bool isTruncHalf(const MachineInstr *MI,
4746 const MachineRegisterInfo &MRI) {
4747 if (MI->getOpcode() != AMDGPU::G_TRUNC)
4748 return false;
4749
4750 unsigned DstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits();
4751 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4752 return DstSize * 2 == SrcSize;
4753}
4754
4755/// Test if the MI is logic shift right with half bits,
4756/// such as `%reg0:2n =G_LSHR %reg1:2n, CONST(n)`
4757static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4758 if (MI->getOpcode() != AMDGPU::G_LSHR)
4759 return false;
4760
4761 Register ShiftSrc;
4762 std::optional<ValueAndVReg> ShiftAmt;
4763 if (mi_match(MI->getOperand(0).getReg(), MRI,
4764 m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4765 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4766 unsigned Shift = ShiftAmt->Value.getZExtValue();
4767 return Shift * 2 == SrcSize;
4768 }
4769 return false;
4770}
4771
4772/// Test if the MI is shift left with half bits,
4773/// such as `%reg0:2n =G_SHL %reg1:2n, CONST(n)`
4774static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4775 if (MI->getOpcode() != AMDGPU::G_SHL)
4776 return false;
4777
4778 Register ShiftSrc;
4779 std::optional<ValueAndVReg> ShiftAmt;
4780 if (mi_match(MI->getOperand(0).getReg(), MRI,
4781 m_GShl(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4782 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4783 unsigned Shift = ShiftAmt->Value.getZExtValue();
4784 return Shift * 2 == SrcSize;
4785 }
4786 return false;
4787}
4788
4789/// Test function, if the MI is `%reg0:n, %reg1:n = G_UNMERGE_VALUES %reg2:2n`
4790static bool isUnmergeHalf(const MachineInstr *MI,
4791 const MachineRegisterInfo &MRI) {
4792 if (MI->getOpcode() != AMDGPU::G_UNMERGE_VALUES)
4793 return false;
4794 return MI->getNumOperands() == 3 && MI->getOperand(0).isDef() &&
4795 MI->getOperand(1).isDef() && !MI->getOperand(2).isDef();
4796}
4797
4799
4801 const MachineRegisterInfo &MRI) {
4802 LLT OpTy = MRI.getType(Reg);
4803 if (OpTy.isScalar())
4804 return TypeClass::SCALAR;
4805 if (OpTy.isVector() && OpTy.getNumElements() == 2)
4808}
4809
4811 const MachineRegisterInfo &MRI) {
4812 TypeClass NegType = isVectorOfTwoOrScalar(Reg, MRI);
4813 if (NegType != TypeClass::VECTOR_OF_TWO && NegType != TypeClass::SCALAR)
4814 return SrcStatus::INVALID;
4815
4816 switch (S) {
4817 case SrcStatus::IS_SAME:
4818 if (NegType == TypeClass::VECTOR_OF_TWO) {
4819 // Vector of 2:
4820 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4821 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4822 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4823 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4825 }
4826 if (NegType == TypeClass::SCALAR) {
4827 // Scalar:
4828 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4829 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4830 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4831 // [SrcHi, SrcLo] = [-OpHi, OpLo]
4832 return SrcStatus::IS_HI_NEG;
4833 }
4834 break;
4836 if (NegType == TypeClass::VECTOR_OF_TWO) {
4837 // Vector of 2:
4838 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4839 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4840 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4841 // [SrcHi, SrcLo] = [-(-OpHi), -OpLo] = [OpHi, -OpLo]
4842 return SrcStatus::IS_LO_NEG;
4843 }
4844 if (NegType == TypeClass::SCALAR) {
4845 // Scalar:
4846 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4847 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4848 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4849 // [SrcHi, SrcLo] = [-(-OpHi), OpLo] = [OpHi, OpLo]
4850 return SrcStatus::IS_SAME;
4851 }
4852 break;
4854 if (NegType == TypeClass::VECTOR_OF_TWO) {
4855 // Vector of 2:
4856 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4857 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4858 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4859 // [SrcHi, SrcLo] = [-OpHi, -(-OpLo)] = [-OpHi, OpLo]
4860 return SrcStatus::IS_HI_NEG;
4861 }
4862 if (NegType == TypeClass::SCALAR) {
4863 // Scalar:
4864 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4865 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4866 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4867 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4869 }
4870 break;
4872 if (NegType == TypeClass::VECTOR_OF_TWO) {
4873 // Vector of 2:
4874 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4875 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4876 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4877 // [SrcHi, SrcLo] = [OpHi, OpLo]
4878 return SrcStatus::IS_SAME;
4879 }
4880 if (NegType == TypeClass::SCALAR) {
4881 // Scalar:
4882 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4883 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4884 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4885 // [SrcHi, SrcLo] = [OpHi, -OpLo]
4886 return SrcStatus::IS_LO_NEG;
4887 }
4888 break;
4890 // Vector of 2:
4891 // Src = CurrUpper
4892 // Curr = [CurrUpper, CurrLower]
4893 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4894 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4895 // Src = -OpUpper
4896 //
4897 // Scalar:
4898 // Src = CurrUpper
4899 // Curr = [CurrUpper, CurrLower]
4900 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4901 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4902 // Src = -OpUpper
4905 if (NegType == TypeClass::VECTOR_OF_TWO) {
4906 // Vector of 2:
4907 // Src = CurrLower
4908 // Curr = [CurrUpper, CurrLower]
4909 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4910 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4911 // Src = -OpLower
4913 }
4914 if (NegType == TypeClass::SCALAR) {
4915 // Scalar:
4916 // Src = CurrLower
4917 // Curr = [CurrUpper, CurrLower]
4918 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4919 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4920 // Src = OpLower
4922 }
4923 break;
4925 // Vector of 2:
4926 // Src = -CurrUpper
4927 // Curr = [CurrUpper, CurrLower]
4928 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4929 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4930 // Src = -(-OpUpper) = OpUpper
4931 //
4932 // Scalar:
4933 // Src = -CurrUpper
4934 // Curr = [CurrUpper, CurrLower]
4935 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4936 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4937 // Src = -(-OpUpper) = OpUpper
4940 if (NegType == TypeClass::VECTOR_OF_TWO) {
4941 // Vector of 2:
4942 // Src = -CurrLower
4943 // Curr = [CurrUpper, CurrLower]
4944 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4945 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4946 // Src = -(-OpLower) = OpLower
4948 }
4949 if (NegType == TypeClass::SCALAR) {
4950 // Scalar:
4951 // Src = -CurrLower
4952 // Curr = [CurrUpper, CurrLower]
4953 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4954 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4955 // Src = -OpLower
4957 }
4958 break;
4959 default:
4960 break;
4961 }
4962 llvm_unreachable("unexpected SrcStatus & NegType combination");
4963}
4964
4965static std::optional<std::pair<Register, SrcStatus>>
4966calcNextStatus(std::pair<Register, SrcStatus> Curr,
4967 const MachineRegisterInfo &MRI) {
4968 const MachineInstr *MI = MRI.getVRegDef(Curr.first);
4969
4970 unsigned Opc = MI->getOpcode();
4971
4972 // Handle general Opc cases.
4973 switch (Opc) {
4974 case AMDGPU::G_BITCAST:
4975 return std::optional<std::pair<Register, SrcStatus>>(
4976 {MI->getOperand(1).getReg(), Curr.second});
4977 case AMDGPU::COPY:
4978 if (MI->getOperand(1).getReg().isPhysical())
4979 return std::nullopt;
4980 return std::optional<std::pair<Register, SrcStatus>>(
4981 {MI->getOperand(1).getReg(), Curr.second});
4982 case AMDGPU::G_FNEG: {
4983 SrcStatus Stat = getNegStatus(Curr.first, Curr.second, MRI);
4984 if (Stat == SrcStatus::INVALID)
4985 return std::nullopt;
4986 return std::optional<std::pair<Register, SrcStatus>>(
4987 {MI->getOperand(1).getReg(), Stat});
4988 }
4989 default:
4990 break;
4991 }
4992
4993 // Calc next Stat from current Stat.
4994 switch (Curr.second) {
4995 case SrcStatus::IS_SAME:
4996 if (isTruncHalf(MI, MRI))
4997 return std::optional<std::pair<Register, SrcStatus>>(
4998 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
4999 else if (isUnmergeHalf(MI, MRI)) {
5000 if (Curr.first == MI->getOperand(0).getReg())
5001 return std::optional<std::pair<Register, SrcStatus>>(
5002 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF});
5003 return std::optional<std::pair<Register, SrcStatus>>(
5004 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF});
5005 }
5006 break;
5008 if (isTruncHalf(MI, MRI)) {
5009 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
5010 // [CurrHi, CurrLo] = trunc [OpUpper, OpLower] = OpLower
5011 // = [OpLowerHi, OpLowerLo]
5012 // Src = [SrcHi, SrcLo] = [-CurrHi, CurrLo]
5013 // = [-OpLowerHi, OpLowerLo]
5014 // = -OpLower
5015 return std::optional<std::pair<Register, SrcStatus>>(
5016 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
5017 }
5018 if (isUnmergeHalf(MI, MRI)) {
5019 if (Curr.first == MI->getOperand(0).getReg())
5020 return std::optional<std::pair<Register, SrcStatus>>(
5021 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
5022 return std::optional<std::pair<Register, SrcStatus>>(
5023 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
5024 }
5025 break;
5027 if (isShlHalf(MI, MRI))
5028 return std::optional<std::pair<Register, SrcStatus>>(
5029 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
5030 break;
5032 if (isLshrHalf(MI, MRI))
5033 return std::optional<std::pair<Register, SrcStatus>>(
5034 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF});
5035 break;
5037 if (isShlHalf(MI, MRI))
5038 return std::optional<std::pair<Register, SrcStatus>>(
5039 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
5040 break;
5042 if (isLshrHalf(MI, MRI))
5043 return std::optional<std::pair<Register, SrcStatus>>(
5044 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
5045 break;
5046 default:
5047 break;
5048 }
5049 return std::nullopt;
5050}
5051
5052/// This is used to control valid status that current MI supports. For example,
5053/// non floating point intrinsic such as @llvm.amdgcn.sdot2 does not support NEG
5054/// bit on VOP3P.
5055/// The class can be further extended to recognize support on SEL, NEG, ABS bit
5056/// for different MI on different arch
5058private:
5059 bool HasNeg = false;
5060 // Assume all complex pattern of VOP3P have opsel.
5061 bool HasOpsel = true;
5062
5063public:
5065 const MachineInstr *MI = MRI.getVRegDef(Reg);
5066 unsigned Opc = MI->getOpcode();
5067
5068 if (Opc == TargetOpcode::G_INTRINSIC) {
5069 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(*MI).getIntrinsicID();
5070 // Only float point intrinsic has neg & neg_hi bits.
5071 if (IntrinsicID == Intrinsic::amdgcn_fdot2)
5072 HasNeg = true;
5074 // Keep same for generic op.
5075 HasNeg = true;
5076 }
5077 }
5078 bool checkOptions(SrcStatus Stat) const {
5079 if (!HasNeg &&
5080 (Stat >= SrcStatus::NEG_START && Stat <= SrcStatus::NEG_END)) {
5081 return false;
5082 }
5083 if (!HasOpsel &&
5084 (Stat >= SrcStatus::HALF_START && Stat <= SrcStatus::HALF_END)) {
5085 return false;
5086 }
5087 return true;
5088 }
5089};
5090
5093 int MaxDepth = 3) {
5094 int Depth = 0;
5095 auto Curr = calcNextStatus({Reg, SrcStatus::IS_SAME}, MRI);
5097
5098 while (Depth <= MaxDepth && Curr.has_value()) {
5099 Depth++;
5100 if (SO.checkOptions(Curr.value().second))
5101 Statlist.push_back(Curr.value());
5102 Curr = calcNextStatus(Curr.value(), MRI);
5103 }
5104
5105 return Statlist;
5106}
5107
5108static std::pair<Register, SrcStatus>
5110 int MaxDepth = 3) {
5111 int Depth = 0;
5112 std::pair<Register, SrcStatus> LastSameOrNeg = {Reg, SrcStatus::IS_SAME};
5113 auto Curr = calcNextStatus(LastSameOrNeg, MRI);
5114
5115 while (Depth <= MaxDepth && Curr.has_value()) {
5116 Depth++;
5117 SrcStatus Stat = Curr.value().second;
5118 if (SO.checkOptions(Stat)) {
5119 if (Stat == SrcStatus::IS_SAME || Stat == SrcStatus::IS_HI_NEG ||
5121 LastSameOrNeg = Curr.value();
5122 }
5123 Curr = calcNextStatus(Curr.value(), MRI);
5124 }
5125
5126 return LastSameOrNeg;
5127}
5128
5129static bool isSameBitWidth(Register Reg1, Register Reg2,
5130 const MachineRegisterInfo &MRI) {
5131 unsigned Width1 = MRI.getType(Reg1).getSizeInBits();
5132 unsigned Width2 = MRI.getType(Reg2).getSizeInBits();
5133 return Width1 == Width2;
5134}
5135
5136static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) {
5137 // SrcStatus::IS_LOWER_HALF remain 0.
5138 if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) {
5139 Mods ^= SISrcMods::NEG_HI;
5140 Mods |= SISrcMods::OP_SEL_1;
5141 } else if (HiStat == SrcStatus::IS_UPPER_HALF)
5142 Mods |= SISrcMods::OP_SEL_1;
5143 else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG)
5144 Mods ^= SISrcMods::NEG_HI;
5145 else if (HiStat == SrcStatus::IS_HI_NEG)
5146 Mods ^= SISrcMods::NEG_HI;
5147
5148 if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) {
5149 Mods ^= SISrcMods::NEG;
5150 Mods |= SISrcMods::OP_SEL_0;
5151 } else if (LoStat == SrcStatus::IS_UPPER_HALF)
5152 Mods |= SISrcMods::OP_SEL_0;
5153 else if (LoStat == SrcStatus::IS_LOWER_HALF_NEG)
5154 Mods |= SISrcMods::NEG;
5155 else if (LoStat == SrcStatus::IS_HI_NEG)
5156 Mods ^= SISrcMods::NEG;
5157
5158 return Mods;
5159}
5160
5161static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg,
5162 Register RootReg, const SIInstrInfo &TII,
5163 const MachineRegisterInfo &MRI) {
5164 auto IsHalfState = [](SrcStatus S) {
5167 };
5168 return isSameBitWidth(NewReg, RootReg, MRI) && IsHalfState(LoStat) &&
5169 IsHalfState(HiStat);
5170}
5171
5172std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3PModsImpl(
5173 Register RootReg, const MachineRegisterInfo &MRI, bool IsDOT) const {
5174 unsigned Mods = 0;
5175 // No modification if Root type is not form of <2 x Type>.
5176 if (isVectorOfTwoOrScalar(RootReg, MRI) != TypeClass::VECTOR_OF_TWO) {
5177 Mods |= SISrcMods::OP_SEL_1;
5178 return {RootReg, Mods};
5179 }
5180
5181 SearchOptions SO(RootReg, MRI);
5182
5183 std::pair<Register, SrcStatus> Stat = getLastSameOrNeg(RootReg, MRI, SO);
5184
5185 if (Stat.second == SrcStatus::IS_BOTH_NEG)
5187 else if (Stat.second == SrcStatus::IS_HI_NEG)
5188 Mods ^= SISrcMods::NEG_HI;
5189 else if (Stat.second == SrcStatus::IS_LO_NEG)
5190 Mods ^= SISrcMods::NEG;
5191
5192 MachineInstr *MI = MRI.getVRegDef(Stat.first);
5193
5194 if (MI->getOpcode() != AMDGPU::G_BUILD_VECTOR || MI->getNumOperands() != 3 ||
5195 (IsDOT && Subtarget->hasDOTOpSelHazard())) {
5196 Mods |= SISrcMods::OP_SEL_1;
5197 return {Stat.first, Mods};
5198 }
5199
5201 getSrcStats(MI->getOperand(2).getReg(), MRI, SO);
5202
5203 if (StatlistHi.empty()) {
5204 Mods |= SISrcMods::OP_SEL_1;
5205 return {Stat.first, Mods};
5206 }
5207
5209 getSrcStats(MI->getOperand(1).getReg(), MRI, SO);
5210
5211 if (StatlistLo.empty()) {
5212 Mods |= SISrcMods::OP_SEL_1;
5213 return {Stat.first, Mods};
5214 }
5215
5216 for (int I = StatlistHi.size() - 1; I >= 0; I--) {
5217 for (int J = StatlistLo.size() - 1; J >= 0; J--) {
5218 if (StatlistHi[I].first == StatlistLo[J].first &&
5219 isValidToPack(StatlistHi[I].second, StatlistLo[J].second,
5220 StatlistHi[I].first, RootReg, TII, MRI))
5221 return {StatlistHi[I].first,
5222 updateMods(StatlistHi[I].second, StatlistLo[J].second, Mods)};
5223 }
5224 }
5225 // Packed instructions do not have abs modifiers.
5226 Mods |= SISrcMods::OP_SEL_1;
5227
5228 return {Stat.first, Mods};
5229}
5230
5231// Removed unused function `getAllKindImm` to eliminate dead code.
5232
5233static bool checkRB(Register Reg, unsigned int RBNo,
5234 const AMDGPURegisterBankInfo &RBI,
5235 const MachineRegisterInfo &MRI,
5236 const TargetRegisterInfo &TRI) {
5237 const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI);
5238 return RB->getID() == RBNo;
5239}
5240
5241// This function is used to get the correct register bank for returned reg.
5242// Assume:
5243// 1. VOP3P is always legal for VGPR.
5244// 2. RootOp's regbank is legal.
5245// Thus
5246// 1. If RootOp is SGPR, then NewOp can be SGPR or VGPR.
5247// 2. If RootOp is VGPR, then NewOp must be VGPR.
5249 const AMDGPURegisterBankInfo &RBI,
5251 const TargetRegisterInfo &TRI,
5252 const SIInstrInfo &TII) {
5253 // RootOp can only be VGPR or SGPR (some hand written cases such as.
5254 // inst-select-ashr.v2s16.mir::ashr_v2s16_vs).
5255 if (checkRB(RootReg, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) ||
5256 checkRB(NewReg, AMDGPU::VGPRRegBankID, RBI, MRI, TRI))
5257 return NewReg;
5258
5259 MachineInstr *MI = MRI.getVRegDef(RootReg);
5260 if (MI->getOpcode() == AMDGPU::COPY && NewReg == MI->getOperand(1).getReg()) {
5261 // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp.
5262 return RootReg;
5263 }
5264
5265 MachineBasicBlock *BB = MI->getParent();
5266 Register DstReg = MRI.cloneVirtualRegister(RootReg);
5267
5269 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
5270 .addReg(NewReg);
5271
5272 // Only accept VGPR.
5273 return MIB->getOperand(0).getReg();
5274}
5275
5277AMDGPUInstructionSelector::selectVOP3PRetHelper(MachineOperand &Root,
5278 bool IsDOT) const {
5279 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5280 Register Reg;
5281 unsigned Mods;
5282 std::tie(Reg, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, IsDOT);
5283
5284 Reg = getLegalRegBank(Reg, Root.getReg(), RBI, MRI, TRI, TII);
5285 return {{
5286 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
5287 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5288 }};
5289}
5290
5292AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
5293
5294 return selectVOP3PRetHelper(Root);
5295}
5296
5298AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
5299
5300 return selectVOP3PRetHelper(Root, true);
5301}
5302
5304AMDGPUInstructionSelector::selectVOP3PNoModsDOT(MachineOperand &Root) const {
5305 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5306 Register Src;
5307 unsigned Mods;
5308 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true /*IsDOT*/);
5309 if (Mods != SISrcMods::OP_SEL_1)
5310 return {};
5311
5312 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }}};
5313}
5314
5316AMDGPUInstructionSelector::selectVOP3PModsF32(MachineOperand &Root) const {
5317 Register Src;
5318 unsigned Mods;
5319 std::tie(Src, Mods) = selectVOP3PModsF32Impl(Root.getReg());
5320
5321 return {{
5322 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5323 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5324 }};
5325}
5326
5328AMDGPUInstructionSelector::selectVOP3PNoModsF32(MachineOperand &Root) const {
5329 Register Src;
5330 unsigned Mods;
5331 std::tie(Src, Mods) = selectVOP3PModsF32Impl(Root.getReg());
5332 if (Mods != SISrcMods::OP_SEL_1)
5333 return {};
5334
5335 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }}};
5336}
5337
5339AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
5340 MachineOperand &Root) const {
5341 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
5342 "expected i1 value");
5343 unsigned Mods = SISrcMods::OP_SEL_1;
5344 if (Root.getImm() != 0)
5345 Mods |= SISrcMods::OP_SEL_0;
5346
5347 return {{
5348 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5349 }};
5350}
5351
5353 MachineInstr *InsertPt,
5354 MachineRegisterInfo &MRI) {
5355 const TargetRegisterClass *DstRegClass;
5356 switch (Elts.size()) {
5357 case 8:
5358 DstRegClass = &AMDGPU::VReg_256RegClass;
5359 break;
5360 case 4:
5361 DstRegClass = &AMDGPU::VReg_128RegClass;
5362 break;
5363 case 2:
5364 DstRegClass = &AMDGPU::VReg_64RegClass;
5365 break;
5366 default:
5367 llvm_unreachable("unhandled Reg sequence size");
5368 }
5369
5370 MachineIRBuilder B(*InsertPt);
5371 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
5372 .addDef(MRI.createVirtualRegister(DstRegClass));
5373 for (unsigned i = 0; i < Elts.size(); ++i) {
5374 MIB.addReg(Elts[i]);
5376 }
5377 return MIB->getOperand(0).getReg();
5378}
5379
5380static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
5382 MachineInstr *InsertPt,
5383 MachineRegisterInfo &MRI) {
5384 if (ModOpcode == TargetOpcode::G_FNEG) {
5385 Mods |= SISrcMods::NEG;
5386 // Check if all elements also have abs modifier
5387 SmallVector<Register, 8> NegAbsElts;
5388 for (auto El : Elts) {
5389 Register FabsSrc;
5390 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
5391 break;
5392 NegAbsElts.push_back(FabsSrc);
5393 }
5394 if (Elts.size() != NegAbsElts.size()) {
5395 // Neg
5396 Src = buildRegSequence(Elts, InsertPt, MRI);
5397 } else {
5398 // Neg and Abs
5399 Mods |= SISrcMods::NEG_HI;
5400 Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
5401 }
5402 } else {
5403 assert(ModOpcode == TargetOpcode::G_FABS);
5404 // Abs
5405 Mods |= SISrcMods::NEG_HI;
5406 Src = buildRegSequence(Elts, InsertPt, MRI);
5407 }
5408}
5409
5411AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
5412 Register Src = Root.getReg();
5413 unsigned Mods = SISrcMods::OP_SEL_1;
5415
5416 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
5417 assert(BV->getNumSources() > 0);
5418 // Based on first element decide which mod we match, neg or abs
5419 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
5420 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
5421 ? AMDGPU::G_FNEG
5422 : AMDGPU::G_FABS;
5423 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
5424 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
5425 if (ElF32->getOpcode() != ModOpcode)
5426 break;
5427 EltsF32.push_back(ElF32->getOperand(1).getReg());
5428 }
5429
5430 // All elements had ModOpcode modifier
5431 if (BV->getNumSources() == EltsF32.size()) {
5432 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
5433 *MRI);
5434 }
5435 }
5436
5437 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5438 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5439}
5440
5442AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
5443 Register Src = Root.getReg();
5444 unsigned Mods = SISrcMods::OP_SEL_1;
5445 SmallVector<Register, 8> EltsV2F16;
5446
5447 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5448 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5449 Register FNegSrc;
5450 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
5451 break;
5452 EltsV2F16.push_back(FNegSrc);
5453 }
5454
5455 // All elements had ModOpcode modifier
5456 if (CV->getNumSources() == EltsV2F16.size()) {
5457 Mods |= SISrcMods::NEG;
5458 Mods |= SISrcMods::NEG_HI;
5459 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
5460 }
5461 }
5462
5463 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5464 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5465}
5466
5468AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
5469 Register Src = Root.getReg();
5470 unsigned Mods = SISrcMods::OP_SEL_1;
5471 SmallVector<Register, 8> EltsV2F16;
5472
5473 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5474 assert(CV->getNumSources() > 0);
5475 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
5476 // Based on first element decide which mod we match, neg or abs
5477 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
5478 ? AMDGPU::G_FNEG
5479 : AMDGPU::G_FABS;
5480
5481 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5482 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
5483 if (ElV2F16->getOpcode() != ModOpcode)
5484 break;
5485 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
5486 }
5487
5488 // All elements had ModOpcode modifier
5489 if (CV->getNumSources() == EltsV2F16.size()) {
5490 MachineIRBuilder B(*Root.getParent());
5491 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
5492 *MRI);
5493 }
5494 }
5495
5496 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5497 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5498}
5499
5501AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
5502 std::optional<FPValueAndVReg> FPValReg;
5503 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
5504 if (TII.isInlineConstant(FPValReg->Value)) {
5505 return {{[=](MachineInstrBuilder &MIB) {
5506 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
5507 }}};
5508 }
5509 // Non-inlineable splat floats should not fall-through for integer immediate
5510 // checks.
5511 return {};
5512 }
5513
5514 APInt ICst;
5515 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
5516 if (TII.isInlineConstant(ICst)) {
5517 return {
5518 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
5519 }
5520 }
5521
5522 return {};
5523}
5524
5526AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
5527 Register Src =
5528 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5529 unsigned Key = 0;
5530
5531 Register ShiftSrc;
5532 std::optional<ValueAndVReg> ShiftAmt;
5533 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5534 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5535 ShiftAmt->Value.getZExtValue() % 8 == 0) {
5536 Key = ShiftAmt->Value.getZExtValue() / 8;
5537 Src = ShiftSrc;
5538 }
5539
5540 return {{
5541 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5542 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5543 }};
5544}
5545
5547AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
5548
5549 Register Src =
5550 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5551 unsigned Key = 0;
5552
5553 Register ShiftSrc;
5554 std::optional<ValueAndVReg> ShiftAmt;
5555 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5556 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5557 ShiftAmt->Value.getZExtValue() == 16) {
5558 Src = ShiftSrc;
5559 Key = 1;
5560 }
5561
5562 return {{
5563 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5564 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5565 }};
5566}
5567
5569AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const {
5570 Register Src =
5571 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5572 unsigned Key = 0;
5573
5574 Register S32 = matchZeroExtendFromS32(Src);
5575 if (!S32)
5576 S32 = matchAnyExtendFromS32(Src);
5577
5578 if (S32) {
5579 const MachineInstr *Def = getDefIgnoringCopies(S32, *MRI);
5580 if (Def->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
5581 assert(Def->getNumOperands() == 3);
5582 Register DstReg1 = Def->getOperand(1).getReg();
5583 if (mi_match(S32, *MRI,
5584 m_any_of(m_SpecificReg(DstReg1), m_Copy(m_Reg(DstReg1))))) {
5585 Src = Def->getOperand(2).getReg();
5586 Key = 1;
5587 }
5588 }
5589 }
5590
5591 return {{
5592 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5593 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5594 }};
5595}
5596
5598AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
5599 Register Src;
5600 unsigned Mods;
5601 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
5602
5603 // FIXME: Handle op_sel
5604 return {{
5605 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5606 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5607 }};
5608}
5609
5610// FIXME-TRUE16 remove when fake16 is removed
5612AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
5613 Register Src;
5614 unsigned Mods;
5615 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5616 /*IsCanonicalizing=*/true,
5617 /*AllowAbs=*/false,
5618 /*OpSel=*/false);
5619
5620 return {{
5621 [=](MachineInstrBuilder &MIB) {
5622 MIB.addReg(
5623 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5624 },
5625 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5626 }};
5627}
5628
5630AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
5631 Register Src;
5632 unsigned Mods;
5633 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5634 /*IsCanonicalizing=*/true,
5635 /*AllowAbs=*/false,
5636 /*OpSel=*/true);
5637
5638 return {{
5639 [=](MachineInstrBuilder &MIB) {
5640 MIB.addReg(
5641 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5642 },
5643 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5644 }};
5645}
5646
5647// Given \p Offset and load specified by the \p Root operand check if \p Offset
5648// is a multiple of the load byte size. If it is update \p Offset to a
5649// pre-scaled value and return true.
5650bool AMDGPUInstructionSelector::selectScaleOffset(MachineOperand &Root,
5652 bool IsSigned) const {
5653 if (!Subtarget->hasScaleOffset())
5654 return false;
5655
5656 const MachineInstr &MI = *Root.getParent();
5657 MachineMemOperand *MMO = *MI.memoperands_begin();
5658
5659 if (!MMO->getSize().hasValue())
5660 return false;
5661
5662 uint64_t Size = MMO->getSize().getValue();
5663
5664 Register OffsetReg = matchExtendFromS32OrS32(Offset, IsSigned);
5665 if (!OffsetReg)
5666 OffsetReg = Offset;
5667
5668 if (auto Def = getDefSrcRegIgnoringCopies(OffsetReg, *MRI))
5669 OffsetReg = Def->Reg;
5670
5671 Register Op0;
5672 MachineInstr *Mul;
5673 bool ScaleOffset =
5674 (isPowerOf2_64(Size) &&
5675 mi_match(OffsetReg, *MRI,
5676 m_GShl(m_Reg(Op0),
5679 mi_match(OffsetReg, *MRI,
5681 m_Copy(m_SpecificICst(Size))))) ||
5682 mi_match(
5683 OffsetReg, *MRI,
5684 m_BinOp(IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO : AMDGPU::S_MUL_U64,
5685 m_Reg(Op0), m_SpecificICst(Size))) ||
5686 // Match G_AMDGPU_MAD_U64_U32 offset, c, 0
5687 (mi_match(OffsetReg, *MRI, m_MInstr(Mul)) &&
5688 (Mul->getOpcode() == (IsSigned ? AMDGPU::G_AMDGPU_MAD_I64_I32
5689 : AMDGPU::G_AMDGPU_MAD_U64_U32) ||
5690 (IsSigned && Mul->getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32 &&
5691 VT->signBitIsZero(Mul->getOperand(2).getReg()))) &&
5692 mi_match(Mul->getOperand(4).getReg(), *MRI, m_ZeroInt()) &&
5693 mi_match(Mul->getOperand(3).getReg(), *MRI,
5695 m_Copy(m_SpecificICst(Size))))) &&
5696 mi_match(Mul->getOperand(2).getReg(), *MRI, m_Reg(Op0)));
5697
5698 if (ScaleOffset)
5699 Offset = Op0;
5700
5701 return ScaleOffset;
5702}
5703
5704bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
5705 Register &Base,
5706 Register *SOffset,
5707 int64_t *Offset,
5708 bool *ScaleOffset) const {
5709 MachineInstr *MI = Root.getParent();
5710 MachineBasicBlock *MBB = MI->getParent();
5711
5712 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
5713 // then we can select all ptr + 32-bit offsets.
5714 SmallVector<GEPInfo, 4> AddrInfo;
5715 getAddrModeInfo(*MI, *MRI, AddrInfo);
5716
5717 if (AddrInfo.empty())
5718 return false;
5719
5720 const GEPInfo &GEPI = AddrInfo[0];
5721 std::optional<int64_t> EncodedImm;
5722
5723 if (ScaleOffset)
5724 *ScaleOffset = false;
5725
5726 if (SOffset && Offset) {
5727 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5728 /*HasSOffset=*/true);
5729 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
5730 AddrInfo.size() > 1) {
5731 const GEPInfo &GEPI2 = AddrInfo[1];
5732 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
5733 Register OffsetReg = GEPI2.SgprParts[1];
5734 if (ScaleOffset)
5735 *ScaleOffset =
5736 selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5737 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5738 if (OffsetReg) {
5739 Base = GEPI2.SgprParts[0];
5740 *SOffset = OffsetReg;
5741 *Offset = *EncodedImm;
5742 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
5743 return true;
5744
5745 // For unbuffered smem loads, it is illegal for the Immediate Offset
5746 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
5747 // is negative. Handle the case where the Immediate Offset + SOffset
5748 // is negative.
5749 auto SKnown = VT->getKnownBits(*SOffset);
5750 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
5751 return false;
5752
5753 return true;
5754 }
5755 }
5756 }
5757 return false;
5758 }
5759
5760 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5761 /*HasSOffset=*/false);
5762 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
5763 Base = GEPI.SgprParts[0];
5764 *Offset = *EncodedImm;
5765 return true;
5766 }
5767
5768 // SGPR offset is unsigned.
5769 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
5770 GEPI.Imm != 0) {
5771 // If we make it this far we have a load with an 32-bit immediate offset.
5772 // It is OK to select this using a sgpr offset, because we have already
5773 // failed trying to select this load into one of the _IMM variants since
5774 // the _IMM Patterns are considered before the _SGPR patterns.
5775 Base = GEPI.SgprParts[0];
5776 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5777 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
5778 .addImm(GEPI.Imm);
5779 return true;
5780 }
5781
5782 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
5783 Register OffsetReg = GEPI.SgprParts[1];
5784 if (ScaleOffset)
5785 *ScaleOffset = selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5786 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5787 if (OffsetReg) {
5788 Base = GEPI.SgprParts[0];
5789 *SOffset = OffsetReg;
5790 return true;
5791 }
5792 }
5793
5794 return false;
5795}
5796
5798AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
5799 Register Base;
5800 int64_t Offset;
5801 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset,
5802 /* ScaleOffset */ nullptr))
5803 return std::nullopt;
5804
5805 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5806 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
5807}
5808
5810AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
5811 SmallVector<GEPInfo, 4> AddrInfo;
5812 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
5813
5814 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
5815 return std::nullopt;
5816
5817 const GEPInfo &GEPInfo = AddrInfo[0];
5818 Register PtrReg = GEPInfo.SgprParts[0];
5819 std::optional<int64_t> EncodedImm =
5820 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
5821 if (!EncodedImm)
5822 return std::nullopt;
5823
5824 return {{
5825 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
5826 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
5827 }};
5828}
5829
5831AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
5832 Register Base, SOffset;
5833 bool ScaleOffset;
5834 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr,
5835 &ScaleOffset))
5836 return std::nullopt;
5837
5838 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5839 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5840 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5841 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
5842}
5843
5845AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
5846 Register Base, SOffset;
5847 int64_t Offset;
5848 bool ScaleOffset;
5849 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset, &ScaleOffset))
5850 return std::nullopt;
5851
5852 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5853 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5854 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5855 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
5856 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
5857}
5858
5859std::pair<Register, int>
5860AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
5861 uint64_t FlatVariant) const {
5862 MachineInstr *MI = Root.getParent();
5863
5864 auto Default = std::pair(Root.getReg(), 0);
5865
5866 if (!STI.hasFlatInstOffsets())
5867 return Default;
5868
5869 Register PtrBase;
5870 int64_t ConstOffset;
5871 bool IsInBounds;
5872 std::tie(PtrBase, ConstOffset, IsInBounds) =
5873 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5874
5875 // Adding the offset to the base address with an immediate in a FLAT
5876 // instruction must not change the memory aperture in which the address falls.
5877 // Therefore we can only fold offsets from inbounds GEPs into FLAT
5878 // instructions.
5879 if (ConstOffset == 0 ||
5880 (FlatVariant == SIInstrFlags::FlatScratch &&
5881 !isFlatScratchBaseLegal(Root.getReg())) ||
5882 (FlatVariant == SIInstrFlags::FLAT && !IsInBounds))
5883 return Default;
5884
5885 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
5886 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
5887 return Default;
5888
5889 return std::pair(PtrBase, ConstOffset);
5890}
5891
5893AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
5894 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
5895
5896 return {{
5897 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5898 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5899 }};
5900}
5901
5903AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
5904 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
5905
5906 return {{
5907 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5908 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5909 }};
5910}
5911
5913AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
5914 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
5915
5916 return {{
5917 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5918 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5919 }};
5920}
5921
5922// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
5924AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
5925 unsigned CPolBits,
5926 bool NeedIOffset) const {
5927 Register Addr = Root.getReg();
5928 Register PtrBase;
5929 int64_t ConstOffset;
5930 int64_t ImmOffset = 0;
5931
5932 // Match the immediate offset first, which canonically is moved as low as
5933 // possible.
5934 std::tie(PtrBase, ConstOffset, std::ignore) =
5935 getPtrBaseWithConstantOffset(Addr, *MRI);
5936
5937 if (ConstOffset != 0) {
5938 if (NeedIOffset &&
5939 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
5941 Addr = PtrBase;
5942 ImmOffset = ConstOffset;
5943 } else {
5944 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
5945 if (isSGPR(PtrBaseDef->Reg)) {
5946 if (ConstOffset > 0) {
5947 // Offset is too large.
5948 //
5949 // saddr + large_offset -> saddr +
5950 // (voffset = large_offset & ~MaxOffset) +
5951 // (large_offset & MaxOffset);
5952 int64_t SplitImmOffset = 0, RemainderOffset = ConstOffset;
5953 if (NeedIOffset) {
5954 std::tie(SplitImmOffset, RemainderOffset) =
5955 TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
5957 }
5958
5959 if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
5960 : isUInt<32>(RemainderOffset)) {
5961 MachineInstr *MI = Root.getParent();
5962 MachineBasicBlock *MBB = MI->getParent();
5963 Register HighBits =
5964 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5965
5966 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
5967 HighBits)
5968 .addImm(RemainderOffset);
5969
5970 if (NeedIOffset)
5971 return {{
5972 [=](MachineInstrBuilder &MIB) {
5973 MIB.addReg(PtrBase);
5974 }, // saddr
5975 [=](MachineInstrBuilder &MIB) {
5976 MIB.addReg(HighBits);
5977 }, // voffset
5978 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
5979 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
5980 }};
5981 return {{
5982 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
5983 [=](MachineInstrBuilder &MIB) {
5984 MIB.addReg(HighBits);
5985 }, // voffset
5986 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
5987 }};
5988 }
5989 }
5990
5991 // We are adding a 64 bit SGPR and a constant. If constant bus limit
5992 // is 1 we would need to perform 1 or 2 extra moves for each half of
5993 // the constant and it is better to do a scalar add and then issue a
5994 // single VALU instruction to materialize zero. Otherwise it is less
5995 // instructions to perform VALU adds with immediates or inline literals.
5996 unsigned NumLiterals =
5997 !TII.isInlineConstant(APInt(32, Lo_32(ConstOffset))) +
5998 !TII.isInlineConstant(APInt(32, Hi_32(ConstOffset)));
5999 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
6000 return std::nullopt;
6001 }
6002 }
6003 }
6004
6005 // Match the variable offset.
6006 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
6007 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
6008 // Look through the SGPR->VGPR copy.
6009 Register SAddr =
6010 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
6011
6012 if (isSGPR(SAddr)) {
6013 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
6014
6015 // It's possible voffset is an SGPR here, but the copy to VGPR will be
6016 // inserted later.
6017 bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset,
6018 Subtarget->hasSignedGVSOffset());
6019 if (Register VOffset = matchExtendFromS32OrS32(
6020 PtrBaseOffset, Subtarget->hasSignedGVSOffset())) {
6021 if (NeedIOffset)
6022 return {{[=](MachineInstrBuilder &MIB) { // saddr
6023 MIB.addReg(SAddr);
6024 },
6025 [=](MachineInstrBuilder &MIB) { // voffset
6026 MIB.addReg(VOffset);
6027 },
6028 [=](MachineInstrBuilder &MIB) { // offset
6029 MIB.addImm(ImmOffset);
6030 },
6031 [=](MachineInstrBuilder &MIB) { // cpol
6032 MIB.addImm(CPolBits |
6033 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
6034 }}};
6035 return {{[=](MachineInstrBuilder &MIB) { // saddr
6036 MIB.addReg(SAddr);
6037 },
6038 [=](MachineInstrBuilder &MIB) { // voffset
6039 MIB.addReg(VOffset);
6040 },
6041 [=](MachineInstrBuilder &MIB) { // cpol
6042 MIB.addImm(CPolBits |
6043 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
6044 }}};
6045 }
6046 }
6047 }
6048
6049 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
6050 // drop this.
6051 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
6052 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
6053 return std::nullopt;
6054
6055 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
6056 // moves required to copy a 64-bit SGPR to VGPR.
6057 MachineInstr *MI = Root.getParent();
6058 MachineBasicBlock *MBB = MI->getParent();
6059 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6060
6061 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
6062 .addImm(0);
6063
6064 if (NeedIOffset)
6065 return {{
6066 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
6067 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
6068 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
6069 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
6070 }};
6071 return {{
6072 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
6073 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
6074 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
6075 }};
6076}
6077
6079AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
6080 return selectGlobalSAddr(Root, 0);
6081}
6082
6084AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const {
6085 const MachineInstr &I = *Root.getParent();
6086
6087 // We are assuming CPol is always the last operand of the intrinsic.
6088 auto PassedCPol =
6089 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
6090 return selectGlobalSAddr(Root, PassedCPol);
6091}
6092
6094AMDGPUInstructionSelector::selectGlobalSAddrCPolM0(MachineOperand &Root) const {
6095 const MachineInstr &I = *Root.getParent();
6096
6097 // We are assuming CPol is second from last operand of the intrinsic.
6098 auto PassedCPol =
6099 I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
6100 return selectGlobalSAddr(Root, PassedCPol);
6101}
6102
6104AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
6105 return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);
6106}
6107
6109AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset(
6110 MachineOperand &Root) const {
6111 const MachineInstr &I = *Root.getParent();
6112
6113 // We are assuming CPol is always the last operand of the intrinsic.
6114 auto PassedCPol =
6115 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
6116 return selectGlobalSAddr(Root, PassedCPol, false);
6117}
6118
6120AMDGPUInstructionSelector::selectGlobalSAddrNoIOffsetM0(
6121 MachineOperand &Root) const {
6122 const MachineInstr &I = *Root.getParent();
6123
6124 // We are assuming CPol is second from last operand of the intrinsic.
6125 auto PassedCPol =
6126 I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
6127 return selectGlobalSAddr(Root, PassedCPol, false);
6128}
6129
6131AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
6132 Register Addr = Root.getReg();
6133 Register PtrBase;
6134 int64_t ConstOffset;
6135 int64_t ImmOffset = 0;
6136
6137 // Match the immediate offset first, which canonically is moved as low as
6138 // possible.
6139 std::tie(PtrBase, ConstOffset, std::ignore) =
6140 getPtrBaseWithConstantOffset(Addr, *MRI);
6141
6142 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
6143 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
6145 Addr = PtrBase;
6146 ImmOffset = ConstOffset;
6147 }
6148
6149 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
6150 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6151 int FI = AddrDef->MI->getOperand(1).getIndex();
6152 return {{
6153 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
6154 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
6155 }};
6156 }
6157
6158 Register SAddr = AddrDef->Reg;
6159
6160 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
6161 Register LHS = AddrDef->MI->getOperand(1).getReg();
6162 Register RHS = AddrDef->MI->getOperand(2).getReg();
6163 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
6164 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
6165
6166 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
6167 isSGPR(RHSDef->Reg)) {
6168 int FI = LHSDef->MI->getOperand(1).getIndex();
6169 MachineInstr &I = *Root.getParent();
6170 MachineBasicBlock *BB = I.getParent();
6171 const DebugLoc &DL = I.getDebugLoc();
6172 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6173
6174 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
6175 .addFrameIndex(FI)
6176 .addReg(RHSDef->Reg)
6177 .setOperandDead(3); // Dead scc
6178 }
6179 }
6180
6181 if (!isSGPR(SAddr))
6182 return std::nullopt;
6183
6184 return {{
6185 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
6186 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
6187 }};
6188}
6189
6190// Check whether the flat scratch SVS swizzle bug affects this access.
6191bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
6192 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
6193 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
6194 return false;
6195
6196 // The bug affects the swizzling of SVS accesses if there is any carry out
6197 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
6198 // voffset to (soffset + inst_offset).
6199 auto VKnown = VT->getKnownBits(VAddr);
6200 auto SKnown = KnownBits::add(VT->getKnownBits(SAddr),
6201 KnownBits::makeConstant(APInt(32, ImmOffset)));
6202 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
6203 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
6204 return (VMax & 3) + (SMax & 3) >= 4;
6205}
6206
6208AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
6209 Register Addr = Root.getReg();
6210 Register PtrBase;
6211 int64_t ConstOffset;
6212 int64_t ImmOffset = 0;
6213
6214 // Match the immediate offset first, which canonically is moved as low as
6215 // possible.
6216 std::tie(PtrBase, ConstOffset, std::ignore) =
6217 getPtrBaseWithConstantOffset(Addr, *MRI);
6218
6219 Register OrigAddr = Addr;
6220 if (ConstOffset != 0 &&
6221 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
6223 Addr = PtrBase;
6224 ImmOffset = ConstOffset;
6225 }
6226
6227 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
6228 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
6229 return std::nullopt;
6230
6231 Register RHS = AddrDef->MI->getOperand(2).getReg();
6232 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
6233 return std::nullopt;
6234
6235 Register LHS = AddrDef->MI->getOperand(1).getReg();
6236 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
6237
6238 if (OrigAddr != Addr) {
6239 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
6240 return std::nullopt;
6241 } else {
6242 if (!isFlatScratchBaseLegalSV(OrigAddr))
6243 return std::nullopt;
6244 }
6245
6246 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
6247 return std::nullopt;
6248
6249 unsigned CPol = selectScaleOffset(Root, RHS, true /* IsSigned */)
6251 : 0;
6252
6253 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6254 int FI = LHSDef->MI->getOperand(1).getIndex();
6255 return {{
6256 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
6257 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
6258 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
6259 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
6260 }};
6261 }
6262
6263 if (!isSGPR(LHS))
6264 if (auto Def = getDefSrcRegIgnoringCopies(LHS, *MRI))
6265 LHS = Def->Reg;
6266
6267 if (!isSGPR(LHS))
6268 return std::nullopt;
6269
6270 return {{
6271 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
6272 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
6273 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
6274 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
6275 }};
6276}
6277
6279AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
6280 MachineInstr *MI = Root.getParent();
6281 MachineBasicBlock *MBB = MI->getParent();
6282 MachineFunction *MF = MBB->getParent();
6283 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6284
6285 int64_t Offset = 0;
6286 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
6288 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6289
6290 // TODO: Should this be inside the render function? The iterator seems to
6291 // move.
6292 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
6293 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
6294 HighBits)
6295 .addImm(Offset & ~MaxOffset);
6296
6297 return {{[=](MachineInstrBuilder &MIB) { // rsrc
6298 MIB.addReg(Info->getScratchRSrcReg());
6299 },
6300 [=](MachineInstrBuilder &MIB) { // vaddr
6301 MIB.addReg(HighBits);
6302 },
6303 [=](MachineInstrBuilder &MIB) { // soffset
6304 // Use constant zero for soffset and rely on eliminateFrameIndex
6305 // to choose the appropriate frame register if need be.
6306 MIB.addImm(0);
6307 },
6308 [=](MachineInstrBuilder &MIB) { // offset
6309 MIB.addImm(Offset & MaxOffset);
6310 }}};
6311 }
6312
6313 assert(Offset == 0 || Offset == -1);
6314
6315 // Try to fold a frame index directly into the MUBUF vaddr field, and any
6316 // offsets.
6317 std::optional<int> FI;
6318 Register VAddr = Root.getReg();
6319
6320 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6321 Register PtrBase;
6322 int64_t ConstOffset;
6323 std::tie(PtrBase, ConstOffset, std::ignore) =
6324 getPtrBaseWithConstantOffset(VAddr, *MRI);
6325 if (ConstOffset != 0) {
6326 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
6327 (!STI.privateMemoryResourceIsRangeChecked() ||
6328 VT->signBitIsZero(PtrBase))) {
6329 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
6330 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
6331 FI = PtrBaseDef->getOperand(1).getIndex();
6332 else
6333 VAddr = PtrBase;
6334 Offset = ConstOffset;
6335 }
6336 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6337 FI = RootDef->getOperand(1).getIndex();
6338 }
6339
6340 return {{[=](MachineInstrBuilder &MIB) { // rsrc
6341 MIB.addReg(Info->getScratchRSrcReg());
6342 },
6343 [=](MachineInstrBuilder &MIB) { // vaddr
6344 if (FI)
6345 MIB.addFrameIndex(*FI);
6346 else
6347 MIB.addReg(VAddr);
6348 },
6349 [=](MachineInstrBuilder &MIB) { // soffset
6350 // Use constant zero for soffset and rely on eliminateFrameIndex
6351 // to choose the appropriate frame register if need be.
6352 MIB.addImm(0);
6353 },
6354 [=](MachineInstrBuilder &MIB) { // offset
6355 MIB.addImm(Offset);
6356 }}};
6357}
6358
6359bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
6360 int64_t Offset) const {
6361 if (!isUInt<16>(Offset))
6362 return false;
6363
6364 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6365 return true;
6366
6367 // On Southern Islands instruction with a negative base value and an offset
6368 // don't seem to work.
6369 return VT->signBitIsZero(Base);
6370}
6371
6372bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
6373 int64_t Offset1,
6374 unsigned Size) const {
6375 if (Offset0 % Size != 0 || Offset1 % Size != 0)
6376 return false;
6377 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
6378 return false;
6379
6380 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6381 return true;
6382
6383 // On Southern Islands instruction with a negative base value and an offset
6384 // don't seem to work.
6385 return VT->signBitIsZero(Base);
6386}
6387
6388// Return whether the operation has NoUnsignedWrap property.
6389static bool isNoUnsignedWrap(MachineInstr *Addr) {
6390 return Addr->getOpcode() == TargetOpcode::G_OR ||
6391 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
6393}
6394
6395// Check that the base address of flat scratch load/store in the form of `base +
6396// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
6397// requirement). We always treat the first operand as the base address here.
6398bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
6399 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6400
6401 if (isNoUnsignedWrap(AddrMI))
6402 return true;
6403
6404 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6405 // values.
6406 if (STI.hasSignedScratchOffsets())
6407 return true;
6408
6409 Register LHS = AddrMI->getOperand(1).getReg();
6410 Register RHS = AddrMI->getOperand(2).getReg();
6411
6412 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
6413 std::optional<ValueAndVReg> RhsValReg =
6415 // If the immediate offset is negative and within certain range, the base
6416 // address cannot also be negative. If the base is also negative, the sum
6417 // would be either negative or much larger than the valid range of scratch
6418 // memory a thread can access.
6419 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
6420 RhsValReg->Value.getSExtValue() > -0x40000000)
6421 return true;
6422 }
6423
6424 return VT->signBitIsZero(LHS);
6425}
6426
6427// Check address value in SGPR/VGPR are legal for flat scratch in the form
6428// of: SGPR + VGPR.
6429bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
6430 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6431
6432 if (isNoUnsignedWrap(AddrMI))
6433 return true;
6434
6435 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6436 // values.
6437 if (STI.hasSignedScratchOffsets())
6438 return true;
6439
6440 Register LHS = AddrMI->getOperand(1).getReg();
6441 Register RHS = AddrMI->getOperand(2).getReg();
6442 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6443}
6444
6445// Check address value in SGPR/VGPR are legal for flat scratch in the form
6446// of: SGPR + VGPR + Imm.
6447bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
6448 Register Addr) const {
6449 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6450 // values.
6451 if (STI.hasSignedScratchOffsets())
6452 return true;
6453
6454 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6455 Register Base = AddrMI->getOperand(1).getReg();
6456 std::optional<DefinitionAndSourceRegister> BaseDef =
6458 std::optional<ValueAndVReg> RHSOffset =
6460 assert(RHSOffset);
6461
6462 // If the immediate offset is negative and within certain range, the base
6463 // address cannot also be negative. If the base is also negative, the sum
6464 // would be either negative or much larger than the valid range of scratch
6465 // memory a thread can access.
6466 if (isNoUnsignedWrap(BaseDef->MI) &&
6467 (isNoUnsignedWrap(AddrMI) ||
6468 (RHSOffset->Value.getSExtValue() < 0 &&
6469 RHSOffset->Value.getSExtValue() > -0x40000000)))
6470 return true;
6471
6472 Register LHS = BaseDef->MI->getOperand(1).getReg();
6473 Register RHS = BaseDef->MI->getOperand(2).getReg();
6474 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6475}
6476
6477bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
6478 unsigned ShAmtBits) const {
6479 assert(MI.getOpcode() == TargetOpcode::G_AND);
6480
6481 std::optional<APInt> RHS =
6482 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
6483 if (!RHS)
6484 return false;
6485
6486 if (RHS->countr_one() >= ShAmtBits)
6487 return true;
6488
6489 const APInt &LHSKnownZeros = VT->getKnownZeroes(MI.getOperand(1).getReg());
6490 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
6491}
6492
6494AMDGPUInstructionSelector::selectMUBUFScratchOffset(
6495 MachineOperand &Root) const {
6496 Register Reg = Root.getReg();
6497 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6498
6499 std::optional<DefinitionAndSourceRegister> Def =
6501 assert(Def && "this shouldn't be an optional result");
6502 Reg = Def->Reg;
6503
6504 if (Register WaveBase = getWaveAddress(Def->MI)) {
6505 return {{
6506 [=](MachineInstrBuilder &MIB) { // rsrc
6507 MIB.addReg(Info->getScratchRSrcReg());
6508 },
6509 [=](MachineInstrBuilder &MIB) { // soffset
6510 MIB.addReg(WaveBase);
6511 },
6512 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
6513 }};
6514 }
6515
6516 int64_t Offset = 0;
6517
6518 // FIXME: Copy check is a hack
6520 if (mi_match(Reg, *MRI,
6521 m_GPtrAdd(m_Reg(BasePtr),
6523 if (!TII.isLegalMUBUFImmOffset(Offset))
6524 return {};
6525 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
6526 Register WaveBase = getWaveAddress(BasePtrDef);
6527 if (!WaveBase)
6528 return {};
6529
6530 return {{
6531 [=](MachineInstrBuilder &MIB) { // rsrc
6532 MIB.addReg(Info->getScratchRSrcReg());
6533 },
6534 [=](MachineInstrBuilder &MIB) { // soffset
6535 MIB.addReg(WaveBase);
6536 },
6537 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6538 }};
6539 }
6540
6541 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
6542 !TII.isLegalMUBUFImmOffset(Offset))
6543 return {};
6544
6545 return {{
6546 [=](MachineInstrBuilder &MIB) { // rsrc
6547 MIB.addReg(Info->getScratchRSrcReg());
6548 },
6549 [=](MachineInstrBuilder &MIB) { // soffset
6550 MIB.addImm(0);
6551 },
6552 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6553 }};
6554}
6555
6556std::pair<Register, unsigned>
6557AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
6558 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6559 int64_t ConstAddr = 0;
6560
6561 Register PtrBase;
6562 int64_t Offset;
6563 std::tie(PtrBase, Offset, std::ignore) =
6564 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6565
6566 if (Offset) {
6567 if (isDSOffsetLegal(PtrBase, Offset)) {
6568 // (add n0, c0)
6569 return std::pair(PtrBase, Offset);
6570 }
6571 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6572 // TODO
6573
6574
6575 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6576 // TODO
6577
6578 }
6579
6580 return std::pair(Root.getReg(), 0);
6581}
6582
6584AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
6585 Register Reg;
6586 unsigned Offset;
6587 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
6588 return {{
6589 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6590 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
6591 }};
6592}
6593
6595AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
6596 return selectDSReadWrite2(Root, 4);
6597}
6598
6600AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
6601 return selectDSReadWrite2(Root, 8);
6602}
6603
6605AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
6606 unsigned Size) const {
6607 Register Reg;
6608 unsigned Offset;
6609 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
6610 return {{
6611 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6612 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
6613 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
6614 }};
6615}
6616
6617std::pair<Register, unsigned>
6618AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
6619 unsigned Size) const {
6620 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6621 int64_t ConstAddr = 0;
6622
6623 Register PtrBase;
6624 int64_t Offset;
6625 std::tie(PtrBase, Offset, std::ignore) =
6626 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6627
6628 if (Offset) {
6629 int64_t OffsetValue0 = Offset;
6630 int64_t OffsetValue1 = Offset + Size;
6631 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
6632 // (add n0, c0)
6633 return std::pair(PtrBase, OffsetValue0 / Size);
6634 }
6635 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6636 // TODO
6637
6638 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6639 // TODO
6640
6641 }
6642
6643 return std::pair(Root.getReg(), 0);
6644}
6645
6646/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
6647/// the base value with the constant offset, and if the offset computation is
6648/// known to be inbounds. There may be intervening copies between \p Root and
6649/// the identified constant. Returns \p Root, 0, false if this does not match
6650/// the pattern.
6651std::tuple<Register, int64_t, bool>
6652AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
6653 Register Root, const MachineRegisterInfo &MRI) const {
6654 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
6655 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
6656 return {Root, 0, false};
6657
6658 MachineOperand &RHS = RootI->getOperand(2);
6659 std::optional<ValueAndVReg> MaybeOffset =
6661 if (!MaybeOffset)
6662 return {Root, 0, false};
6663 bool IsInBounds = RootI->getFlag(MachineInstr::MIFlag::InBounds);
6664 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue(),
6665 IsInBounds};
6666}
6667
6669 MIB.addImm(0);
6670}
6671
6672/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
6673/// BasePtr is not valid, a null base pointer will be used.
6675 uint32_t FormatLo, uint32_t FormatHi,
6676 Register BasePtr) {
6677 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6678 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6679 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6680 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6681
6682 B.buildInstr(AMDGPU::S_MOV_B32)
6683 .addDef(RSrc2)
6684 .addImm(FormatLo);
6685 B.buildInstr(AMDGPU::S_MOV_B32)
6686 .addDef(RSrc3)
6687 .addImm(FormatHi);
6688
6689 // Build the half of the subregister with the constants before building the
6690 // full 128-bit register. If we are building multiple resource descriptors,
6691 // this will allow CSEing of the 2-component register.
6692 B.buildInstr(AMDGPU::REG_SEQUENCE)
6693 .addDef(RSrcHi)
6694 .addReg(RSrc2)
6695 .addImm(AMDGPU::sub0)
6696 .addReg(RSrc3)
6697 .addImm(AMDGPU::sub1);
6698
6699 Register RSrcLo = BasePtr;
6700 if (!BasePtr) {
6701 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6702 B.buildInstr(AMDGPU::S_MOV_B64)
6703 .addDef(RSrcLo)
6704 .addImm(0);
6705 }
6706
6707 B.buildInstr(AMDGPU::REG_SEQUENCE)
6708 .addDef(RSrc)
6709 .addReg(RSrcLo)
6710 .addImm(AMDGPU::sub0_sub1)
6711 .addReg(RSrcHi)
6712 .addImm(AMDGPU::sub2_sub3);
6713
6714 return RSrc;
6715}
6716
6718 const SIInstrInfo &TII, Register BasePtr) {
6719 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6720
6721 // FIXME: Why are half the "default" bits ignored based on the addressing
6722 // mode?
6723 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
6724}
6725
6727 const SIInstrInfo &TII, Register BasePtr) {
6728 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6729
6730 // FIXME: Why are half the "default" bits ignored based on the addressing
6731 // mode?
6732 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
6733}
6734
6735AMDGPUInstructionSelector::MUBUFAddressData
6736AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
6737 MUBUFAddressData Data;
6738 Data.N0 = Src;
6739
6740 Register PtrBase;
6741 int64_t Offset;
6742
6743 std::tie(PtrBase, Offset, std::ignore) =
6744 getPtrBaseWithConstantOffset(Src, *MRI);
6745 if (isUInt<32>(Offset)) {
6746 Data.N0 = PtrBase;
6747 Data.Offset = Offset;
6748 }
6749
6750 if (MachineInstr *InputAdd
6751 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
6752 Data.N2 = InputAdd->getOperand(1).getReg();
6753 Data.N3 = InputAdd->getOperand(2).getReg();
6754
6755 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
6756 // FIXME: Don't know this was defined by operand 0
6757 //
6758 // TODO: Remove this when we have copy folding optimizations after
6759 // RegBankSelect.
6760 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
6761 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
6762 }
6763
6764 return Data;
6765}
6766
6767/// Return if the addr64 mubuf mode should be used for the given address.
6768bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
6769 // (ptr_add N2, N3) -> addr64, or
6770 // (ptr_add (ptr_add N2, N3), C1) -> addr64
6771 if (Addr.N2)
6772 return true;
6773
6774 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
6775 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
6776}
6777
6778/// Split an immediate offset \p ImmOffset depending on whether it fits in the
6779/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
6780/// component.
6781void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
6782 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
6783 if (TII.isLegalMUBUFImmOffset(ImmOffset))
6784 return;
6785
6786 // Illegal offset, store it in soffset.
6787 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6788 B.buildInstr(AMDGPU::S_MOV_B32)
6789 .addDef(SOffset)
6790 .addImm(ImmOffset);
6791 ImmOffset = 0;
6792}
6793
6794bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
6795 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
6796 Register &SOffset, int64_t &Offset) const {
6797 // FIXME: Predicates should stop this from reaching here.
6798 // addr64 bit was removed for volcanic islands.
6799 if (!STI.hasAddr64() || STI.useFlatForGlobal())
6800 return false;
6801
6802 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6803 if (!shouldUseAddr64(AddrData))
6804 return false;
6805
6806 Register N0 = AddrData.N0;
6807 Register N2 = AddrData.N2;
6808 Register N3 = AddrData.N3;
6809 Offset = AddrData.Offset;
6810
6811 // Base pointer for the SRD.
6812 Register SRDPtr;
6813
6814 if (N2) {
6815 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6816 assert(N3);
6817 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6818 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
6819 // addr64, and construct the default resource from a 0 address.
6820 VAddr = N0;
6821 } else {
6822 SRDPtr = N3;
6823 VAddr = N2;
6824 }
6825 } else {
6826 // N2 is not divergent.
6827 SRDPtr = N2;
6828 VAddr = N3;
6829 }
6830 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6831 // Use the default null pointer in the resource
6832 VAddr = N0;
6833 } else {
6834 // N0 -> offset, or
6835 // (N0 + C1) -> offset
6836 SRDPtr = N0;
6837 }
6838
6839 MachineIRBuilder B(*Root.getParent());
6840 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
6841 splitIllegalMUBUFOffset(B, SOffset, Offset);
6842 return true;
6843}
6844
6845bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
6846 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
6847 int64_t &Offset) const {
6848
6849 // FIXME: Pattern should not reach here.
6850 if (STI.useFlatForGlobal())
6851 return false;
6852
6853 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6854 if (shouldUseAddr64(AddrData))
6855 return false;
6856
6857 // N0 -> offset, or
6858 // (N0 + C1) -> offset
6859 Register SRDPtr = AddrData.N0;
6860 Offset = AddrData.Offset;
6861
6862 // TODO: Look through extensions for 32-bit soffset.
6863 MachineIRBuilder B(*Root.getParent());
6864
6865 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
6866 splitIllegalMUBUFOffset(B, SOffset, Offset);
6867 return true;
6868}
6869
6871AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
6872 Register VAddr;
6873 Register RSrcReg;
6874 Register SOffset;
6875 int64_t Offset = 0;
6876
6877 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
6878 return {};
6879
6880 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
6881 // pattern.
6882 return {{
6883 [=](MachineInstrBuilder &MIB) { // rsrc
6884 MIB.addReg(RSrcReg);
6885 },
6886 [=](MachineInstrBuilder &MIB) { // vaddr
6887 MIB.addReg(VAddr);
6888 },
6889 [=](MachineInstrBuilder &MIB) { // soffset
6890 if (SOffset)
6891 MIB.addReg(SOffset);
6892 else if (STI.hasRestrictedSOffset())
6893 MIB.addReg(AMDGPU::SGPR_NULL);
6894 else
6895 MIB.addImm(0);
6896 },
6897 [=](MachineInstrBuilder &MIB) { // offset
6898 MIB.addImm(Offset);
6899 },
6900 addZeroImm, // cpol
6901 addZeroImm, // tfe
6902 addZeroImm // swz
6903 }};
6904}
6905
6907AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
6908 Register RSrcReg;
6909 Register SOffset;
6910 int64_t Offset = 0;
6911
6912 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
6913 return {};
6914
6915 return {{
6916 [=](MachineInstrBuilder &MIB) { // rsrc
6917 MIB.addReg(RSrcReg);
6918 },
6919 [=](MachineInstrBuilder &MIB) { // soffset
6920 if (SOffset)
6921 MIB.addReg(SOffset);
6922 else if (STI.hasRestrictedSOffset())
6923 MIB.addReg(AMDGPU::SGPR_NULL);
6924 else
6925 MIB.addImm(0);
6926 },
6927 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
6928 addZeroImm, // cpol
6929 addZeroImm, // tfe
6930 addZeroImm, // swz
6931 }};
6932}
6933
6935AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
6936
6937 Register SOffset = Root.getReg();
6938
6939 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
6940 SOffset = AMDGPU::SGPR_NULL;
6941
6942 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
6943}
6944
6945/// Get an immediate that must be 32-bits, and treated as zero extended.
6946static std::optional<uint64_t>
6948 // getIConstantVRegVal sexts any values, so see if that matters.
6949 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
6950 if (!OffsetVal || !isInt<32>(*OffsetVal))
6951 return std::nullopt;
6952 return Lo_32(*OffsetVal);
6953}
6954
6956AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
6957 std::optional<uint64_t> OffsetVal =
6958 Root.isImm() ? Root.getImm() : getConstantZext32Val(Root.getReg(), *MRI);
6959 if (!OffsetVal)
6960 return {};
6961
6962 std::optional<int64_t> EncodedImm =
6963 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
6964 if (!EncodedImm)
6965 return {};
6966
6967 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
6968}
6969
6971AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
6972 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
6973
6974 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
6975 if (!OffsetVal)
6976 return {};
6977
6978 std::optional<int64_t> EncodedImm =
6980 if (!EncodedImm)
6981 return {};
6982
6983 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
6984}
6985
6987AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
6988 // Match the (soffset + offset) pair as a 32-bit register base and
6989 // an immediate offset.
6990 Register SOffset;
6991 unsigned Offset;
6992 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
6993 *MRI, Root.getReg(), VT, /*CheckNUW*/ true);
6994 if (!SOffset)
6995 return std::nullopt;
6996
6997 std::optional<int64_t> EncodedOffset =
6998 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
6999 if (!EncodedOffset)
7000 return std::nullopt;
7001
7002 assert(MRI->getType(SOffset) == LLT::scalar(32));
7003 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
7004 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
7005}
7006
7007std::pair<Register, unsigned>
7008AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
7009 bool &Matched) const {
7010 Matched = false;
7011
7012 Register Src;
7013 unsigned Mods;
7014 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
7015
7016 if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
7017 assert(MRI->getType(Src) == LLT::scalar(16));
7018
7019 // Only change Src if src modifier could be gained. In such cases new Src
7020 // could be sgpr but this does not violate constant bus restriction for
7021 // instruction that is being selected.
7022 Src = stripBitCast(Src, *MRI);
7023
7024 const auto CheckAbsNeg = [&]() {
7025 // Be careful about folding modifiers if we already have an abs. fneg is
7026 // applied last, so we don't want to apply an earlier fneg.
7027 if ((Mods & SISrcMods::ABS) == 0) {
7028 unsigned ModsTmp;
7029 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
7030
7031 if ((ModsTmp & SISrcMods::NEG) != 0)
7032 Mods ^= SISrcMods::NEG;
7033
7034 if ((ModsTmp & SISrcMods::ABS) != 0)
7035 Mods |= SISrcMods::ABS;
7036 }
7037 };
7038
7039 CheckAbsNeg();
7040
7041 // op_sel/op_sel_hi decide the source type and source.
7042 // If the source's op_sel_hi is set, it indicates to do a conversion from
7043 // fp16. If the sources's op_sel is set, it picks the high half of the
7044 // source register.
7045
7046 Mods |= SISrcMods::OP_SEL_1;
7047
7048 if (isExtractHiElt(*MRI, Src, Src)) {
7049 Mods |= SISrcMods::OP_SEL_0;
7050 CheckAbsNeg();
7051 }
7052
7053 Matched = true;
7054 }
7055
7056 return {Src, Mods};
7057}
7058
7060AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
7061 MachineOperand &Root) const {
7062 Register Src;
7063 unsigned Mods;
7064 bool Matched;
7065 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
7066 if (!Matched)
7067 return {};
7068
7069 return {{
7070 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
7071 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
7072 }};
7073}
7074
7076AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
7077 Register Src;
7078 unsigned Mods;
7079 bool Matched;
7080 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
7081
7082 return {{
7083 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
7084 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
7085 }};
7086}
7087
7088bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
7089 MachineInstr &I, Intrinsic::ID IntrID) const {
7090 MachineBasicBlock *MBB = I.getParent();
7091 const DebugLoc &DL = I.getDebugLoc();
7092 Register CCReg = I.getOperand(0).getReg();
7093
7094 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
7095 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_CMP_EQ_U32)).addImm(0).addImm(0);
7096
7097 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
7098 .addImm(I.getOperand(2).getImm());
7099
7100 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
7101
7102 I.eraseFromParent();
7103 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
7104 *MRI);
7105}
7106
7107bool AMDGPUInstructionSelector::selectSGetBarrierState(
7108 MachineInstr &I, Intrinsic::ID IntrID) const {
7109 MachineBasicBlock *MBB = I.getParent();
7110 const DebugLoc &DL = I.getDebugLoc();
7111 const MachineOperand &BarOp = I.getOperand(2);
7112 std::optional<int64_t> BarValImm =
7113 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
7114
7115 if (!BarValImm) {
7116 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
7117 .addReg(BarOp.getReg());
7118 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
7119 }
7120 MachineInstrBuilder MIB;
7121 unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM
7122 : AMDGPU::S_GET_BARRIER_STATE_M0;
7123 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
7124
7125 auto DstReg = I.getOperand(0).getReg();
7126 const TargetRegisterClass *DstRC =
7127 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
7128 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
7129 return false;
7130 MIB.addDef(DstReg);
7131 if (BarValImm) {
7132 MIB.addImm(*BarValImm);
7133 }
7134 I.eraseFromParent();
7135 return true;
7136}
7137
7138unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
7139 if (HasInlineConst) {
7140 switch (IntrID) {
7141 default:
7142 llvm_unreachable("not a named barrier op");
7143 case Intrinsic::amdgcn_s_barrier_join:
7144 return AMDGPU::S_BARRIER_JOIN_IMM;
7145 case Intrinsic::amdgcn_s_wakeup_barrier:
7146 return AMDGPU::S_WAKEUP_BARRIER_IMM;
7147 case Intrinsic::amdgcn_s_get_named_barrier_state:
7148 return AMDGPU::S_GET_BARRIER_STATE_IMM;
7149 };
7150 } else {
7151 switch (IntrID) {
7152 default:
7153 llvm_unreachable("not a named barrier op");
7154 case Intrinsic::amdgcn_s_barrier_join:
7155 return AMDGPU::S_BARRIER_JOIN_M0;
7156 case Intrinsic::amdgcn_s_wakeup_barrier:
7157 return AMDGPU::S_WAKEUP_BARRIER_M0;
7158 case Intrinsic::amdgcn_s_get_named_barrier_state:
7159 return AMDGPU::S_GET_BARRIER_STATE_M0;
7160 };
7161 }
7162}
7163
7164bool AMDGPUInstructionSelector::selectNamedBarrierInit(
7165 MachineInstr &I, Intrinsic::ID IntrID) const {
7166 MachineBasicBlock *MBB = I.getParent();
7167 const DebugLoc &DL = I.getDebugLoc();
7168 const MachineOperand &BarOp = I.getOperand(1);
7169 const MachineOperand &CntOp = I.getOperand(2);
7170
7171 // A member count of 0 means "keep existing member count". That plus a known
7172 // constant value for the barrier ID lets us use the immarg form.
7173 if (IntrID == Intrinsic::amdgcn_s_barrier_signal_var) {
7174 std::optional<int64_t> CntImm =
7175 getIConstantVRegSExtVal(CntOp.getReg(), *MRI);
7176 if (CntImm && *CntImm == 0) {
7177 std::optional<int64_t> BarValImm =
7178 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
7179 if (BarValImm) {
7180 auto BarID = ((*BarValImm) >> 4) & 0x3F;
7181 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
7182 .addImm(BarID);
7183 I.eraseFromParent();
7184 return true;
7185 }
7186 }
7187 }
7188
7189 // BarID = (BarOp >> 4) & 0x3F
7190 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7191 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
7192 .add(BarOp)
7193 .addImm(4u)
7194 .setOperandDead(3); // Dead scc
7195
7196 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7197 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
7198 .addReg(TmpReg0)
7199 .addImm(0x3F)
7200 .setOperandDead(3); // Dead scc
7201
7202 // MO = ((CntOp & 0x3F) << shAmt) | BarID
7203 Register TmpReg2 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7204 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg2)
7205 .add(CntOp)
7206 .addImm(0x3F)
7207 .setOperandDead(3); // Dead scc
7208
7209 Register TmpReg3 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7210 constexpr unsigned ShAmt = 16;
7211 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg3)
7212 .addReg(TmpReg2)
7213 .addImm(ShAmt)
7214 .setOperandDead(3); // Dead scc
7215
7216 Register TmpReg4 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7217 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg4)
7218 .addReg(TmpReg1)
7219 .addReg(TmpReg3)
7220 .setOperandDead(3); // Dead scc;
7221
7222 auto CopyMIB =
7223 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4);
7224 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
7225
7226 unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
7227 ? AMDGPU::S_BARRIER_INIT_M0
7228 : AMDGPU::S_BARRIER_SIGNAL_M0;
7229 MachineInstrBuilder MIB;
7230 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
7231
7232 I.eraseFromParent();
7233 return true;
7234}
7235
7236bool AMDGPUInstructionSelector::selectNamedBarrierInst(
7237 MachineInstr &I, Intrinsic::ID IntrID) const {
7238 MachineBasicBlock *MBB = I.getParent();
7239 const DebugLoc &DL = I.getDebugLoc();
7240 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state
7241 ? I.getOperand(2)
7242 : I.getOperand(1);
7243 std::optional<int64_t> BarValImm =
7244 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
7245
7246 if (!BarValImm) {
7247 // BarID = (BarOp >> 4) & 0x3F
7248 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7249 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
7250 .addReg(BarOp.getReg())
7251 .addImm(4u)
7252 .setOperandDead(3); // Dead scc;
7253
7254 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7255 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
7256 .addReg(TmpReg0)
7257 .addImm(0x3F)
7258 .setOperandDead(3); // Dead scc;
7259
7260 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
7261 .addReg(TmpReg1);
7262 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
7263 }
7264
7265 MachineInstrBuilder MIB;
7266 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
7267 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
7268
7269 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
7270 auto DstReg = I.getOperand(0).getReg();
7271 const TargetRegisterClass *DstRC =
7272 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
7273 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
7274 return false;
7275 MIB.addDef(DstReg);
7276 }
7277
7278 if (BarValImm) {
7279 auto BarId = ((*BarValImm) >> 4) & 0x3F;
7280 MIB.addImm(BarId);
7281 }
7282
7283 I.eraseFromParent();
7284 return true;
7285}
7286
7287void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
7288 const MachineInstr &MI,
7289 int OpIdx) const {
7290 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7291 "Expected G_CONSTANT");
7292 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
7293}
7294
7295void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
7296 const MachineInstr &MI,
7297 int OpIdx) const {
7298 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7299 "Expected G_CONSTANT");
7300 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
7301}
7302
7303void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
7304 const MachineInstr &MI,
7305 int OpIdx) const {
7306 const MachineOperand &Op = MI.getOperand(1);
7307 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
7308 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
7309}
7310
7311void AMDGPUInstructionSelector::renderCountTrailingOnesImm(
7312 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7313 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7314 "Expected G_CONSTANT");
7315 MIB.addImm(MI.getOperand(1).getCImm()->getValue().countTrailingOnes());
7316}
7317
7318/// This only really exists to satisfy DAG type checking machinery, so is a
7319/// no-op here.
7320void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
7321 const MachineInstr &MI,
7322 int OpIdx) const {
7323 const MachineOperand &Op = MI.getOperand(OpIdx);
7324 int64_t Imm;
7325 if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm)))
7326 MIB.addImm(Imm);
7327 else
7328 MIB.addImm(Op.getImm());
7329}
7330
7331void AMDGPUInstructionSelector::renderZextBoolTImm(MachineInstrBuilder &MIB,
7332 const MachineInstr &MI,
7333 int OpIdx) const {
7334 MIB.addImm(MI.getOperand(OpIdx).getImm() != 0);
7335}
7336
7337void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
7338 const MachineInstr &MI,
7339 int OpIdx) const {
7340 assert(OpIdx >= 0 && "expected to match an immediate operand");
7341 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7342}
7343
7344void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
7345 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7346 assert(OpIdx >= 0 && "expected to match an immediate operand");
7347 MIB.addImm(
7348 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7349}
7350
7351void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
7352 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7353 assert(OpIdx >= 0 && "expected to match an immediate operand");
7354 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
7356 : (int64_t)SISrcMods::DST_OP_SEL);
7357}
7358
7359void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
7360 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7361 assert(OpIdx >= 0 && "expected to match an immediate operand");
7362 MIB.addImm(
7363 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7364}
7365
7366void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
7367 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7368 assert(OpIdx >= 0 && "expected to match an immediate operand");
7369 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
7370 ? (int64_t)(SISrcMods::OP_SEL_0)
7371 : 0);
7372}
7373
7374void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
7375 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7376 assert(OpIdx >= 0 && "expected to match an immediate operand");
7377 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL)
7378 : 0);
7379}
7380
7381void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(
7382 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7383 assert(OpIdx >= 0 && "expected to match an immediate operand");
7384 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0)
7385 : 0);
7386}
7387
7388void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
7389 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7390 assert(OpIdx >= 0 && "expected to match an immediate operand");
7391 MIB.addImm(
7392 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7393}
7394
7395void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
7396 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7397 assert(OpIdx >= 0 && "expected to match an immediate operand");
7398 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
7399 ? (int64_t)SISrcMods::DST_OP_SEL
7400 : 0);
7401}
7402
7403void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
7404 const MachineInstr &MI,
7405 int OpIdx) const {
7406 assert(OpIdx >= 0 && "expected to match an immediate operand");
7407 MIB.addImm(MI.getOperand(OpIdx).getImm() &
7410}
7411
7412void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
7413 const MachineInstr &MI,
7414 int OpIdx) const {
7415 assert(OpIdx >= 0 && "expected to match an immediate operand");
7416 const bool Swizzle = MI.getOperand(OpIdx).getImm() &
7419 MIB.addImm(Swizzle);
7420}
7421
7422void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
7423 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7424 assert(OpIdx >= 0 && "expected to match an immediate operand");
7425 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
7428 MIB.addImm(Cpol | AMDGPU::CPol::GLC);
7429}
7430
7431void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
7432 const MachineInstr &MI,
7433 int OpIdx) const {
7434 MIB.addFrameIndex(MI.getOperand(1).getIndex());
7435}
7436
7437void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
7438 const MachineInstr &MI,
7439 int OpIdx) const {
7440 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
7441 int ExpVal = APF.getExactLog2Abs();
7442 assert(ExpVal != INT_MIN);
7443 MIB.addImm(ExpVal);
7444}
7445
7446void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
7447 const MachineInstr &MI,
7448 int OpIdx) const {
7449 // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
7450 // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
7451 // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
7452 // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
7453 MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
7454}
7455
7456void AMDGPUInstructionSelector::renderVOP3PModsNeg(MachineInstrBuilder &MIB,
7457 const MachineInstr &MI,
7458 int OpIdx) const {
7459 unsigned Mods = SISrcMods::OP_SEL_1;
7460 if (MI.getOperand(OpIdx).getImm())
7461 Mods ^= SISrcMods::NEG;
7462 MIB.addImm((int64_t)Mods);
7463}
7464
7465void AMDGPUInstructionSelector::renderVOP3PModsNegs(MachineInstrBuilder &MIB,
7466 const MachineInstr &MI,
7467 int OpIdx) const {
7468 unsigned Mods = SISrcMods::OP_SEL_1;
7469 if (MI.getOperand(OpIdx).getImm())
7471 MIB.addImm((int64_t)Mods);
7472}
7473
7474void AMDGPUInstructionSelector::renderVOP3PModsNegAbs(MachineInstrBuilder &MIB,
7475 const MachineInstr &MI,
7476 int OpIdx) const {
7477 unsigned Val = MI.getOperand(OpIdx).getImm();
7478 unsigned Mods = SISrcMods::OP_SEL_1; // default: none
7479 if (Val == 1) // neg
7480 Mods ^= SISrcMods::NEG;
7481 if (Val == 2) // abs
7482 Mods ^= SISrcMods::ABS;
7483 if (Val == 3) // neg and abs
7484 Mods ^= (SISrcMods::NEG | SISrcMods::ABS);
7485 MIB.addImm((int64_t)Mods);
7486}
7487
7488void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB,
7489 const MachineInstr &MI,
7490 int OpIdx) const {
7491 uint32_t V = MI.getOperand(2).getImm();
7494 if (!Subtarget->hasSafeCUPrefetch())
7495 V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe
7496 MIB.addImm(V);
7497}
7498
7499/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
7500void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
7501 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7502 unsigned Val = MI.getOperand(OpIdx).getImm();
7503 unsigned New = 0;
7504 if (Val & 0x1)
7506 if (Val & 0x2)
7508 MIB.addImm(New);
7509}
7510
7511bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
7512 return TII.isInlineConstant(Imm);
7513}
7514
7515bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
7516 return TII.isInlineConstant(Imm);
7517}
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
#define GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static Register getLegalRegBank(Register NewReg, Register RootReg, const AMDGPURegisterBankInfo &RBI, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const SIInstrInfo &TII)
static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is shift left with half bits, such as reg0:2n =G_SHL reg1:2n, CONST(n)
static bool isNoUnsignedWrap(MachineInstr *Addr)
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID)
static bool checkRB(Register Reg, unsigned int RBNo, const AMDGPURegisterBankInfo &RBI, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI)
static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods)
static bool isTruncHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is truncating to half, such as reg0:n = G_TRUNC reg1:2n
static Register getWaveAddress(const MachineInstr *Def)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static bool shouldUseAndMask(unsigned Size, unsigned &Mask)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static TypeClass isVectorOfTwoOrScalar(Register Reg, const MachineRegisterInfo &MRI)
static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI, MachineBasicBlock *MBB)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void addZeroImm(MachineInstrBuilder &MIB)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static bool isConstant(const MachineInstr &MI)
static bool isSameBitWidth(Register Reg1, Register Reg2, const MachineRegisterInfo &MRI)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
static bool isAsyncLDSDMA(Intrinsic::ID Intr)
static std::pair< Register, unsigned > computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, Register IdxReg, unsigned EltSize, GISelValueTracking &ValueTracking)
Return the register to use for the index value, and the subregister to use for the indirectly accesse...
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
static std::pair< Register, SrcStatus > getLastSameOrNeg(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static Register stripCopy(Register Reg, MachineRegisterInfo &MRI)
static std::optional< std::pair< Register, SrcStatus > > calcNextStatus(std::pair< Register, SrcStatus > Curr, const MachineRegisterInfo &MRI)
static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI)
static std::optional< uint64_t > getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI)
Get an immediate that must be 32-bits, and treated as zero extended.
static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg, Register RootReg, const SIInstrInfo &TII, const MachineRegisterInfo &MRI)
static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, const GCNSubtarget &ST)
static SmallVector< std::pair< Register, SrcStatus > > getSrcStats(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static bool isUnmergeHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test function, if the MI is reg0:n, reg1:n = G_UNMERGE_VALUES reg2:2n
static SrcStatus getNegStatus(Register Reg, SrcStatus S, const MachineRegisterInfo &MRI)
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is logic shift right with half bits, such as reg0:2n =G_LSHR reg1:2n,...
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
This file declares the targeting of the InstructionSelector class for AMDGPU.
constexpr LLT S1
constexpr LLT S32
AMDGPU Register Bank Select
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool isAllZeros(StringRef Arr)
Return true if the array is empty or all zeros.
dxil translate DXIL Translate Metadata
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
MachineInstr unsigned OpIdx
#define P(N)
static std::vector< std::pair< int, unsigned > > Swizzle(std::vector< std::pair< int, unsigned > > Src, R600InstrInfo::BankSwizzle Swz)
#define LLVM_DEBUG(...)
Definition Debug.h:119
Value * RHS
Value * LHS
This is used to control valid status that current MI supports.
bool checkOptions(SrcStatus Stat) const
SearchOptions(Register Reg, const MachineRegisterInfo &MRI)
AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM)
static const char * getName()
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
void setupMF(MachineFunction &MF, GISelValueTracking *VT, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override
Setup per-MF executor state.
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1594
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:693
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:690
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:688
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:678
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
bool isFPPredicate() const
Definition InstrTypes.h:782
bool isIntPredicate() const
Definition InstrTypes.h:783
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
LLVM_ABI DILocation * get() const
Get the underlying DILocation.
Definition DebugLoc.cpp:48
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
std::optional< SmallVector< std::function< void(MachineInstrBuilder &)>, 4 > > ComplexRendererFns
virtual void setupMF(MachineFunction &mf, GISelValueTracking *vt, CodeGenCoverage *covinfo=nullptr, ProfileSummaryInfo *psi=nullptr, BlockFrequencyInfo *bfi=nullptr)
Setup per-MF executor state.
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool hasValue() const
TypeSize getValue() const
int getOperandConstraint(unsigned OpNum, MCOI::OperandConstraint Constraint) const
Returns the value of the specified operand constraint if it is present.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void setReturnAddressIsTaken(bool s)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Helper class to build MachineInstr.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
const ConstantInt * getCImm() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
ArrayRef< int > getShuffleMask() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isEarlyClobber() const
Register getReg() const
getReg - Returns the register number.
bool isInternalRead() const
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const RegisterBank * getRegBankOrNull(Register Reg) const
Return the register bank of Reg, or null if Reg has not been assigned a register bank or has been ass...
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Analysis providing profile information.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
static bool isGenericOpcode(unsigned Opc)
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
LLVM_READONLY int32_t getGlobalSaddrOp(uint32_t Opcode)
bool isGFX13Plus(const MCSubtargetInfo &STI)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX10Plus(const MCSubtargetInfo &STI)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
IndexMode
ARM Index Modes.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
operand_type_match m_Reg()
SpecificConstantMatch m_SpecificICst(const APInt &RequestedValue)
Matches a constant equal to RequestedValue.
GCstAndRegMatch m_GCst(std::optional< ValueAndVReg > &ValReg)
UnaryOp_match< SrcTy, TargetOpcode::COPY > m_Copy(SrcTy &&Src)
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_XOR, true > m_GXor(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_SEXT > m_GSExt(const SrcTy &Src)
UnaryOp_match< SrcTy, TargetOpcode::G_FPEXT > m_GFPExt(const SrcTy &Src)
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
ConstantMatch< APInt > m_ICst(APInt &Cst)
SpecificConstantMatch m_AllOnesInt()
BinaryOp_match< LHS, RHS, TargetOpcode::G_OR, true > m_GOr(const LHS &L, const RHS &R)
ICstOrSplatMatch< APInt > m_ICstOrSplat(APInt &Cst)
ImplicitDefMatch m_GImplicitDef()
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
BinaryOp_match< LHS, RHS, TargetOpcode::G_ASHR, false > m_GAShr(const LHS &L, const RHS &R)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
SpecificRegisterMatch m_SpecificReg(Register RequestedReg)
Matches a register only if it is equal to RequestedReg.
BinaryOp_match< LHS, RHS, TargetOpcode::G_SHL, false > m_GShl(const LHS &L, const RHS &R)
Or< Preds... > m_any_of(Preds &&... preds)
BinaryOp_match< LHS, RHS, TargetOpcode::G_AND, true > m_GAnd(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_BITCAST > m_GBitcast(const SrcTy &Src)
bind_ty< MachineInstr * > m_MInstr(MachineInstr *&MI)
UnaryOp_match< SrcTy, TargetOpcode::G_FNEG > m_GFNeg(const SrcTy &Src)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
UnaryOp_match< SrcTy, TargetOpcode::G_FABS > m_GFabs(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_ANYEXT > m_GAnyExt(const SrcTy &Src)
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, TargetOpcode::G_MUL, true > m_GMul(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_TRUNC > m_GTrunc(const SrcTy &Src)
auto m_BinOp()
Match an arbitrary binary operation and ignore it.
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition Utils.cpp:858
@ Offset
Definition DWP.cpp:557
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI bool isBuildVectorAllZeros(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndef=false)
Return true if the specified instruction is a G_BUILD_VECTOR or G_BUILD_VECTOR_TRUNC where all of the...
Definition Utils.cpp:1444
LLVM_ABI Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition Utils.cpp:57
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:653
PointerUnion< const TargetRegisterClass *, const RegisterBank * > RegClassOrRegBank
Convenient type to represent either a register class or a register bank.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition Utils.cpp:461
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
LLVM_ABI std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition Utils.cpp:294
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:156
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition Utils.cpp:494
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:314
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
LLVM_ABI std::optional< ValueAndVReg > getAnyConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool LookThroughAnyExt=false)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT or G_FCONST...
Definition Utils.cpp:439
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
LLVM_ABI std::optional< DefinitionAndSourceRegister > getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, and underlying value Register folding away any copies.
Definition Utils.cpp:469
LLVM_ABI Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition Utils.cpp:501
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
constexpr RegState getUndefRegState(bool B)
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:315
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false, bool SelfAdd=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:361
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.