LLVM 22.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
15#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
45 const AMDGPUTargetMachine &TM)
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
49#include "AMDGPUGenGlobalISel.inc"
52#include "AMDGPUGenGlobalISel.inc"
54{
55}
56
57const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
58
69
70// Return the wave level SGPR base address if this is a wave address.
72 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
73 ? Def->getOperand(1).getReg()
74 : Register();
75}
76
77bool AMDGPUInstructionSelector::isVCC(Register Reg,
78 const MachineRegisterInfo &MRI) const {
79 // The verifier is oblivious to s1 being a valid value for wavesize registers.
80 if (Reg.isPhysical())
81 return false;
82
83 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
84 const TargetRegisterClass *RC =
86 if (RC) {
87 const LLT Ty = MRI.getType(Reg);
88 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
89 return false;
90 // G_TRUNC s1 result is never vcc.
91 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
92 RC->hasSuperClassEq(TRI.getBoolRC());
93 }
94
95 const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank);
96 return RB->getID() == AMDGPU::VCCRegBankID;
97}
98
99bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
100 unsigned NewOpc) const {
101 MI.setDesc(TII.get(NewOpc));
102 MI.removeOperand(1); // Remove intrinsic ID.
103 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
104
105 MachineOperand &Dst = MI.getOperand(0);
106 MachineOperand &Src = MI.getOperand(1);
107
108 // TODO: This should be legalized to s32 if needed
109 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
110 return false;
111
112 const TargetRegisterClass *DstRC
113 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
114 const TargetRegisterClass *SrcRC
115 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
116 if (!DstRC || DstRC != SrcRC)
117 return false;
118
119 if (!RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) ||
120 !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
121 return false;
122 const MCInstrDesc &MCID = MI.getDesc();
123 if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
124 MI.getOperand(0).setIsEarlyClobber(true);
125 }
126 return true;
127}
128
129bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
130 const DebugLoc &DL = I.getDebugLoc();
131 MachineBasicBlock *BB = I.getParent();
132 I.setDesc(TII.get(TargetOpcode::COPY));
133
134 const MachineOperand &Src = I.getOperand(1);
135 MachineOperand &Dst = I.getOperand(0);
136 Register DstReg = Dst.getReg();
137 Register SrcReg = Src.getReg();
138
139 if (isVCC(DstReg, *MRI)) {
140 if (SrcReg == AMDGPU::SCC) {
141 const TargetRegisterClass *RC
142 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
143 if (!RC)
144 return true;
145 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
146 }
147
148 if (!isVCC(SrcReg, *MRI)) {
149 // TODO: Should probably leave the copy and let copyPhysReg expand it.
150 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
151 return false;
152
153 const TargetRegisterClass *SrcRC
154 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
155
156 std::optional<ValueAndVReg> ConstVal =
157 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
158 if (ConstVal) {
159 unsigned MovOpc =
160 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
161 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
162 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
163 } else {
164 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
165
166 // We can't trust the high bits at this point, so clear them.
167
168 // TODO: Skip masking high bits if def is known boolean.
169
170 if (AMDGPU::getRegBitWidth(SrcRC->getID()) == 16) {
171 assert(Subtarget->useRealTrue16Insts());
172 const int64_t NoMods = 0;
173 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg)
174 .addImm(NoMods)
175 .addImm(1)
176 .addImm(NoMods)
177 .addReg(SrcReg)
178 .addImm(NoMods);
179 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg)
180 .addImm(NoMods)
181 .addImm(0)
182 .addImm(NoMods)
183 .addReg(MaskedReg)
184 .addImm(NoMods);
185 } else {
186 bool IsSGPR = TRI.isSGPRClass(SrcRC);
187 unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
188 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
189 .addImm(1)
190 .addReg(SrcReg);
191 if (IsSGPR)
192 And.setOperandDead(3); // Dead scc
193
194 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
195 .addImm(0)
196 .addReg(MaskedReg);
197 }
198 }
199
200 if (!MRI->getRegClassOrNull(SrcReg))
201 MRI->setRegClass(SrcReg, SrcRC);
202 I.eraseFromParent();
203 return true;
204 }
205
206 const TargetRegisterClass *RC =
207 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
208 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
209 return false;
210
211 return true;
212 }
213
214 for (const MachineOperand &MO : I.operands()) {
215 if (MO.getReg().isPhysical())
216 continue;
217
218 const TargetRegisterClass *RC =
219 TRI.getConstrainedRegClassForOperand(MO, *MRI);
220 if (!RC)
221 continue;
222 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
223 }
224 return true;
225}
226
227bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
228 const DebugLoc &DL = I.getDebugLoc();
229 MachineBasicBlock *BB = I.getParent();
230 Register VCCReg = I.getOperand(1).getReg();
231 MachineInstr *Cmp;
232
233 // Set SCC as a side effect with S_CMP or S_OR.
234 if (STI.hasScalarCompareEq64()) {
235 unsigned CmpOpc =
236 STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
237 Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0);
238 } else {
239 Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
240 Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst)
241 .addReg(VCCReg)
242 .addReg(VCCReg);
243 }
244
245 if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI))
246 return false;
247
248 Register DstReg = I.getOperand(0).getReg();
249 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);
250
251 I.eraseFromParent();
252 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
253}
254
255bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
256 const DebugLoc &DL = I.getDebugLoc();
257 MachineBasicBlock *BB = I.getParent();
258
259 Register DstReg = I.getOperand(0).getReg();
260 Register SrcReg = I.getOperand(1).getReg();
261 std::optional<ValueAndVReg> Arg =
262 getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI);
263
264 if (Arg) {
265 const int64_t Value = Arg->Value.getZExtValue();
266 if (Value == 0) {
267 unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
268 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
269 } else {
270 assert(Value == 1);
271 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec());
272 }
273 I.eraseFromParent();
274 return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI);
275 }
276
277 // RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0).
278 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg);
279
280 unsigned SelectOpcode =
281 STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
282 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
283 .addReg(TRI.getExec())
284 .addImm(0);
285
286 I.eraseFromParent();
287 return constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
288}
289
290bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
291 Register DstReg = I.getOperand(0).getReg();
292 Register SrcReg = I.getOperand(1).getReg();
293
294 const DebugLoc &DL = I.getDebugLoc();
295 MachineBasicBlock *BB = I.getParent();
296
297 auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
298 .addReg(SrcReg);
299
300 I.eraseFromParent();
301 return constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
302}
303
304bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
305 const Register DefReg = I.getOperand(0).getReg();
306 const LLT DefTy = MRI->getType(DefReg);
307
308 // S1 G_PHIs should not be selected in instruction-select, instead:
309 // - divergent S1 G_PHI should go through lane mask merging algorithm
310 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
311 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
312 if (DefTy == LLT::scalar(1))
313 return false;
314
315 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
316
317 const RegClassOrRegBank &RegClassOrBank =
318 MRI->getRegClassOrRegBank(DefReg);
319
320 const TargetRegisterClass *DefRC =
322 if (!DefRC) {
323 if (!DefTy.isValid()) {
324 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
325 return false;
326 }
327
328 const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
329 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
330 if (!DefRC) {
331 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
332 return false;
333 }
334 }
335
336 // If inputs have register bank, assign corresponding reg class.
337 // Note: registers don't need to have the same reg bank.
338 for (unsigned i = 1; i != I.getNumOperands(); i += 2) {
339 const Register SrcReg = I.getOperand(i).getReg();
340
341 const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg);
342 if (RB) {
343 const LLT SrcTy = MRI->getType(SrcReg);
344 const TargetRegisterClass *SrcRC =
345 TRI.getRegClassForTypeOnBank(SrcTy, *RB);
346 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
347 return false;
348 }
349 }
350
351 I.setDesc(TII.get(TargetOpcode::PHI));
352 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
353}
354
356AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
357 const TargetRegisterClass &SubRC,
358 unsigned SubIdx) const {
359
360 MachineInstr *MI = MO.getParent();
361 MachineBasicBlock *BB = MO.getParent()->getParent();
362 Register DstReg = MRI->createVirtualRegister(&SubRC);
363
364 if (MO.isReg()) {
365 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
366 Register Reg = MO.getReg();
367 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
368 .addReg(Reg, 0, ComposedSubIdx);
369
370 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
371 MO.isKill(), MO.isDead(), MO.isUndef(),
372 MO.isEarlyClobber(), 0, MO.isDebug(),
373 MO.isInternalRead());
374 }
375
376 assert(MO.isImm());
377
378 APInt Imm(64, MO.getImm());
379
380 switch (SubIdx) {
381 default:
382 llvm_unreachable("do not know to split immediate with this sub index.");
383 case AMDGPU::sub0:
384 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
385 case AMDGPU::sub1:
386 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
387 }
388}
389
390static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
391 switch (Opc) {
392 case AMDGPU::G_AND:
393 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
394 case AMDGPU::G_OR:
395 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
396 case AMDGPU::G_XOR:
397 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
398 default:
399 llvm_unreachable("not a bit op");
400 }
401}
402
403bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
404 Register DstReg = I.getOperand(0).getReg();
405 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
406
407 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
408 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
409 DstRB->getID() != AMDGPU::VCCRegBankID)
410 return false;
411
412 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
413 STI.isWave64());
414 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
415
416 // Dead implicit-def of scc
417 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
418 true, // isImp
419 false, // isKill
420 true)); // isDead
421 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
422}
423
424bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
425 MachineBasicBlock *BB = I.getParent();
426 MachineFunction *MF = BB->getParent();
427 Register DstReg = I.getOperand(0).getReg();
428 const DebugLoc &DL = I.getDebugLoc();
429 LLT Ty = MRI->getType(DstReg);
430 if (Ty.isVector())
431 return false;
432
433 unsigned Size = Ty.getSizeInBits();
434 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
435 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
436 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
437
438 if (Size == 32) {
439 if (IsSALU) {
440 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
441 MachineInstr *Add =
442 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
443 .add(I.getOperand(1))
444 .add(I.getOperand(2))
445 .setOperandDead(3); // Dead scc
446 I.eraseFromParent();
447 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
448 }
449
450 if (STI.hasAddNoCarry()) {
451 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
452 I.setDesc(TII.get(Opc));
453 I.addOperand(*MF, MachineOperand::CreateImm(0));
454 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
455 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
456 }
457
458 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
459
460 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
461 MachineInstr *Add
462 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
463 .addDef(UnusedCarry, RegState::Dead)
464 .add(I.getOperand(1))
465 .add(I.getOperand(2))
466 .addImm(0);
467 I.eraseFromParent();
468 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
469 }
470
471 assert(!Sub && "illegal sub should not reach here");
472
473 const TargetRegisterClass &RC
474 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
475 const TargetRegisterClass &HalfRC
476 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
477
478 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
479 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
480 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
481 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
482
483 Register DstLo = MRI->createVirtualRegister(&HalfRC);
484 Register DstHi = MRI->createVirtualRegister(&HalfRC);
485
486 if (IsSALU) {
487 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
488 .add(Lo1)
489 .add(Lo2);
490 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
491 .add(Hi1)
492 .add(Hi2)
493 .setOperandDead(3); // Dead scc
494 } else {
495 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
496 Register CarryReg = MRI->createVirtualRegister(CarryRC);
497 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
498 .addDef(CarryReg)
499 .add(Lo1)
500 .add(Lo2)
501 .addImm(0);
502 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
503 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
504 .add(Hi1)
505 .add(Hi2)
506 .addReg(CarryReg, RegState::Kill)
507 .addImm(0);
508
509 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
510 return false;
511 }
512
513 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
514 .addReg(DstLo)
515 .addImm(AMDGPU::sub0)
516 .addReg(DstHi)
517 .addImm(AMDGPU::sub1);
518
519
520 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
521 return false;
522
523 I.eraseFromParent();
524 return true;
525}
526
527bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
528 MachineInstr &I) const {
529 MachineBasicBlock *BB = I.getParent();
530 MachineFunction *MF = BB->getParent();
531 const DebugLoc &DL = I.getDebugLoc();
532 Register Dst0Reg = I.getOperand(0).getReg();
533 Register Dst1Reg = I.getOperand(1).getReg();
534 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
535 I.getOpcode() == AMDGPU::G_UADDE;
536 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
537 I.getOpcode() == AMDGPU::G_USUBE;
538
539 if (isVCC(Dst1Reg, *MRI)) {
540 unsigned NoCarryOpc =
541 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
542 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
543 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
544 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
545 I.addOperand(*MF, MachineOperand::CreateImm(0));
546 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
547 }
548
549 Register Src0Reg = I.getOperand(2).getReg();
550 Register Src1Reg = I.getOperand(3).getReg();
551
552 if (HasCarryIn) {
553 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
554 .addReg(I.getOperand(4).getReg());
555 }
556
557 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
558 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
559
560 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
561 .add(I.getOperand(2))
562 .add(I.getOperand(3));
563
564 if (MRI->use_nodbg_empty(Dst1Reg)) {
565 CarryInst.setOperandDead(3); // Dead scc
566 } else {
567 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
568 .addReg(AMDGPU::SCC);
569 if (!MRI->getRegClassOrNull(Dst1Reg))
570 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
571 }
572
573 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
574 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
575 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
576 return false;
577
578 if (HasCarryIn &&
579 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
580 AMDGPU::SReg_32RegClass, *MRI))
581 return false;
582
583 I.eraseFromParent();
584 return true;
585}
586
587bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
588 MachineInstr &I) const {
589 MachineBasicBlock *BB = I.getParent();
590 MachineFunction *MF = BB->getParent();
591 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
592 bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() &&
593 MRI->use_nodbg_empty(I.getOperand(1).getReg());
594
595 unsigned Opc;
596 if (Subtarget->hasMADIntraFwdBug())
597 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
598 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
599 else if (UseNoCarry)
600 Opc = IsUnsigned ? AMDGPU::V_MAD_NC_U64_U32_e64
601 : AMDGPU::V_MAD_NC_I64_I32_e64;
602 else
603 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
604
605 if (UseNoCarry)
606 I.removeOperand(1);
607
608 I.setDesc(TII.get(Opc));
609 I.addOperand(*MF, MachineOperand::CreateImm(0));
610 I.addImplicitDefUseOperands(*MF);
611 I.getOperand(0).setIsEarlyClobber(true);
612 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
613}
614
615// TODO: We should probably legalize these to only using 32-bit results.
616bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
617 MachineBasicBlock *BB = I.getParent();
618 Register DstReg = I.getOperand(0).getReg();
619 Register SrcReg = I.getOperand(1).getReg();
620 LLT DstTy = MRI->getType(DstReg);
621 LLT SrcTy = MRI->getType(SrcReg);
622 const unsigned SrcSize = SrcTy.getSizeInBits();
623 unsigned DstSize = DstTy.getSizeInBits();
624
625 // TODO: Should handle any multiple of 32 offset.
626 unsigned Offset = I.getOperand(2).getImm();
627 if (Offset % 32 != 0 || DstSize > 128)
628 return false;
629
630 // 16-bit operations really use 32-bit registers.
631 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
632 if (DstSize == 16)
633 DstSize = 32;
634
635 const TargetRegisterClass *DstRC =
636 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
637 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
638 return false;
639
640 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
641 const TargetRegisterClass *SrcRC =
642 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
643 if (!SrcRC)
644 return false;
646 DstSize / 32);
647 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
648 if (!SrcRC)
649 return false;
650
651 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
652 *SrcRC, I.getOperand(1));
653 const DebugLoc &DL = I.getDebugLoc();
654 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
655 .addReg(SrcReg, 0, SubReg);
656
657 I.eraseFromParent();
658 return true;
659}
660
661bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
662 MachineBasicBlock *BB = MI.getParent();
663 Register DstReg = MI.getOperand(0).getReg();
664 LLT DstTy = MRI->getType(DstReg);
665 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
666
667 const unsigned SrcSize = SrcTy.getSizeInBits();
668 if (SrcSize < 32)
669 return selectImpl(MI, *CoverageInfo);
670
671 const DebugLoc &DL = MI.getDebugLoc();
672 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
673 const unsigned DstSize = DstTy.getSizeInBits();
674 const TargetRegisterClass *DstRC =
675 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
676 if (!DstRC)
677 return false;
678
679 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
680 MachineInstrBuilder MIB =
681 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
682 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
683 MachineOperand &Src = MI.getOperand(I + 1);
684 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
685 MIB.addImm(SubRegs[I]);
686
687 const TargetRegisterClass *SrcRC
688 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
689 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
690 return false;
691 }
692
693 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
694 return false;
695
696 MI.eraseFromParent();
697 return true;
698}
699
700bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
701 MachineBasicBlock *BB = MI.getParent();
702 const int NumDst = MI.getNumOperands() - 1;
703
704 MachineOperand &Src = MI.getOperand(NumDst);
705
706 Register SrcReg = Src.getReg();
707 Register DstReg0 = MI.getOperand(0).getReg();
708 LLT DstTy = MRI->getType(DstReg0);
709 LLT SrcTy = MRI->getType(SrcReg);
710
711 const unsigned DstSize = DstTy.getSizeInBits();
712 const unsigned SrcSize = SrcTy.getSizeInBits();
713 const DebugLoc &DL = MI.getDebugLoc();
714 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
715
716 const TargetRegisterClass *SrcRC =
717 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
718 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
719 return false;
720
721 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
722 // source, and this relies on the fact that the same subregister indices are
723 // used for both.
724 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
725 for (int I = 0, E = NumDst; I != E; ++I) {
726 MachineOperand &Dst = MI.getOperand(I);
727 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
728 .addReg(SrcReg, 0, SubRegs[I]);
729
730 // Make sure the subregister index is valid for the source register.
731 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
732 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
733 return false;
734
735 const TargetRegisterClass *DstRC =
736 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
737 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
738 return false;
739 }
740
741 MI.eraseFromParent();
742 return true;
743}
744
745bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
746 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
747 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
748
749 Register Src0 = MI.getOperand(1).getReg();
750 Register Src1 = MI.getOperand(2).getReg();
751 LLT SrcTy = MRI->getType(Src0);
752 const unsigned SrcSize = SrcTy.getSizeInBits();
753
754 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
755 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
756 return selectG_MERGE_VALUES(MI);
757 }
758
759 // Selection logic below is for V2S16 only.
760 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
761 Register Dst = MI.getOperand(0).getReg();
762 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
763 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
764 SrcTy != LLT::scalar(32)))
765 return selectImpl(MI, *CoverageInfo);
766
767 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
768 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
769 return false;
770
771 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
772 DstBank->getID() == AMDGPU::VGPRRegBankID);
773 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
774
775 const DebugLoc &DL = MI.getDebugLoc();
776 MachineBasicBlock *BB = MI.getParent();
777
778 // First, before trying TableGen patterns, check if both sources are
779 // constants. In those cases, we can trivially compute the final constant
780 // and emit a simple move.
781 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
782 if (ConstSrc1) {
783 auto ConstSrc0 =
784 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
785 if (ConstSrc0) {
786 const int64_t K0 = ConstSrc0->Value.getSExtValue();
787 const int64_t K1 = ConstSrc1->Value.getSExtValue();
788 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
789 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
790 uint32_t Imm = Lo16 | (Hi16 << 16);
791
792 // VALU
793 if (IsVector) {
794 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
795 MI.eraseFromParent();
796 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
797 }
798
799 // SALU
800 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
801 MI.eraseFromParent();
802 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
803 }
804 }
805
806 // Now try TableGen patterns.
807 if (selectImpl(MI, *CoverageInfo))
808 return true;
809
810 // TODO: This should probably be a combine somewhere
811 // (build_vector $src0, undef) -> copy $src0
812 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
813 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
814 MI.setDesc(TII.get(AMDGPU::COPY));
815 MI.removeOperand(2);
816 const auto &RC =
817 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
818 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
819 RBI.constrainGenericRegister(Src0, RC, *MRI);
820 }
821
822 // TODO: Can be improved?
823 if (IsVector) {
824 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
825 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
826 .addImm(0xFFFF)
827 .addReg(Src0);
828 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
829 return false;
830
831 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
832 .addReg(Src1)
833 .addImm(16)
834 .addReg(TmpReg);
835 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
836 return false;
837
838 MI.eraseFromParent();
839 return true;
840 }
841
842 Register ShiftSrc0;
843 Register ShiftSrc1;
844
845 // With multiple uses of the shift, this will duplicate the shift and
846 // increase register pressure.
847 //
848 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
849 // => (S_PACK_HH_B32_B16 $src0, $src1)
850 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
851 // => (S_PACK_HL_B32_B16 $src0, $src1)
852 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
853 // => (S_PACK_LH_B32_B16 $src0, $src1)
854 // (build_vector $src0, $src1)
855 // => (S_PACK_LL_B32_B16 $src0, $src1)
856
857 bool Shift0 = mi_match(
858 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
859
860 bool Shift1 = mi_match(
861 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
862
863 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
864 if (Shift0 && Shift1) {
865 Opc = AMDGPU::S_PACK_HH_B32_B16;
866 MI.getOperand(1).setReg(ShiftSrc0);
867 MI.getOperand(2).setReg(ShiftSrc1);
868 } else if (Shift1) {
869 Opc = AMDGPU::S_PACK_LH_B32_B16;
870 MI.getOperand(2).setReg(ShiftSrc1);
871 } else if (Shift0) {
872 auto ConstSrc1 =
873 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
874 if (ConstSrc1 && ConstSrc1->Value == 0) {
875 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
876 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
877 .addReg(ShiftSrc0)
878 .addImm(16)
879 .setOperandDead(3); // Dead scc
880
881 MI.eraseFromParent();
882 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
883 }
884 if (STI.hasSPackHL()) {
885 Opc = AMDGPU::S_PACK_HL_B32_B16;
886 MI.getOperand(1).setReg(ShiftSrc0);
887 }
888 }
889
890 MI.setDesc(TII.get(Opc));
891 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
892}
893
894bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
895 const MachineOperand &MO = I.getOperand(0);
896
897 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
898 // regbank check here is to know why getConstrainedRegClassForOperand failed.
899 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
900 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
901 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
902 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
903 return true;
904 }
905
906 return false;
907}
908
909bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
910 MachineBasicBlock *BB = I.getParent();
911
912 Register DstReg = I.getOperand(0).getReg();
913 Register Src0Reg = I.getOperand(1).getReg();
914 Register Src1Reg = I.getOperand(2).getReg();
915 LLT Src1Ty = MRI->getType(Src1Reg);
916
917 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
918 unsigned InsSize = Src1Ty.getSizeInBits();
919
920 int64_t Offset = I.getOperand(3).getImm();
921
922 // FIXME: These cases should have been illegal and unnecessary to check here.
923 if (Offset % 32 != 0 || InsSize % 32 != 0)
924 return false;
925
926 // Currently not handled by getSubRegFromChannel.
927 if (InsSize > 128)
928 return false;
929
930 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
931 if (SubReg == AMDGPU::NoSubRegister)
932 return false;
933
934 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
935 const TargetRegisterClass *DstRC =
936 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
937 if (!DstRC)
938 return false;
939
940 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
941 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
942 const TargetRegisterClass *Src0RC =
943 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
944 const TargetRegisterClass *Src1RC =
945 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
946
947 // Deal with weird cases where the class only partially supports the subreg
948 // index.
949 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
950 if (!Src0RC || !Src1RC)
951 return false;
952
953 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
954 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
955 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
956 return false;
957
958 const DebugLoc &DL = I.getDebugLoc();
959 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
960 .addReg(Src0Reg)
961 .addReg(Src1Reg)
962 .addImm(SubReg);
963
964 I.eraseFromParent();
965 return true;
966}
967
968bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
969 Register DstReg = MI.getOperand(0).getReg();
970 Register SrcReg = MI.getOperand(1).getReg();
971 Register OffsetReg = MI.getOperand(2).getReg();
972 Register WidthReg = MI.getOperand(3).getReg();
973
974 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
975 "scalar BFX instructions are expanded in regbankselect");
976 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
977 "64-bit vector BFX instructions are expanded in regbankselect");
978
979 const DebugLoc &DL = MI.getDebugLoc();
980 MachineBasicBlock *MBB = MI.getParent();
981
982 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
983 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
984 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
985 .addReg(SrcReg)
986 .addReg(OffsetReg)
987 .addReg(WidthReg);
988 MI.eraseFromParent();
989 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
990}
991
992bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
993 if (STI.getLDSBankCount() != 16)
994 return selectImpl(MI, *CoverageInfo);
995
996 Register Dst = MI.getOperand(0).getReg();
997 Register Src0 = MI.getOperand(2).getReg();
998 Register M0Val = MI.getOperand(6).getReg();
999 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
1000 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
1001 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
1002 return false;
1003
1004 // This requires 2 instructions. It is possible to write a pattern to support
1005 // this, but the generated isel emitter doesn't correctly deal with multiple
1006 // output instructions using the same physical register input. The copy to m0
1007 // is incorrectly placed before the second instruction.
1008 //
1009 // TODO: Match source modifiers.
1010
1011 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1012 const DebugLoc &DL = MI.getDebugLoc();
1013 MachineBasicBlock *MBB = MI.getParent();
1014
1015 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1016 .addReg(M0Val);
1017 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
1018 .addImm(2)
1019 .addImm(MI.getOperand(4).getImm()) // $attr
1020 .addImm(MI.getOperand(3).getImm()); // $attrchan
1021
1022 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
1023 .addImm(0) // $src0_modifiers
1024 .addReg(Src0) // $src0
1025 .addImm(MI.getOperand(4).getImm()) // $attr
1026 .addImm(MI.getOperand(3).getImm()) // $attrchan
1027 .addImm(0) // $src2_modifiers
1028 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
1029 .addImm(MI.getOperand(5).getImm()) // $high
1030 .addImm(0) // $clamp
1031 .addImm(0); // $omod
1032
1033 MI.eraseFromParent();
1034 return true;
1035}
1036
1037// Writelane is special in that it can use SGPR and M0 (which would normally
1038// count as using the constant bus twice - but in this case it is allowed since
1039// the lane selector doesn't count as a use of the constant bus). However, it is
1040// still required to abide by the 1 SGPR rule. Fix this up if we might have
1041// multiple SGPRs.
1042bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
1043 // With a constant bus limit of at least 2, there's no issue.
1044 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
1045 return selectImpl(MI, *CoverageInfo);
1046
1047 MachineBasicBlock *MBB = MI.getParent();
1048 const DebugLoc &DL = MI.getDebugLoc();
1049 Register VDst = MI.getOperand(0).getReg();
1050 Register Val = MI.getOperand(2).getReg();
1051 Register LaneSelect = MI.getOperand(3).getReg();
1052 Register VDstIn = MI.getOperand(4).getReg();
1053
1054 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
1055
1056 std::optional<ValueAndVReg> ConstSelect =
1057 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
1058 if (ConstSelect) {
1059 // The selector has to be an inline immediate, so we can use whatever for
1060 // the other operands.
1061 MIB.addReg(Val);
1062 MIB.addImm(ConstSelect->Value.getSExtValue() &
1063 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
1064 } else {
1065 std::optional<ValueAndVReg> ConstVal =
1067
1068 // If the value written is an inline immediate, we can get away without a
1069 // copy to m0.
1070 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
1071 STI.hasInv2PiInlineImm())) {
1072 MIB.addImm(ConstVal->Value.getSExtValue());
1073 MIB.addReg(LaneSelect);
1074 } else {
1075 MIB.addReg(Val);
1076
1077 // If the lane selector was originally in a VGPR and copied with
1078 // readfirstlane, there's a hazard to read the same SGPR from the
1079 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
1080 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
1081
1082 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1083 .addReg(LaneSelect);
1084 MIB.addReg(AMDGPU::M0);
1085 }
1086 }
1087
1088 MIB.addReg(VDstIn);
1089
1090 MI.eraseFromParent();
1091 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1092}
1093
1094// We need to handle this here because tablegen doesn't support matching
1095// instructions with multiple outputs.
1096bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
1097 Register Dst0 = MI.getOperand(0).getReg();
1098 Register Dst1 = MI.getOperand(1).getReg();
1099
1100 LLT Ty = MRI->getType(Dst0);
1101 unsigned Opc;
1102 if (Ty == LLT::scalar(32))
1103 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
1104 else if (Ty == LLT::scalar(64))
1105 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
1106 else
1107 return false;
1108
1109 // TODO: Match source modifiers.
1110
1111 const DebugLoc &DL = MI.getDebugLoc();
1112 MachineBasicBlock *MBB = MI.getParent();
1113
1114 Register Numer = MI.getOperand(3).getReg();
1115 Register Denom = MI.getOperand(4).getReg();
1116 unsigned ChooseDenom = MI.getOperand(5).getImm();
1117
1118 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1119
1120 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1121 .addDef(Dst1)
1122 .addImm(0) // $src0_modifiers
1123 .addUse(Src0) // $src0
1124 .addImm(0) // $src1_modifiers
1125 .addUse(Denom) // $src1
1126 .addImm(0) // $src2_modifiers
1127 .addUse(Numer) // $src2
1128 .addImm(0) // $clamp
1129 .addImm(0); // $omod
1130
1131 MI.eraseFromParent();
1132 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1133}
1134
1135bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1136 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1137 switch (IntrinsicID) {
1138 case Intrinsic::amdgcn_if_break: {
1139 MachineBasicBlock *BB = I.getParent();
1140
1141 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1142 // SelectionDAG uses for wave32 vs wave64.
1143 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1144 .add(I.getOperand(0))
1145 .add(I.getOperand(2))
1146 .add(I.getOperand(3));
1147
1148 Register DstReg = I.getOperand(0).getReg();
1149 Register Src0Reg = I.getOperand(2).getReg();
1150 Register Src1Reg = I.getOperand(3).getReg();
1151
1152 I.eraseFromParent();
1153
1154 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1155 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1156
1157 return true;
1158 }
1159 case Intrinsic::amdgcn_interp_p1_f16:
1160 return selectInterpP1F16(I);
1161 case Intrinsic::amdgcn_wqm:
1162 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1163 case Intrinsic::amdgcn_softwqm:
1164 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1165 case Intrinsic::amdgcn_strict_wwm:
1166 case Intrinsic::amdgcn_wwm:
1167 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1168 case Intrinsic::amdgcn_strict_wqm:
1169 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1170 case Intrinsic::amdgcn_writelane:
1171 return selectWritelane(I);
1172 case Intrinsic::amdgcn_div_scale:
1173 return selectDivScale(I);
1174 case Intrinsic::amdgcn_icmp:
1175 case Intrinsic::amdgcn_fcmp:
1176 if (selectImpl(I, *CoverageInfo))
1177 return true;
1178 return selectIntrinsicCmp(I);
1179 case Intrinsic::amdgcn_ballot:
1180 return selectBallot(I);
1181 case Intrinsic::amdgcn_reloc_constant:
1182 return selectRelocConstant(I);
1183 case Intrinsic::amdgcn_groupstaticsize:
1184 return selectGroupStaticSize(I);
1185 case Intrinsic::returnaddress:
1186 return selectReturnAddress(I);
1187 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1188 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1189 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1190 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1191 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1192 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1193 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1194 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1195 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1196 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1197 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1198 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1199 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1200 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1201 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
1202 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
1203 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
1204 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
1205 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
1206 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
1207 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1208 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
1209 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1210 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
1211 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
1212 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
1213 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
1214 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
1215 return selectSMFMACIntrin(I);
1216 case Intrinsic::amdgcn_permlane16_swap:
1217 case Intrinsic::amdgcn_permlane32_swap:
1218 return selectPermlaneSwapIntrin(I, IntrinsicID);
1219 default:
1220 return selectImpl(I, *CoverageInfo);
1221 }
1222}
1223
1225 const GCNSubtarget &ST) {
1226 if (Size != 16 && Size != 32 && Size != 64)
1227 return -1;
1228
1229 if (Size == 16 && !ST.has16BitInsts())
1230 return -1;
1231
1232 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc,
1233 unsigned FakeS16Opc, unsigned S32Opc,
1234 unsigned S64Opc) {
1235 if (Size == 16)
1236 return ST.hasTrue16BitInsts()
1237 ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc
1238 : S16Opc;
1239 if (Size == 32)
1240 return S32Opc;
1241 return S64Opc;
1242 };
1243
1244 switch (P) {
1245 default:
1246 llvm_unreachable("Unknown condition code!");
1247 case CmpInst::ICMP_NE:
1248 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1249 AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,
1250 AMDGPU::V_CMP_NE_U64_e64);
1251 case CmpInst::ICMP_EQ:
1252 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1253 AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,
1254 AMDGPU::V_CMP_EQ_U64_e64);
1255 case CmpInst::ICMP_SGT:
1256 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1257 AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,
1258 AMDGPU::V_CMP_GT_I64_e64);
1259 case CmpInst::ICMP_SGE:
1260 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1261 AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,
1262 AMDGPU::V_CMP_GE_I64_e64);
1263 case CmpInst::ICMP_SLT:
1264 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1265 AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,
1266 AMDGPU::V_CMP_LT_I64_e64);
1267 case CmpInst::ICMP_SLE:
1268 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1269 AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,
1270 AMDGPU::V_CMP_LE_I64_e64);
1271 case CmpInst::ICMP_UGT:
1272 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1273 AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,
1274 AMDGPU::V_CMP_GT_U64_e64);
1275 case CmpInst::ICMP_UGE:
1276 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1277 AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,
1278 AMDGPU::V_CMP_GE_U64_e64);
1279 case CmpInst::ICMP_ULT:
1280 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1281 AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,
1282 AMDGPU::V_CMP_LT_U64_e64);
1283 case CmpInst::ICMP_ULE:
1284 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1285 AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,
1286 AMDGPU::V_CMP_LE_U64_e64);
1287
1288 case CmpInst::FCMP_OEQ:
1289 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1290 AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,
1291 AMDGPU::V_CMP_EQ_F64_e64);
1292 case CmpInst::FCMP_OGT:
1293 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1294 AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,
1295 AMDGPU::V_CMP_GT_F64_e64);
1296 case CmpInst::FCMP_OGE:
1297 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1298 AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,
1299 AMDGPU::V_CMP_GE_F64_e64);
1300 case CmpInst::FCMP_OLT:
1301 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1302 AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,
1303 AMDGPU::V_CMP_LT_F64_e64);
1304 case CmpInst::FCMP_OLE:
1305 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1306 AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,
1307 AMDGPU::V_CMP_LE_F64_e64);
1308 case CmpInst::FCMP_ONE:
1309 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1310 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1311 AMDGPU::V_CMP_NEQ_F64_e64);
1312 case CmpInst::FCMP_ORD:
1313 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1314 AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,
1315 AMDGPU::V_CMP_O_F64_e64);
1316 case CmpInst::FCMP_UNO:
1317 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1318 AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,
1319 AMDGPU::V_CMP_U_F64_e64);
1320 case CmpInst::FCMP_UEQ:
1321 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1322 AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,
1323 AMDGPU::V_CMP_NLG_F64_e64);
1324 case CmpInst::FCMP_UGT:
1325 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1326 AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,
1327 AMDGPU::V_CMP_NLE_F64_e64);
1328 case CmpInst::FCMP_UGE:
1329 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1330 AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,
1331 AMDGPU::V_CMP_NLT_F64_e64);
1332 case CmpInst::FCMP_ULT:
1333 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1334 AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,
1335 AMDGPU::V_CMP_NGE_F64_e64);
1336 case CmpInst::FCMP_ULE:
1337 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1338 AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,
1339 AMDGPU::V_CMP_NGT_F64_e64);
1340 case CmpInst::FCMP_UNE:
1341 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1342 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1343 AMDGPU::V_CMP_NEQ_F64_e64);
1344 case CmpInst::FCMP_TRUE:
1345 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1346 AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,
1347 AMDGPU::V_CMP_TRU_F64_e64);
1349 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1350 AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,
1351 AMDGPU::V_CMP_F_F64_e64);
1352 }
1353}
1354
1355int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1356 unsigned Size) const {
1357 if (Size == 64) {
1358 if (!STI.hasScalarCompareEq64())
1359 return -1;
1360
1361 switch (P) {
1362 case CmpInst::ICMP_NE:
1363 return AMDGPU::S_CMP_LG_U64;
1364 case CmpInst::ICMP_EQ:
1365 return AMDGPU::S_CMP_EQ_U64;
1366 default:
1367 return -1;
1368 }
1369 }
1370
1371 if (Size == 32) {
1372 switch (P) {
1373 case CmpInst::ICMP_NE:
1374 return AMDGPU::S_CMP_LG_U32;
1375 case CmpInst::ICMP_EQ:
1376 return AMDGPU::S_CMP_EQ_U32;
1377 case CmpInst::ICMP_SGT:
1378 return AMDGPU::S_CMP_GT_I32;
1379 case CmpInst::ICMP_SGE:
1380 return AMDGPU::S_CMP_GE_I32;
1381 case CmpInst::ICMP_SLT:
1382 return AMDGPU::S_CMP_LT_I32;
1383 case CmpInst::ICMP_SLE:
1384 return AMDGPU::S_CMP_LE_I32;
1385 case CmpInst::ICMP_UGT:
1386 return AMDGPU::S_CMP_GT_U32;
1387 case CmpInst::ICMP_UGE:
1388 return AMDGPU::S_CMP_GE_U32;
1389 case CmpInst::ICMP_ULT:
1390 return AMDGPU::S_CMP_LT_U32;
1391 case CmpInst::ICMP_ULE:
1392 return AMDGPU::S_CMP_LE_U32;
1393 case CmpInst::FCMP_OEQ:
1394 return AMDGPU::S_CMP_EQ_F32;
1395 case CmpInst::FCMP_OGT:
1396 return AMDGPU::S_CMP_GT_F32;
1397 case CmpInst::FCMP_OGE:
1398 return AMDGPU::S_CMP_GE_F32;
1399 case CmpInst::FCMP_OLT:
1400 return AMDGPU::S_CMP_LT_F32;
1401 case CmpInst::FCMP_OLE:
1402 return AMDGPU::S_CMP_LE_F32;
1403 case CmpInst::FCMP_ONE:
1404 return AMDGPU::S_CMP_LG_F32;
1405 case CmpInst::FCMP_ORD:
1406 return AMDGPU::S_CMP_O_F32;
1407 case CmpInst::FCMP_UNO:
1408 return AMDGPU::S_CMP_U_F32;
1409 case CmpInst::FCMP_UEQ:
1410 return AMDGPU::S_CMP_NLG_F32;
1411 case CmpInst::FCMP_UGT:
1412 return AMDGPU::S_CMP_NLE_F32;
1413 case CmpInst::FCMP_UGE:
1414 return AMDGPU::S_CMP_NLT_F32;
1415 case CmpInst::FCMP_ULT:
1416 return AMDGPU::S_CMP_NGE_F32;
1417 case CmpInst::FCMP_ULE:
1418 return AMDGPU::S_CMP_NGT_F32;
1419 case CmpInst::FCMP_UNE:
1420 return AMDGPU::S_CMP_NEQ_F32;
1421 default:
1422 llvm_unreachable("Unknown condition code!");
1423 }
1424 }
1425
1426 if (Size == 16) {
1427 if (!STI.hasSALUFloatInsts())
1428 return -1;
1429
1430 switch (P) {
1431 case CmpInst::FCMP_OEQ:
1432 return AMDGPU::S_CMP_EQ_F16;
1433 case CmpInst::FCMP_OGT:
1434 return AMDGPU::S_CMP_GT_F16;
1435 case CmpInst::FCMP_OGE:
1436 return AMDGPU::S_CMP_GE_F16;
1437 case CmpInst::FCMP_OLT:
1438 return AMDGPU::S_CMP_LT_F16;
1439 case CmpInst::FCMP_OLE:
1440 return AMDGPU::S_CMP_LE_F16;
1441 case CmpInst::FCMP_ONE:
1442 return AMDGPU::S_CMP_LG_F16;
1443 case CmpInst::FCMP_ORD:
1444 return AMDGPU::S_CMP_O_F16;
1445 case CmpInst::FCMP_UNO:
1446 return AMDGPU::S_CMP_U_F16;
1447 case CmpInst::FCMP_UEQ:
1448 return AMDGPU::S_CMP_NLG_F16;
1449 case CmpInst::FCMP_UGT:
1450 return AMDGPU::S_CMP_NLE_F16;
1451 case CmpInst::FCMP_UGE:
1452 return AMDGPU::S_CMP_NLT_F16;
1453 case CmpInst::FCMP_ULT:
1454 return AMDGPU::S_CMP_NGE_F16;
1455 case CmpInst::FCMP_ULE:
1456 return AMDGPU::S_CMP_NGT_F16;
1457 case CmpInst::FCMP_UNE:
1458 return AMDGPU::S_CMP_NEQ_F16;
1459 default:
1460 llvm_unreachable("Unknown condition code!");
1461 }
1462 }
1463
1464 return -1;
1465}
1466
1467bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1468
1469 MachineBasicBlock *BB = I.getParent();
1470 const DebugLoc &DL = I.getDebugLoc();
1471
1472 Register SrcReg = I.getOperand(2).getReg();
1473 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1474
1475 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1476
1477 Register CCReg = I.getOperand(0).getReg();
1478 if (!isVCC(CCReg, *MRI)) {
1479 int Opcode = getS_CMPOpcode(Pred, Size);
1480 if (Opcode == -1)
1481 return false;
1482 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1483 .add(I.getOperand(2))
1484 .add(I.getOperand(3));
1485 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1486 .addReg(AMDGPU::SCC);
1487 bool Ret =
1488 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1489 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1490 I.eraseFromParent();
1491 return Ret;
1492 }
1493
1494 if (I.getOpcode() == AMDGPU::G_FCMP)
1495 return false;
1496
1497 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1498 if (Opcode == -1)
1499 return false;
1500
1501 MachineInstrBuilder ICmp;
1502 // t16 instructions
1503 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers)) {
1504 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1505 .addImm(0)
1506 .add(I.getOperand(2))
1507 .addImm(0)
1508 .add(I.getOperand(3))
1509 .addImm(0); // op_sel
1510 } else {
1511 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1512 .add(I.getOperand(2))
1513 .add(I.getOperand(3));
1514 }
1515
1516 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1517 *TRI.getBoolRC(), *MRI);
1518 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1519 I.eraseFromParent();
1520 return Ret;
1521}
1522
1523bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1524 Register Dst = I.getOperand(0).getReg();
1525 if (isVCC(Dst, *MRI))
1526 return false;
1527
1528 LLT DstTy = MRI->getType(Dst);
1529 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1530 return false;
1531
1532 MachineBasicBlock *BB = I.getParent();
1533 const DebugLoc &DL = I.getDebugLoc();
1534 Register SrcReg = I.getOperand(2).getReg();
1535 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1536
1537 // i1 inputs are not supported in GlobalISel.
1538 if (Size == 1)
1539 return false;
1540
1541 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1542 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1543 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1544 I.eraseFromParent();
1545 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1546 }
1547
1548 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1549 if (Opcode == -1)
1550 return false;
1551
1552 MachineInstrBuilder SelectedMI;
1553 MachineOperand &LHS = I.getOperand(2);
1554 MachineOperand &RHS = I.getOperand(3);
1555 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
1556 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
1557 Register Src0Reg =
1558 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1559 Register Src1Reg =
1560 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1561 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1562 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1563 SelectedMI.addImm(Src0Mods);
1564 SelectedMI.addReg(Src0Reg);
1565 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1566 SelectedMI.addImm(Src1Mods);
1567 SelectedMI.addReg(Src1Reg);
1568 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1569 SelectedMI.addImm(0); // clamp
1570 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1571 SelectedMI.addImm(0); // op_sel
1572
1573 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1574 if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
1575 return false;
1576
1577 I.eraseFromParent();
1578 return true;
1579}
1580
1581// Ballot has to zero bits in input lane-mask that are zero in current exec,
1582// Done as AND with exec. For inputs that are results of instruction that
1583// implicitly use same exec, for example compares in same basic block or SCC to
1584// VCC copy, use copy.
1587 MachineInstr *MI = MRI.getVRegDef(Reg);
1588 if (MI->getParent() != MBB)
1589 return false;
1590
1591 // Lane mask generated by SCC to VCC copy.
1592 if (MI->getOpcode() == AMDGPU::COPY) {
1593 auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg());
1594 auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg());
1595 if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&
1596 SrcRB->getID() == AMDGPU::SGPRRegBankID)
1597 return true;
1598 }
1599
1600 // Lane mask generated using compare with same exec.
1601 if (isa<GAnyCmp>(MI))
1602 return true;
1603
1604 Register LHS, RHS;
1605 // Look through AND.
1606 if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS))))
1607 return isLaneMaskFromSameBlock(LHS, MRI, MBB) ||
1609
1610 return false;
1611}
1612
1613bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1614 MachineBasicBlock *BB = I.getParent();
1615 const DebugLoc &DL = I.getDebugLoc();
1616 Register DstReg = I.getOperand(0).getReg();
1617 Register SrcReg = I.getOperand(2).getReg();
1618 const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits();
1619 const unsigned WaveSize = STI.getWavefrontSize();
1620
1621 // In the common case, the return type matches the wave size.
1622 // However we also support emitting i64 ballots in wave32 mode.
1623 if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
1624 return false;
1625
1626 std::optional<ValueAndVReg> Arg =
1628
1629 Register Dst = DstReg;
1630 // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
1631 if (BallotSize != WaveSize) {
1632 Dst = MRI->createVirtualRegister(TRI.getBoolRC());
1633 }
1634
1635 if (Arg) {
1636 const int64_t Value = Arg->Value.getZExtValue();
1637 if (Value == 0) {
1638 // Dst = S_MOV 0
1639 unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1640 BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0);
1641 } else {
1642 // Dst = COPY EXEC
1643 assert(Value == 1);
1644 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec());
1645 }
1646 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1647 return false;
1648 } else {
1649 if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) {
1650 // Dst = COPY SrcReg
1651 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg);
1652 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1653 return false;
1654 } else {
1655 // Dst = S_AND SrcReg, EXEC
1656 unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1657 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst)
1658 .addReg(SrcReg)
1659 .addReg(TRI.getExec())
1660 .setOperandDead(3); // Dead scc
1661 if (!constrainSelectedInstRegOperands(*And, TII, TRI, RBI))
1662 return false;
1663 }
1664 }
1665
1666 // i64 ballot on Wave32: zero-extend i32 ballot to i64.
1667 if (BallotSize != WaveSize) {
1668 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1669 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1670 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1671 .addReg(Dst)
1672 .addImm(AMDGPU::sub0)
1673 .addReg(HiReg)
1674 .addImm(AMDGPU::sub1);
1675 }
1676
1677 I.eraseFromParent();
1678 return true;
1679}
1680
1681bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1682 Register DstReg = I.getOperand(0).getReg();
1683 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1684 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1685 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1686 return false;
1687
1688 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1689
1690 Module *M = MF->getFunction().getParent();
1691 const MDNode *Metadata = I.getOperand(2).getMetadata();
1692 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1693 auto *RelocSymbol = cast<GlobalVariable>(
1694 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1695
1696 MachineBasicBlock *BB = I.getParent();
1697 BuildMI(*BB, &I, I.getDebugLoc(),
1698 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1700
1701 I.eraseFromParent();
1702 return true;
1703}
1704
1705bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1706 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1707
1708 Register DstReg = I.getOperand(0).getReg();
1709 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1710 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1711 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1712
1713 MachineBasicBlock *MBB = I.getParent();
1714 const DebugLoc &DL = I.getDebugLoc();
1715
1716 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1717
1718 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1719 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1720 MIB.addImm(MFI->getLDSSize());
1721 } else {
1722 Module *M = MF->getFunction().getParent();
1723 const GlobalValue *GV =
1724 Intrinsic::getOrInsertDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1726 }
1727
1728 I.eraseFromParent();
1729 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1730}
1731
1732bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1733 MachineBasicBlock *MBB = I.getParent();
1734 MachineFunction &MF = *MBB->getParent();
1735 const DebugLoc &DL = I.getDebugLoc();
1736
1737 MachineOperand &Dst = I.getOperand(0);
1738 Register DstReg = Dst.getReg();
1739 unsigned Depth = I.getOperand(2).getImm();
1740
1741 const TargetRegisterClass *RC
1742 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1743 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1744 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1745 return false;
1746
1747 // Check for kernel and shader functions
1748 if (Depth != 0 ||
1749 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1750 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1751 .addImm(0);
1752 I.eraseFromParent();
1753 return true;
1754 }
1755
1756 MachineFrameInfo &MFI = MF.getFrameInfo();
1757 // There is a call to @llvm.returnaddress in this function
1758 MFI.setReturnAddressIsTaken(true);
1759
1760 // Get the return address reg and mark it as an implicit live-in
1761 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1762 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1763 AMDGPU::SReg_64RegClass, DL);
1764 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1765 .addReg(LiveIn);
1766 I.eraseFromParent();
1767 return true;
1768}
1769
1770bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1771 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1772 // SelectionDAG uses for wave32 vs wave64.
1773 MachineBasicBlock *BB = MI.getParent();
1774 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1775 .add(MI.getOperand(1));
1776
1777 Register Reg = MI.getOperand(1).getReg();
1778 MI.eraseFromParent();
1779
1780 if (!MRI->getRegClassOrNull(Reg))
1781 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1782 return true;
1783}
1784
1785bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1786 MachineInstr &MI, Intrinsic::ID IntrID) const {
1787 MachineBasicBlock *MBB = MI.getParent();
1788 MachineFunction *MF = MBB->getParent();
1789 const DebugLoc &DL = MI.getDebugLoc();
1790
1791 unsigned IndexOperand = MI.getOperand(7).getImm();
1792 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1793 bool WaveDone = MI.getOperand(9).getImm() != 0;
1794
1795 if (WaveDone && !WaveRelease) {
1796 // TODO: Move this to IR verifier
1797 const Function &Fn = MF->getFunction();
1798 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1799 Fn, "ds_ordered_count: wave_done requires wave_release", DL));
1800 }
1801
1802 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1803 IndexOperand &= ~0x3f;
1804 unsigned CountDw = 0;
1805
1806 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1807 CountDw = (IndexOperand >> 24) & 0xf;
1808 IndexOperand &= ~(0xf << 24);
1809
1810 if (CountDw < 1 || CountDw > 4) {
1811 const Function &Fn = MF->getFunction();
1812 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1813 Fn, "ds_ordered_count: dword count must be between 1 and 4", DL));
1814 CountDw = 1;
1815 }
1816 }
1817
1818 if (IndexOperand) {
1819 const Function &Fn = MF->getFunction();
1820 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1821 Fn, "ds_ordered_count: bad index operand", DL));
1822 }
1823
1824 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1825 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1826
1827 unsigned Offset0 = OrderedCountIndex << 2;
1828 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1829
1830 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1831 Offset1 |= (CountDw - 1) << 6;
1832
1833 if (STI.getGeneration() < AMDGPUSubtarget::GFX11)
1834 Offset1 |= ShaderType << 2;
1835
1836 unsigned Offset = Offset0 | (Offset1 << 8);
1837
1838 Register M0Val = MI.getOperand(2).getReg();
1839 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1840 .addReg(M0Val);
1841
1842 Register DstReg = MI.getOperand(0).getReg();
1843 Register ValReg = MI.getOperand(3).getReg();
1844 MachineInstrBuilder DS =
1845 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1846 .addReg(ValReg)
1847 .addImm(Offset)
1848 .cloneMemRefs(MI);
1849
1850 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1851 return false;
1852
1853 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1854 MI.eraseFromParent();
1855 return Ret;
1856}
1857
1858static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1859 switch (IntrID) {
1860 case Intrinsic::amdgcn_ds_gws_init:
1861 return AMDGPU::DS_GWS_INIT;
1862 case Intrinsic::amdgcn_ds_gws_barrier:
1863 return AMDGPU::DS_GWS_BARRIER;
1864 case Intrinsic::amdgcn_ds_gws_sema_v:
1865 return AMDGPU::DS_GWS_SEMA_V;
1866 case Intrinsic::amdgcn_ds_gws_sema_br:
1867 return AMDGPU::DS_GWS_SEMA_BR;
1868 case Intrinsic::amdgcn_ds_gws_sema_p:
1869 return AMDGPU::DS_GWS_SEMA_P;
1870 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1871 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1872 default:
1873 llvm_unreachable("not a gws intrinsic");
1874 }
1875}
1876
1877bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1878 Intrinsic::ID IID) const {
1879 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1880 !STI.hasGWSSemaReleaseAll()))
1881 return false;
1882
1883 // intrinsic ID, vsrc, offset
1884 const bool HasVSrc = MI.getNumOperands() == 3;
1885 assert(HasVSrc || MI.getNumOperands() == 2);
1886
1887 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1888 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1889 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1890 return false;
1891
1892 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1893 unsigned ImmOffset;
1894
1895 MachineBasicBlock *MBB = MI.getParent();
1896 const DebugLoc &DL = MI.getDebugLoc();
1897
1898 MachineInstr *Readfirstlane = nullptr;
1899
1900 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1901 // incoming offset, in case there's an add of a constant. We'll have to put it
1902 // back later.
1903 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1904 Readfirstlane = OffsetDef;
1905 BaseOffset = OffsetDef->getOperand(1).getReg();
1906 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1907 }
1908
1909 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1910 // If we have a constant offset, try to use the 0 in m0 as the base.
1911 // TODO: Look into changing the default m0 initialization value. If the
1912 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1913 // the immediate offset.
1914
1915 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1916 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1917 .addImm(0);
1918 } else {
1919 std::tie(BaseOffset, ImmOffset) =
1920 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, VT);
1921
1922 if (Readfirstlane) {
1923 // We have the constant offset now, so put the readfirstlane back on the
1924 // variable component.
1925 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1926 return false;
1927
1928 Readfirstlane->getOperand(1).setReg(BaseOffset);
1929 BaseOffset = Readfirstlane->getOperand(0).getReg();
1930 } else {
1931 if (!RBI.constrainGenericRegister(BaseOffset,
1932 AMDGPU::SReg_32RegClass, *MRI))
1933 return false;
1934 }
1935
1936 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1937 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1938 .addReg(BaseOffset)
1939 .addImm(16)
1940 .setOperandDead(3); // Dead scc
1941
1942 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1943 .addReg(M0Base);
1944 }
1945
1946 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1947 // offset field) % 64. Some versions of the programming guide omit the m0
1948 // part, or claim it's from offset 0.
1949 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1950
1951 if (HasVSrc) {
1952 Register VSrc = MI.getOperand(1).getReg();
1953 MIB.addReg(VSrc);
1954
1955 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1956 return false;
1957 }
1958
1959 MIB.addImm(ImmOffset)
1960 .cloneMemRefs(MI);
1961
1962 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
1963
1964 MI.eraseFromParent();
1965 return true;
1966}
1967
1968bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1969 bool IsAppend) const {
1970 Register PtrBase = MI.getOperand(2).getReg();
1971 LLT PtrTy = MRI->getType(PtrBase);
1972 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1973
1974 unsigned Offset;
1975 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1976
1977 // TODO: Should this try to look through readfirstlane like GWS?
1978 if (!isDSOffsetLegal(PtrBase, Offset)) {
1979 PtrBase = MI.getOperand(2).getReg();
1980 Offset = 0;
1981 }
1982
1983 MachineBasicBlock *MBB = MI.getParent();
1984 const DebugLoc &DL = MI.getDebugLoc();
1985 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1986
1987 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1988 .addReg(PtrBase);
1989 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1990 return false;
1991
1992 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1993 .addImm(Offset)
1994 .addImm(IsGDS ? -1 : 0)
1995 .cloneMemRefs(MI);
1996 MI.eraseFromParent();
1997 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1998}
1999
2000bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
2001 MachineFunction *MF = MI.getParent()->getParent();
2002 SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
2003
2004 MFInfo->setInitWholeWave();
2005 return selectImpl(MI, *CoverageInfo);
2006}
2007
2008static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
2009 bool &IsTexFail) {
2010 if (TexFailCtrl)
2011 IsTexFail = true;
2012
2013 TFE = TexFailCtrl & 0x1;
2014 TexFailCtrl &= ~(uint64_t)0x1;
2015 LWE = TexFailCtrl & 0x2;
2016 TexFailCtrl &= ~(uint64_t)0x2;
2017
2018 return TexFailCtrl == 0;
2019}
2020
2021bool AMDGPUInstructionSelector::selectImageIntrinsic(
2022 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
2023 MachineBasicBlock *MBB = MI.getParent();
2024 const DebugLoc &DL = MI.getDebugLoc();
2025 unsigned IntrOpcode = Intr->BaseOpcode;
2026
2027 // For image atomic: use no-return opcode if result is unused.
2028 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode) {
2029 Register ResultDef = MI.getOperand(0).getReg();
2030 if (MRI->use_nodbg_empty(ResultDef))
2031 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
2032 }
2033
2034 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
2036
2037 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
2038 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
2039 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
2040 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
2041
2042 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
2043
2044 Register VDataIn = AMDGPU::NoRegister;
2045 Register VDataOut = AMDGPU::NoRegister;
2046 LLT VDataTy;
2047 int NumVDataDwords = -1;
2048 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
2049 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
2050
2051 bool Unorm;
2052 if (!BaseOpcode->Sampler)
2053 Unorm = true;
2054 else
2055 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
2056
2057 bool TFE;
2058 bool LWE;
2059 bool IsTexFail = false;
2060 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
2061 TFE, LWE, IsTexFail))
2062 return false;
2063
2064 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
2065 const bool IsA16 = (Flags & 1) != 0;
2066 const bool IsG16 = (Flags & 2) != 0;
2067
2068 // A16 implies 16 bit gradients if subtarget doesn't support G16
2069 if (IsA16 && !STI.hasG16() && !IsG16)
2070 return false;
2071
2072 unsigned DMask = 0;
2073 unsigned DMaskLanes = 0;
2074
2075 if (BaseOpcode->Atomic) {
2076 if (!BaseOpcode->NoReturn)
2077 VDataOut = MI.getOperand(0).getReg();
2078 VDataIn = MI.getOperand(2).getReg();
2079 LLT Ty = MRI->getType(VDataIn);
2080
2081 // Be careful to allow atomic swap on 16-bit element vectors.
2082 const bool Is64Bit = BaseOpcode->AtomicX2 ?
2083 Ty.getSizeInBits() == 128 :
2084 Ty.getSizeInBits() == 64;
2085
2086 if (BaseOpcode->AtomicX2) {
2087 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
2088
2089 DMask = Is64Bit ? 0xf : 0x3;
2090 NumVDataDwords = Is64Bit ? 4 : 2;
2091 } else {
2092 DMask = Is64Bit ? 0x3 : 0x1;
2093 NumVDataDwords = Is64Bit ? 2 : 1;
2094 }
2095 } else {
2096 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
2097 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
2098
2099 if (BaseOpcode->Store) {
2100 VDataIn = MI.getOperand(1).getReg();
2101 VDataTy = MRI->getType(VDataIn);
2102 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
2103 } else if (BaseOpcode->NoReturn) {
2104 NumVDataDwords = 0;
2105 } else {
2106 VDataOut = MI.getOperand(0).getReg();
2107 VDataTy = MRI->getType(VDataOut);
2108 NumVDataDwords = DMaskLanes;
2109
2110 if (IsD16 && !STI.hasUnpackedD16VMem())
2111 NumVDataDwords = (DMaskLanes + 1) / 2;
2112 }
2113 }
2114
2115 // Set G16 opcode
2116 if (Subtarget->hasG16() && IsG16) {
2117 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
2119 assert(G16MappingInfo);
2120 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
2121 }
2122
2123 // TODO: Check this in verifier.
2124 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
2125
2126 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
2127 // Keep GLC only when the atomic's result is actually used.
2128 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
2130 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
2132 return false;
2133
2134 int NumVAddrRegs = 0;
2135 int NumVAddrDwords = 0;
2136 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
2137 // Skip the $noregs and 0s inserted during legalization.
2138 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
2139 if (!AddrOp.isReg())
2140 continue; // XXX - Break?
2141
2142 Register Addr = AddrOp.getReg();
2143 if (!Addr)
2144 break;
2145
2146 ++NumVAddrRegs;
2147 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
2148 }
2149
2150 // The legalizer preprocessed the intrinsic arguments. If we aren't using
2151 // NSA, these should have been packed into a single value in the first
2152 // address register
2153 const bool UseNSA =
2154 NumVAddrRegs != 1 &&
2155 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
2156 : NumVAddrDwords == NumVAddrRegs);
2157 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
2158 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
2159 return false;
2160 }
2161
2162 if (IsTexFail)
2163 ++NumVDataDwords;
2164
2165 int Opcode = -1;
2166 if (IsGFX12Plus) {
2167 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
2168 NumVDataDwords, NumVAddrDwords);
2169 } else if (IsGFX11Plus) {
2170 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2171 UseNSA ? AMDGPU::MIMGEncGfx11NSA
2172 : AMDGPU::MIMGEncGfx11Default,
2173 NumVDataDwords, NumVAddrDwords);
2174 } else if (IsGFX10Plus) {
2175 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2176 UseNSA ? AMDGPU::MIMGEncGfx10NSA
2177 : AMDGPU::MIMGEncGfx10Default,
2178 NumVDataDwords, NumVAddrDwords);
2179 } else {
2180 if (Subtarget->hasGFX90AInsts()) {
2181 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
2182 NumVDataDwords, NumVAddrDwords);
2183 if (Opcode == -1) {
2184 LLVM_DEBUG(
2185 dbgs()
2186 << "requested image instruction is not supported on this GPU\n");
2187 return false;
2188 }
2189 }
2190 if (Opcode == -1 &&
2191 STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
2192 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
2193 NumVDataDwords, NumVAddrDwords);
2194 if (Opcode == -1)
2195 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
2196 NumVDataDwords, NumVAddrDwords);
2197 }
2198 if (Opcode == -1)
2199 return false;
2200
2201 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
2202 .cloneMemRefs(MI);
2203
2204 if (VDataOut) {
2205 if (BaseOpcode->AtomicX2) {
2206 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
2207
2208 Register TmpReg = MRI->createVirtualRegister(
2209 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2210 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2211
2212 MIB.addDef(TmpReg);
2213 if (!MRI->use_empty(VDataOut)) {
2214 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
2215 .addReg(TmpReg, RegState::Kill, SubReg);
2216 }
2217
2218 } else {
2219 MIB.addDef(VDataOut); // vdata output
2220 }
2221 }
2222
2223 if (VDataIn)
2224 MIB.addReg(VDataIn); // vdata input
2225
2226 for (int I = 0; I != NumVAddrRegs; ++I) {
2227 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2228 if (SrcOp.isReg()) {
2229 assert(SrcOp.getReg() != 0);
2230 MIB.addReg(SrcOp.getReg());
2231 }
2232 }
2233
2234 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2235 if (BaseOpcode->Sampler)
2236 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2237
2238 MIB.addImm(DMask); // dmask
2239
2240 if (IsGFX10Plus)
2241 MIB.addImm(DimInfo->Encoding);
2242 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2243 MIB.addImm(Unorm);
2244
2245 MIB.addImm(CPol);
2246 MIB.addImm(IsA16 && // a16 or r128
2247 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2248 if (IsGFX10Plus)
2249 MIB.addImm(IsA16 ? -1 : 0);
2250
2251 if (!Subtarget->hasGFX90AInsts()) {
2252 MIB.addImm(TFE); // tfe
2253 } else if (TFE) {
2254 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2255 return false;
2256 }
2257
2258 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2259 MIB.addImm(LWE); // lwe
2260 if (!IsGFX10Plus)
2261 MIB.addImm(DimInfo->DA ? -1 : 0);
2262 if (BaseOpcode->HasD16)
2263 MIB.addImm(IsD16 ? -1 : 0);
2264
2265 MI.eraseFromParent();
2266 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2267 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2268 return true;
2269}
2270
2271// We need to handle this here because tablegen doesn't support matching
2272// instructions with multiple outputs.
2273bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2274 MachineInstr &MI) const {
2275 Register Dst0 = MI.getOperand(0).getReg();
2276 Register Dst1 = MI.getOperand(1).getReg();
2277
2278 const DebugLoc &DL = MI.getDebugLoc();
2279 MachineBasicBlock *MBB = MI.getParent();
2280
2281 Register Addr = MI.getOperand(3).getReg();
2282 Register Data0 = MI.getOperand(4).getReg();
2283 Register Data1 = MI.getOperand(5).getReg();
2284 unsigned Offset = MI.getOperand(6).getImm();
2285
2286 unsigned Opc;
2287 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
2288 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2289 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2290 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2291 break;
2292 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2293 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2294 break;
2295 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2296 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2297 break;
2298 }
2299
2300 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
2301 .addDef(Dst1)
2302 .addUse(Addr)
2303 .addUse(Data0)
2304 .addUse(Data1)
2305 .addImm(Offset)
2306 .cloneMemRefs(MI);
2307
2308 MI.eraseFromParent();
2309 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2310}
2311
2312bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2313 MachineInstr &I) const {
2314 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2315 switch (IntrinsicID) {
2316 case Intrinsic::amdgcn_end_cf:
2317 return selectEndCfIntrinsic(I);
2318 case Intrinsic::amdgcn_ds_ordered_add:
2319 case Intrinsic::amdgcn_ds_ordered_swap:
2320 return selectDSOrderedIntrinsic(I, IntrinsicID);
2321 case Intrinsic::amdgcn_ds_gws_init:
2322 case Intrinsic::amdgcn_ds_gws_barrier:
2323 case Intrinsic::amdgcn_ds_gws_sema_v:
2324 case Intrinsic::amdgcn_ds_gws_sema_br:
2325 case Intrinsic::amdgcn_ds_gws_sema_p:
2326 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2327 return selectDSGWSIntrinsic(I, IntrinsicID);
2328 case Intrinsic::amdgcn_ds_append:
2329 return selectDSAppendConsume(I, true);
2330 case Intrinsic::amdgcn_ds_consume:
2331 return selectDSAppendConsume(I, false);
2332 case Intrinsic::amdgcn_init_whole_wave:
2333 return selectInitWholeWave(I);
2334 case Intrinsic::amdgcn_raw_buffer_load_lds:
2335 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2336 case Intrinsic::amdgcn_struct_buffer_load_lds:
2337 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2338 return selectBufferLoadLds(I);
2339 // Until we can store both the address space of the global and the LDS
2340 // arguments by having tto MachineMemOperands on an intrinsic, we just trust
2341 // that the argument is a global pointer (buffer pointers have been handled by
2342 // a LLVM IR-level lowering).
2343 case Intrinsic::amdgcn_load_to_lds:
2344 case Intrinsic::amdgcn_global_load_lds:
2345 return selectGlobalLoadLds(I);
2346 case Intrinsic::amdgcn_exp_compr:
2347 if (!STI.hasCompressedExport()) {
2348 Function &F = I.getMF()->getFunction();
2349 F.getContext().diagnose(
2350 DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget",
2351 I.getDebugLoc(), DS_Error));
2352 return false;
2353 }
2354 break;
2355 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2356 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2357 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2358 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2359 return selectDSBvhStackIntrinsic(I);
2360 case Intrinsic::amdgcn_s_barrier_init:
2361 case Intrinsic::amdgcn_s_barrier_signal_var:
2362 return selectNamedBarrierInit(I, IntrinsicID);
2363 case Intrinsic::amdgcn_s_barrier_join:
2364 case Intrinsic::amdgcn_s_get_named_barrier_state:
2365 return selectNamedBarrierInst(I, IntrinsicID);
2366 case Intrinsic::amdgcn_s_get_barrier_state:
2367 return selectSGetBarrierState(I, IntrinsicID);
2368 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2369 return selectSBarrierSignalIsfirst(I, IntrinsicID);
2370 }
2371 return selectImpl(I, *CoverageInfo);
2372}
2373
2374bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2375 if (selectImpl(I, *CoverageInfo))
2376 return true;
2377
2378 MachineBasicBlock *BB = I.getParent();
2379 const DebugLoc &DL = I.getDebugLoc();
2380
2381 Register DstReg = I.getOperand(0).getReg();
2382 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2383 assert(Size <= 32 || Size == 64);
2384 const MachineOperand &CCOp = I.getOperand(1);
2385 Register CCReg = CCOp.getReg();
2386 if (!isVCC(CCReg, *MRI)) {
2387 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2388 AMDGPU::S_CSELECT_B32;
2389 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2390 .addReg(CCReg);
2391
2392 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2393 // bank, because it does not cover the register class that we used to represent
2394 // for it. So we need to manually set the register class here.
2395 if (!MRI->getRegClassOrNull(CCReg))
2396 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2397 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2398 .add(I.getOperand(2))
2399 .add(I.getOperand(3));
2400
2401 bool Ret = false;
2402 Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2403 Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2404 I.eraseFromParent();
2405 return Ret;
2406 }
2407
2408 // Wide VGPR select should have been split in RegBankSelect.
2409 if (Size > 32)
2410 return false;
2411
2412 MachineInstr *Select =
2413 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2414 .addImm(0)
2415 .add(I.getOperand(3))
2416 .addImm(0)
2417 .add(I.getOperand(2))
2418 .add(I.getOperand(1));
2419
2420 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2421 I.eraseFromParent();
2422 return Ret;
2423}
2424
2425bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2426 Register DstReg = I.getOperand(0).getReg();
2427 Register SrcReg = I.getOperand(1).getReg();
2428 const LLT DstTy = MRI->getType(DstReg);
2429 const LLT SrcTy = MRI->getType(SrcReg);
2430 const LLT S1 = LLT::scalar(1);
2431
2432 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2433 const RegisterBank *DstRB;
2434 if (DstTy == S1) {
2435 // This is a special case. We don't treat s1 for legalization artifacts as
2436 // vcc booleans.
2437 DstRB = SrcRB;
2438 } else {
2439 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2440 if (SrcRB != DstRB)
2441 return false;
2442 }
2443
2444 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2445
2446 unsigned DstSize = DstTy.getSizeInBits();
2447 unsigned SrcSize = SrcTy.getSizeInBits();
2448
2449 const TargetRegisterClass *SrcRC =
2450 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2451 const TargetRegisterClass *DstRC =
2452 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2453 if (!SrcRC || !DstRC)
2454 return false;
2455
2456 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2457 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2458 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2459 return false;
2460 }
2461
2462 if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
2463 assert(STI.useRealTrue16Insts());
2464 const DebugLoc &DL = I.getDebugLoc();
2465 MachineBasicBlock *MBB = I.getParent();
2466 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg)
2467 .addReg(SrcReg, 0, AMDGPU::lo16);
2468 I.eraseFromParent();
2469 return true;
2470 }
2471
2472 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2473 MachineBasicBlock *MBB = I.getParent();
2474 const DebugLoc &DL = I.getDebugLoc();
2475
2476 Register LoReg = MRI->createVirtualRegister(DstRC);
2477 Register HiReg = MRI->createVirtualRegister(DstRC);
2478 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2479 .addReg(SrcReg, 0, AMDGPU::sub0);
2480 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2481 .addReg(SrcReg, 0, AMDGPU::sub1);
2482
2483 if (IsVALU && STI.hasSDWA()) {
2484 // Write the low 16-bits of the high element into the high 16-bits of the
2485 // low element.
2486 MachineInstr *MovSDWA =
2487 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2488 .addImm(0) // $src0_modifiers
2489 .addReg(HiReg) // $src0
2490 .addImm(0) // $clamp
2491 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2492 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2493 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2494 .addReg(LoReg, RegState::Implicit);
2495 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2496 } else {
2497 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2498 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2499 Register ImmReg = MRI->createVirtualRegister(DstRC);
2500 if (IsVALU) {
2501 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2502 .addImm(16)
2503 .addReg(HiReg);
2504 } else {
2505 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2506 .addReg(HiReg)
2507 .addImm(16)
2508 .setOperandDead(3); // Dead scc
2509 }
2510
2511 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2512 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2513 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2514
2515 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2516 .addImm(0xffff);
2517 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2518 .addReg(LoReg)
2519 .addReg(ImmReg);
2520 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2521 .addReg(TmpReg0)
2522 .addReg(TmpReg1);
2523
2524 if (!IsVALU) {
2525 And.setOperandDead(3); // Dead scc
2526 Or.setOperandDead(3); // Dead scc
2527 }
2528 }
2529
2530 I.eraseFromParent();
2531 return true;
2532 }
2533
2534 if (!DstTy.isScalar())
2535 return false;
2536
2537 if (SrcSize > 32) {
2538 unsigned SubRegIdx = DstSize < 32
2539 ? static_cast<unsigned>(AMDGPU::sub0)
2540 : TRI.getSubRegFromChannel(0, DstSize / 32);
2541 if (SubRegIdx == AMDGPU::NoSubRegister)
2542 return false;
2543
2544 // Deal with weird cases where the class only partially supports the subreg
2545 // index.
2546 const TargetRegisterClass *SrcWithSubRC
2547 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2548 if (!SrcWithSubRC)
2549 return false;
2550
2551 if (SrcWithSubRC != SrcRC) {
2552 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2553 return false;
2554 }
2555
2556 I.getOperand(1).setSubReg(SubRegIdx);
2557 }
2558
2559 I.setDesc(TII.get(TargetOpcode::COPY));
2560 return true;
2561}
2562
2563/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2564static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2566 int SignedMask = static_cast<int>(Mask);
2567 return SignedMask >= -16 && SignedMask <= 64;
2568}
2569
2570// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2571const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2573 const TargetRegisterInfo &TRI) const {
2574 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2575 if (auto *RB = dyn_cast<const RegisterBank *>(RegClassOrBank))
2576 return RB;
2577
2578 // Ignore the type, since we don't use vcc in artifacts.
2579 if (auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
2580 return &RBI.getRegBankFromRegClass(*RC, LLT());
2581 return nullptr;
2582}
2583
2584bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2585 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2586 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2587 const DebugLoc &DL = I.getDebugLoc();
2588 MachineBasicBlock &MBB = *I.getParent();
2589 const Register DstReg = I.getOperand(0).getReg();
2590 const Register SrcReg = I.getOperand(1).getReg();
2591
2592 const LLT DstTy = MRI->getType(DstReg);
2593 const LLT SrcTy = MRI->getType(SrcReg);
2594 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2595 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2596 const unsigned DstSize = DstTy.getSizeInBits();
2597 if (!DstTy.isScalar())
2598 return false;
2599
2600 // Artifact casts should never use vcc.
2601 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2602
2603 // FIXME: This should probably be illegal and split earlier.
2604 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2605 if (DstSize <= 32)
2606 return selectCOPY(I);
2607
2608 const TargetRegisterClass *SrcRC =
2609 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2610 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2611 const TargetRegisterClass *DstRC =
2612 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2613
2614 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2615 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2616 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2617 .addReg(SrcReg)
2618 .addImm(AMDGPU::sub0)
2619 .addReg(UndefReg)
2620 .addImm(AMDGPU::sub1);
2621 I.eraseFromParent();
2622
2623 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2624 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2625 }
2626
2627 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2628 // 64-bit should have been split up in RegBankSelect
2629
2630 // Try to use an and with a mask if it will save code size.
2631 unsigned Mask;
2632 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2633 MachineInstr *ExtI =
2634 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2635 .addImm(Mask)
2636 .addReg(SrcReg);
2637 I.eraseFromParent();
2638 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2639 }
2640
2641 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2642 MachineInstr *ExtI =
2643 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2644 .addReg(SrcReg)
2645 .addImm(0) // Offset
2646 .addImm(SrcSize); // Width
2647 I.eraseFromParent();
2648 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2649 }
2650
2651 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2652 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2653 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2654 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2655 return false;
2656
2657 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2658 const unsigned SextOpc = SrcSize == 8 ?
2659 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2660 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2661 .addReg(SrcReg);
2662 I.eraseFromParent();
2663 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2664 }
2665
2666 // Using a single 32-bit SALU to calculate the high half is smaller than
2667 // S_BFE with a literal constant operand.
2668 if (DstSize > 32 && SrcSize == 32) {
2669 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2670 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2671 if (Signed) {
2672 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2673 .addReg(SrcReg, 0, SubReg)
2674 .addImm(31)
2675 .setOperandDead(3); // Dead scc
2676 } else {
2677 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2678 .addImm(0);
2679 }
2680 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2681 .addReg(SrcReg, 0, SubReg)
2682 .addImm(AMDGPU::sub0)
2683 .addReg(HiReg)
2684 .addImm(AMDGPU::sub1);
2685 I.eraseFromParent();
2686 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2687 *MRI);
2688 }
2689
2690 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2691 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2692
2693 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2694 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2695 // We need a 64-bit register source, but the high bits don't matter.
2696 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2697 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2698 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2699
2700 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2701 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2702 .addReg(SrcReg, 0, SubReg)
2703 .addImm(AMDGPU::sub0)
2704 .addReg(UndefReg)
2705 .addImm(AMDGPU::sub1);
2706
2707 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2708 .addReg(ExtReg)
2709 .addImm(SrcSize << 16);
2710
2711 I.eraseFromParent();
2712 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2713 }
2714
2715 unsigned Mask;
2716 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2717 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2718 .addReg(SrcReg)
2719 .addImm(Mask)
2720 .setOperandDead(3); // Dead scc
2721 } else {
2722 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2723 .addReg(SrcReg)
2724 .addImm(SrcSize << 16);
2725 }
2726
2727 I.eraseFromParent();
2728 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2729 }
2730
2731 return false;
2732}
2733
2737
2739 Register BitcastSrc;
2740 if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
2741 Reg = BitcastSrc;
2742 return Reg;
2743}
2744
2746 Register &Out) {
2747 Register Trunc;
2748 if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
2749 return false;
2750
2751 Register LShlSrc;
2752 Register Cst;
2753 if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
2754 Cst = stripCopy(Cst, MRI);
2755 if (mi_match(Cst, MRI, m_SpecificICst(16))) {
2756 Out = stripBitCast(LShlSrc, MRI);
2757 return true;
2758 }
2759 }
2760
2761 MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
2762 if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2763 return false;
2764
2765 assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2766 LLT::fixed_vector(2, 16));
2767
2768 ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
2769 assert(Mask.size() == 2);
2770
2771 if (Mask[0] == 1 && Mask[1] <= 1) {
2772 Out = Shuffle->getOperand(0).getReg();
2773 return true;
2774 }
2775
2776 return false;
2777}
2778
2779bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2780 if (!Subtarget->hasSALUFloatInsts())
2781 return false;
2782
2783 Register Dst = I.getOperand(0).getReg();
2784 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2785 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2786 return false;
2787
2788 Register Src = I.getOperand(1).getReg();
2789
2790 if (MRI->getType(Dst) == LLT::scalar(32) &&
2791 MRI->getType(Src) == LLT::scalar(16)) {
2792 if (isExtractHiElt(*MRI, Src, Src)) {
2793 MachineBasicBlock *BB = I.getParent();
2794 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2795 .addUse(Src);
2796 I.eraseFromParent();
2797 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2798 }
2799 }
2800
2801 return false;
2802}
2803
2804bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2805 // Only manually handle the f64 SGPR case.
2806 //
2807 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2808 // the bit ops theoretically have a second result due to the implicit def of
2809 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2810 // that is easy by disabling the check. The result works, but uses a
2811 // nonsensical sreg32orlds_and_sreg_1 regclass.
2812 //
2813 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2814 // the variadic REG_SEQUENCE operands.
2815
2816 Register Dst = MI.getOperand(0).getReg();
2817 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2818 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2819 MRI->getType(Dst) != LLT::scalar(64))
2820 return false;
2821
2822 Register Src = MI.getOperand(1).getReg();
2823 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2824 if (Fabs)
2825 Src = Fabs->getOperand(1).getReg();
2826
2827 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2828 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2829 return false;
2830
2831 MachineBasicBlock *BB = MI.getParent();
2832 const DebugLoc &DL = MI.getDebugLoc();
2833 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2834 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2835 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2836 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2837
2838 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2839 .addReg(Src, 0, AMDGPU::sub0);
2840 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2841 .addReg(Src, 0, AMDGPU::sub1);
2842 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2843 .addImm(0x80000000);
2844
2845 // Set or toggle sign bit.
2846 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2847 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2848 .addReg(HiReg)
2849 .addReg(ConstReg)
2850 .setOperandDead(3); // Dead scc
2851 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2852 .addReg(LoReg)
2853 .addImm(AMDGPU::sub0)
2854 .addReg(OpReg)
2855 .addImm(AMDGPU::sub1);
2856 MI.eraseFromParent();
2857 return true;
2858}
2859
2860// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2861bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2862 Register Dst = MI.getOperand(0).getReg();
2863 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2864 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2865 MRI->getType(Dst) != LLT::scalar(64))
2866 return false;
2867
2868 Register Src = MI.getOperand(1).getReg();
2869 MachineBasicBlock *BB = MI.getParent();
2870 const DebugLoc &DL = MI.getDebugLoc();
2871 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2872 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2873 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2874 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2875
2876 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2877 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2878 return false;
2879
2880 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2881 .addReg(Src, 0, AMDGPU::sub0);
2882 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2883 .addReg(Src, 0, AMDGPU::sub1);
2884 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2885 .addImm(0x7fffffff);
2886
2887 // Clear sign bit.
2888 // TODO: Should this used S_BITSET0_*?
2889 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2890 .addReg(HiReg)
2891 .addReg(ConstReg)
2892 .setOperandDead(3); // Dead scc
2893 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2894 .addReg(LoReg)
2895 .addImm(AMDGPU::sub0)
2896 .addReg(OpReg)
2897 .addImm(AMDGPU::sub1);
2898
2899 MI.eraseFromParent();
2900 return true;
2901}
2902
2903static bool isConstant(const MachineInstr &MI) {
2904 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2905}
2906
2907void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2908 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2909
2910 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2911 const MachineInstr *PtrMI =
2912 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
2913
2914 assert(PtrMI);
2915
2916 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2917 return;
2918
2919 GEPInfo GEPInfo;
2920
2921 for (unsigned i = 1; i != 3; ++i) {
2922 const MachineOperand &GEPOp = PtrMI->getOperand(i);
2923 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2924 assert(OpDef);
2925 if (i == 2 && isConstant(*OpDef)) {
2926 // TODO: Could handle constant base + variable offset, but a combine
2927 // probably should have commuted it.
2928 assert(GEPInfo.Imm == 0);
2929 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2930 continue;
2931 }
2932 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2933 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2934 GEPInfo.SgprParts.push_back(GEPOp.getReg());
2935 else
2936 GEPInfo.VgprParts.push_back(GEPOp.getReg());
2937 }
2938
2939 AddrInfo.push_back(GEPInfo);
2940 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2941}
2942
2943bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2944 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2945}
2946
2947bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2948 if (!MI.hasOneMemOperand())
2949 return false;
2950
2951 const MachineMemOperand *MMO = *MI.memoperands_begin();
2952 const Value *Ptr = MMO->getValue();
2953
2954 // UndefValue means this is a load of a kernel input. These are uniform.
2955 // Sometimes LDS instructions have constant pointers.
2956 // If Ptr is null, then that means this mem operand contains a
2957 // PseudoSourceValue like GOT.
2959 return true;
2960
2962 return true;
2963
2964 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
2965 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
2966 AMDGPU::SGPRRegBankID;
2967
2969 return I && I->getMetadata("amdgpu.uniform");
2970}
2971
2972bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2973 for (const GEPInfo &GEPInfo : AddrInfo) {
2974 if (!GEPInfo.VgprParts.empty())
2975 return true;
2976 }
2977 return false;
2978}
2979
2980void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2981 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2982 unsigned AS = PtrTy.getAddressSpace();
2984 STI.ldsRequiresM0Init()) {
2985 MachineBasicBlock *BB = I.getParent();
2986
2987 // If DS instructions require M0 initialization, insert it before selecting.
2988 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2989 .addImm(-1);
2990 }
2991}
2992
2993bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2994 MachineInstr &I) const {
2995 initM0(I);
2996 return selectImpl(I, *CoverageInfo);
2997}
2998
3000 if (Reg.isPhysical())
3001 return false;
3002
3003 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
3004 const unsigned Opcode = MI.getOpcode();
3005
3006 if (Opcode == AMDGPU::COPY)
3007 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
3008
3009 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
3010 Opcode == AMDGPU::G_XOR)
3011 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
3012 isVCmpResult(MI.getOperand(2).getReg(), MRI);
3013
3014 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
3015 return GI->is(Intrinsic::amdgcn_class);
3016
3017 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
3018}
3019
3020bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
3021 MachineBasicBlock *BB = I.getParent();
3022 MachineOperand &CondOp = I.getOperand(0);
3023 Register CondReg = CondOp.getReg();
3024 const DebugLoc &DL = I.getDebugLoc();
3025
3026 unsigned BrOpcode;
3027 Register CondPhysReg;
3028 const TargetRegisterClass *ConstrainRC;
3029
3030 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
3031 // whether the branch is uniform when selecting the instruction. In
3032 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
3033 // RegBankSelect knows what it's doing if the branch condition is scc, even
3034 // though it currently does not.
3035 if (!isVCC(CondReg, *MRI)) {
3036 if (MRI->getType(CondReg) != LLT::scalar(32))
3037 return false;
3038
3039 CondPhysReg = AMDGPU::SCC;
3040 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
3041 ConstrainRC = &AMDGPU::SReg_32RegClass;
3042 } else {
3043 // FIXME: Should scc->vcc copies and with exec?
3044
3045 // Unless the value of CondReg is a result of a V_CMP* instruction then we
3046 // need to insert an and with exec.
3047 if (!isVCmpResult(CondReg, *MRI)) {
3048 const bool Is64 = STI.isWave64();
3049 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
3050 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
3051
3052 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
3053 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
3054 .addReg(CondReg)
3055 .addReg(Exec)
3056 .setOperandDead(3); // Dead scc
3057 CondReg = TmpReg;
3058 }
3059
3060 CondPhysReg = TRI.getVCC();
3061 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
3062 ConstrainRC = TRI.getBoolRC();
3063 }
3064
3065 if (!MRI->getRegClassOrNull(CondReg))
3066 MRI->setRegClass(CondReg, ConstrainRC);
3067
3068 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
3069 .addReg(CondReg);
3070 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
3071 .addMBB(I.getOperand(1).getMBB());
3072
3073 I.eraseFromParent();
3074 return true;
3075}
3076
3077bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
3078 MachineInstr &I) const {
3079 Register DstReg = I.getOperand(0).getReg();
3080 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3081 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3082 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
3083 if (IsVGPR)
3084 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3085
3086 return RBI.constrainGenericRegister(
3087 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
3088}
3089
3090bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
3091 Register DstReg = I.getOperand(0).getReg();
3092 Register SrcReg = I.getOperand(1).getReg();
3093 Register MaskReg = I.getOperand(2).getReg();
3094 LLT Ty = MRI->getType(DstReg);
3095 LLT MaskTy = MRI->getType(MaskReg);
3096 MachineBasicBlock *BB = I.getParent();
3097 const DebugLoc &DL = I.getDebugLoc();
3098
3099 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3100 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3101 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
3102 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3103 if (DstRB != SrcRB) // Should only happen for hand written MIR.
3104 return false;
3105
3106 // Try to avoid emitting a bit operation when we only need to touch half of
3107 // the 64-bit pointer.
3108 APInt MaskOnes = VT->getKnownOnes(MaskReg).zext(64);
3109 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
3110 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
3111
3112 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
3113 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
3114
3115 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
3116 !CanCopyLow32 && !CanCopyHi32) {
3117 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
3118 .addReg(SrcReg)
3119 .addReg(MaskReg)
3120 .setOperandDead(3); // Dead scc
3121 I.eraseFromParent();
3122 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3123 }
3124
3125 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
3126 const TargetRegisterClass &RegRC
3127 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3128
3129 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
3130 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
3131 const TargetRegisterClass *MaskRC =
3132 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
3133
3134 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3135 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3136 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
3137 return false;
3138
3139 if (Ty.getSizeInBits() == 32) {
3140 assert(MaskTy.getSizeInBits() == 32 &&
3141 "ptrmask should have been narrowed during legalize");
3142
3143 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
3144 .addReg(SrcReg)
3145 .addReg(MaskReg);
3146
3147 if (!IsVGPR)
3148 NewOp.setOperandDead(3); // Dead scc
3149 I.eraseFromParent();
3150 return true;
3151 }
3152
3153 Register HiReg = MRI->createVirtualRegister(&RegRC);
3154 Register LoReg = MRI->createVirtualRegister(&RegRC);
3155
3156 // Extract the subregisters from the source pointer.
3157 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
3158 .addReg(SrcReg, 0, AMDGPU::sub0);
3159 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
3160 .addReg(SrcReg, 0, AMDGPU::sub1);
3161
3162 Register MaskedLo, MaskedHi;
3163
3164 if (CanCopyLow32) {
3165 // If all the bits in the low half are 1, we only need a copy for it.
3166 MaskedLo = LoReg;
3167 } else {
3168 // Extract the mask subregister and apply the and.
3169 Register MaskLo = MRI->createVirtualRegister(&RegRC);
3170 MaskedLo = MRI->createVirtualRegister(&RegRC);
3171
3172 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
3173 .addReg(MaskReg, 0, AMDGPU::sub0);
3174 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
3175 .addReg(LoReg)
3176 .addReg(MaskLo);
3177 }
3178
3179 if (CanCopyHi32) {
3180 // If all the bits in the high half are 1, we only need a copy for it.
3181 MaskedHi = HiReg;
3182 } else {
3183 Register MaskHi = MRI->createVirtualRegister(&RegRC);
3184 MaskedHi = MRI->createVirtualRegister(&RegRC);
3185
3186 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
3187 .addReg(MaskReg, 0, AMDGPU::sub1);
3188 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
3189 .addReg(HiReg)
3190 .addReg(MaskHi);
3191 }
3192
3193 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3194 .addReg(MaskedLo)
3195 .addImm(AMDGPU::sub0)
3196 .addReg(MaskedHi)
3197 .addImm(AMDGPU::sub1);
3198 I.eraseFromParent();
3199 return true;
3200}
3201
3202/// Return the register to use for the index value, and the subregister to use
3203/// for the indirectly accessed register.
3204static std::pair<Register, unsigned>
3206 const TargetRegisterClass *SuperRC, Register IdxReg,
3207 unsigned EltSize, GISelValueTracking &ValueTracking) {
3208 Register IdxBaseReg;
3209 int Offset;
3210
3211 std::tie(IdxBaseReg, Offset) =
3212 AMDGPU::getBaseWithConstantOffset(MRI, IdxReg, &ValueTracking);
3213 if (IdxBaseReg == AMDGPU::NoRegister) {
3214 // This will happen if the index is a known constant. This should ordinarily
3215 // be legalized out, but handle it as a register just in case.
3216 assert(Offset == 0);
3217 IdxBaseReg = IdxReg;
3218 }
3219
3220 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3221
3222 // Skip out of bounds offsets, or else we would end up using an undefined
3223 // register.
3224 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3225 return std::pair(IdxReg, SubRegs[0]);
3226 return std::pair(IdxBaseReg, SubRegs[Offset]);
3227}
3228
3229bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3230 MachineInstr &MI) const {
3231 Register DstReg = MI.getOperand(0).getReg();
3232 Register SrcReg = MI.getOperand(1).getReg();
3233 Register IdxReg = MI.getOperand(2).getReg();
3234
3235 LLT DstTy = MRI->getType(DstReg);
3236 LLT SrcTy = MRI->getType(SrcReg);
3237
3238 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3239 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3240 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3241
3242 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3243 // into a waterfall loop.
3244 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3245 return false;
3246
3247 const TargetRegisterClass *SrcRC =
3248 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3249 const TargetRegisterClass *DstRC =
3250 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3251 if (!SrcRC || !DstRC)
3252 return false;
3253 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3254 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3255 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3256 return false;
3257
3258 MachineBasicBlock *BB = MI.getParent();
3259 const DebugLoc &DL = MI.getDebugLoc();
3260 const bool Is64 = DstTy.getSizeInBits() == 64;
3261
3262 unsigned SubReg;
3263 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3264 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *VT);
3265
3266 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3267 if (DstTy.getSizeInBits() != 32 && !Is64)
3268 return false;
3269
3270 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3271 .addReg(IdxReg);
3272
3273 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3274 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3275 .addReg(SrcReg, 0, SubReg)
3276 .addReg(SrcReg, RegState::Implicit);
3277 MI.eraseFromParent();
3278 return true;
3279 }
3280
3281 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3282 return false;
3283
3284 if (!STI.useVGPRIndexMode()) {
3285 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3286 .addReg(IdxReg);
3287 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3288 .addReg(SrcReg, 0, SubReg)
3289 .addReg(SrcReg, RegState::Implicit);
3290 MI.eraseFromParent();
3291 return true;
3292 }
3293
3294 const MCInstrDesc &GPRIDXDesc =
3295 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3296 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3297 .addReg(SrcReg)
3298 .addReg(IdxReg)
3299 .addImm(SubReg);
3300
3301 MI.eraseFromParent();
3302 return true;
3303}
3304
3305// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3306bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3307 MachineInstr &MI) const {
3308 Register DstReg = MI.getOperand(0).getReg();
3309 Register VecReg = MI.getOperand(1).getReg();
3310 Register ValReg = MI.getOperand(2).getReg();
3311 Register IdxReg = MI.getOperand(3).getReg();
3312
3313 LLT VecTy = MRI->getType(DstReg);
3314 LLT ValTy = MRI->getType(ValReg);
3315 unsigned VecSize = VecTy.getSizeInBits();
3316 unsigned ValSize = ValTy.getSizeInBits();
3317
3318 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3319 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3320 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3321
3322 assert(VecTy.getElementType() == ValTy);
3323
3324 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3325 // into a waterfall loop.
3326 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3327 return false;
3328
3329 const TargetRegisterClass *VecRC =
3330 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3331 const TargetRegisterClass *ValRC =
3332 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3333
3334 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3335 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3336 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3337 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3338 return false;
3339
3340 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3341 return false;
3342
3343 unsigned SubReg;
3344 std::tie(IdxReg, SubReg) =
3345 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *VT);
3346
3347 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3348 STI.useVGPRIndexMode();
3349
3350 MachineBasicBlock *BB = MI.getParent();
3351 const DebugLoc &DL = MI.getDebugLoc();
3352
3353 if (!IndexMode) {
3354 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3355 .addReg(IdxReg);
3356
3357 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3358 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3359 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3360 .addReg(VecReg)
3361 .addReg(ValReg)
3362 .addImm(SubReg);
3363 MI.eraseFromParent();
3364 return true;
3365 }
3366
3367 const MCInstrDesc &GPRIDXDesc =
3368 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3369 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3370 .addReg(VecReg)
3371 .addReg(ValReg)
3372 .addReg(IdxReg)
3373 .addImm(SubReg);
3374
3375 MI.eraseFromParent();
3376 return true;
3377}
3378
3379bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3380 if (!Subtarget->hasVMemToLDSLoad())
3381 return false;
3382 unsigned Opc;
3383 unsigned Size = MI.getOperand(3).getImm();
3384
3385 // The struct intrinsic variants add one additional operand over raw.
3386 const bool HasVIndex = MI.getNumOperands() == 9;
3387 Register VIndex;
3388 int OpOffset = 0;
3389 if (HasVIndex) {
3390 VIndex = MI.getOperand(4).getReg();
3391 OpOffset = 1;
3392 }
3393
3394 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3395 std::optional<ValueAndVReg> MaybeVOffset =
3397 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3398
3399 switch (Size) {
3400 default:
3401 return false;
3402 case 1:
3403 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3404 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3405 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3406 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3407 break;
3408 case 2:
3409 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3410 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3411 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3412 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3413 break;
3414 case 4:
3415 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3416 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3417 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3418 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3419 break;
3420 case 12:
3421 if (!Subtarget->hasLDSLoadB96_B128())
3422 return false;
3423
3424 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
3425 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
3426 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
3427 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
3428 break;
3429 case 16:
3430 if (!Subtarget->hasLDSLoadB96_B128())
3431 return false;
3432
3433 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
3434 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
3435 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
3436 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
3437 break;
3438 }
3439
3440 MachineBasicBlock *MBB = MI.getParent();
3441 const DebugLoc &DL = MI.getDebugLoc();
3442 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3443 .add(MI.getOperand(2));
3444
3445 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3446
3447 if (HasVIndex && HasVOffset) {
3448 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3449 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3450 .addReg(VIndex)
3451 .addImm(AMDGPU::sub0)
3452 .addReg(VOffset)
3453 .addImm(AMDGPU::sub1);
3454
3455 MIB.addReg(IdxReg);
3456 } else if (HasVIndex) {
3457 MIB.addReg(VIndex);
3458 } else if (HasVOffset) {
3459 MIB.addReg(VOffset);
3460 }
3461
3462 MIB.add(MI.getOperand(1)); // rsrc
3463 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3464 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3465 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
3466 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3467 MIB.addImm(Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL
3468 : AMDGPU::CPol::ALL_pregfx12)); // cpol
3469 MIB.addImm(
3470 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
3471 ? 1
3472 : 0); // swz
3473
3474 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3475 // Don't set the offset value here because the pointer points to the base of
3476 // the buffer.
3477 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3478
3479 MachinePointerInfo StorePtrI = LoadPtrI;
3480 LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3484
3485 auto F = LoadMMO->getFlags() &
3487 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3488 Size, LoadMMO->getBaseAlign());
3489
3490 MachineMemOperand *StoreMMO =
3491 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3492 sizeof(int32_t), LoadMMO->getBaseAlign());
3493
3494 MIB.setMemRefs({LoadMMO, StoreMMO});
3495
3496 MI.eraseFromParent();
3497 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3498}
3499
3500/// Match a zero extend from a 32-bit value to 64-bits.
3501Register AMDGPUInstructionSelector::matchZeroExtendFromS32(Register Reg) const {
3502 Register ZExtSrc;
3503 if (mi_match(Reg, *MRI, m_GZExt(m_Reg(ZExtSrc))))
3504 return MRI->getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3505
3506 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3507 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3508 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3509 return Register();
3510
3511 assert(Def->getNumOperands() == 3 &&
3512 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3513 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_ZeroInt())) {
3514 return Def->getOperand(1).getReg();
3515 }
3516
3517 return Register();
3518}
3519
3520/// Match a sign extend from a 32-bit value to 64-bits.
3521Register AMDGPUInstructionSelector::matchSignExtendFromS32(Register Reg) const {
3522 Register SExtSrc;
3523 if (mi_match(Reg, *MRI, m_GSExt(m_Reg(SExtSrc))))
3524 return MRI->getType(SExtSrc) == LLT::scalar(32) ? SExtSrc : Register();
3525
3526 // Match legalized form %sext = G_MERGE_VALUES (s32 %x), G_ASHR((S32 %x, 31))
3527 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3528 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3529 return Register();
3530
3531 assert(Def->getNumOperands() == 3 &&
3532 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3533 if (mi_match(Def->getOperand(2).getReg(), *MRI,
3534 m_GAShr(m_SpecificReg(Def->getOperand(1).getReg()),
3535 m_SpecificICst(31))))
3536 return Def->getOperand(1).getReg();
3537
3538 if (VT->signBitIsZero(Reg))
3539 return matchZeroExtendFromS32(Reg);
3540
3541 return Register();
3542}
3543
3544/// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it
3545/// is 32-bit.
3547AMDGPUInstructionSelector::matchZeroExtendFromS32OrS32(Register Reg) const {
3548 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3549 : matchZeroExtendFromS32(Reg);
3550}
3551
3552/// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it
3553/// is 32-bit.
3555AMDGPUInstructionSelector::matchSignExtendFromS32OrS32(Register Reg) const {
3556 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3557 : matchSignExtendFromS32(Reg);
3558}
3559
3561AMDGPUInstructionSelector::matchExtendFromS32OrS32(Register Reg,
3562 bool IsSigned) const {
3563 if (IsSigned)
3564 return matchSignExtendFromS32OrS32(Reg);
3565
3566 return matchZeroExtendFromS32OrS32(Reg);
3567}
3568
3569Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const {
3570 Register AnyExtSrc;
3571 if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc))))
3572 return MRI->getType(AnyExtSrc) == LLT::scalar(32) ? AnyExtSrc : Register();
3573
3574 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 G_IMPLICIT_DEF)
3575 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3576 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3577 return Register();
3578
3579 assert(Def->getNumOperands() == 3 &&
3580 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3581
3582 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_GImplicitDef()))
3583 return Def->getOperand(1).getReg();
3584
3585 return Register();
3586}
3587
3588bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3589 if (!Subtarget->hasVMemToLDSLoad())
3590 return false;
3591
3592 unsigned Opc;
3593 unsigned Size = MI.getOperand(3).getImm();
3594
3595 switch (Size) {
3596 default:
3597 return false;
3598 case 1:
3599 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3600 break;
3601 case 2:
3602 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3603 break;
3604 case 4:
3605 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3606 break;
3607 case 12:
3608 if (!Subtarget->hasLDSLoadB96_B128())
3609 return false;
3610 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3611 break;
3612 case 16:
3613 if (!Subtarget->hasLDSLoadB96_B128())
3614 return false;
3615 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3616 break;
3617 }
3618
3619 MachineBasicBlock *MBB = MI.getParent();
3620 const DebugLoc &DL = MI.getDebugLoc();
3621 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3622 .add(MI.getOperand(2));
3623
3624 Register Addr = MI.getOperand(1).getReg();
3625 Register VOffset;
3626 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3627 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3628 if (!isSGPR(Addr)) {
3629 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3630 if (isSGPR(AddrDef->Reg)) {
3631 Addr = AddrDef->Reg;
3632 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3633 Register SAddr =
3634 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3635 if (isSGPR(SAddr)) {
3636 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3637 if (Register Off = matchZeroExtendFromS32(PtrBaseOffset)) {
3638 Addr = SAddr;
3639 VOffset = Off;
3640 }
3641 }
3642 }
3643 }
3644
3645 if (isSGPR(Addr)) {
3647 if (!VOffset) {
3648 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3649 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3650 .addImm(0);
3651 }
3652 }
3653
3654 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3655 .addReg(Addr);
3656
3657 if (isSGPR(Addr))
3658 MIB.addReg(VOffset);
3659
3660 MIB.add(MI.getOperand(4)); // offset
3661
3662 unsigned Aux = MI.getOperand(5).getImm();
3663 MIB.addImm(Aux & ~AMDGPU::CPol::VIRTUAL_BITS); // cpol
3664
3665 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3666 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3667 LoadPtrI.Offset = MI.getOperand(4).getImm();
3668 MachinePointerInfo StorePtrI = LoadPtrI;
3669 LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3673 auto F = LoadMMO->getFlags() &
3675 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3676 Size, LoadMMO->getBaseAlign());
3677 MachineMemOperand *StoreMMO =
3678 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3679 sizeof(int32_t), Align(4));
3680
3681 MIB.setMemRefs({LoadMMO, StoreMMO});
3682
3683 MI.eraseFromParent();
3684 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3685}
3686
3687bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(
3688 MachineInstr &MI) const {
3689 unsigned OpcodeOpIdx =
3690 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3;
3691 MI.setDesc(TII.get(MI.getOperand(OpcodeOpIdx).getImm()));
3692 MI.removeOperand(OpcodeOpIdx);
3693 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3694 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3695}
3696
3697// FIXME: This should be removed and let the patterns select. We just need the
3698// AGPR/VGPR combination versions.
3699bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3700 unsigned Opc;
3701 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3702 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3703 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3704 break;
3705 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3706 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3707 break;
3708 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3709 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3710 break;
3711 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3712 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3713 break;
3714 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3715 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3716 break;
3717 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3718 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3719 break;
3720 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3721 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3722 break;
3723 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3724 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3725 break;
3726 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3727 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3728 break;
3729 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3730 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3731 break;
3732 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3733 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3734 break;
3735 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3736 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3737 break;
3738 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3739 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3740 break;
3741 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3742 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3743 break;
3744 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
3745 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;
3746 break;
3747 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
3748 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64;
3749 break;
3750 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
3751 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
3752 break;
3753 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
3754 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
3755 break;
3756 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
3757 Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64;
3758 break;
3759 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
3760 Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64;
3761 break;
3762 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
3763 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
3764 break;
3765 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
3766 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
3767 break;
3768 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
3769 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
3770 break;
3771 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
3772 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
3773 break;
3774 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
3775 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64;
3776 break;
3777 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
3778 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64;
3779 break;
3780 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
3781 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64;
3782 break;
3783 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
3784 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64;
3785 break;
3786 default:
3787 llvm_unreachable("unhandled smfmac intrinsic");
3788 }
3789
3790 auto VDst_In = MI.getOperand(4);
3791
3792 MI.setDesc(TII.get(Opc));
3793 MI.removeOperand(4); // VDst_In
3794 MI.removeOperand(1); // Intrinsic ID
3795 MI.addOperand(VDst_In); // Readd VDst_In to the end
3796 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3797 const MCInstrDesc &MCID = MI.getDesc();
3798 if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
3799 MI.getOperand(0).setIsEarlyClobber(true);
3800 }
3801 return true;
3802}
3803
3804bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
3805 MachineInstr &MI, Intrinsic::ID IntrID) const {
3806 if (IntrID == Intrinsic::amdgcn_permlane16_swap &&
3807 !Subtarget->hasPermlane16Swap())
3808 return false;
3809 if (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3810 !Subtarget->hasPermlane32Swap())
3811 return false;
3812
3813 unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3814 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3815 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3816
3817 MI.removeOperand(2);
3818 MI.setDesc(TII.get(Opcode));
3819 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3820
3821 MachineOperand &FI = MI.getOperand(4);
3823
3824 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3825}
3826
3827bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3828 Register DstReg = MI.getOperand(0).getReg();
3829 Register SrcReg = MI.getOperand(1).getReg();
3830 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3831 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3832 MachineBasicBlock *MBB = MI.getParent();
3833 const DebugLoc &DL = MI.getDebugLoc();
3834
3835 if (IsVALU) {
3836 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3837 .addImm(Subtarget->getWavefrontSizeLog2())
3838 .addReg(SrcReg);
3839 } else {
3840 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3841 .addReg(SrcReg)
3842 .addImm(Subtarget->getWavefrontSizeLog2())
3843 .setOperandDead(3); // Dead scc
3844 }
3845
3846 const TargetRegisterClass &RC =
3847 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3848 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3849 return false;
3850
3851 MI.eraseFromParent();
3852 return true;
3853}
3854
3855// Match BITOP3 operation and return a number of matched instructions plus
3856// truth table.
3857static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
3859 const MachineRegisterInfo &MRI) {
3860 unsigned NumOpcodes = 0;
3861 uint8_t LHSBits, RHSBits;
3862
3863 auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {
3864 // Define truth table given Src0, Src1, Src2 bits permutations:
3865 // 0 0 0
3866 // 0 0 1
3867 // 0 1 0
3868 // 0 1 1
3869 // 1 0 0
3870 // 1 0 1
3871 // 1 1 0
3872 // 1 1 1
3873 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
3874
3875 if (mi_match(Op, MRI, m_AllOnesInt())) {
3876 Bits = 0xff;
3877 return true;
3878 }
3879 if (mi_match(Op, MRI, m_ZeroInt())) {
3880 Bits = 0;
3881 return true;
3882 }
3883
3884 for (unsigned I = 0; I < Src.size(); ++I) {
3885 // Try to find existing reused operand
3886 if (Src[I] == Op) {
3887 Bits = SrcBits[I];
3888 return true;
3889 }
3890 // Try to replace parent operator
3891 if (Src[I] == R) {
3892 Bits = SrcBits[I];
3893 Src[I] = Op;
3894 return true;
3895 }
3896 }
3897
3898 if (Src.size() == 3) {
3899 // No room left for operands. Try one last time, there can be a 'not' of
3900 // one of our source operands. In this case we can compute the bits
3901 // without growing Src vector.
3902 Register LHS;
3903 if (mi_match(Op, MRI, m_Not(m_Reg(LHS)))) {
3905 for (unsigned I = 0; I < Src.size(); ++I) {
3906 if (Src[I] == LHS) {
3907 Bits = ~SrcBits[I];
3908 return true;
3909 }
3910 }
3911 }
3912
3913 return false;
3914 }
3915
3916 Bits = SrcBits[Src.size()];
3917 Src.push_back(Op);
3918 return true;
3919 };
3920
3921 MachineInstr *MI = MRI.getVRegDef(R);
3922 switch (MI->getOpcode()) {
3923 case TargetOpcode::G_AND:
3924 case TargetOpcode::G_OR:
3925 case TargetOpcode::G_XOR: {
3926 Register LHS = getSrcRegIgnoringCopies(MI->getOperand(1).getReg(), MRI);
3927 Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI);
3928
3929 SmallVector<Register, 3> Backup(Src.begin(), Src.end());
3930 if (!getOperandBits(LHS, LHSBits) ||
3931 !getOperandBits(RHS, RHSBits)) {
3932 Src = Backup;
3933 return std::make_pair(0, 0);
3934 }
3935
3936 // Recursion is naturally limited by the size of the operand vector.
3937 auto Op = BitOp3_Op(LHS, Src, MRI);
3938 if (Op.first) {
3939 NumOpcodes += Op.first;
3940 LHSBits = Op.second;
3941 }
3942
3943 Op = BitOp3_Op(RHS, Src, MRI);
3944 if (Op.first) {
3945 NumOpcodes += Op.first;
3946 RHSBits = Op.second;
3947 }
3948 break;
3949 }
3950 default:
3951 return std::make_pair(0, 0);
3952 }
3953
3954 uint8_t TTbl;
3955 switch (MI->getOpcode()) {
3956 case TargetOpcode::G_AND:
3957 TTbl = LHSBits & RHSBits;
3958 break;
3959 case TargetOpcode::G_OR:
3960 TTbl = LHSBits | RHSBits;
3961 break;
3962 case TargetOpcode::G_XOR:
3963 TTbl = LHSBits ^ RHSBits;
3964 break;
3965 default:
3966 break;
3967 }
3968
3969 return std::make_pair(NumOpcodes + 1, TTbl);
3970}
3971
3972bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
3973 if (!Subtarget->hasBitOp3Insts())
3974 return false;
3975
3976 Register DstReg = MI.getOperand(0).getReg();
3977 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3978 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3979 if (!IsVALU)
3980 return false;
3981
3983 uint8_t TTbl;
3984 unsigned NumOpcodes;
3985
3986 std::tie(NumOpcodes, TTbl) = BitOp3_Op(DstReg, Src, *MRI);
3987
3988 // Src.empty() case can happen if all operands are all zero or all ones.
3989 // Normally it shall be optimized out before reaching this.
3990 if (NumOpcodes < 2 || Src.empty())
3991 return false;
3992
3993 const bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32);
3994 if (NumOpcodes == 2 && IsB32) {
3995 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
3996 // asm more readable. This cannot be modeled with AddedComplexity because
3997 // selector does not know how many operations did we match.
3998 if (mi_match(MI, *MRI, m_GXor(m_GXor(m_Reg(), m_Reg()), m_Reg())) ||
3999 mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) ||
4000 mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg())))
4001 return false;
4002 } else if (NumOpcodes < 4) {
4003 // For a uniform case threshold should be higher to account for moves
4004 // between VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be
4005 // in SGPRs and a readtfirstlane after.
4006 return false;
4007 }
4008
4009 unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
4010 if (!IsB32 && STI.hasTrue16BitInsts())
4011 Opc = STI.useRealTrue16Insts() ? AMDGPU::V_BITOP3_B16_gfx1250_t16_e64
4012 : AMDGPU::V_BITOP3_B16_gfx1250_fake16_e64;
4013 unsigned CBL = STI.getConstantBusLimit(Opc);
4014 MachineBasicBlock *MBB = MI.getParent();
4015 const DebugLoc &DL = MI.getDebugLoc();
4016
4017 for (unsigned I = 0; I < Src.size(); ++I) {
4018 const RegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI);
4019 if (RB->getID() != AMDGPU::SGPRRegBankID)
4020 continue;
4021 if (CBL > 0) {
4022 --CBL;
4023 continue;
4024 }
4025 Register NewReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4026 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg)
4027 .addReg(Src[I]);
4028 Src[I] = NewReg;
4029 }
4030
4031 // Last operand can be ignored, turning a ternary operation into a binary.
4032 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
4033 // 'c' with 'a' here without changing the answer. In some pathological
4034 // cases it should be possible to get an operation with a single operand
4035 // too if optimizer would not catch it.
4036 while (Src.size() < 3)
4037 Src.push_back(Src[0]);
4038
4039 auto MIB = BuildMI(*MBB, MI, DL, TII.get(Opc), DstReg);
4040 if (!IsB32)
4041 MIB.addImm(0); // src_mod0
4042 MIB.addReg(Src[0]);
4043 if (!IsB32)
4044 MIB.addImm(0); // src_mod1
4045 MIB.addReg(Src[1]);
4046 if (!IsB32)
4047 MIB.addImm(0); // src_mod2
4048 MIB.addReg(Src[2])
4049 .addImm(TTbl);
4050 if (!IsB32)
4051 MIB.addImm(0); // op_sel
4052
4053 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
4054 MI.eraseFromParent();
4055
4056 return true;
4057}
4058
4059bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
4060 Register SrcReg = MI.getOperand(0).getReg();
4061 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
4062 return false;
4063
4064 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
4065 Register SP =
4066 Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore();
4067 Register WaveAddr = getWaveAddress(DefMI);
4068 MachineBasicBlock *MBB = MI.getParent();
4069 const DebugLoc &DL = MI.getDebugLoc();
4070
4071 if (!WaveAddr) {
4072 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4073 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
4074 .addReg(SrcReg)
4075 .addImm(Subtarget->getWavefrontSizeLog2())
4076 .setOperandDead(3); // Dead scc
4077 }
4078
4079 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
4080 .addReg(WaveAddr);
4081
4082 MI.eraseFromParent();
4083 return true;
4084}
4085
4087
4088 if (!I.isPreISelOpcode()) {
4089 if (I.isCopy())
4090 return selectCOPY(I);
4091 return true;
4092 }
4093
4094 switch (I.getOpcode()) {
4095 case TargetOpcode::G_AND:
4096 case TargetOpcode::G_OR:
4097 case TargetOpcode::G_XOR:
4098 if (selectBITOP3(I))
4099 return true;
4100 if (selectImpl(I, *CoverageInfo))
4101 return true;
4102 return selectG_AND_OR_XOR(I);
4103 case TargetOpcode::G_ADD:
4104 case TargetOpcode::G_SUB:
4105 case TargetOpcode::G_PTR_ADD:
4106 if (selectImpl(I, *CoverageInfo))
4107 return true;
4108 return selectG_ADD_SUB(I);
4109 case TargetOpcode::G_UADDO:
4110 case TargetOpcode::G_USUBO:
4111 case TargetOpcode::G_UADDE:
4112 case TargetOpcode::G_USUBE:
4113 return selectG_UADDO_USUBO_UADDE_USUBE(I);
4114 case AMDGPU::G_AMDGPU_MAD_U64_U32:
4115 case AMDGPU::G_AMDGPU_MAD_I64_I32:
4116 return selectG_AMDGPU_MAD_64_32(I);
4117 case TargetOpcode::G_INTTOPTR:
4118 case TargetOpcode::G_BITCAST:
4119 case TargetOpcode::G_PTRTOINT:
4120 case TargetOpcode::G_FREEZE:
4121 return selectCOPY(I);
4122 case TargetOpcode::G_FNEG:
4123 if (selectImpl(I, *CoverageInfo))
4124 return true;
4125 return selectG_FNEG(I);
4126 case TargetOpcode::G_FABS:
4127 if (selectImpl(I, *CoverageInfo))
4128 return true;
4129 return selectG_FABS(I);
4130 case TargetOpcode::G_EXTRACT:
4131 return selectG_EXTRACT(I);
4132 case TargetOpcode::G_MERGE_VALUES:
4133 case TargetOpcode::G_CONCAT_VECTORS:
4134 return selectG_MERGE_VALUES(I);
4135 case TargetOpcode::G_UNMERGE_VALUES:
4136 return selectG_UNMERGE_VALUES(I);
4137 case TargetOpcode::G_BUILD_VECTOR:
4138 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
4139 return selectG_BUILD_VECTOR(I);
4140 case TargetOpcode::G_IMPLICIT_DEF:
4141 return selectG_IMPLICIT_DEF(I);
4142 case TargetOpcode::G_INSERT:
4143 return selectG_INSERT(I);
4144 case TargetOpcode::G_INTRINSIC:
4145 case TargetOpcode::G_INTRINSIC_CONVERGENT:
4146 return selectG_INTRINSIC(I);
4147 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
4148 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
4149 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
4150 case TargetOpcode::G_ICMP:
4151 case TargetOpcode::G_FCMP:
4152 if (selectG_ICMP_or_FCMP(I))
4153 return true;
4154 return selectImpl(I, *CoverageInfo);
4155 case TargetOpcode::G_LOAD:
4156 case TargetOpcode::G_ZEXTLOAD:
4157 case TargetOpcode::G_SEXTLOAD:
4158 case TargetOpcode::G_STORE:
4159 case TargetOpcode::G_ATOMIC_CMPXCHG:
4160 case TargetOpcode::G_ATOMICRMW_XCHG:
4161 case TargetOpcode::G_ATOMICRMW_ADD:
4162 case TargetOpcode::G_ATOMICRMW_SUB:
4163 case TargetOpcode::G_ATOMICRMW_AND:
4164 case TargetOpcode::G_ATOMICRMW_OR:
4165 case TargetOpcode::G_ATOMICRMW_XOR:
4166 case TargetOpcode::G_ATOMICRMW_MIN:
4167 case TargetOpcode::G_ATOMICRMW_MAX:
4168 case TargetOpcode::G_ATOMICRMW_UMIN:
4169 case TargetOpcode::G_ATOMICRMW_UMAX:
4170 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
4171 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
4172 case TargetOpcode::G_ATOMICRMW_FADD:
4173 case TargetOpcode::G_ATOMICRMW_FMIN:
4174 case TargetOpcode::G_ATOMICRMW_FMAX:
4175 return selectG_LOAD_STORE_ATOMICRMW(I);
4176 case TargetOpcode::G_SELECT:
4177 return selectG_SELECT(I);
4178 case TargetOpcode::G_TRUNC:
4179 return selectG_TRUNC(I);
4180 case TargetOpcode::G_SEXT:
4181 case TargetOpcode::G_ZEXT:
4182 case TargetOpcode::G_ANYEXT:
4183 case TargetOpcode::G_SEXT_INREG:
4184 // This is a workaround. For extension from type i1, `selectImpl()` uses
4185 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
4186 // i1 can only be hold in a SGPR class.
4187 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
4188 selectImpl(I, *CoverageInfo))
4189 return true;
4190 return selectG_SZA_EXT(I);
4191 case TargetOpcode::G_FPEXT:
4192 if (selectG_FPEXT(I))
4193 return true;
4194 return selectImpl(I, *CoverageInfo);
4195 case TargetOpcode::G_BRCOND:
4196 return selectG_BRCOND(I);
4197 case TargetOpcode::G_GLOBAL_VALUE:
4198 return selectG_GLOBAL_VALUE(I);
4199 case TargetOpcode::G_PTRMASK:
4200 return selectG_PTRMASK(I);
4201 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4202 return selectG_EXTRACT_VECTOR_ELT(I);
4203 case TargetOpcode::G_INSERT_VECTOR_ELT:
4204 return selectG_INSERT_VECTOR_ELT(I);
4205 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4206 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4207 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
4208 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4209 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4210 const AMDGPU::ImageDimIntrinsicInfo *Intr =
4212 assert(Intr && "not an image intrinsic with image pseudo");
4213 return selectImageIntrinsic(I, Intr);
4214 }
4215 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY:
4216 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
4217 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
4218 return selectBVHIntersectRayIntrinsic(I);
4219 case AMDGPU::G_SBFX:
4220 case AMDGPU::G_UBFX:
4221 return selectG_SBFX_UBFX(I);
4222 case AMDGPU::G_SI_CALL:
4223 I.setDesc(TII.get(AMDGPU::SI_CALL));
4224 return true;
4225 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
4226 return selectWaveAddress(I);
4227 case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: {
4228 I.setDesc(TII.get(AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN));
4229 return true;
4230 }
4231 case AMDGPU::G_STACKRESTORE:
4232 return selectStackRestore(I);
4233 case AMDGPU::G_PHI:
4234 return selectPHI(I);
4235 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
4236 return selectCOPY_SCC_VCC(I);
4237 case AMDGPU::G_AMDGPU_COPY_VCC_SCC:
4238 return selectCOPY_VCC_SCC(I);
4239 case AMDGPU::G_AMDGPU_READANYLANE:
4240 return selectReadAnyLane(I);
4241 case TargetOpcode::G_CONSTANT:
4242 case TargetOpcode::G_FCONSTANT:
4243 default:
4244 return selectImpl(I, *CoverageInfo);
4245 }
4246 return false;
4247}
4248
4250AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
4251 return {{
4252 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4253 }};
4254
4255}
4256
4257std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
4258 Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
4259 unsigned Mods = 0;
4260 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
4261
4262 if (MI->getOpcode() == AMDGPU::G_FNEG) {
4263 Src = MI->getOperand(1).getReg();
4264 Mods |= SISrcMods::NEG;
4265 MI = getDefIgnoringCopies(Src, *MRI);
4266 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
4267 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
4268 // denormal mode, but we're implicitly canonicalizing in a source operand.
4269 const ConstantFP *LHS =
4270 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
4271 if (LHS && LHS->isZero()) {
4272 Mods |= SISrcMods::NEG;
4273 Src = MI->getOperand(2).getReg();
4274 }
4275 }
4276
4277 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
4278 Src = MI->getOperand(1).getReg();
4279 Mods |= SISrcMods::ABS;
4280 }
4281
4282 if (OpSel)
4283 Mods |= SISrcMods::OP_SEL_0;
4284
4285 return std::pair(Src, Mods);
4286}
4287
4288Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
4289 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
4290 bool ForceVGPR) const {
4291 if ((Mods != 0 || ForceVGPR) &&
4292 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
4293
4294 // If we looked through copies to find source modifiers on an SGPR operand,
4295 // we now have an SGPR register source. To avoid potentially violating the
4296 // constant bus restriction, we need to insert a copy to a VGPR.
4297 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
4298 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
4299 TII.get(AMDGPU::COPY), VGPRSrc)
4300 .addReg(Src);
4301 Src = VGPRSrc;
4302 }
4303
4304 return Src;
4305}
4306
4307///
4308/// This will select either an SGPR or VGPR operand and will save us from
4309/// having to write an extra tablegen pattern.
4311AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
4312 return {{
4313 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4314 }};
4315}
4316
4318AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
4319 Register Src;
4320 unsigned Mods;
4321 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4322
4323 return {{
4324 [=](MachineInstrBuilder &MIB) {
4325 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4326 },
4327 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4328 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4329 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4330 }};
4331}
4332
4334AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
4335 Register Src;
4336 unsigned Mods;
4337 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4338 /*IsCanonicalizing=*/true,
4339 /*AllowAbs=*/false);
4340
4341 return {{
4342 [=](MachineInstrBuilder &MIB) {
4343 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4344 },
4345 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4346 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4347 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4348 }};
4349}
4350
4352AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
4353 return {{
4354 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
4355 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4356 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4357 }};
4358}
4359
4361AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
4362 Register Src;
4363 unsigned Mods;
4364 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4365
4366 return {{
4367 [=](MachineInstrBuilder &MIB) {
4368 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4369 },
4370 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4371 }};
4372}
4373
4375AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
4376 MachineOperand &Root) const {
4377 Register Src;
4378 unsigned Mods;
4379 std::tie(Src, Mods) =
4380 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);
4381
4382 return {{
4383 [=](MachineInstrBuilder &MIB) {
4384 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4385 },
4386 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4387 }};
4388}
4389
4391AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
4392 Register Src;
4393 unsigned Mods;
4394 std::tie(Src, Mods) =
4395 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
4396 /*AllowAbs=*/false);
4397
4398 return {{
4399 [=](MachineInstrBuilder &MIB) {
4400 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4401 },
4402 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4403 }};
4404}
4405
4407AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
4408 Register Reg = Root.getReg();
4409 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
4410 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
4411 return {};
4412 return {{
4413 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4414 }};
4415}
4416
4417enum class SrcStatus {
4422 // This means current op = [op_upper, op_lower] and src = -op_lower.
4425 // This means current op = [op_upper, op_lower] and src = [op_upper,
4426 // -op_lower].
4434};
4435/// Test if the MI is truncating to half, such as `%reg0:n = G_TRUNC %reg1:2n`
4436static bool isTruncHalf(const MachineInstr *MI,
4437 const MachineRegisterInfo &MRI) {
4438 if (MI->getOpcode() != AMDGPU::G_TRUNC)
4439 return false;
4440
4441 unsigned DstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits();
4442 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4443 return DstSize * 2 == SrcSize;
4444}
4445
4446/// Test if the MI is logic shift right with half bits,
4447/// such as `%reg0:2n =G_LSHR %reg1:2n, CONST(n)`
4448static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4449 if (MI->getOpcode() != AMDGPU::G_LSHR)
4450 return false;
4451
4452 Register ShiftSrc;
4453 std::optional<ValueAndVReg> ShiftAmt;
4454 if (mi_match(MI->getOperand(0).getReg(), MRI,
4455 m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4456 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4457 unsigned Shift = ShiftAmt->Value.getZExtValue();
4458 return Shift * 2 == SrcSize;
4459 }
4460 return false;
4461}
4462
4463/// Test if the MI is shift left with half bits,
4464/// such as `%reg0:2n =G_SHL %reg1:2n, CONST(n)`
4465static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4466 if (MI->getOpcode() != AMDGPU::G_SHL)
4467 return false;
4468
4469 Register ShiftSrc;
4470 std::optional<ValueAndVReg> ShiftAmt;
4471 if (mi_match(MI->getOperand(0).getReg(), MRI,
4472 m_GShl(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4473 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4474 unsigned Shift = ShiftAmt->Value.getZExtValue();
4475 return Shift * 2 == SrcSize;
4476 }
4477 return false;
4478}
4479
4480/// Test function, if the MI is `%reg0:n, %reg1:n = G_UNMERGE_VALUES %reg2:2n`
4481static bool isUnmergeHalf(const MachineInstr *MI,
4482 const MachineRegisterInfo &MRI) {
4483 if (MI->getOpcode() != AMDGPU::G_UNMERGE_VALUES)
4484 return false;
4485 return MI->getNumOperands() == 3 && MI->getOperand(0).isDef() &&
4486 MI->getOperand(1).isDef() && !MI->getOperand(2).isDef();
4487}
4488
4490
4492 const MachineRegisterInfo &MRI) {
4493 LLT OpTy = MRI.getType(Reg);
4494 if (OpTy.isScalar())
4495 return TypeClass::SCALAR;
4496 if (OpTy.isVector() && OpTy.getNumElements() == 2)
4499}
4500
4502 const MachineRegisterInfo &MRI) {
4504 if (NegType != TypeClass::VECTOR_OF_TWO && NegType != TypeClass::SCALAR)
4505 return SrcStatus::INVALID;
4506
4507 switch (S) {
4508 case SrcStatus::IS_SAME:
4509 if (NegType == TypeClass::VECTOR_OF_TWO) {
4510 // Vector of 2:
4511 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4512 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4513 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4514 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4516 }
4517 if (NegType == TypeClass::SCALAR) {
4518 // Scalar:
4519 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4520 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4521 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4522 // [SrcHi, SrcLo] = [-OpHi, OpLo]
4523 return SrcStatus::IS_HI_NEG;
4524 }
4525 break;
4527 if (NegType == TypeClass::VECTOR_OF_TWO) {
4528 // Vector of 2:
4529 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4530 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4531 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4532 // [SrcHi, SrcLo] = [-(-OpHi), -OpLo] = [OpHi, -OpLo]
4533 return SrcStatus::IS_LO_NEG;
4534 }
4535 if (NegType == TypeClass::SCALAR) {
4536 // Scalar:
4537 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4538 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4539 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4540 // [SrcHi, SrcLo] = [-(-OpHi), OpLo] = [OpHi, OpLo]
4541 return SrcStatus::IS_SAME;
4542 }
4543 break;
4545 if (NegType == TypeClass::VECTOR_OF_TWO) {
4546 // Vector of 2:
4547 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4548 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4549 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4550 // [SrcHi, SrcLo] = [-OpHi, -(-OpLo)] = [-OpHi, OpLo]
4551 return SrcStatus::IS_HI_NEG;
4552 }
4553 if (NegType == TypeClass::SCALAR) {
4554 // Scalar:
4555 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4556 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4557 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4558 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4560 }
4561 break;
4563 if (NegType == TypeClass::VECTOR_OF_TWO) {
4564 // Vector of 2:
4565 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4566 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4567 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4568 // [SrcHi, SrcLo] = [OpHi, OpLo]
4569 return SrcStatus::IS_SAME;
4570 }
4571 if (NegType == TypeClass::SCALAR) {
4572 // Scalar:
4573 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4574 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4575 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4576 // [SrcHi, SrcLo] = [OpHi, -OpLo]
4577 return SrcStatus::IS_LO_NEG;
4578 }
4579 break;
4581 // Vector of 2:
4582 // Src = CurrUpper
4583 // Curr = [CurrUpper, CurrLower]
4584 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4585 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4586 // Src = -OpUpper
4587 //
4588 // Scalar:
4589 // Src = CurrUpper
4590 // Curr = [CurrUpper, CurrLower]
4591 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4592 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4593 // Src = -OpUpper
4596 if (NegType == TypeClass::VECTOR_OF_TWO) {
4597 // Vector of 2:
4598 // Src = CurrLower
4599 // Curr = [CurrUpper, CurrLower]
4600 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4601 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4602 // Src = -OpLower
4604 }
4605 if (NegType == TypeClass::SCALAR) {
4606 // Scalar:
4607 // Src = CurrLower
4608 // Curr = [CurrUpper, CurrLower]
4609 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4610 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4611 // Src = OpLower
4613 }
4614 break;
4616 // Vector of 2:
4617 // Src = -CurrUpper
4618 // Curr = [CurrUpper, CurrLower]
4619 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4620 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4621 // Src = -(-OpUpper) = OpUpper
4622 //
4623 // Scalar:
4624 // Src = -CurrUpper
4625 // Curr = [CurrUpper, CurrLower]
4626 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4627 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4628 // Src = -(-OpUpper) = OpUpper
4631 if (NegType == TypeClass::VECTOR_OF_TWO) {
4632 // Vector of 2:
4633 // Src = -CurrLower
4634 // Curr = [CurrUpper, CurrLower]
4635 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4636 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4637 // Src = -(-OpLower) = OpLower
4639 }
4640 if (NegType == TypeClass::SCALAR) {
4641 // Scalar:
4642 // Src = -CurrLower
4643 // Curr = [CurrUpper, CurrLower]
4644 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4645 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4646 // Src = -OpLower
4648 }
4649 break;
4650 default:
4651 break;
4652 }
4653 llvm_unreachable("unexpected SrcStatus & NegType combination");
4654}
4655
4656static std::optional<std::pair<Register, SrcStatus>>
4657calcNextStatus(std::pair<Register, SrcStatus> Curr,
4658 const MachineRegisterInfo &MRI) {
4659 const MachineInstr *MI = MRI.getVRegDef(Curr.first);
4660
4661 unsigned Opc = MI->getOpcode();
4662
4663 // Handle general Opc cases.
4664 switch (Opc) {
4665 case AMDGPU::G_BITCAST:
4666 return std::optional<std::pair<Register, SrcStatus>>(
4667 {MI->getOperand(1).getReg(), Curr.second});
4668 case AMDGPU::COPY:
4669 if (MI->getOperand(1).getReg().isPhysical())
4670 return std::nullopt;
4671 return std::optional<std::pair<Register, SrcStatus>>(
4672 {MI->getOperand(1).getReg(), Curr.second});
4673 case AMDGPU::G_FNEG: {
4674 SrcStatus Stat = getNegStatus(Curr.first, Curr.second, MRI);
4675 if (Stat == SrcStatus::INVALID)
4676 return std::nullopt;
4677 return std::optional<std::pair<Register, SrcStatus>>(
4678 {MI->getOperand(1).getReg(), Stat});
4679 }
4680 default:
4681 break;
4682 }
4683
4684 // Calc next Stat from current Stat.
4685 switch (Curr.second) {
4686 case SrcStatus::IS_SAME:
4687 if (isTruncHalf(MI, MRI))
4688 return std::optional<std::pair<Register, SrcStatus>>(
4689 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
4690 else if (isUnmergeHalf(MI, MRI)) {
4691 if (Curr.first == MI->getOperand(0).getReg())
4692 return std::optional<std::pair<Register, SrcStatus>>(
4693 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF});
4694 return std::optional<std::pair<Register, SrcStatus>>(
4695 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF});
4696 }
4697 break;
4699 if (isTruncHalf(MI, MRI)) {
4700 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4701 // [CurrHi, CurrLo] = trunc [OpUpper, OpLower] = OpLower
4702 // = [OpLowerHi, OpLowerLo]
4703 // Src = [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4704 // = [-OpLowerHi, OpLowerLo]
4705 // = -OpLower
4706 return std::optional<std::pair<Register, SrcStatus>>(
4707 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4708 }
4709 if (isUnmergeHalf(MI, MRI)) {
4710 if (Curr.first == MI->getOperand(0).getReg())
4711 return std::optional<std::pair<Register, SrcStatus>>(
4712 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4713 return std::optional<std::pair<Register, SrcStatus>>(
4714 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
4715 }
4716 break;
4718 if (isShlHalf(MI, MRI))
4719 return std::optional<std::pair<Register, SrcStatus>>(
4720 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
4721 break;
4723 if (isLshrHalf(MI, MRI))
4724 return std::optional<std::pair<Register, SrcStatus>>(
4725 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF});
4726 break;
4728 if (isShlHalf(MI, MRI))
4729 return std::optional<std::pair<Register, SrcStatus>>(
4730 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4731 break;
4733 if (isLshrHalf(MI, MRI))
4734 return std::optional<std::pair<Register, SrcStatus>>(
4735 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
4736 break;
4737 default:
4738 break;
4739 }
4740 return std::nullopt;
4741}
4742
4743/// This is used to control valid status that current MI supports. For example,
4744/// non floating point intrinsic such as @llvm.amdgcn.sdot2 does not support NEG
4745/// bit on VOP3P.
4746/// The class can be further extended to recognize support on SEL, NEG, ABS bit
4747/// for different MI on different arch
4749private:
4750 bool HasNeg = false;
4751 // Assume all complex pattern of VOP3P have opsel.
4752 bool HasOpsel = true;
4753
4754public:
4756 const MachineInstr *MI = MRI.getVRegDef(Reg);
4757 unsigned Opc = MI->getOpcode();
4758
4759 if (Opc < TargetOpcode::GENERIC_OP_END) {
4760 // Keep same for generic op.
4761 HasNeg = true;
4762 } else if (Opc == TargetOpcode::G_INTRINSIC) {
4763 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(*MI).getIntrinsicID();
4764 // Only float point intrinsic has neg & neg_hi bits.
4765 if (IntrinsicID == Intrinsic::amdgcn_fdot2)
4766 HasNeg = true;
4767 }
4768 }
4769 bool checkOptions(SrcStatus Stat) const {
4770 if (!HasNeg &&
4771 (Stat >= SrcStatus::NEG_START && Stat <= SrcStatus::NEG_END)) {
4772 return false;
4773 }
4774 if (!HasOpsel &&
4775 (Stat >= SrcStatus::HALF_START && Stat <= SrcStatus::HALF_END)) {
4776 return false;
4777 }
4778 return true;
4779 }
4780};
4781
4784 int MaxDepth = 3) {
4785 int Depth = 0;
4786 auto Curr = calcNextStatus({Reg, SrcStatus::IS_SAME}, MRI);
4788
4789 while (Depth <= MaxDepth && Curr.has_value()) {
4790 Depth++;
4791 if (SO.checkOptions(Curr.value().second))
4792 Statlist.push_back(Curr.value());
4793 Curr = calcNextStatus(Curr.value(), MRI);
4794 }
4795
4796 return Statlist;
4797}
4798
4799static std::pair<Register, SrcStatus>
4801 int MaxDepth = 3) {
4802 int Depth = 0;
4803 std::pair<Register, SrcStatus> LastSameOrNeg = {Reg, SrcStatus::IS_SAME};
4804 auto Curr = calcNextStatus(LastSameOrNeg, MRI);
4805
4806 while (Depth <= MaxDepth && Curr.has_value()) {
4807 Depth++;
4808 SrcStatus Stat = Curr.value().second;
4809 if (SO.checkOptions(Stat)) {
4810 if (Stat == SrcStatus::IS_SAME || Stat == SrcStatus::IS_HI_NEG ||
4812 LastSameOrNeg = Curr.value();
4813 }
4814 Curr = calcNextStatus(Curr.value(), MRI);
4815 }
4816
4817 return LastSameOrNeg;
4818}
4819
4820static bool isSameBitWidth(Register Reg1, Register Reg2,
4821 const MachineRegisterInfo &MRI) {
4822 unsigned Width1 = MRI.getType(Reg1).getSizeInBits();
4823 unsigned Width2 = MRI.getType(Reg2).getSizeInBits();
4824 return Width1 == Width2;
4825}
4826
4827static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) {
4828 // SrcStatus::IS_LOWER_HALF remain 0.
4829 if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) {
4830 Mods ^= SISrcMods::NEG_HI;
4831 Mods |= SISrcMods::OP_SEL_1;
4832 } else if (HiStat == SrcStatus::IS_UPPER_HALF)
4833 Mods |= SISrcMods::OP_SEL_1;
4834 else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG)
4835 Mods ^= SISrcMods::NEG_HI;
4836 else if (HiStat == SrcStatus::IS_HI_NEG)
4837 Mods ^= SISrcMods::NEG_HI;
4838
4839 if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) {
4840 Mods ^= SISrcMods::NEG;
4841 Mods |= SISrcMods::OP_SEL_0;
4842 } else if (LoStat == SrcStatus::IS_UPPER_HALF)
4843 Mods |= SISrcMods::OP_SEL_0;
4844 else if (LoStat == SrcStatus::IS_LOWER_HALF_NEG)
4845 Mods |= SISrcMods::NEG;
4846 else if (LoStat == SrcStatus::IS_HI_NEG)
4847 Mods ^= SISrcMods::NEG;
4848
4849 return Mods;
4850}
4851
4852static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg,
4853 Register RootReg, const SIInstrInfo &TII,
4854 const MachineRegisterInfo &MRI) {
4855 auto IsHalfState = [](SrcStatus S) {
4858 };
4859 return isSameBitWidth(NewReg, RootReg, MRI) && IsHalfState(LoStat) &&
4860 IsHalfState(HiStat);
4861}
4862
4863std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3PModsImpl(
4864 Register RootReg, const MachineRegisterInfo &MRI, bool IsDOT) const {
4865 unsigned Mods = 0;
4866 // No modification if Root type is not form of <2 x Type>.
4867 if (isVectorOfTwoOrScalar(RootReg, MRI) != TypeClass::VECTOR_OF_TWO) {
4868 Mods |= SISrcMods::OP_SEL_1;
4869 return {RootReg, Mods};
4870 }
4871
4872 SearchOptions SO(RootReg, MRI);
4873
4874 std::pair<Register, SrcStatus> Stat = getLastSameOrNeg(RootReg, MRI, SO);
4875
4876 if (Stat.second == SrcStatus::IS_BOTH_NEG)
4878 else if (Stat.second == SrcStatus::IS_HI_NEG)
4879 Mods ^= SISrcMods::NEG_HI;
4880 else if (Stat.second == SrcStatus::IS_LO_NEG)
4881 Mods ^= SISrcMods::NEG;
4882
4883 MachineInstr *MI = MRI.getVRegDef(Stat.first);
4884
4885 if (MI->getOpcode() != AMDGPU::G_BUILD_VECTOR || MI->getNumOperands() != 3 ||
4886 (IsDOT && Subtarget->hasDOTOpSelHazard())) {
4887 Mods |= SISrcMods::OP_SEL_1;
4888 return {Stat.first, Mods};
4889 }
4890
4892 getSrcStats(MI->getOperand(2).getReg(), MRI, SO);
4893
4894 if (StatlistHi.empty()) {
4895 Mods |= SISrcMods::OP_SEL_1;
4896 return {Stat.first, Mods};
4897 }
4898
4900 getSrcStats(MI->getOperand(1).getReg(), MRI, SO);
4901
4902 if (StatlistLo.empty()) {
4903 Mods |= SISrcMods::OP_SEL_1;
4904 return {Stat.first, Mods};
4905 }
4906
4907 for (int I = StatlistHi.size() - 1; I >= 0; I--) {
4908 for (int J = StatlistLo.size() - 1; J >= 0; J--) {
4909 if (StatlistHi[I].first == StatlistLo[J].first &&
4910 isValidToPack(StatlistHi[I].second, StatlistLo[J].second,
4911 StatlistHi[I].first, RootReg, TII, MRI))
4912 return {StatlistHi[I].first,
4913 updateMods(StatlistHi[I].second, StatlistLo[J].second, Mods)};
4914 }
4915 }
4916 // Packed instructions do not have abs modifiers.
4917 Mods |= SISrcMods::OP_SEL_1;
4918
4919 return {Stat.first, Mods};
4920}
4921
4922// Removed unused function `getAllKindImm` to eliminate dead code.
4923
4924static bool checkRB(Register Reg, unsigned int RBNo,
4925 const AMDGPURegisterBankInfo &RBI,
4926 const MachineRegisterInfo &MRI,
4927 const TargetRegisterInfo &TRI) {
4928 const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI);
4929 return RB->getID() == RBNo;
4930}
4931
4932// This function is used to get the correct register bank for returned reg.
4933// Assume:
4934// 1. VOP3P is always legal for VGPR.
4935// 2. RootOp's regbank is legal.
4936// Thus
4937// 1. If RootOp is SGPR, then NewOp can be SGPR or VGPR.
4938// 2. If RootOp is VGPR, then NewOp must be VGPR.
4940 const AMDGPURegisterBankInfo &RBI,
4942 const TargetRegisterInfo &TRI,
4943 const SIInstrInfo &TII) {
4944 // RootOp can only be VGPR or SGPR (some hand written cases such as.
4945 // inst-select-ashr.v2s16.mir::ashr_v2s16_vs).
4946 if (checkRB(RootReg, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) ||
4947 checkRB(NewReg, AMDGPU::VGPRRegBankID, RBI, MRI, TRI))
4948 return NewReg;
4949
4950 MachineInstr *MI = MRI.getVRegDef(RootReg);
4951 if (MI->getOpcode() == AMDGPU::COPY && NewReg == MI->getOperand(1).getReg()) {
4952 // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp.
4953 return RootReg;
4954 }
4955
4956 MachineBasicBlock *BB = MI->getParent();
4957 Register DstReg = MRI.cloneVirtualRegister(RootReg);
4958
4960 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
4961 .addReg(NewReg);
4962
4963 // Only accept VGPR.
4964 return MIB->getOperand(0).getReg();
4965}
4966
4968AMDGPUInstructionSelector::selectVOP3PRetHelper(MachineOperand &Root,
4969 bool IsDOT) const {
4970 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
4971 Register Reg;
4972 unsigned Mods;
4973 std::tie(Reg, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, IsDOT);
4974
4975 Reg = getLegalRegBank(Reg, Root.getReg(), RBI, MRI, TRI, TII);
4976 return {{
4977 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4978 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4979 }};
4980}
4981
4983AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
4984
4985 return selectVOP3PRetHelper(Root);
4986}
4987
4989AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
4990
4991 return selectVOP3PRetHelper(Root, true);
4992}
4993
4995AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
4996 MachineOperand &Root) const {
4997 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
4998 "expected i1 value");
4999 unsigned Mods = SISrcMods::OP_SEL_1;
5000 if (Root.getImm() != 0)
5001 Mods |= SISrcMods::OP_SEL_0;
5002
5003 return {{
5004 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5005 }};
5006}
5007
5009 MachineInstr *InsertPt,
5011 const TargetRegisterClass *DstRegClass;
5012 switch (Elts.size()) {
5013 case 8:
5014 DstRegClass = &AMDGPU::VReg_256RegClass;
5015 break;
5016 case 4:
5017 DstRegClass = &AMDGPU::VReg_128RegClass;
5018 break;
5019 case 2:
5020 DstRegClass = &AMDGPU::VReg_64RegClass;
5021 break;
5022 default:
5023 llvm_unreachable("unhandled Reg sequence size");
5024 }
5025
5026 MachineIRBuilder B(*InsertPt);
5027 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
5028 .addDef(MRI.createVirtualRegister(DstRegClass));
5029 for (unsigned i = 0; i < Elts.size(); ++i) {
5030 MIB.addReg(Elts[i]);
5032 }
5033 return MIB->getOperand(0).getReg();
5034}
5035
5036static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
5038 MachineInstr *InsertPt,
5040 if (ModOpcode == TargetOpcode::G_FNEG) {
5041 Mods |= SISrcMods::NEG;
5042 // Check if all elements also have abs modifier
5043 SmallVector<Register, 8> NegAbsElts;
5044 for (auto El : Elts) {
5045 Register FabsSrc;
5046 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
5047 break;
5048 NegAbsElts.push_back(FabsSrc);
5049 }
5050 if (Elts.size() != NegAbsElts.size()) {
5051 // Neg
5052 Src = buildRegSequence(Elts, InsertPt, MRI);
5053 } else {
5054 // Neg and Abs
5055 Mods |= SISrcMods::NEG_HI;
5056 Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
5057 }
5058 } else {
5059 assert(ModOpcode == TargetOpcode::G_FABS);
5060 // Abs
5061 Mods |= SISrcMods::NEG_HI;
5062 Src = buildRegSequence(Elts, InsertPt, MRI);
5063 }
5064}
5065
5067AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
5068 Register Src = Root.getReg();
5069 unsigned Mods = SISrcMods::OP_SEL_1;
5071
5072 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
5073 assert(BV->getNumSources() > 0);
5074 // Based on first element decide which mod we match, neg or abs
5075 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
5076 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
5077 ? AMDGPU::G_FNEG
5078 : AMDGPU::G_FABS;
5079 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
5080 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
5081 if (ElF32->getOpcode() != ModOpcode)
5082 break;
5083 EltsF32.push_back(ElF32->getOperand(1).getReg());
5084 }
5085
5086 // All elements had ModOpcode modifier
5087 if (BV->getNumSources() == EltsF32.size()) {
5088 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
5089 *MRI);
5090 }
5091 }
5092
5093 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5094 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5095}
5096
5098AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
5099 Register Src = Root.getReg();
5100 unsigned Mods = SISrcMods::OP_SEL_1;
5101 SmallVector<Register, 8> EltsV2F16;
5102
5103 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5104 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5105 Register FNegSrc;
5106 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
5107 break;
5108 EltsV2F16.push_back(FNegSrc);
5109 }
5110
5111 // All elements had ModOpcode modifier
5112 if (CV->getNumSources() == EltsV2F16.size()) {
5113 Mods |= SISrcMods::NEG;
5114 Mods |= SISrcMods::NEG_HI;
5115 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
5116 }
5117 }
5118
5119 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5120 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5121}
5122
5124AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
5125 Register Src = Root.getReg();
5126 unsigned Mods = SISrcMods::OP_SEL_1;
5127 SmallVector<Register, 8> EltsV2F16;
5128
5129 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5130 assert(CV->getNumSources() > 0);
5131 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
5132 // Based on first element decide which mod we match, neg or abs
5133 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
5134 ? AMDGPU::G_FNEG
5135 : AMDGPU::G_FABS;
5136
5137 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5138 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
5139 if (ElV2F16->getOpcode() != ModOpcode)
5140 break;
5141 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
5142 }
5143
5144 // All elements had ModOpcode modifier
5145 if (CV->getNumSources() == EltsV2F16.size()) {
5146 MachineIRBuilder B(*Root.getParent());
5147 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
5148 *MRI);
5149 }
5150 }
5151
5152 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5153 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5154}
5155
5157AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
5158 std::optional<FPValueAndVReg> FPValReg;
5159 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
5160 if (TII.isInlineConstant(FPValReg->Value)) {
5161 return {{[=](MachineInstrBuilder &MIB) {
5162 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
5163 }}};
5164 }
5165 // Non-inlineable splat floats should not fall-through for integer immediate
5166 // checks.
5167 return {};
5168 }
5169
5170 APInt ICst;
5171 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
5172 if (TII.isInlineConstant(ICst)) {
5173 return {
5174 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
5175 }
5176 }
5177
5178 return {};
5179}
5180
5182AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
5183 Register Src =
5184 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5185 unsigned Key = 0;
5186
5187 Register ShiftSrc;
5188 std::optional<ValueAndVReg> ShiftAmt;
5189 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5190 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5191 ShiftAmt->Value.getZExtValue() % 8 == 0) {
5192 Key = ShiftAmt->Value.getZExtValue() / 8;
5193 Src = ShiftSrc;
5194 }
5195
5196 return {{
5197 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5198 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5199 }};
5200}
5201
5203AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
5204
5205 Register Src =
5206 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5207 unsigned Key = 0;
5208
5209 Register ShiftSrc;
5210 std::optional<ValueAndVReg> ShiftAmt;
5211 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5212 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5213 ShiftAmt->Value.getZExtValue() == 16) {
5214 Src = ShiftSrc;
5215 Key = 1;
5216 }
5217
5218 return {{
5219 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5220 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5221 }};
5222}
5223
5225AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const {
5226 Register Src =
5227 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5228 unsigned Key = 0;
5229
5230 Register S32 = matchZeroExtendFromS32(Src);
5231 if (!S32)
5232 S32 = matchAnyExtendFromS32(Src);
5233
5234 if (S32) {
5235 const MachineInstr *Def = getDefIgnoringCopies(S32, *MRI);
5236 if (Def->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
5237 assert(Def->getNumOperands() == 3);
5238 Register DstReg1 = Def->getOperand(1).getReg();
5239 if (mi_match(S32, *MRI,
5240 m_any_of(m_SpecificReg(DstReg1), m_Copy(m_Reg(DstReg1))))) {
5241 Src = Def->getOperand(2).getReg();
5242 Key = 1;
5243 }
5244 }
5245 }
5246
5247 return {{
5248 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5249 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5250 }};
5251}
5252
5254AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
5255 Register Src;
5256 unsigned Mods;
5257 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
5258
5259 // FIXME: Handle op_sel
5260 return {{
5261 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5262 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5263 }};
5264}
5265
5266// FIXME-TRUE16 remove when fake16 is removed
5268AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
5269 Register Src;
5270 unsigned Mods;
5271 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5272 /*IsCanonicalizing=*/true,
5273 /*AllowAbs=*/false,
5274 /*OpSel=*/false);
5275
5276 return {{
5277 [=](MachineInstrBuilder &MIB) {
5278 MIB.addReg(
5279 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5280 },
5281 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5282 }};
5283}
5284
5286AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
5287 Register Src;
5288 unsigned Mods;
5289 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5290 /*IsCanonicalizing=*/true,
5291 /*AllowAbs=*/false,
5292 /*OpSel=*/true);
5293
5294 return {{
5295 [=](MachineInstrBuilder &MIB) {
5296 MIB.addReg(
5297 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5298 },
5299 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5300 }};
5301}
5302
5303// Given \p Offset and load specified by the \p Root operand check if \p Offset
5304// is a multiple of the load byte size. If it is update \p Offset to a
5305// pre-scaled value and return true.
5306bool AMDGPUInstructionSelector::selectScaleOffset(MachineOperand &Root,
5308 bool IsSigned) const {
5309 if (!Subtarget->hasScaleOffset())
5310 return false;
5311
5312 const MachineInstr &MI = *Root.getParent();
5313 MachineMemOperand *MMO = *MI.memoperands_begin();
5314
5315 if (!MMO->getSize().hasValue())
5316 return false;
5317
5318 uint64_t Size = MMO->getSize().getValue();
5319
5320 Register OffsetReg = matchExtendFromS32OrS32(Offset, IsSigned);
5321 if (!OffsetReg)
5322 OffsetReg = Offset;
5323
5324 if (auto Def = getDefSrcRegIgnoringCopies(OffsetReg, *MRI))
5325 OffsetReg = Def->Reg;
5326
5327 Register Op0;
5328 MachineInstr *Mul;
5329 bool ScaleOffset =
5330 (isPowerOf2_64(Size) &&
5331 mi_match(OffsetReg, *MRI,
5332 m_GShl(m_Reg(Op0),
5335 mi_match(OffsetReg, *MRI,
5337 m_Copy(m_SpecificICst(Size))))) ||
5338 mi_match(
5339 OffsetReg, *MRI,
5340 m_BinOp(IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO : AMDGPU::S_MUL_U64,
5341 m_Reg(Op0), m_SpecificICst(Size))) ||
5342 // Match G_AMDGPU_MAD_U64_U32 offset, c, 0
5343 (mi_match(OffsetReg, *MRI, m_MInstr(Mul)) &&
5344 (Mul->getOpcode() == (IsSigned ? AMDGPU::G_AMDGPU_MAD_I64_I32
5345 : AMDGPU::G_AMDGPU_MAD_U64_U32) ||
5346 (IsSigned && Mul->getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32 &&
5347 VT->signBitIsZero(Mul->getOperand(2).getReg()))) &&
5348 mi_match(Mul->getOperand(4).getReg(), *MRI, m_ZeroInt()) &&
5349 mi_match(Mul->getOperand(3).getReg(), *MRI,
5351 m_Copy(m_SpecificICst(Size))))) &&
5352 mi_match(Mul->getOperand(2).getReg(), *MRI, m_Reg(Op0)));
5353
5354 if (ScaleOffset)
5355 Offset = Op0;
5356
5357 return ScaleOffset;
5358}
5359
5360bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
5361 Register &Base,
5362 Register *SOffset,
5363 int64_t *Offset,
5364 bool *ScaleOffset) const {
5365 MachineInstr *MI = Root.getParent();
5366 MachineBasicBlock *MBB = MI->getParent();
5367
5368 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
5369 // then we can select all ptr + 32-bit offsets.
5370 SmallVector<GEPInfo, 4> AddrInfo;
5371 getAddrModeInfo(*MI, *MRI, AddrInfo);
5372
5373 if (AddrInfo.empty())
5374 return false;
5375
5376 const GEPInfo &GEPI = AddrInfo[0];
5377 std::optional<int64_t> EncodedImm;
5378
5379 if (ScaleOffset)
5380 *ScaleOffset = false;
5381
5382 if (SOffset && Offset) {
5383 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5384 /*HasSOffset=*/true);
5385 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
5386 AddrInfo.size() > 1) {
5387 const GEPInfo &GEPI2 = AddrInfo[1];
5388 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
5389 Register OffsetReg = GEPI2.SgprParts[1];
5390 if (ScaleOffset)
5391 *ScaleOffset =
5392 selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5393 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5394 if (OffsetReg) {
5395 Base = GEPI2.SgprParts[0];
5396 *SOffset = OffsetReg;
5397 *Offset = *EncodedImm;
5398 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
5399 return true;
5400
5401 // For unbuffered smem loads, it is illegal for the Immediate Offset
5402 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
5403 // is negative. Handle the case where the Immediate Offset + SOffset
5404 // is negative.
5405 auto SKnown = VT->getKnownBits(*SOffset);
5406 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
5407 return false;
5408
5409 return true;
5410 }
5411 }
5412 }
5413 return false;
5414 }
5415
5416 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5417 /*HasSOffset=*/false);
5418 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
5419 Base = GEPI.SgprParts[0];
5420 *Offset = *EncodedImm;
5421 return true;
5422 }
5423
5424 // SGPR offset is unsigned.
5425 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
5426 GEPI.Imm != 0) {
5427 // If we make it this far we have a load with an 32-bit immediate offset.
5428 // It is OK to select this using a sgpr offset, because we have already
5429 // failed trying to select this load into one of the _IMM variants since
5430 // the _IMM Patterns are considered before the _SGPR patterns.
5431 Base = GEPI.SgprParts[0];
5432 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5433 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
5434 .addImm(GEPI.Imm);
5435 return true;
5436 }
5437
5438 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
5439 Register OffsetReg = GEPI.SgprParts[1];
5440 if (ScaleOffset)
5441 *ScaleOffset = selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5442 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5443 if (OffsetReg) {
5444 Base = GEPI.SgprParts[0];
5445 *SOffset = OffsetReg;
5446 return true;
5447 }
5448 }
5449
5450 return false;
5451}
5452
5454AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
5455 Register Base;
5456 int64_t Offset;
5457 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset,
5458 /* ScaleOffset */ nullptr))
5459 return std::nullopt;
5460
5461 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5462 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
5463}
5464
5466AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
5467 SmallVector<GEPInfo, 4> AddrInfo;
5468 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
5469
5470 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
5471 return std::nullopt;
5472
5473 const GEPInfo &GEPInfo = AddrInfo[0];
5474 Register PtrReg = GEPInfo.SgprParts[0];
5475 std::optional<int64_t> EncodedImm =
5476 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
5477 if (!EncodedImm)
5478 return std::nullopt;
5479
5480 return {{
5481 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
5482 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
5483 }};
5484}
5485
5487AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
5488 Register Base, SOffset;
5489 bool ScaleOffset;
5490 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr,
5491 &ScaleOffset))
5492 return std::nullopt;
5493
5494 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5495 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5496 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5497 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
5498}
5499
5501AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
5502 Register Base, SOffset;
5503 int64_t Offset;
5504 bool ScaleOffset;
5505 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset, &ScaleOffset))
5506 return std::nullopt;
5507
5508 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5509 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5510 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5511 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
5512 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
5513}
5514
5515std::pair<Register, int>
5516AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
5517 uint64_t FlatVariant) const {
5518 MachineInstr *MI = Root.getParent();
5519
5520 auto Default = std::pair(Root.getReg(), 0);
5521
5522 if (!STI.hasFlatInstOffsets())
5523 return Default;
5524
5525 Register PtrBase;
5526 int64_t ConstOffset;
5527 bool IsInBounds;
5528 std::tie(PtrBase, ConstOffset, IsInBounds) =
5529 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5530
5531 // Adding the offset to the base address with an immediate in a FLAT
5532 // instruction must not change the memory aperture in which the address falls.
5533 // Therefore we can only fold offsets from inbounds GEPs into FLAT
5534 // instructions.
5535 if (ConstOffset == 0 ||
5536 (FlatVariant == SIInstrFlags::FlatScratch &&
5537 !isFlatScratchBaseLegal(Root.getReg())) ||
5538 (FlatVariant == SIInstrFlags::FLAT && !IsInBounds))
5539 return Default;
5540
5541 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
5542 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
5543 return Default;
5544
5545 return std::pair(PtrBase, ConstOffset);
5546}
5547
5549AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
5550 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
5551
5552 return {{
5553 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5554 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5555 }};
5556}
5557
5559AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
5560 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
5561
5562 return {{
5563 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5564 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5565 }};
5566}
5567
5569AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
5570 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
5571
5572 return {{
5573 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5574 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5575 }};
5576}
5577
5578// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
5580AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
5581 unsigned CPolBits,
5582 bool NeedIOffset) const {
5583 Register Addr = Root.getReg();
5584 Register PtrBase;
5585 int64_t ConstOffset;
5586 int64_t ImmOffset = 0;
5587
5588 // Match the immediate offset first, which canonically is moved as low as
5589 // possible.
5590 std::tie(PtrBase, ConstOffset, std::ignore) =
5591 getPtrBaseWithConstantOffset(Addr, *MRI);
5592
5593 if (ConstOffset != 0) {
5594 if (NeedIOffset &&
5595 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
5597 Addr = PtrBase;
5598 ImmOffset = ConstOffset;
5599 } else {
5600 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
5601 if (isSGPR(PtrBaseDef->Reg)) {
5602 if (ConstOffset > 0) {
5603 // Offset is too large.
5604 //
5605 // saddr + large_offset -> saddr +
5606 // (voffset = large_offset & ~MaxOffset) +
5607 // (large_offset & MaxOffset);
5608 int64_t SplitImmOffset = 0, RemainderOffset = ConstOffset;
5609 if (NeedIOffset) {
5610 std::tie(SplitImmOffset, RemainderOffset) =
5611 TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
5613 }
5614
5615 if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
5616 : isUInt<32>(RemainderOffset)) {
5617 MachineInstr *MI = Root.getParent();
5618 MachineBasicBlock *MBB = MI->getParent();
5619 Register HighBits =
5620 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5621
5622 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
5623 HighBits)
5624 .addImm(RemainderOffset);
5625
5626 if (NeedIOffset)
5627 return {{
5628 [=](MachineInstrBuilder &MIB) {
5629 MIB.addReg(PtrBase);
5630 }, // saddr
5631 [=](MachineInstrBuilder &MIB) {
5632 MIB.addReg(HighBits);
5633 }, // voffset
5634 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
5635 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
5636 }};
5637 return {{
5638 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
5639 [=](MachineInstrBuilder &MIB) {
5640 MIB.addReg(HighBits);
5641 }, // voffset
5642 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
5643 }};
5644 }
5645 }
5646
5647 // We are adding a 64 bit SGPR and a constant. If constant bus limit
5648 // is 1 we would need to perform 1 or 2 extra moves for each half of
5649 // the constant and it is better to do a scalar add and then issue a
5650 // single VALU instruction to materialize zero. Otherwise it is less
5651 // instructions to perform VALU adds with immediates or inline literals.
5652 unsigned NumLiterals =
5653 !TII.isInlineConstant(APInt(32, Lo_32(ConstOffset))) +
5654 !TII.isInlineConstant(APInt(32, Hi_32(ConstOffset)));
5655 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
5656 return std::nullopt;
5657 }
5658 }
5659 }
5660
5661 // Match the variable offset.
5662 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5663 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
5664 // Look through the SGPR->VGPR copy.
5665 Register SAddr =
5666 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
5667
5668 if (isSGPR(SAddr)) {
5669 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
5670
5671 // It's possible voffset is an SGPR here, but the copy to VGPR will be
5672 // inserted later.
5673 bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset,
5674 Subtarget->hasSignedGVSOffset());
5675 if (Register VOffset = matchExtendFromS32OrS32(
5676 PtrBaseOffset, Subtarget->hasSignedGVSOffset())) {
5677 if (NeedIOffset)
5678 return {{[=](MachineInstrBuilder &MIB) { // saddr
5679 MIB.addReg(SAddr);
5680 },
5681 [=](MachineInstrBuilder &MIB) { // voffset
5682 MIB.addReg(VOffset);
5683 },
5684 [=](MachineInstrBuilder &MIB) { // offset
5685 MIB.addImm(ImmOffset);
5686 },
5687 [=](MachineInstrBuilder &MIB) { // cpol
5688 MIB.addImm(CPolBits |
5689 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
5690 }}};
5691 return {{[=](MachineInstrBuilder &MIB) { // saddr
5692 MIB.addReg(SAddr);
5693 },
5694 [=](MachineInstrBuilder &MIB) { // voffset
5695 MIB.addReg(VOffset);
5696 },
5697 [=](MachineInstrBuilder &MIB) { // cpol
5698 MIB.addImm(CPolBits |
5699 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
5700 }}};
5701 }
5702 }
5703 }
5704
5705 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
5706 // drop this.
5707 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
5708 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
5709 return std::nullopt;
5710
5711 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
5712 // moves required to copy a 64-bit SGPR to VGPR.
5713 MachineInstr *MI = Root.getParent();
5714 MachineBasicBlock *MBB = MI->getParent();
5715 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5716
5717 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
5718 .addImm(0);
5719
5720 if (NeedIOffset)
5721 return {{
5722 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
5723 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
5724 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
5725 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
5726 }};
5727 return {{
5728 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
5729 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
5730 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
5731 }};
5732}
5733
5735AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
5736 return selectGlobalSAddr(Root, 0);
5737}
5738
5740AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const {
5741 const MachineInstr &I = *Root.getParent();
5742
5743 // We are assuming CPol is always the last operand of the intrinsic.
5744 auto PassedCPol =
5745 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
5746 return selectGlobalSAddr(Root, PassedCPol);
5747}
5748
5750AMDGPUInstructionSelector::selectGlobalSAddrCPolM0(MachineOperand &Root) const {
5751 const MachineInstr &I = *Root.getParent();
5752
5753 // We are assuming CPol is second from last operand of the intrinsic.
5754 auto PassedCPol =
5755 I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
5756 return selectGlobalSAddr(Root, PassedCPol);
5757}
5758
5760AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
5761 return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);
5762}
5763
5765AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset(
5766 MachineOperand &Root) const {
5767 const MachineInstr &I = *Root.getParent();
5768
5769 // We are assuming CPol is always the last operand of the intrinsic.
5770 auto PassedCPol =
5771 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
5772 return selectGlobalSAddr(Root, PassedCPol, false);
5773}
5774
5776AMDGPUInstructionSelector::selectGlobalSAddrNoIOffsetM0(
5777 MachineOperand &Root) const {
5778 const MachineInstr &I = *Root.getParent();
5779
5780 // We are assuming CPol is second from last operand of the intrinsic.
5781 auto PassedCPol =
5782 I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
5783 return selectGlobalSAddr(Root, PassedCPol, false);
5784}
5785
5787AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
5788 Register Addr = Root.getReg();
5789 Register PtrBase;
5790 int64_t ConstOffset;
5791 int64_t ImmOffset = 0;
5792
5793 // Match the immediate offset first, which canonically is moved as low as
5794 // possible.
5795 std::tie(PtrBase, ConstOffset, std::ignore) =
5796 getPtrBaseWithConstantOffset(Addr, *MRI);
5797
5798 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
5799 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
5801 Addr = PtrBase;
5802 ImmOffset = ConstOffset;
5803 }
5804
5805 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5806 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5807 int FI = AddrDef->MI->getOperand(1).getIndex();
5808 return {{
5809 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
5810 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
5811 }};
5812 }
5813
5814 Register SAddr = AddrDef->Reg;
5815
5816 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
5817 Register LHS = AddrDef->MI->getOperand(1).getReg();
5818 Register RHS = AddrDef->MI->getOperand(2).getReg();
5819 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
5820 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
5821
5822 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
5823 isSGPR(RHSDef->Reg)) {
5824 int FI = LHSDef->MI->getOperand(1).getIndex();
5825 MachineInstr &I = *Root.getParent();
5826 MachineBasicBlock *BB = I.getParent();
5827 const DebugLoc &DL = I.getDebugLoc();
5828 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5829
5830 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
5831 .addFrameIndex(FI)
5832 .addReg(RHSDef->Reg)
5833 .setOperandDead(3); // Dead scc
5834 }
5835 }
5836
5837 if (!isSGPR(SAddr))
5838 return std::nullopt;
5839
5840 return {{
5841 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
5842 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
5843 }};
5844}
5845
5846// Check whether the flat scratch SVS swizzle bug affects this access.
5847bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
5848 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
5849 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
5850 return false;
5851
5852 // The bug affects the swizzling of SVS accesses if there is any carry out
5853 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
5854 // voffset to (soffset + inst_offset).
5855 auto VKnown = VT->getKnownBits(VAddr);
5856 auto SKnown = KnownBits::add(VT->getKnownBits(SAddr),
5857 KnownBits::makeConstant(APInt(32, ImmOffset)));
5858 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
5859 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
5860 return (VMax & 3) + (SMax & 3) >= 4;
5861}
5862
5864AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
5865 Register Addr = Root.getReg();
5866 Register PtrBase;
5867 int64_t ConstOffset;
5868 int64_t ImmOffset = 0;
5869
5870 // Match the immediate offset first, which canonically is moved as low as
5871 // possible.
5872 std::tie(PtrBase, ConstOffset, std::ignore) =
5873 getPtrBaseWithConstantOffset(Addr, *MRI);
5874
5875 Register OrigAddr = Addr;
5876 if (ConstOffset != 0 &&
5877 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
5879 Addr = PtrBase;
5880 ImmOffset = ConstOffset;
5881 }
5882
5883 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5884 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
5885 return std::nullopt;
5886
5887 Register RHS = AddrDef->MI->getOperand(2).getReg();
5888 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
5889 return std::nullopt;
5890
5891 Register LHS = AddrDef->MI->getOperand(1).getReg();
5892 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
5893
5894 if (OrigAddr != Addr) {
5895 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
5896 return std::nullopt;
5897 } else {
5898 if (!isFlatScratchBaseLegalSV(OrigAddr))
5899 return std::nullopt;
5900 }
5901
5902 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
5903 return std::nullopt;
5904
5905 unsigned CPol = selectScaleOffset(Root, RHS, true /* IsSigned */)
5907 : 0;
5908
5909 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5910 int FI = LHSDef->MI->getOperand(1).getIndex();
5911 return {{
5912 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
5913 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
5914 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
5915 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
5916 }};
5917 }
5918
5919 if (!isSGPR(LHS))
5920 if (auto Def = getDefSrcRegIgnoringCopies(LHS, *MRI))
5921 LHS = Def->Reg;
5922
5923 if (!isSGPR(LHS))
5924 return std::nullopt;
5925
5926 return {{
5927 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
5928 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
5929 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
5930 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
5931 }};
5932}
5933
5935AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
5936 MachineInstr *MI = Root.getParent();
5937 MachineBasicBlock *MBB = MI->getParent();
5938 MachineFunction *MF = MBB->getParent();
5939 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
5940
5941 int64_t Offset = 0;
5942 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
5943 Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
5944 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5945
5946 // TODO: Should this be inside the render function? The iterator seems to
5947 // move.
5948 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
5949 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
5950 HighBits)
5951 .addImm(Offset & ~MaxOffset);
5952
5953 return {{[=](MachineInstrBuilder &MIB) { // rsrc
5954 MIB.addReg(Info->getScratchRSrcReg());
5955 },
5956 [=](MachineInstrBuilder &MIB) { // vaddr
5957 MIB.addReg(HighBits);
5958 },
5959 [=](MachineInstrBuilder &MIB) { // soffset
5960 // Use constant zero for soffset and rely on eliminateFrameIndex
5961 // to choose the appropriate frame register if need be.
5962 MIB.addImm(0);
5963 },
5964 [=](MachineInstrBuilder &MIB) { // offset
5965 MIB.addImm(Offset & MaxOffset);
5966 }}};
5967 }
5968
5969 assert(Offset == 0 || Offset == -1);
5970
5971 // Try to fold a frame index directly into the MUBUF vaddr field, and any
5972 // offsets.
5973 std::optional<int> FI;
5974 Register VAddr = Root.getReg();
5975
5976 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
5977 Register PtrBase;
5978 int64_t ConstOffset;
5979 std::tie(PtrBase, ConstOffset, std::ignore) =
5980 getPtrBaseWithConstantOffset(VAddr, *MRI);
5981 if (ConstOffset != 0) {
5982 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
5983 (!STI.privateMemoryResourceIsRangeChecked() ||
5984 VT->signBitIsZero(PtrBase))) {
5985 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
5986 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
5987 FI = PtrBaseDef->getOperand(1).getIndex();
5988 else
5989 VAddr = PtrBase;
5990 Offset = ConstOffset;
5991 }
5992 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5993 FI = RootDef->getOperand(1).getIndex();
5994 }
5995
5996 return {{[=](MachineInstrBuilder &MIB) { // rsrc
5997 MIB.addReg(Info->getScratchRSrcReg());
5998 },
5999 [=](MachineInstrBuilder &MIB) { // vaddr
6000 if (FI)
6001 MIB.addFrameIndex(*FI);
6002 else
6003 MIB.addReg(VAddr);
6004 },
6005 [=](MachineInstrBuilder &MIB) { // soffset
6006 // Use constant zero for soffset and rely on eliminateFrameIndex
6007 // to choose the appropriate frame register if need be.
6008 MIB.addImm(0);
6009 },
6010 [=](MachineInstrBuilder &MIB) { // offset
6011 MIB.addImm(Offset);
6012 }}};
6013}
6014
6015bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
6016 int64_t Offset) const {
6017 if (!isUInt<16>(Offset))
6018 return false;
6019
6020 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6021 return true;
6022
6023 // On Southern Islands instruction with a negative base value and an offset
6024 // don't seem to work.
6025 return VT->signBitIsZero(Base);
6026}
6027
6028bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
6029 int64_t Offset1,
6030 unsigned Size) const {
6031 if (Offset0 % Size != 0 || Offset1 % Size != 0)
6032 return false;
6033 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
6034 return false;
6035
6036 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6037 return true;
6038
6039 // On Southern Islands instruction with a negative base value and an offset
6040 // don't seem to work.
6041 return VT->signBitIsZero(Base);
6042}
6043
6044// Return whether the operation has NoUnsignedWrap property.
6045static bool isNoUnsignedWrap(MachineInstr *Addr) {
6046 return Addr->getOpcode() == TargetOpcode::G_OR ||
6047 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
6049}
6050
6051// Check that the base address of flat scratch load/store in the form of `base +
6052// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
6053// requirement). We always treat the first operand as the base address here.
6054bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
6055 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6056
6057 if (isNoUnsignedWrap(AddrMI))
6058 return true;
6059
6060 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6061 // values.
6062 if (STI.hasSignedScratchOffsets())
6063 return true;
6064
6065 Register LHS = AddrMI->getOperand(1).getReg();
6066 Register RHS = AddrMI->getOperand(2).getReg();
6067
6068 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
6069 std::optional<ValueAndVReg> RhsValReg =
6071 // If the immediate offset is negative and within certain range, the base
6072 // address cannot also be negative. If the base is also negative, the sum
6073 // would be either negative or much larger than the valid range of scratch
6074 // memory a thread can access.
6075 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
6076 RhsValReg->Value.getSExtValue() > -0x40000000)
6077 return true;
6078 }
6079
6080 return VT->signBitIsZero(LHS);
6081}
6082
6083// Check address value in SGPR/VGPR are legal for flat scratch in the form
6084// of: SGPR + VGPR.
6085bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
6086 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6087
6088 if (isNoUnsignedWrap(AddrMI))
6089 return true;
6090
6091 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6092 // values.
6093 if (STI.hasSignedScratchOffsets())
6094 return true;
6095
6096 Register LHS = AddrMI->getOperand(1).getReg();
6097 Register RHS = AddrMI->getOperand(2).getReg();
6098 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6099}
6100
6101// Check address value in SGPR/VGPR are legal for flat scratch in the form
6102// of: SGPR + VGPR + Imm.
6103bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
6104 Register Addr) const {
6105 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6106 // values.
6107 if (STI.hasSignedScratchOffsets())
6108 return true;
6109
6110 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6111 Register Base = AddrMI->getOperand(1).getReg();
6112 std::optional<DefinitionAndSourceRegister> BaseDef =
6114 std::optional<ValueAndVReg> RHSOffset =
6116 assert(RHSOffset);
6117
6118 // If the immediate offset is negative and within certain range, the base
6119 // address cannot also be negative. If the base is also negative, the sum
6120 // would be either negative or much larger than the valid range of scratch
6121 // memory a thread can access.
6122 if (isNoUnsignedWrap(BaseDef->MI) &&
6123 (isNoUnsignedWrap(AddrMI) ||
6124 (RHSOffset->Value.getSExtValue() < 0 &&
6125 RHSOffset->Value.getSExtValue() > -0x40000000)))
6126 return true;
6127
6128 Register LHS = BaseDef->MI->getOperand(1).getReg();
6129 Register RHS = BaseDef->MI->getOperand(2).getReg();
6130 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6131}
6132
6133bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
6134 unsigned ShAmtBits) const {
6135 assert(MI.getOpcode() == TargetOpcode::G_AND);
6136
6137 std::optional<APInt> RHS =
6138 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
6139 if (!RHS)
6140 return false;
6141
6142 if (RHS->countr_one() >= ShAmtBits)
6143 return true;
6144
6145 const APInt &LHSKnownZeros = VT->getKnownZeroes(MI.getOperand(1).getReg());
6146 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
6147}
6148
6150AMDGPUInstructionSelector::selectMUBUFScratchOffset(
6151 MachineOperand &Root) const {
6152 Register Reg = Root.getReg();
6153 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6154
6155 std::optional<DefinitionAndSourceRegister> Def =
6157 assert(Def && "this shouldn't be an optional result");
6158 Reg = Def->Reg;
6159
6160 if (Register WaveBase = getWaveAddress(Def->MI)) {
6161 return {{
6162 [=](MachineInstrBuilder &MIB) { // rsrc
6163 MIB.addReg(Info->getScratchRSrcReg());
6164 },
6165 [=](MachineInstrBuilder &MIB) { // soffset
6166 MIB.addReg(WaveBase);
6167 },
6168 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
6169 }};
6170 }
6171
6172 int64_t Offset = 0;
6173
6174 // FIXME: Copy check is a hack
6176 if (mi_match(Reg, *MRI,
6177 m_GPtrAdd(m_Reg(BasePtr),
6179 if (!TII.isLegalMUBUFImmOffset(Offset))
6180 return {};
6181 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
6182 Register WaveBase = getWaveAddress(BasePtrDef);
6183 if (!WaveBase)
6184 return {};
6185
6186 return {{
6187 [=](MachineInstrBuilder &MIB) { // rsrc
6188 MIB.addReg(Info->getScratchRSrcReg());
6189 },
6190 [=](MachineInstrBuilder &MIB) { // soffset
6191 MIB.addReg(WaveBase);
6192 },
6193 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6194 }};
6195 }
6196
6197 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
6198 !TII.isLegalMUBUFImmOffset(Offset))
6199 return {};
6200
6201 return {{
6202 [=](MachineInstrBuilder &MIB) { // rsrc
6203 MIB.addReg(Info->getScratchRSrcReg());
6204 },
6205 [=](MachineInstrBuilder &MIB) { // soffset
6206 MIB.addImm(0);
6207 },
6208 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6209 }};
6210}
6211
6212std::pair<Register, unsigned>
6213AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
6214 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6215 int64_t ConstAddr = 0;
6216
6217 Register PtrBase;
6218 int64_t Offset;
6219 std::tie(PtrBase, Offset, std::ignore) =
6220 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6221
6222 if (Offset) {
6223 if (isDSOffsetLegal(PtrBase, Offset)) {
6224 // (add n0, c0)
6225 return std::pair(PtrBase, Offset);
6226 }
6227 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6228 // TODO
6229
6230
6231 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6232 // TODO
6233
6234 }
6235
6236 return std::pair(Root.getReg(), 0);
6237}
6238
6240AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
6241 Register Reg;
6242 unsigned Offset;
6243 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
6244 return {{
6245 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6246 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
6247 }};
6248}
6249
6251AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
6252 return selectDSReadWrite2(Root, 4);
6253}
6254
6256AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
6257 return selectDSReadWrite2(Root, 8);
6258}
6259
6261AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
6262 unsigned Size) const {
6263 Register Reg;
6264 unsigned Offset;
6265 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
6266 return {{
6267 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6268 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
6269 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
6270 }};
6271}
6272
6273std::pair<Register, unsigned>
6274AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
6275 unsigned Size) const {
6276 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6277 int64_t ConstAddr = 0;
6278
6279 Register PtrBase;
6280 int64_t Offset;
6281 std::tie(PtrBase, Offset, std::ignore) =
6282 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6283
6284 if (Offset) {
6285 int64_t OffsetValue0 = Offset;
6286 int64_t OffsetValue1 = Offset + Size;
6287 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
6288 // (add n0, c0)
6289 return std::pair(PtrBase, OffsetValue0 / Size);
6290 }
6291 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6292 // TODO
6293
6294 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6295 // TODO
6296
6297 }
6298
6299 return std::pair(Root.getReg(), 0);
6300}
6301
6302/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
6303/// the base value with the constant offset, and if the offset computation is
6304/// known to be inbounds. There may be intervening copies between \p Root and
6305/// the identified constant. Returns \p Root, 0, false if this does not match
6306/// the pattern.
6307std::tuple<Register, int64_t, bool>
6308AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
6309 Register Root, const MachineRegisterInfo &MRI) const {
6310 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
6311 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
6312 return {Root, 0, false};
6313
6314 MachineOperand &RHS = RootI->getOperand(2);
6315 std::optional<ValueAndVReg> MaybeOffset =
6317 if (!MaybeOffset)
6318 return {Root, 0, false};
6319 bool IsInBounds = RootI->getFlag(MachineInstr::MIFlag::InBounds);
6320 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue(),
6321 IsInBounds};
6322}
6323
6325 MIB.addImm(0);
6326}
6327
6328/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
6329/// BasePtr is not valid, a null base pointer will be used.
6331 uint32_t FormatLo, uint32_t FormatHi,
6332 Register BasePtr) {
6333 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6334 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6335 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6336 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6337
6338 B.buildInstr(AMDGPU::S_MOV_B32)
6339 .addDef(RSrc2)
6340 .addImm(FormatLo);
6341 B.buildInstr(AMDGPU::S_MOV_B32)
6342 .addDef(RSrc3)
6343 .addImm(FormatHi);
6344
6345 // Build the half of the subregister with the constants before building the
6346 // full 128-bit register. If we are building multiple resource descriptors,
6347 // this will allow CSEing of the 2-component register.
6348 B.buildInstr(AMDGPU::REG_SEQUENCE)
6349 .addDef(RSrcHi)
6350 .addReg(RSrc2)
6351 .addImm(AMDGPU::sub0)
6352 .addReg(RSrc3)
6353 .addImm(AMDGPU::sub1);
6354
6355 Register RSrcLo = BasePtr;
6356 if (!BasePtr) {
6357 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6358 B.buildInstr(AMDGPU::S_MOV_B64)
6359 .addDef(RSrcLo)
6360 .addImm(0);
6361 }
6362
6363 B.buildInstr(AMDGPU::REG_SEQUENCE)
6364 .addDef(RSrc)
6365 .addReg(RSrcLo)
6366 .addImm(AMDGPU::sub0_sub1)
6367 .addReg(RSrcHi)
6368 .addImm(AMDGPU::sub2_sub3);
6369
6370 return RSrc;
6371}
6372
6374 const SIInstrInfo &TII, Register BasePtr) {
6375 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6376
6377 // FIXME: Why are half the "default" bits ignored based on the addressing
6378 // mode?
6379 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
6380}
6381
6383 const SIInstrInfo &TII, Register BasePtr) {
6384 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6385
6386 // FIXME: Why are half the "default" bits ignored based on the addressing
6387 // mode?
6388 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
6389}
6390
6391AMDGPUInstructionSelector::MUBUFAddressData
6392AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
6393 MUBUFAddressData Data;
6394 Data.N0 = Src;
6395
6396 Register PtrBase;
6397 int64_t Offset;
6398
6399 std::tie(PtrBase, Offset, std::ignore) =
6400 getPtrBaseWithConstantOffset(Src, *MRI);
6401 if (isUInt<32>(Offset)) {
6402 Data.N0 = PtrBase;
6403 Data.Offset = Offset;
6404 }
6405
6406 if (MachineInstr *InputAdd
6407 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
6408 Data.N2 = InputAdd->getOperand(1).getReg();
6409 Data.N3 = InputAdd->getOperand(2).getReg();
6410
6411 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
6412 // FIXME: Don't know this was defined by operand 0
6413 //
6414 // TODO: Remove this when we have copy folding optimizations after
6415 // RegBankSelect.
6416 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
6417 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
6418 }
6419
6420 return Data;
6421}
6422
6423/// Return if the addr64 mubuf mode should be used for the given address.
6424bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
6425 // (ptr_add N2, N3) -> addr64, or
6426 // (ptr_add (ptr_add N2, N3), C1) -> addr64
6427 if (Addr.N2)
6428 return true;
6429
6430 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
6431 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
6432}
6433
6434/// Split an immediate offset \p ImmOffset depending on whether it fits in the
6435/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
6436/// component.
6437void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
6438 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
6439 if (TII.isLegalMUBUFImmOffset(ImmOffset))
6440 return;
6441
6442 // Illegal offset, store it in soffset.
6443 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6444 B.buildInstr(AMDGPU::S_MOV_B32)
6445 .addDef(SOffset)
6446 .addImm(ImmOffset);
6447 ImmOffset = 0;
6448}
6449
6450bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
6451 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
6452 Register &SOffset, int64_t &Offset) const {
6453 // FIXME: Predicates should stop this from reaching here.
6454 // addr64 bit was removed for volcanic islands.
6455 if (!STI.hasAddr64() || STI.useFlatForGlobal())
6456 return false;
6457
6458 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6459 if (!shouldUseAddr64(AddrData))
6460 return false;
6461
6462 Register N0 = AddrData.N0;
6463 Register N2 = AddrData.N2;
6464 Register N3 = AddrData.N3;
6465 Offset = AddrData.Offset;
6466
6467 // Base pointer for the SRD.
6468 Register SRDPtr;
6469
6470 if (N2) {
6471 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6472 assert(N3);
6473 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6474 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
6475 // addr64, and construct the default resource from a 0 address.
6476 VAddr = N0;
6477 } else {
6478 SRDPtr = N3;
6479 VAddr = N2;
6480 }
6481 } else {
6482 // N2 is not divergent.
6483 SRDPtr = N2;
6484 VAddr = N3;
6485 }
6486 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6487 // Use the default null pointer in the resource
6488 VAddr = N0;
6489 } else {
6490 // N0 -> offset, or
6491 // (N0 + C1) -> offset
6492 SRDPtr = N0;
6493 }
6494
6495 MachineIRBuilder B(*Root.getParent());
6496 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
6497 splitIllegalMUBUFOffset(B, SOffset, Offset);
6498 return true;
6499}
6500
6501bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
6502 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
6503 int64_t &Offset) const {
6504
6505 // FIXME: Pattern should not reach here.
6506 if (STI.useFlatForGlobal())
6507 return false;
6508
6509 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6510 if (shouldUseAddr64(AddrData))
6511 return false;
6512
6513 // N0 -> offset, or
6514 // (N0 + C1) -> offset
6515 Register SRDPtr = AddrData.N0;
6516 Offset = AddrData.Offset;
6517
6518 // TODO: Look through extensions for 32-bit soffset.
6519 MachineIRBuilder B(*Root.getParent());
6520
6521 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
6522 splitIllegalMUBUFOffset(B, SOffset, Offset);
6523 return true;
6524}
6525
6527AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
6528 Register VAddr;
6529 Register RSrcReg;
6530 Register SOffset;
6531 int64_t Offset = 0;
6532
6533 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
6534 return {};
6535
6536 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
6537 // pattern.
6538 return {{
6539 [=](MachineInstrBuilder &MIB) { // rsrc
6540 MIB.addReg(RSrcReg);
6541 },
6542 [=](MachineInstrBuilder &MIB) { // vaddr
6543 MIB.addReg(VAddr);
6544 },
6545 [=](MachineInstrBuilder &MIB) { // soffset
6546 if (SOffset)
6547 MIB.addReg(SOffset);
6548 else if (STI.hasRestrictedSOffset())
6549 MIB.addReg(AMDGPU::SGPR_NULL);
6550 else
6551 MIB.addImm(0);
6552 },
6553 [=](MachineInstrBuilder &MIB) { // offset
6554 MIB.addImm(Offset);
6555 },
6556 addZeroImm, // cpol
6557 addZeroImm, // tfe
6558 addZeroImm // swz
6559 }};
6560}
6561
6563AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
6564 Register RSrcReg;
6565 Register SOffset;
6566 int64_t Offset = 0;
6567
6568 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
6569 return {};
6570
6571 return {{
6572 [=](MachineInstrBuilder &MIB) { // rsrc
6573 MIB.addReg(RSrcReg);
6574 },
6575 [=](MachineInstrBuilder &MIB) { // soffset
6576 if (SOffset)
6577 MIB.addReg(SOffset);
6578 else if (STI.hasRestrictedSOffset())
6579 MIB.addReg(AMDGPU::SGPR_NULL);
6580 else
6581 MIB.addImm(0);
6582 },
6583 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
6584 addZeroImm, // cpol
6585 addZeroImm, // tfe
6586 addZeroImm, // swz
6587 }};
6588}
6589
6591AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
6592
6593 Register SOffset = Root.getReg();
6594
6595 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
6596 SOffset = AMDGPU::SGPR_NULL;
6597
6598 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
6599}
6600
6601/// Get an immediate that must be 32-bits, and treated as zero extended.
6602static std::optional<uint64_t>
6604 // getIConstantVRegVal sexts any values, so see if that matters.
6605 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
6606 if (!OffsetVal || !isInt<32>(*OffsetVal))
6607 return std::nullopt;
6608 return Lo_32(*OffsetVal);
6609}
6610
6612AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
6613 std::optional<uint64_t> OffsetVal =
6614 Root.isImm() ? Root.getImm() : getConstantZext32Val(Root.getReg(), *MRI);
6615 if (!OffsetVal)
6616 return {};
6617
6618 std::optional<int64_t> EncodedImm =
6619 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
6620 if (!EncodedImm)
6621 return {};
6622
6623 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
6624}
6625
6627AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
6628 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
6629
6630 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
6631 if (!OffsetVal)
6632 return {};
6633
6634 std::optional<int64_t> EncodedImm =
6636 if (!EncodedImm)
6637 return {};
6638
6639 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
6640}
6641
6643AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
6644 // Match the (soffset + offset) pair as a 32-bit register base and
6645 // an immediate offset.
6646 Register SOffset;
6647 unsigned Offset;
6648 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
6649 *MRI, Root.getReg(), VT, /*CheckNUW*/ true);
6650 if (!SOffset)
6651 return std::nullopt;
6652
6653 std::optional<int64_t> EncodedOffset =
6654 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
6655 if (!EncodedOffset)
6656 return std::nullopt;
6657
6658 assert(MRI->getType(SOffset) == LLT::scalar(32));
6659 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
6660 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
6661}
6662
6663std::pair<Register, unsigned>
6664AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
6665 bool &Matched) const {
6666 Matched = false;
6667
6668 Register Src;
6669 unsigned Mods;
6670 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
6671
6672 if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
6673 assert(MRI->getType(Src) == LLT::scalar(16));
6674
6675 // Only change Src if src modifier could be gained. In such cases new Src
6676 // could be sgpr but this does not violate constant bus restriction for
6677 // instruction that is being selected.
6678 Src = stripBitCast(Src, *MRI);
6679
6680 const auto CheckAbsNeg = [&]() {
6681 // Be careful about folding modifiers if we already have an abs. fneg is
6682 // applied last, so we don't want to apply an earlier fneg.
6683 if ((Mods & SISrcMods::ABS) == 0) {
6684 unsigned ModsTmp;
6685 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
6686
6687 if ((ModsTmp & SISrcMods::NEG) != 0)
6688 Mods ^= SISrcMods::NEG;
6689
6690 if ((ModsTmp & SISrcMods::ABS) != 0)
6691 Mods |= SISrcMods::ABS;
6692 }
6693 };
6694
6695 CheckAbsNeg();
6696
6697 // op_sel/op_sel_hi decide the source type and source.
6698 // If the source's op_sel_hi is set, it indicates to do a conversion from
6699 // fp16. If the sources's op_sel is set, it picks the high half of the
6700 // source register.
6701
6702 Mods |= SISrcMods::OP_SEL_1;
6703
6704 if (isExtractHiElt(*MRI, Src, Src)) {
6705 Mods |= SISrcMods::OP_SEL_0;
6706 CheckAbsNeg();
6707 }
6708
6709 Matched = true;
6710 }
6711
6712 return {Src, Mods};
6713}
6714
6716AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
6717 MachineOperand &Root) const {
6718 Register Src;
6719 unsigned Mods;
6720 bool Matched;
6721 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
6722 if (!Matched)
6723 return {};
6724
6725 return {{
6726 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
6727 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
6728 }};
6729}
6730
6732AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
6733 Register Src;
6734 unsigned Mods;
6735 bool Matched;
6736 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
6737
6738 return {{
6739 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
6740 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
6741 }};
6742}
6743
6744bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
6745 MachineInstr &I, Intrinsic::ID IntrID) const {
6746 MachineBasicBlock *MBB = I.getParent();
6747 const DebugLoc &DL = I.getDebugLoc();
6748 Register CCReg = I.getOperand(0).getReg();
6749
6750 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6751 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_CMP_EQ_U32)).addImm(0).addImm(0);
6752
6753 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
6754 .addImm(I.getOperand(2).getImm());
6755
6756 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
6757
6758 I.eraseFromParent();
6759 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
6760 *MRI);
6761}
6762
6763bool AMDGPUInstructionSelector::selectSGetBarrierState(
6764 MachineInstr &I, Intrinsic::ID IntrID) const {
6765 MachineBasicBlock *MBB = I.getParent();
6766 const DebugLoc &DL = I.getDebugLoc();
6767 const MachineOperand &BarOp = I.getOperand(2);
6768 std::optional<int64_t> BarValImm =
6769 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
6770
6771 if (!BarValImm) {
6772 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
6773 .addReg(BarOp.getReg());
6774 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
6775 }
6776 MachineInstrBuilder MIB;
6777 unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM
6778 : AMDGPU::S_GET_BARRIER_STATE_M0;
6779 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
6780
6781 auto DstReg = I.getOperand(0).getReg();
6782 const TargetRegisterClass *DstRC =
6783 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
6784 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
6785 return false;
6786 MIB.addDef(DstReg);
6787 if (BarValImm) {
6788 MIB.addImm(*BarValImm);
6789 }
6790 I.eraseFromParent();
6791 return true;
6792}
6793
6794unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
6795 if (HasInlineConst) {
6796 switch (IntrID) {
6797 default:
6798 llvm_unreachable("not a named barrier op");
6799 case Intrinsic::amdgcn_s_barrier_join:
6800 return AMDGPU::S_BARRIER_JOIN_IMM;
6801 case Intrinsic::amdgcn_s_get_named_barrier_state:
6802 return AMDGPU::S_GET_BARRIER_STATE_IMM;
6803 };
6804 } else {
6805 switch (IntrID) {
6806 default:
6807 llvm_unreachable("not a named barrier op");
6808 case Intrinsic::amdgcn_s_barrier_join:
6809 return AMDGPU::S_BARRIER_JOIN_M0;
6810 case Intrinsic::amdgcn_s_get_named_barrier_state:
6811 return AMDGPU::S_GET_BARRIER_STATE_M0;
6812 };
6813 }
6814}
6815
6816bool AMDGPUInstructionSelector::selectNamedBarrierInit(
6817 MachineInstr &I, Intrinsic::ID IntrID) const {
6818 MachineBasicBlock *MBB = I.getParent();
6819 const DebugLoc &DL = I.getDebugLoc();
6820 const MachineOperand &BarOp = I.getOperand(1);
6821 const MachineOperand &CntOp = I.getOperand(2);
6822
6823 // BarID = (BarOp >> 4) & 0x3F
6824 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6825 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
6826 .add(BarOp)
6827 .addImm(4u)
6828 .setOperandDead(3); // Dead scc
6829
6830 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6831 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
6832 .addReg(TmpReg0)
6833 .addImm(0x3F)
6834 .setOperandDead(3); // Dead scc
6835
6836 // MO = ((CntOp & 0x3F) << shAmt) | BarID
6837 Register TmpReg2 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6838 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg2)
6839 .add(CntOp)
6840 .addImm(0x3F)
6841 .setOperandDead(3); // Dead scc
6842
6843 Register TmpReg3 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6844 constexpr unsigned ShAmt = 16;
6845 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg3)
6846 .addReg(TmpReg2)
6847 .addImm(ShAmt)
6848 .setOperandDead(3); // Dead scc
6849
6850 Register TmpReg4 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6851 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg4)
6852 .addReg(TmpReg1)
6853 .addReg(TmpReg3)
6854 .setOperandDead(3); // Dead scc;
6855
6856 auto CopyMIB =
6857 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4);
6858 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
6859
6860 unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
6861 ? AMDGPU::S_BARRIER_INIT_M0
6862 : AMDGPU::S_BARRIER_SIGNAL_M0;
6863 MachineInstrBuilder MIB;
6864 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
6865
6866 I.eraseFromParent();
6867 return true;
6868}
6869
6870bool AMDGPUInstructionSelector::selectNamedBarrierInst(
6871 MachineInstr &I, Intrinsic::ID IntrID) const {
6872 MachineBasicBlock *MBB = I.getParent();
6873 const DebugLoc &DL = I.getDebugLoc();
6874 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state
6875 ? I.getOperand(2)
6876 : I.getOperand(1);
6877 std::optional<int64_t> BarValImm =
6878 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
6879
6880 if (!BarValImm) {
6881 // BarID = (BarOp >> 4) & 0x3F
6882 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6883 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
6884 .addReg(BarOp.getReg())
6885 .addImm(4u)
6886 .setOperandDead(3); // Dead scc;
6887
6888 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6889 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
6890 .addReg(TmpReg0)
6891 .addImm(0x3F)
6892 .setOperandDead(3); // Dead scc;
6893
6894 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
6895 .addReg(TmpReg1);
6896 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
6897 }
6898
6899 MachineInstrBuilder MIB;
6900 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
6901 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
6902
6903 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
6904 auto DstReg = I.getOperand(0).getReg();
6905 const TargetRegisterClass *DstRC =
6906 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
6907 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
6908 return false;
6909 MIB.addDef(DstReg);
6910 }
6911
6912 if (BarValImm) {
6913 auto BarId = ((*BarValImm) >> 4) & 0x3F;
6914 MIB.addImm(BarId);
6915 }
6916
6917 I.eraseFromParent();
6918 return true;
6919}
6920
6921void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
6922 const MachineInstr &MI,
6923 int OpIdx) const {
6924 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6925 "Expected G_CONSTANT");
6926 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
6927}
6928
6929void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
6930 const MachineInstr &MI,
6931 int OpIdx) const {
6932 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6933 "Expected G_CONSTANT");
6934 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
6935}
6936
6937void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
6938 const MachineInstr &MI,
6939 int OpIdx) const {
6940 const MachineOperand &Op = MI.getOperand(1);
6941 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
6942 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
6943}
6944
6945void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
6946 const MachineInstr &MI,
6947 int OpIdx) const {
6948 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6949 "Expected G_CONSTANT");
6950 MIB.addImm(MI.getOperand(1).getCImm()->getValue().popcount());
6951}
6952
6953/// This only really exists to satisfy DAG type checking machinery, so is a
6954/// no-op here.
6955void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
6956 const MachineInstr &MI,
6957 int OpIdx) const {
6958 const MachineOperand &Op = MI.getOperand(OpIdx);
6959 int64_t Imm;
6960 if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm)))
6961 MIB.addImm(Imm);
6962 else
6963 MIB.addImm(Op.getImm());
6964}
6965
6966void AMDGPUInstructionSelector::renderZextBoolTImm(MachineInstrBuilder &MIB,
6967 const MachineInstr &MI,
6968 int OpIdx) const {
6969 MIB.addImm(MI.getOperand(OpIdx).getImm() != 0);
6970}
6971
6972void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
6973 const MachineInstr &MI,
6974 int OpIdx) const {
6975 assert(OpIdx >= 0 && "expected to match an immediate operand");
6976 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6977}
6978
6979void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
6980 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6981 assert(OpIdx >= 0 && "expected to match an immediate operand");
6982 MIB.addImm(
6983 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6984}
6985
6986void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
6987 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6988 assert(OpIdx >= 0 && "expected to match an immediate operand");
6989 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
6991 : (int64_t)SISrcMods::DST_OP_SEL);
6992}
6993
6994void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
6995 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6996 assert(OpIdx >= 0 && "expected to match an immediate operand");
6997 MIB.addImm(
6998 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6999}
7000
7001void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
7002 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7003 assert(OpIdx >= 0 && "expected to match an immediate operand");
7004 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
7005 ? (int64_t)(SISrcMods::OP_SEL_0)
7006 : 0);
7007}
7008
7009void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
7010 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7011 assert(OpIdx >= 0 && "expected to match an immediate operand");
7012 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL)
7013 : 0);
7014}
7015
7016void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(
7017 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7018 assert(OpIdx >= 0 && "expected to match an immediate operand");
7019 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0)
7020 : 0);
7021}
7022
7023void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
7024 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7025 assert(OpIdx >= 0 && "expected to match an immediate operand");
7026 MIB.addImm(
7027 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7028}
7029
7030void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
7031 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7032 assert(OpIdx >= 0 && "expected to match an immediate operand");
7033 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
7034 ? (int64_t)SISrcMods::DST_OP_SEL
7035 : 0);
7036}
7037
7038void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
7039 const MachineInstr &MI,
7040 int OpIdx) const {
7041 assert(OpIdx >= 0 && "expected to match an immediate operand");
7042 MIB.addImm(MI.getOperand(OpIdx).getImm() &
7045}
7046
7047void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
7048 const MachineInstr &MI,
7049 int OpIdx) const {
7050 assert(OpIdx >= 0 && "expected to match an immediate operand");
7051 const bool Swizzle = MI.getOperand(OpIdx).getImm() &
7054 MIB.addImm(Swizzle);
7055}
7056
7057void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
7058 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7059 assert(OpIdx >= 0 && "expected to match an immediate operand");
7060 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
7063 MIB.addImm(Cpol | AMDGPU::CPol::GLC);
7064}
7065
7066void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
7067 const MachineInstr &MI,
7068 int OpIdx) const {
7069 MIB.addFrameIndex(MI.getOperand(1).getIndex());
7070}
7071
7072void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
7073 const MachineInstr &MI,
7074 int OpIdx) const {
7075 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
7076 int ExpVal = APF.getExactLog2Abs();
7077 assert(ExpVal != INT_MIN);
7078 MIB.addImm(ExpVal);
7079}
7080
7081void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
7082 const MachineInstr &MI,
7083 int OpIdx) const {
7084 // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
7085 // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
7086 // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
7087 // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
7088 MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
7089}
7090
7091void AMDGPUInstructionSelector::renderVOP3PModsNeg(MachineInstrBuilder &MIB,
7092 const MachineInstr &MI,
7093 int OpIdx) const {
7094 unsigned Mods = SISrcMods::OP_SEL_1;
7095 if (MI.getOperand(OpIdx).getImm())
7096 Mods ^= SISrcMods::NEG;
7097 MIB.addImm((int64_t)Mods);
7098}
7099
7100void AMDGPUInstructionSelector::renderVOP3PModsNegs(MachineInstrBuilder &MIB,
7101 const MachineInstr &MI,
7102 int OpIdx) const {
7103 unsigned Mods = SISrcMods::OP_SEL_1;
7104 if (MI.getOperand(OpIdx).getImm())
7106 MIB.addImm((int64_t)Mods);
7107}
7108
7109void AMDGPUInstructionSelector::renderVOP3PModsNegAbs(MachineInstrBuilder &MIB,
7110 const MachineInstr &MI,
7111 int OpIdx) const {
7112 unsigned Val = MI.getOperand(OpIdx).getImm();
7113 unsigned Mods = SISrcMods::OP_SEL_1; // default: none
7114 if (Val == 1) // neg
7115 Mods ^= SISrcMods::NEG;
7116 if (Val == 2) // abs
7117 Mods ^= SISrcMods::ABS;
7118 if (Val == 3) // neg and abs
7119 Mods ^= (SISrcMods::NEG | SISrcMods::ABS);
7120 MIB.addImm((int64_t)Mods);
7121}
7122
7123void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB,
7124 const MachineInstr &MI,
7125 int OpIdx) const {
7126 uint32_t V = MI.getOperand(2).getImm();
7129 if (!Subtarget->hasSafeCUPrefetch())
7130 V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe
7131 MIB.addImm(V);
7132}
7133
7134/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
7135void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
7136 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7137 unsigned Val = MI.getOperand(OpIdx).getImm();
7138 unsigned New = 0;
7139 if (Val & 0x1)
7141 if (Val & 0x2)
7143 MIB.addImm(New);
7144}
7145
7146bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
7147 return TII.isInlineConstant(Imm);
7148}
7149
7150bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
7151 return TII.isInlineConstant(Imm);
7152}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
#define GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static Register getLegalRegBank(Register NewReg, Register RootReg, const AMDGPURegisterBankInfo &RBI, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const SIInstrInfo &TII)
static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is shift left with half bits, such as reg0:2n =G_SHL reg1:2n, CONST(n)
static bool isNoUnsignedWrap(MachineInstr *Addr)
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID)
static bool checkRB(Register Reg, unsigned int RBNo, const AMDGPURegisterBankInfo &RBI, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI)
static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods)
static bool isTruncHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is truncating to half, such as reg0:n = G_TRUNC reg1:2n
static Register getWaveAddress(const MachineInstr *Def)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static bool shouldUseAndMask(unsigned Size, unsigned &Mask)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static TypeClass isVectorOfTwoOrScalar(Register Reg, const MachineRegisterInfo &MRI)
static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI, MachineBasicBlock *MBB)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void addZeroImm(MachineInstrBuilder &MIB)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static bool isConstant(const MachineInstr &MI)
static bool isSameBitWidth(Register Reg1, Register Reg2, const MachineRegisterInfo &MRI)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
static std::pair< Register, unsigned > computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, Register IdxReg, unsigned EltSize, GISelValueTracking &ValueTracking)
Return the register to use for the index value, and the subregister to use for the indirectly accesse...
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
static std::pair< Register, SrcStatus > getLastSameOrNeg(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static Register stripCopy(Register Reg, MachineRegisterInfo &MRI)
static std::optional< std::pair< Register, SrcStatus > > calcNextStatus(std::pair< Register, SrcStatus > Curr, const MachineRegisterInfo &MRI)
static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI)
static std::optional< uint64_t > getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI)
Get an immediate that must be 32-bits, and treated as zero extended.
static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg, Register RootReg, const SIInstrInfo &TII, const MachineRegisterInfo &MRI)
static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, const GCNSubtarget &ST)
static SmallVector< std::pair< Register, SrcStatus > > getSrcStats(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static bool isUnmergeHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test function, if the MI is reg0:n, reg1:n = G_UNMERGE_VALUES reg2:2n
static SrcStatus getNegStatus(Register Reg, SrcStatus S, const MachineRegisterInfo &MRI)
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is logic shift right with half bits, such as reg0:2n =G_LSHR reg1:2n,...
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
This file declares the targeting of the InstructionSelector class for AMDGPU.
constexpr LLT S1
constexpr LLT S32
AMDGPU Register Bank Select
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
dxil translate DXIL Translate Metadata
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
MachineInstr unsigned OpIdx
#define P(N)
static std::vector< std::pair< int, unsigned > > Swizzle(std::vector< std::pair< int, unsigned > > Src, R600InstrInfo::BankSwizzle Swz)
#define LLVM_DEBUG(...)
Definition Debug.h:114
Value * RHS
Value * LHS
This is used to control valid status that current MI supports.
bool checkOptions(SrcStatus Stat) const
SearchOptions(Register Reg, const MachineRegisterInfo &MRI)
AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM)
static const char * getName()
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
void setupMF(MachineFunction &MF, GISelValueTracking *VT, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override
Setup per-MF executor state.
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1479
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1563
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:143
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:693
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:690
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:688
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:678
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
bool isFPPredicate() const
Definition InstrTypes.h:782
bool isIntPredicate() const
Definition InstrTypes.h:783
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:169
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:163
LLVM_ABI DILocation * get() const
Get the underlying DILocation.
Definition DebugLoc.cpp:48
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
std::optional< SmallVector< std::function< void(MachineInstrBuilder &)>, 4 > > ComplexRendererFns
virtual void setupMF(MachineFunction &mf, GISelValueTracking *vt, CodeGenCoverage *covinfo=nullptr, ProfileSummaryInfo *psi=nullptr, BlockFrequencyInfo *bfi=nullptr)
Setup per-MF executor state.
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool hasValue() const
TypeSize getValue() const
int getOperandConstraint(unsigned OpNum, MCOI::OperandConstraint Constraint) const
Returns the value of the specified operand constraint if it is present.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void setReturnAddressIsTaken(bool s)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Helper class to build MachineInstr.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
const ConstantInt * getCImm() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
ArrayRef< int > getShuffleMask() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isEarlyClobber() const
Register getReg() const
getReg - Returns the register number.
bool isInternalRead() const
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Analysis providing profile information.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX10Plus(const MCSubtargetInfo &STI)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
IndexMode
ARM Index Modes.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
operand_type_match m_Reg()
SpecificConstantMatch m_SpecificICst(const APInt &RequestedValue)
Matches a constant equal to RequestedValue.
GCstAndRegMatch m_GCst(std::optional< ValueAndVReg > &ValReg)
UnaryOp_match< SrcTy, TargetOpcode::COPY > m_Copy(SrcTy &&Src)
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_XOR, true > m_GXor(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_SEXT > m_GSExt(const SrcTy &Src)
UnaryOp_match< SrcTy, TargetOpcode::G_FPEXT > m_GFPExt(const SrcTy &Src)
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
ConstantMatch< APInt > m_ICst(APInt &Cst)
SpecificConstantMatch m_AllOnesInt()
BinaryOp_match< LHS, RHS, TargetOpcode::G_OR, true > m_GOr(const LHS &L, const RHS &R)
ICstOrSplatMatch< APInt > m_ICstOrSplat(APInt &Cst)
ImplicitDefMatch m_GImplicitDef()
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
BinaryOp_match< LHS, RHS, TargetOpcode::G_ASHR, false > m_GAShr(const LHS &L, const RHS &R)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
SpecificRegisterMatch m_SpecificReg(Register RequestedReg)
Matches a register only if it is equal to RequestedReg.
BinaryOp_match< LHS, RHS, TargetOpcode::G_SHL, false > m_GShl(const LHS &L, const RHS &R)
Or< Preds... > m_any_of(Preds &&... preds)
BinaryOp_match< LHS, RHS, TargetOpcode::G_AND, true > m_GAnd(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_BITCAST > m_GBitcast(const SrcTy &Src)
bind_ty< MachineInstr * > m_MInstr(MachineInstr *&MI)
UnaryOp_match< SrcTy, TargetOpcode::G_FNEG > m_GFNeg(const SrcTy &Src)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
UnaryOp_match< SrcTy, TargetOpcode::G_FABS > m_GFabs(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_ANYEXT > m_GAnyExt(const SrcTy &Src)
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, TargetOpcode::G_MUL, true > m_GMul(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_TRUNC > m_GTrunc(const SrcTy &Src)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition Utils.cpp:915
@ Offset
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition Utils.cpp:56
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:651
PointerUnion< const TargetRegisterClass *, const RegisterBank * > RegClassOrRegBank
Convenient type to represent either a register class or a register bank.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition Utils.cpp:459
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
LLVM_ABI std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition Utils.cpp:294
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:155
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition Utils.cpp:492
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:314
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI std::optional< ValueAndVReg > getAnyConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool LookThroughAnyExt=false)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT or G_FCONST...
Definition Utils.cpp:439
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
LLVM_ABI std::optional< DefinitionAndSourceRegister > getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, and underlying value Register folding away any copies.
Definition Utils.cpp:467
LLVM_ABI Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition Utils.cpp:499
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:301
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.