LLVM 22.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "GCNHazardRecognizer.h"
19#include "GCNSubtarget.h"
22#include "llvm/ADT/STLExtras.h"
33#include "llvm/IR/IntrinsicsAMDGPU.h"
34#include "llvm/MC/MCContext.h"
37
38using namespace llvm;
39
40#define DEBUG_TYPE "si-instr-info"
41
42#define GET_INSTRINFO_CTOR_DTOR
43#include "AMDGPUGenInstrInfo.inc"
44
45namespace llvm::AMDGPU {
46#define GET_D16ImageDimIntrinsics_IMPL
47#define GET_ImageDimIntrinsicTable_IMPL
48#define GET_RsrcIntrinsics_IMPL
49#include "AMDGPUGenSearchableTables.inc"
50} // namespace llvm::AMDGPU
51
52// Must be at least 4 to be able to branch over minimum unconditional branch
53// code. This is only for making it possible to write reasonably small tests for
54// long branches.
56BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
57 cl::desc("Restrict range of branch instructions (DEBUG)"));
58
60 "amdgpu-fix-16-bit-physreg-copies",
61 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
62 cl::init(true),
64
66 : AMDGPUGenInstrInfo(ST, AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
67 RI(ST), ST(ST) {
68 SchedModel.init(&ST);
69}
70
71//===----------------------------------------------------------------------===//
72// TargetInstrInfo callbacks
73//===----------------------------------------------------------------------===//
74
75static unsigned getNumOperandsNoGlue(SDNode *Node) {
76 unsigned N = Node->getNumOperands();
77 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
78 --N;
79 return N;
80}
81
82/// Returns true if both nodes have the same value for the given
83/// operand \p Op, or if both nodes do not have this operand.
85 AMDGPU::OpName OpName) {
86 unsigned Opc0 = N0->getMachineOpcode();
87 unsigned Opc1 = N1->getMachineOpcode();
88
89 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
90 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
91
92 if (Op0Idx == -1 && Op1Idx == -1)
93 return true;
94
95
96 if ((Op0Idx == -1 && Op1Idx != -1) ||
97 (Op1Idx == -1 && Op0Idx != -1))
98 return false;
99
100 // getNamedOperandIdx returns the index for the MachineInstr's operands,
101 // which includes the result as the first operand. We are indexing into the
102 // MachineSDNode's operands, so we need to skip the result operand to get
103 // the real index.
104 --Op0Idx;
105 --Op1Idx;
106
107 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
108}
109
110static bool canRemat(const MachineInstr &MI) {
111
115 return true;
116
117 if (SIInstrInfo::isSMRD(MI)) {
118 return !MI.memoperands_empty() &&
119 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
120 return MMO->isLoad() && MMO->isInvariant();
121 });
122 }
123
124 return false;
125}
126
128 const MachineInstr &MI) const {
129
130 if (canRemat(MI)) {
131 // Normally VALU use of exec would block the rematerialization, but that
132 // is OK in this case to have an implicit exec read as all VALU do.
133 // We really want all of the generic logic for this except for this.
134
135 // Another potential implicit use is mode register. The core logic of
136 // the RA will not attempt rematerialization if mode is set anywhere
137 // in the function, otherwise it is safe since mode is not changed.
138
139 // There is difference to generic method which does not allow
140 // rematerialization if there are virtual register uses. We allow this,
141 // therefore this method includes SOP instructions as well.
142 if (!MI.hasImplicitDef() &&
143 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
144 !MI.mayRaiseFPException())
145 return true;
146 }
147
149}
150
151// Returns true if the scalar result of a VALU instruction depends on exec.
152bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
153 // Ignore comparisons which are only used masked with exec.
154 // This allows some hoisting/sinking of VALU comparisons.
155 if (MI.isCompare()) {
156 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
157 if (!Dst)
158 return true;
159
160 Register DstReg = Dst->getReg();
161 if (!DstReg.isVirtual())
162 return true;
163
164 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
165 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
166 switch (Use.getOpcode()) {
167 case AMDGPU::S_AND_SAVEEXEC_B32:
168 case AMDGPU::S_AND_SAVEEXEC_B64:
169 break;
170 case AMDGPU::S_AND_B32:
171 case AMDGPU::S_AND_B64:
172 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
173 return true;
174 break;
175 default:
176 return true;
177 }
178 }
179 return false;
180 }
181
182 switch (MI.getOpcode()) {
183 default:
184 break;
185 case AMDGPU::V_READFIRSTLANE_B32:
186 return true;
187 }
188
189 return false;
190}
191
193 // Any implicit use of exec by VALU is not a real register read.
194 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
195 isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
196}
197
199 MachineBasicBlock *SuccToSinkTo,
200 MachineCycleInfo *CI) const {
201 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
202 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
203 return true;
204
205 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
206 // Check if sinking of MI would create temporal divergent use.
207 for (auto Op : MI.uses()) {
208 if (Op.isReg() && Op.getReg().isVirtual() &&
209 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
210 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
211
212 // SgprDef defined inside cycle
213 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
214 if (FromCycle == nullptr)
215 continue;
216
217 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
218 // Check if there is a FromCycle that contains SgprDef's basic block but
219 // does not contain SuccToSinkTo and also has divergent exit condition.
220 while (FromCycle && !FromCycle->contains(ToCycle)) {
222 FromCycle->getExitingBlocks(ExitingBlocks);
223
224 // FromCycle has divergent exit condition.
225 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
226 if (hasDivergentBranch(ExitingBlock))
227 return false;
228 }
229
230 FromCycle = FromCycle->getParentCycle();
231 }
232 }
233 }
234
235 return true;
236}
237
239 int64_t &Offset0,
240 int64_t &Offset1) const {
241 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
242 return false;
243
244 unsigned Opc0 = Load0->getMachineOpcode();
245 unsigned Opc1 = Load1->getMachineOpcode();
246
247 // Make sure both are actually loads.
248 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
249 return false;
250
251 // A mayLoad instruction without a def is not a load. Likely a prefetch.
252 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
253 return false;
254
255 if (isDS(Opc0) && isDS(Opc1)) {
256
257 // FIXME: Handle this case:
258 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
259 return false;
260
261 // Check base reg.
262 if (Load0->getOperand(0) != Load1->getOperand(0))
263 return false;
264
265 // Skip read2 / write2 variants for simplicity.
266 // TODO: We should report true if the used offsets are adjacent (excluded
267 // st64 versions).
268 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
269 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
270 if (Offset0Idx == -1 || Offset1Idx == -1)
271 return false;
272
273 // XXX - be careful of dataless loads
274 // getNamedOperandIdx returns the index for MachineInstrs. Since they
275 // include the output in the operand list, but SDNodes don't, we need to
276 // subtract the index by one.
277 Offset0Idx -= get(Opc0).NumDefs;
278 Offset1Idx -= get(Opc1).NumDefs;
279 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
280 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
281 return true;
282 }
283
284 if (isSMRD(Opc0) && isSMRD(Opc1)) {
285 // Skip time and cache invalidation instructions.
286 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
287 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
288 return false;
289
290 unsigned NumOps = getNumOperandsNoGlue(Load0);
291 if (NumOps != getNumOperandsNoGlue(Load1))
292 return false;
293
294 // Check base reg.
295 if (Load0->getOperand(0) != Load1->getOperand(0))
296 return false;
297
298 // Match register offsets, if both register and immediate offsets present.
299 assert(NumOps == 4 || NumOps == 5);
300 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
301 return false;
302
303 const ConstantSDNode *Load0Offset =
305 const ConstantSDNode *Load1Offset =
307
308 if (!Load0Offset || !Load1Offset)
309 return false;
310
311 Offset0 = Load0Offset->getZExtValue();
312 Offset1 = Load1Offset->getZExtValue();
313 return true;
314 }
315
316 // MUBUF and MTBUF can access the same addresses.
317 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
318
319 // MUBUF and MTBUF have vaddr at different indices.
320 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
321 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
322 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
323 return false;
324
325 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
326 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
327
328 if (OffIdx0 == -1 || OffIdx1 == -1)
329 return false;
330
331 // getNamedOperandIdx returns the index for MachineInstrs. Since they
332 // include the output in the operand list, but SDNodes don't, we need to
333 // subtract the index by one.
334 OffIdx0 -= get(Opc0).NumDefs;
335 OffIdx1 -= get(Opc1).NumDefs;
336
337 SDValue Off0 = Load0->getOperand(OffIdx0);
338 SDValue Off1 = Load1->getOperand(OffIdx1);
339
340 // The offset might be a FrameIndexSDNode.
341 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
342 return false;
343
344 Offset0 = Off0->getAsZExtVal();
345 Offset1 = Off1->getAsZExtVal();
346 return true;
347 }
348
349 return false;
350}
351
352static bool isStride64(unsigned Opc) {
353 switch (Opc) {
354 case AMDGPU::DS_READ2ST64_B32:
355 case AMDGPU::DS_READ2ST64_B64:
356 case AMDGPU::DS_WRITE2ST64_B32:
357 case AMDGPU::DS_WRITE2ST64_B64:
358 return true;
359 default:
360 return false;
361 }
362}
363
366 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
367 const TargetRegisterInfo *TRI) const {
368 if (!LdSt.mayLoadOrStore())
369 return false;
370
371 unsigned Opc = LdSt.getOpcode();
372 OffsetIsScalable = false;
373 const MachineOperand *BaseOp, *OffsetOp;
374 int DataOpIdx;
375
376 if (isDS(LdSt)) {
377 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
378 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
379 if (OffsetOp) {
380 // Normal, single offset LDS instruction.
381 if (!BaseOp) {
382 // DS_CONSUME/DS_APPEND use M0 for the base address.
383 // TODO: find the implicit use operand for M0 and use that as BaseOp?
384 return false;
385 }
386 BaseOps.push_back(BaseOp);
387 Offset = OffsetOp->getImm();
388 // Get appropriate operand, and compute width accordingly.
389 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
390 if (DataOpIdx == -1)
391 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
392 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
393 Width = LocationSize::precise(64);
394 else
395 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
396 } else {
397 // The 2 offset instructions use offset0 and offset1 instead. We can treat
398 // these as a load with a single offset if the 2 offsets are consecutive.
399 // We will use this for some partially aligned loads.
400 const MachineOperand *Offset0Op =
401 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
402 const MachineOperand *Offset1Op =
403 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
404
405 unsigned Offset0 = Offset0Op->getImm() & 0xff;
406 unsigned Offset1 = Offset1Op->getImm() & 0xff;
407 if (Offset0 + 1 != Offset1)
408 return false;
409
410 // Each of these offsets is in element sized units, so we need to convert
411 // to bytes of the individual reads.
412
413 unsigned EltSize;
414 if (LdSt.mayLoad())
415 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
416 else {
417 assert(LdSt.mayStore());
418 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
419 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
420 }
421
422 if (isStride64(Opc))
423 EltSize *= 64;
424
425 BaseOps.push_back(BaseOp);
426 Offset = EltSize * Offset0;
427 // Get appropriate operand(s), and compute width accordingly.
428 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
429 if (DataOpIdx == -1) {
430 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
431 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
432 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
433 Width = LocationSize::precise(
434 Width.getValue() + TypeSize::getFixed(getOpSize(LdSt, DataOpIdx)));
435 } else {
436 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
437 }
438 }
439 return true;
440 }
441
442 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
443 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
444 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
445 return false;
446 BaseOps.push_back(RSrc);
447 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
448 if (BaseOp && !BaseOp->isFI())
449 BaseOps.push_back(BaseOp);
450 const MachineOperand *OffsetImm =
451 getNamedOperand(LdSt, AMDGPU::OpName::offset);
452 Offset = OffsetImm->getImm();
453 const MachineOperand *SOffset =
454 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
455 if (SOffset) {
456 if (SOffset->isReg())
457 BaseOps.push_back(SOffset);
458 else
459 Offset += SOffset->getImm();
460 }
461 // Get appropriate operand, and compute width accordingly.
462 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
463 if (DataOpIdx == -1)
464 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
465 if (DataOpIdx == -1) // LDS DMA
466 return false;
467 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
468 return true;
469 }
470
471 if (isImage(LdSt)) {
472 auto RsrcOpName =
473 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
474 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
475 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
476 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
477 if (VAddr0Idx >= 0) {
478 // GFX10 possible NSA encoding.
479 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
480 BaseOps.push_back(&LdSt.getOperand(I));
481 } else {
482 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
483 }
484 Offset = 0;
485 // Get appropriate operand, and compute width accordingly.
486 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
487 if (DataOpIdx == -1)
488 return false; // no return sampler
489 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
490 return true;
491 }
492
493 if (isSMRD(LdSt)) {
494 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
495 if (!BaseOp) // e.g. S_MEMTIME
496 return false;
497 BaseOps.push_back(BaseOp);
498 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
499 Offset = OffsetOp ? OffsetOp->getImm() : 0;
500 // Get appropriate operand, and compute width accordingly.
501 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
502 if (DataOpIdx == -1)
503 return false;
504 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
505 return true;
506 }
507
508 if (isFLAT(LdSt)) {
509 // Instructions have either vaddr or saddr or both or none.
510 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
511 if (BaseOp)
512 BaseOps.push_back(BaseOp);
513 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
514 if (BaseOp)
515 BaseOps.push_back(BaseOp);
516 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
517 // Get appropriate operand, and compute width accordingly.
518 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
519 if (DataOpIdx == -1)
520 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
521 if (DataOpIdx == -1) // LDS DMA
522 return false;
523 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
524 return true;
525 }
526
527 return false;
528}
529
530static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
532 const MachineInstr &MI2,
534 // Only examine the first "base" operand of each instruction, on the
535 // assumption that it represents the real base address of the memory access.
536 // Other operands are typically offsets or indices from this base address.
537 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
538 return true;
539
540 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
541 return false;
542
543 auto *MO1 = *MI1.memoperands_begin();
544 auto *MO2 = *MI2.memoperands_begin();
545 if (MO1->getAddrSpace() != MO2->getAddrSpace())
546 return false;
547
548 const auto *Base1 = MO1->getValue();
549 const auto *Base2 = MO2->getValue();
550 if (!Base1 || !Base2)
551 return false;
552 Base1 = getUnderlyingObject(Base1);
553 Base2 = getUnderlyingObject(Base2);
554
555 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
556 return false;
557
558 return Base1 == Base2;
559}
560
562 int64_t Offset1, bool OffsetIsScalable1,
564 int64_t Offset2, bool OffsetIsScalable2,
565 unsigned ClusterSize,
566 unsigned NumBytes) const {
567 // If the mem ops (to be clustered) do not have the same base ptr, then they
568 // should not be clustered
569 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
570 if (!BaseOps1.empty() && !BaseOps2.empty()) {
571 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
572 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
573 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
574 return false;
575
576 const SIMachineFunctionInfo *MFI =
577 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
578 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
579 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
580 // If only one base op is empty, they do not have the same base ptr
581 return false;
582 }
583
584 // In order to avoid register pressure, on an average, the number of DWORDS
585 // loaded together by all clustered mem ops should not exceed
586 // MaxMemoryClusterDWords. This is an empirical value based on certain
587 // observations and performance related experiments.
588 // The good thing about this heuristic is - it avoids clustering of too many
589 // sub-word loads, and also avoids clustering of wide loads. Below is the
590 // brief summary of how the heuristic behaves for various `LoadSize` when
591 // MaxMemoryClusterDWords is 8.
592 //
593 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
594 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
595 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
596 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
597 // (5) LoadSize >= 17: do not cluster
598 const unsigned LoadSize = NumBytes / ClusterSize;
599 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
600 return NumDWords <= MaxMemoryClusterDWords;
601}
602
603// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
604// the first 16 loads will be interleaved with the stores, and the next 16 will
605// be clustered as expected. It should really split into 2 16 store batches.
606//
607// Loads are clustered until this returns false, rather than trying to schedule
608// groups of stores. This also means we have to deal with saying different
609// address space loads should be clustered, and ones which might cause bank
610// conflicts.
611//
612// This might be deprecated so it might not be worth that much effort to fix.
614 int64_t Offset0, int64_t Offset1,
615 unsigned NumLoads) const {
616 assert(Offset1 > Offset0 &&
617 "Second offset should be larger than first offset!");
618 // If we have less than 16 loads in a row, and the offsets are within 64
619 // bytes, then schedule together.
620
621 // A cacheline is 64 bytes (for global memory).
622 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
623}
624
627 const DebugLoc &DL, MCRegister DestReg,
628 MCRegister SrcReg, bool KillSrc,
629 const char *Msg = "illegal VGPR to SGPR copy") {
630 MachineFunction *MF = MBB.getParent();
631
633 C.diagnose(DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
634
635 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
636 .addReg(SrcReg, getKillRegState(KillSrc));
637}
638
639/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
640/// possible to have a direct copy in these cases on GFX908, so an intermediate
641/// VGPR copy is required.
645 const DebugLoc &DL, MCRegister DestReg,
646 MCRegister SrcReg, bool KillSrc,
647 RegScavenger &RS, bool RegsOverlap,
648 Register ImpDefSuperReg = Register(),
649 Register ImpUseSuperReg = Register()) {
650 assert((TII.getSubtarget().hasMAIInsts() &&
651 !TII.getSubtarget().hasGFX90AInsts()) &&
652 "Expected GFX908 subtarget.");
653
654 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
655 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
656 "Source register of the copy should be either an SGPR or an AGPR.");
657
658 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
659 "Destination register of the copy should be an AGPR.");
660
661 const SIRegisterInfo &RI = TII.getRegisterInfo();
662
663 // First try to find defining accvgpr_write to avoid temporary registers.
664 // In the case of copies of overlapping AGPRs, we conservatively do not
665 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
666 // an accvgpr_write used for this same copy due to implicit-defs
667 if (!RegsOverlap) {
668 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
669 --Def;
670
671 if (!Def->modifiesRegister(SrcReg, &RI))
672 continue;
673
674 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
675 Def->getOperand(0).getReg() != SrcReg)
676 break;
677
678 MachineOperand &DefOp = Def->getOperand(1);
679 assert(DefOp.isReg() || DefOp.isImm());
680
681 if (DefOp.isReg()) {
682 bool SafeToPropagate = true;
683 // Check that register source operand is not clobbered before MI.
684 // Immediate operands are always safe to propagate.
685 for (auto I = Def; I != MI && SafeToPropagate; ++I)
686 if (I->modifiesRegister(DefOp.getReg(), &RI))
687 SafeToPropagate = false;
688
689 if (!SafeToPropagate)
690 break;
691
692 for (auto I = Def; I != MI; ++I)
693 I->clearRegisterKills(DefOp.getReg(), &RI);
694 }
695
696 MachineInstrBuilder Builder =
697 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
698 .add(DefOp);
699 if (ImpDefSuperReg)
700 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
701
702 if (ImpUseSuperReg) {
703 Builder.addReg(ImpUseSuperReg,
705 }
706
707 return;
708 }
709 }
710
711 RS.enterBasicBlockEnd(MBB);
712 RS.backward(std::next(MI));
713
714 // Ideally we want to have three registers for a long reg_sequence copy
715 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
716 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
717 *MBB.getParent());
718
719 // Registers in the sequence are allocated contiguously so we can just
720 // use register number to pick one of three round-robin temps.
721 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
722 Register Tmp =
723 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
724 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
725 "VGPR used for an intermediate copy should have been reserved.");
726
727 // Only loop through if there are any free registers left. We don't want to
728 // spill.
729 while (RegNo--) {
730 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
731 /* RestoreAfter */ false, 0,
732 /* AllowSpill */ false);
733 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
734 break;
735 Tmp = Tmp2;
736 RS.setRegUsed(Tmp);
737 }
738
739 // Insert copy to temporary VGPR.
740 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
741 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
742 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
743 } else {
744 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
745 }
746
747 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
748 .addReg(SrcReg, getKillRegState(KillSrc));
749 if (ImpUseSuperReg) {
750 UseBuilder.addReg(ImpUseSuperReg,
752 }
753
754 MachineInstrBuilder DefBuilder
755 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
756 .addReg(Tmp, RegState::Kill);
757
758 if (ImpDefSuperReg)
759 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
760}
761
764 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
765 const TargetRegisterClass *RC, bool Forward) {
766 const SIRegisterInfo &RI = TII.getRegisterInfo();
767 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
769 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
770
771 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
772 int16_t SubIdx = BaseIndices[Idx];
773 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
774 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
775 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
776 unsigned Opcode = AMDGPU::S_MOV_B32;
777
778 // Is SGPR aligned? If so try to combine with next.
779 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
780 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
781 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
782 // Can use SGPR64 copy
783 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
784 SubIdx = RI.getSubRegFromChannel(Channel, 2);
785 DestSubReg = RI.getSubReg(DestReg, SubIdx);
786 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
787 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
788 Opcode = AMDGPU::S_MOV_B64;
789 Idx++;
790 }
791
792 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
793 .addReg(SrcSubReg)
794 .addReg(SrcReg, RegState::Implicit);
795
796 if (!FirstMI)
797 FirstMI = LastMI;
798
799 if (!Forward)
800 I--;
801 }
802
803 assert(FirstMI && LastMI);
804 if (!Forward)
805 std::swap(FirstMI, LastMI);
806
807 FirstMI->addOperand(
808 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
809
810 if (KillSrc)
811 LastMI->addRegisterKilled(SrcReg, &RI);
812}
813
816 const DebugLoc &DL, Register DestReg,
817 Register SrcReg, bool KillSrc, bool RenamableDest,
818 bool RenamableSrc) const {
819 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
820 unsigned Size = RI.getRegSizeInBits(*RC);
821 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
822 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
823
824 // The rest of copyPhysReg assumes Src and Dst size are the same size.
825 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
826 // we remove Fix16BitCopies and this code block?
827 if (Fix16BitCopies) {
828 if (((Size == 16) != (SrcSize == 16))) {
829 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
830 assert(ST.useRealTrue16Insts());
831 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
832 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
833 RegToFix = SubReg;
834
835 if (DestReg == SrcReg) {
836 // Identity copy. Insert empty bundle since ExpandPostRA expects an
837 // instruction here.
838 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
839 return;
840 }
841 RC = RI.getPhysRegBaseClass(DestReg);
842 Size = RI.getRegSizeInBits(*RC);
843 SrcRC = RI.getPhysRegBaseClass(SrcReg);
844 SrcSize = RI.getRegSizeInBits(*SrcRC);
845 }
846 }
847
848 if (RC == &AMDGPU::VGPR_32RegClass) {
849 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
850 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
851 AMDGPU::AGPR_32RegClass.contains(SrcReg));
852 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
853 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
854 BuildMI(MBB, MI, DL, get(Opc), DestReg)
855 .addReg(SrcReg, getKillRegState(KillSrc));
856 return;
857 }
858
859 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
860 RC == &AMDGPU::SReg_32RegClass) {
861 if (SrcReg == AMDGPU::SCC) {
862 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
863 .addImm(1)
864 .addImm(0);
865 return;
866 }
867
868 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
869 if (DestReg == AMDGPU::VCC_LO) {
870 // FIXME: Hack until VReg_1 removed.
871 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
872 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
873 .addImm(0)
874 .addReg(SrcReg, getKillRegState(KillSrc));
875 return;
876 }
877
878 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
879 return;
880 }
881
882 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
883 .addReg(SrcReg, getKillRegState(KillSrc));
884 return;
885 }
886
887 if (RC == &AMDGPU::SReg_64RegClass) {
888 if (SrcReg == AMDGPU::SCC) {
889 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
890 .addImm(1)
891 .addImm(0);
892 return;
893 }
894
895 if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
896 if (DestReg == AMDGPU::VCC) {
897 // FIXME: Hack until VReg_1 removed.
898 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
899 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
900 .addImm(0)
901 .addReg(SrcReg, getKillRegState(KillSrc));
902 return;
903 }
904
905 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
906 return;
907 }
908
909 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
910 .addReg(SrcReg, getKillRegState(KillSrc));
911 return;
912 }
913
914 if (DestReg == AMDGPU::SCC) {
915 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
916 // but SelectionDAG emits such copies for i1 sources.
917 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
918 // This copy can only be produced by patterns
919 // with explicit SCC, which are known to be enabled
920 // only for subtargets with S_CMP_LG_U64 present.
921 assert(ST.hasScalarCompareEq64());
922 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
923 .addReg(SrcReg, getKillRegState(KillSrc))
924 .addImm(0);
925 } else {
926 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
927 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
928 .addReg(SrcReg, getKillRegState(KillSrc))
929 .addImm(0);
930 }
931
932 return;
933 }
934
935 if (RC == &AMDGPU::AGPR_32RegClass) {
936 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
937 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
938 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
939 .addReg(SrcReg, getKillRegState(KillSrc));
940 return;
941 }
942
943 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
944 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
945 .addReg(SrcReg, getKillRegState(KillSrc));
946 return;
947 }
948
949 // FIXME: Pass should maintain scavenger to avoid scan through the block on
950 // every AGPR spill.
951 RegScavenger RS;
952 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
953 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
954 return;
955 }
956
957 if (Size == 16) {
958 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
959 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
960 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
961
962 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
963 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
964 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
965 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
966 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
967 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
968 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
969 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
970
971 if (IsSGPRDst) {
972 if (!IsSGPRSrc) {
973 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
974 return;
975 }
976
977 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
978 .addReg(NewSrcReg, getKillRegState(KillSrc));
979 return;
980 }
981
982 if (IsAGPRDst || IsAGPRSrc) {
983 if (!DstLow || !SrcLow) {
984 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
985 "Cannot use hi16 subreg with an AGPR!");
986 }
987
988 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
989 return;
990 }
991
992 if (ST.useRealTrue16Insts()) {
993 if (IsSGPRSrc) {
994 assert(SrcLow);
995 SrcReg = NewSrcReg;
996 }
997 // Use the smaller instruction encoding if possible.
998 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
999 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
1000 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
1001 .addReg(SrcReg);
1002 } else {
1003 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
1004 .addImm(0) // src0_modifiers
1005 .addReg(SrcReg)
1006 .addImm(0); // op_sel
1007 }
1008 return;
1009 }
1010
1011 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1012 if (!DstLow || !SrcLow) {
1013 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1014 "Cannot use hi16 subreg on VI!");
1015 }
1016
1017 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1018 .addReg(NewSrcReg, getKillRegState(KillSrc));
1019 return;
1020 }
1021
1022 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1023 .addImm(0) // src0_modifiers
1024 .addReg(NewSrcReg)
1025 .addImm(0) // clamp
1032 // First implicit operand is $exec.
1033 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1034 return;
1035 }
1036
1037 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1038 if (ST.hasMovB64()) {
1039 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1040 .addReg(SrcReg, getKillRegState(KillSrc));
1041 return;
1042 }
1043 if (ST.hasPkMovB32()) {
1044 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1046 .addReg(SrcReg)
1048 .addReg(SrcReg)
1049 .addImm(0) // op_sel_lo
1050 .addImm(0) // op_sel_hi
1051 .addImm(0) // neg_lo
1052 .addImm(0) // neg_hi
1053 .addImm(0) // clamp
1054 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1055 return;
1056 }
1057 }
1058
1059 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1060 if (RI.isSGPRClass(RC)) {
1061 if (!RI.isSGPRClass(SrcRC)) {
1062 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1063 return;
1064 }
1065 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1066 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1067 Forward);
1068 return;
1069 }
1070
1071 unsigned EltSize = 4;
1072 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1073 if (RI.isAGPRClass(RC)) {
1074 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1075 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1076 else if (RI.hasVGPRs(SrcRC) ||
1077 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1078 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1079 else
1080 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1081 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1082 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1083 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1084 (RI.isProperlyAlignedRC(*RC) &&
1085 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1086 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1087 if (ST.hasMovB64()) {
1088 Opcode = AMDGPU::V_MOV_B64_e32;
1089 EltSize = 8;
1090 } else if (ST.hasPkMovB32()) {
1091 Opcode = AMDGPU::V_PK_MOV_B32;
1092 EltSize = 8;
1093 }
1094 }
1095
1096 // For the cases where we need an intermediate instruction/temporary register
1097 // (destination is an AGPR), we need a scavenger.
1098 //
1099 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1100 // whole block for every handled copy.
1101 std::unique_ptr<RegScavenger> RS;
1102 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1103 RS = std::make_unique<RegScavenger>();
1104
1105 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1106
1107 // If there is an overlap, we can't kill the super-register on the last
1108 // instruction, since it will also kill the components made live by this def.
1109 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1110 const bool CanKillSuperReg = KillSrc && !Overlap;
1111
1112 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1113 unsigned SubIdx;
1114 if (Forward)
1115 SubIdx = SubIndices[Idx];
1116 else
1117 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1118 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1119 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1120 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1121
1122 bool IsFirstSubreg = Idx == 0;
1123 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1124
1125 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1126 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1127 Register ImpUseSuper = SrcReg;
1128 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1129 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1130 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1132 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1134 .addReg(SrcSubReg)
1136 .addReg(SrcSubReg)
1137 .addImm(0) // op_sel_lo
1138 .addImm(0) // op_sel_hi
1139 .addImm(0) // neg_lo
1140 .addImm(0) // neg_hi
1141 .addImm(0) // clamp
1142 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1143 if (IsFirstSubreg)
1145 } else {
1146 MachineInstrBuilder Builder =
1147 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1148 if (IsFirstSubreg)
1149 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1150
1151 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1152 }
1153 }
1154}
1155
1156int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1157 int NewOpc;
1158
1159 // Try to map original to commuted opcode
1160 NewOpc = AMDGPU::getCommuteRev(Opcode);
1161 if (NewOpc != -1)
1162 // Check if the commuted (REV) opcode exists on the target.
1163 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1164
1165 // Try to map commuted to original opcode
1166 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1167 if (NewOpc != -1)
1168 // Check if the original (non-REV) opcode exists on the target.
1169 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1170
1171 return Opcode;
1172}
1173
1174const TargetRegisterClass *
1176 return &AMDGPU::VGPR_32RegClass;
1177}
1178
1181 const DebugLoc &DL, Register DstReg,
1183 Register TrueReg,
1184 Register FalseReg) const {
1185 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1186 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1188 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1189 "Not a VGPR32 reg");
1190
1191 if (Cond.size() == 1) {
1192 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1193 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1194 .add(Cond[0]);
1195 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1196 .addImm(0)
1197 .addReg(FalseReg)
1198 .addImm(0)
1199 .addReg(TrueReg)
1200 .addReg(SReg);
1201 } else if (Cond.size() == 2) {
1202 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1203 switch (Cond[0].getImm()) {
1204 case SIInstrInfo::SCC_TRUE: {
1205 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1206 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1207 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1208 .addImm(0)
1209 .addReg(FalseReg)
1210 .addImm(0)
1211 .addReg(TrueReg)
1212 .addReg(SReg);
1213 break;
1214 }
1215 case SIInstrInfo::SCC_FALSE: {
1216 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1217 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1218 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1219 .addImm(0)
1220 .addReg(FalseReg)
1221 .addImm(0)
1222 .addReg(TrueReg)
1223 .addReg(SReg);
1224 break;
1225 }
1226 case SIInstrInfo::VCCNZ: {
1227 MachineOperand RegOp = Cond[1];
1228 RegOp.setImplicit(false);
1229 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1230 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1231 .add(RegOp);
1232 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1233 .addImm(0)
1234 .addReg(FalseReg)
1235 .addImm(0)
1236 .addReg(TrueReg)
1237 .addReg(SReg);
1238 break;
1239 }
1240 case SIInstrInfo::VCCZ: {
1241 MachineOperand RegOp = Cond[1];
1242 RegOp.setImplicit(false);
1243 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1244 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1245 .add(RegOp);
1246 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1247 .addImm(0)
1248 .addReg(TrueReg)
1249 .addImm(0)
1250 .addReg(FalseReg)
1251 .addReg(SReg);
1252 break;
1253 }
1254 case SIInstrInfo::EXECNZ: {
1255 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1256 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1257 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1258 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1259 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1260 .addImm(0)
1261 .addReg(FalseReg)
1262 .addImm(0)
1263 .addReg(TrueReg)
1264 .addReg(SReg);
1265 break;
1266 }
1267 case SIInstrInfo::EXECZ: {
1268 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1269 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1270 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1271 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1272 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1273 .addImm(0)
1274 .addReg(FalseReg)
1275 .addImm(0)
1276 .addReg(TrueReg)
1277 .addReg(SReg);
1278 llvm_unreachable("Unhandled branch predicate EXECZ");
1279 break;
1280 }
1281 default:
1282 llvm_unreachable("invalid branch predicate");
1283 }
1284 } else {
1285 llvm_unreachable("Can only handle Cond size 1 or 2");
1286 }
1287}
1288
1291 const DebugLoc &DL,
1292 Register SrcReg, int Value) const {
1293 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1294 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1295 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1296 .addImm(Value)
1297 .addReg(SrcReg);
1298
1299 return Reg;
1300}
1301
1304 const DebugLoc &DL,
1305 Register SrcReg, int Value) const {
1306 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1307 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1308 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1309 .addImm(Value)
1310 .addReg(SrcReg);
1311
1312 return Reg;
1313}
1314
1316 const Register Reg,
1317 int64_t &ImmVal) const {
1318 switch (MI.getOpcode()) {
1319 case AMDGPU::V_MOV_B32_e32:
1320 case AMDGPU::S_MOV_B32:
1321 case AMDGPU::S_MOVK_I32:
1322 case AMDGPU::S_MOV_B64:
1323 case AMDGPU::V_MOV_B64_e32:
1324 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1325 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1326 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1327 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1328 case AMDGPU::V_MOV_B64_PSEUDO: {
1329 const MachineOperand &Src0 = MI.getOperand(1);
1330 if (Src0.isImm()) {
1331 ImmVal = Src0.getImm();
1332 return MI.getOperand(0).getReg() == Reg;
1333 }
1334
1335 return false;
1336 }
1337 case AMDGPU::S_BREV_B32:
1338 case AMDGPU::V_BFREV_B32_e32:
1339 case AMDGPU::V_BFREV_B32_e64: {
1340 const MachineOperand &Src0 = MI.getOperand(1);
1341 if (Src0.isImm()) {
1342 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Src0.getImm()));
1343 return MI.getOperand(0).getReg() == Reg;
1344 }
1345
1346 return false;
1347 }
1348 case AMDGPU::S_NOT_B32:
1349 case AMDGPU::V_NOT_B32_e32:
1350 case AMDGPU::V_NOT_B32_e64: {
1351 const MachineOperand &Src0 = MI.getOperand(1);
1352 if (Src0.isImm()) {
1353 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1354 return MI.getOperand(0).getReg() == Reg;
1355 }
1356
1357 return false;
1358 }
1359 default:
1360 return false;
1361 }
1362}
1363
1365
1366 if (RI.isAGPRClass(DstRC))
1367 return AMDGPU::COPY;
1368 if (RI.getRegSizeInBits(*DstRC) == 16) {
1369 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1370 // before RA.
1371 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1372 }
1373 if (RI.getRegSizeInBits(*DstRC) == 32)
1374 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1375 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1376 return AMDGPU::S_MOV_B64;
1377 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1378 return AMDGPU::V_MOV_B64_PSEUDO;
1379 return AMDGPU::COPY;
1380}
1381
1382const MCInstrDesc &
1384 bool IsIndirectSrc) const {
1385 if (IsIndirectSrc) {
1386 if (VecSize <= 32) // 4 bytes
1387 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1388 if (VecSize <= 64) // 8 bytes
1389 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1390 if (VecSize <= 96) // 12 bytes
1391 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1392 if (VecSize <= 128) // 16 bytes
1393 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1394 if (VecSize <= 160) // 20 bytes
1395 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1396 if (VecSize <= 256) // 32 bytes
1397 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1398 if (VecSize <= 288) // 36 bytes
1399 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1400 if (VecSize <= 320) // 40 bytes
1401 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1402 if (VecSize <= 352) // 44 bytes
1403 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1404 if (VecSize <= 384) // 48 bytes
1405 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1406 if (VecSize <= 512) // 64 bytes
1407 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1408 if (VecSize <= 1024) // 128 bytes
1409 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1410
1411 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1412 }
1413
1414 if (VecSize <= 32) // 4 bytes
1415 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1416 if (VecSize <= 64) // 8 bytes
1417 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1418 if (VecSize <= 96) // 12 bytes
1419 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1420 if (VecSize <= 128) // 16 bytes
1421 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1422 if (VecSize <= 160) // 20 bytes
1423 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1424 if (VecSize <= 256) // 32 bytes
1425 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1426 if (VecSize <= 288) // 36 bytes
1427 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1428 if (VecSize <= 320) // 40 bytes
1429 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1430 if (VecSize <= 352) // 44 bytes
1431 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1432 if (VecSize <= 384) // 48 bytes
1433 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1434 if (VecSize <= 512) // 64 bytes
1435 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1436 if (VecSize <= 1024) // 128 bytes
1437 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1438
1439 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1440}
1441
1442static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1443 if (VecSize <= 32) // 4 bytes
1444 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1445 if (VecSize <= 64) // 8 bytes
1446 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1447 if (VecSize <= 96) // 12 bytes
1448 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1449 if (VecSize <= 128) // 16 bytes
1450 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1451 if (VecSize <= 160) // 20 bytes
1452 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1453 if (VecSize <= 256) // 32 bytes
1454 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1455 if (VecSize <= 288) // 36 bytes
1456 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1457 if (VecSize <= 320) // 40 bytes
1458 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1459 if (VecSize <= 352) // 44 bytes
1460 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1461 if (VecSize <= 384) // 48 bytes
1462 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1463 if (VecSize <= 512) // 64 bytes
1464 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1465 if (VecSize <= 1024) // 128 bytes
1466 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1467
1468 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1469}
1470
1471static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1472 if (VecSize <= 32) // 4 bytes
1473 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1474 if (VecSize <= 64) // 8 bytes
1475 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1476 if (VecSize <= 96) // 12 bytes
1477 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1478 if (VecSize <= 128) // 16 bytes
1479 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1480 if (VecSize <= 160) // 20 bytes
1481 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1482 if (VecSize <= 256) // 32 bytes
1483 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1484 if (VecSize <= 288) // 36 bytes
1485 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1486 if (VecSize <= 320) // 40 bytes
1487 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1488 if (VecSize <= 352) // 44 bytes
1489 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1490 if (VecSize <= 384) // 48 bytes
1491 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1492 if (VecSize <= 512) // 64 bytes
1493 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1494 if (VecSize <= 1024) // 128 bytes
1495 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1496
1497 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1498}
1499
1500static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1501 if (VecSize <= 64) // 8 bytes
1502 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1503 if (VecSize <= 128) // 16 bytes
1504 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1505 if (VecSize <= 256) // 32 bytes
1506 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1507 if (VecSize <= 512) // 64 bytes
1508 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1509 if (VecSize <= 1024) // 128 bytes
1510 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1511
1512 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1513}
1514
1515const MCInstrDesc &
1516SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1517 bool IsSGPR) const {
1518 if (IsSGPR) {
1519 switch (EltSize) {
1520 case 32:
1521 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1522 case 64:
1523 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1524 default:
1525 llvm_unreachable("invalid reg indexing elt size");
1526 }
1527 }
1528
1529 assert(EltSize == 32 && "invalid reg indexing elt size");
1531}
1532
1533static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1534 switch (Size) {
1535 case 4:
1536 return AMDGPU::SI_SPILL_S32_SAVE;
1537 case 8:
1538 return AMDGPU::SI_SPILL_S64_SAVE;
1539 case 12:
1540 return AMDGPU::SI_SPILL_S96_SAVE;
1541 case 16:
1542 return AMDGPU::SI_SPILL_S128_SAVE;
1543 case 20:
1544 return AMDGPU::SI_SPILL_S160_SAVE;
1545 case 24:
1546 return AMDGPU::SI_SPILL_S192_SAVE;
1547 case 28:
1548 return AMDGPU::SI_SPILL_S224_SAVE;
1549 case 32:
1550 return AMDGPU::SI_SPILL_S256_SAVE;
1551 case 36:
1552 return AMDGPU::SI_SPILL_S288_SAVE;
1553 case 40:
1554 return AMDGPU::SI_SPILL_S320_SAVE;
1555 case 44:
1556 return AMDGPU::SI_SPILL_S352_SAVE;
1557 case 48:
1558 return AMDGPU::SI_SPILL_S384_SAVE;
1559 case 64:
1560 return AMDGPU::SI_SPILL_S512_SAVE;
1561 case 128:
1562 return AMDGPU::SI_SPILL_S1024_SAVE;
1563 default:
1564 llvm_unreachable("unknown register size");
1565 }
1566}
1567
1568static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1569 switch (Size) {
1570 case 2:
1571 return AMDGPU::SI_SPILL_V16_SAVE;
1572 case 4:
1573 return AMDGPU::SI_SPILL_V32_SAVE;
1574 case 8:
1575 return AMDGPU::SI_SPILL_V64_SAVE;
1576 case 12:
1577 return AMDGPU::SI_SPILL_V96_SAVE;
1578 case 16:
1579 return AMDGPU::SI_SPILL_V128_SAVE;
1580 case 20:
1581 return AMDGPU::SI_SPILL_V160_SAVE;
1582 case 24:
1583 return AMDGPU::SI_SPILL_V192_SAVE;
1584 case 28:
1585 return AMDGPU::SI_SPILL_V224_SAVE;
1586 case 32:
1587 return AMDGPU::SI_SPILL_V256_SAVE;
1588 case 36:
1589 return AMDGPU::SI_SPILL_V288_SAVE;
1590 case 40:
1591 return AMDGPU::SI_SPILL_V320_SAVE;
1592 case 44:
1593 return AMDGPU::SI_SPILL_V352_SAVE;
1594 case 48:
1595 return AMDGPU::SI_SPILL_V384_SAVE;
1596 case 64:
1597 return AMDGPU::SI_SPILL_V512_SAVE;
1598 case 128:
1599 return AMDGPU::SI_SPILL_V1024_SAVE;
1600 default:
1601 llvm_unreachable("unknown register size");
1602 }
1603}
1604
1605static unsigned getAVSpillSaveOpcode(unsigned Size) {
1606 switch (Size) {
1607 case 4:
1608 return AMDGPU::SI_SPILL_AV32_SAVE;
1609 case 8:
1610 return AMDGPU::SI_SPILL_AV64_SAVE;
1611 case 12:
1612 return AMDGPU::SI_SPILL_AV96_SAVE;
1613 case 16:
1614 return AMDGPU::SI_SPILL_AV128_SAVE;
1615 case 20:
1616 return AMDGPU::SI_SPILL_AV160_SAVE;
1617 case 24:
1618 return AMDGPU::SI_SPILL_AV192_SAVE;
1619 case 28:
1620 return AMDGPU::SI_SPILL_AV224_SAVE;
1621 case 32:
1622 return AMDGPU::SI_SPILL_AV256_SAVE;
1623 case 36:
1624 return AMDGPU::SI_SPILL_AV288_SAVE;
1625 case 40:
1626 return AMDGPU::SI_SPILL_AV320_SAVE;
1627 case 44:
1628 return AMDGPU::SI_SPILL_AV352_SAVE;
1629 case 48:
1630 return AMDGPU::SI_SPILL_AV384_SAVE;
1631 case 64:
1632 return AMDGPU::SI_SPILL_AV512_SAVE;
1633 case 128:
1634 return AMDGPU::SI_SPILL_AV1024_SAVE;
1635 default:
1636 llvm_unreachable("unknown register size");
1637 }
1638}
1639
1640static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1641 bool IsVectorSuperClass) {
1642 // Currently, there is only 32-bit WWM register spills needed.
1643 if (Size != 4)
1644 llvm_unreachable("unknown wwm register spill size");
1645
1646 if (IsVectorSuperClass)
1647 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1648
1649 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1650}
1651
1653 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1654 const SIMachineFunctionInfo &MFI) const {
1655 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1656
1657 // Choose the right opcode if spilling a WWM register.
1659 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1660
1661 // TODO: Check if AGPRs are available
1662 if (ST.hasMAIInsts())
1663 return getAVSpillSaveOpcode(Size);
1664
1666}
1667
1670 bool isKill, int FrameIndex, const TargetRegisterClass *RC,
1671 const TargetRegisterInfo *TRI, Register VReg,
1672 MachineInstr::MIFlag Flags) const {
1673 MachineFunction *MF = MBB.getParent();
1675 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1676 const DebugLoc &DL = MBB.findDebugLoc(MI);
1677
1678 MachinePointerInfo PtrInfo
1679 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1681 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1682 FrameInfo.getObjectAlign(FrameIndex));
1683 unsigned SpillSize = TRI->getSpillSize(*RC);
1684
1686 if (RI.isSGPRClass(RC)) {
1687 MFI->setHasSpilledSGPRs();
1688 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1689 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1690 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1691
1692 // We are only allowed to create one new instruction when spilling
1693 // registers, so we need to use pseudo instruction for spilling SGPRs.
1694 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1695
1696 // The SGPR spill/restore instructions only work on number sgprs, so we need
1697 // to make sure we are using the correct register class.
1698 if (SrcReg.isVirtual() && SpillSize == 4) {
1699 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1700 }
1701
1702 BuildMI(MBB, MI, DL, OpDesc)
1703 .addReg(SrcReg, getKillRegState(isKill)) // data
1704 .addFrameIndex(FrameIndex) // addr
1705 .addMemOperand(MMO)
1707
1708 if (RI.spillSGPRToVGPR())
1709 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1710 return;
1711 }
1712
1713 unsigned Opcode =
1714 getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
1715 MFI->setHasSpilledVGPRs();
1716
1717 BuildMI(MBB, MI, DL, get(Opcode))
1718 .addReg(SrcReg, getKillRegState(isKill)) // data
1719 .addFrameIndex(FrameIndex) // addr
1720 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1721 .addImm(0) // offset
1722 .addMemOperand(MMO);
1723}
1724
1725static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1726 switch (Size) {
1727 case 4:
1728 return AMDGPU::SI_SPILL_S32_RESTORE;
1729 case 8:
1730 return AMDGPU::SI_SPILL_S64_RESTORE;
1731 case 12:
1732 return AMDGPU::SI_SPILL_S96_RESTORE;
1733 case 16:
1734 return AMDGPU::SI_SPILL_S128_RESTORE;
1735 case 20:
1736 return AMDGPU::SI_SPILL_S160_RESTORE;
1737 case 24:
1738 return AMDGPU::SI_SPILL_S192_RESTORE;
1739 case 28:
1740 return AMDGPU::SI_SPILL_S224_RESTORE;
1741 case 32:
1742 return AMDGPU::SI_SPILL_S256_RESTORE;
1743 case 36:
1744 return AMDGPU::SI_SPILL_S288_RESTORE;
1745 case 40:
1746 return AMDGPU::SI_SPILL_S320_RESTORE;
1747 case 44:
1748 return AMDGPU::SI_SPILL_S352_RESTORE;
1749 case 48:
1750 return AMDGPU::SI_SPILL_S384_RESTORE;
1751 case 64:
1752 return AMDGPU::SI_SPILL_S512_RESTORE;
1753 case 128:
1754 return AMDGPU::SI_SPILL_S1024_RESTORE;
1755 default:
1756 llvm_unreachable("unknown register size");
1757 }
1758}
1759
1760static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1761 switch (Size) {
1762 case 2:
1763 return AMDGPU::SI_SPILL_V16_RESTORE;
1764 case 4:
1765 return AMDGPU::SI_SPILL_V32_RESTORE;
1766 case 8:
1767 return AMDGPU::SI_SPILL_V64_RESTORE;
1768 case 12:
1769 return AMDGPU::SI_SPILL_V96_RESTORE;
1770 case 16:
1771 return AMDGPU::SI_SPILL_V128_RESTORE;
1772 case 20:
1773 return AMDGPU::SI_SPILL_V160_RESTORE;
1774 case 24:
1775 return AMDGPU::SI_SPILL_V192_RESTORE;
1776 case 28:
1777 return AMDGPU::SI_SPILL_V224_RESTORE;
1778 case 32:
1779 return AMDGPU::SI_SPILL_V256_RESTORE;
1780 case 36:
1781 return AMDGPU::SI_SPILL_V288_RESTORE;
1782 case 40:
1783 return AMDGPU::SI_SPILL_V320_RESTORE;
1784 case 44:
1785 return AMDGPU::SI_SPILL_V352_RESTORE;
1786 case 48:
1787 return AMDGPU::SI_SPILL_V384_RESTORE;
1788 case 64:
1789 return AMDGPU::SI_SPILL_V512_RESTORE;
1790 case 128:
1791 return AMDGPU::SI_SPILL_V1024_RESTORE;
1792 default:
1793 llvm_unreachable("unknown register size");
1794 }
1795}
1796
1797static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1798 switch (Size) {
1799 case 4:
1800 return AMDGPU::SI_SPILL_AV32_RESTORE;
1801 case 8:
1802 return AMDGPU::SI_SPILL_AV64_RESTORE;
1803 case 12:
1804 return AMDGPU::SI_SPILL_AV96_RESTORE;
1805 case 16:
1806 return AMDGPU::SI_SPILL_AV128_RESTORE;
1807 case 20:
1808 return AMDGPU::SI_SPILL_AV160_RESTORE;
1809 case 24:
1810 return AMDGPU::SI_SPILL_AV192_RESTORE;
1811 case 28:
1812 return AMDGPU::SI_SPILL_AV224_RESTORE;
1813 case 32:
1814 return AMDGPU::SI_SPILL_AV256_RESTORE;
1815 case 36:
1816 return AMDGPU::SI_SPILL_AV288_RESTORE;
1817 case 40:
1818 return AMDGPU::SI_SPILL_AV320_RESTORE;
1819 case 44:
1820 return AMDGPU::SI_SPILL_AV352_RESTORE;
1821 case 48:
1822 return AMDGPU::SI_SPILL_AV384_RESTORE;
1823 case 64:
1824 return AMDGPU::SI_SPILL_AV512_RESTORE;
1825 case 128:
1826 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1827 default:
1828 llvm_unreachable("unknown register size");
1829 }
1830}
1831
1832static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1833 bool IsVectorSuperClass) {
1834 // Currently, there is only 32-bit WWM register spills needed.
1835 if (Size != 4)
1836 llvm_unreachable("unknown wwm register spill size");
1837
1838 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1839 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1840
1841 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1842}
1843
1845 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1846 const SIMachineFunctionInfo &MFI) const {
1847 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1848
1849 // Choose the right opcode if restoring a WWM register.
1851 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1852
1853 // TODO: Check if AGPRs are available
1854 if (ST.hasMAIInsts())
1856
1857 assert(!RI.isAGPRClass(RC));
1859}
1860
1863 Register DestReg, int FrameIndex,
1864 const TargetRegisterClass *RC,
1865 const TargetRegisterInfo *TRI,
1866 Register VReg,
1867 MachineInstr::MIFlag Flags) const {
1868 MachineFunction *MF = MBB.getParent();
1870 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1871 const DebugLoc &DL = MBB.findDebugLoc(MI);
1872 unsigned SpillSize = TRI->getSpillSize(*RC);
1873
1874 MachinePointerInfo PtrInfo
1875 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1876
1878 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1879 FrameInfo.getObjectAlign(FrameIndex));
1880
1881 if (RI.isSGPRClass(RC)) {
1882 MFI->setHasSpilledSGPRs();
1883 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1884 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1885 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1886
1887 // FIXME: Maybe this should not include a memoperand because it will be
1888 // lowered to non-memory instructions.
1889 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1890 if (DestReg.isVirtual() && SpillSize == 4) {
1892 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1893 }
1894
1895 if (RI.spillSGPRToVGPR())
1896 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1897 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1898 .addFrameIndex(FrameIndex) // addr
1899 .addMemOperand(MMO)
1901
1902 return;
1903 }
1904
1905 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1906 SpillSize, *MFI);
1907 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1908 .addFrameIndex(FrameIndex) // vaddr
1909 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1910 .addImm(0) // offset
1911 .addMemOperand(MMO);
1912}
1913
1918
1921 unsigned Quantity) const {
1922 DebugLoc DL = MBB.findDebugLoc(MI);
1923 unsigned MaxSNopCount = 1u << ST.getSNopBits();
1924 while (Quantity > 0) {
1925 unsigned Arg = std::min(Quantity, MaxSNopCount);
1926 Quantity -= Arg;
1927 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1928 }
1929}
1930
1932 auto *MF = MBB.getParent();
1933 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1934
1935 assert(Info->isEntryFunction());
1936
1937 if (MBB.succ_empty()) {
1938 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1939 if (HasNoTerminator) {
1940 if (Info->returnsVoid()) {
1941 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1942 } else {
1943 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1944 }
1945 }
1946 }
1947}
1948
1952 const DebugLoc &DL) const {
1953 MachineFunction *MF = MBB.getParent();
1954 constexpr unsigned DoorbellIDMask = 0x3ff;
1955 constexpr unsigned ECQueueWaveAbort = 0x400;
1956
1957 MachineBasicBlock *TrapBB = &MBB;
1958 MachineBasicBlock *ContBB = &MBB;
1959 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
1960
1961 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
1962 ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
1963 TrapBB = MF->CreateMachineBasicBlock();
1964 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
1965 MF->push_back(TrapBB);
1966 MBB.addSuccessor(TrapBB);
1967 }
1968
1969 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
1970 // will be a nop.
1971 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
1972 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
1973 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1974 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
1975 DoorbellReg)
1977 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
1978 .addUse(AMDGPU::M0);
1979 Register DoorbellRegMasked =
1980 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1981 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
1982 .addUse(DoorbellReg)
1983 .addImm(DoorbellIDMask);
1984 Register SetWaveAbortBit =
1985 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1986 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
1987 .addUse(DoorbellRegMasked)
1988 .addImm(ECQueueWaveAbort);
1989 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1990 .addUse(SetWaveAbortBit);
1991 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
1993 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1994 .addUse(AMDGPU::TTMP2);
1995 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
1996 TrapBB->addSuccessor(HaltLoopBB);
1997
1998 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
1999 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2000 .addMBB(HaltLoopBB);
2001 MF->push_back(HaltLoopBB);
2002 HaltLoopBB->addSuccessor(HaltLoopBB);
2003
2004 return ContBB;
2005}
2006
2008 switch (MI.getOpcode()) {
2009 default:
2010 if (MI.isMetaInstruction())
2011 return 0;
2012 return 1; // FIXME: Do wait states equal cycles?
2013
2014 case AMDGPU::S_NOP:
2015 return MI.getOperand(0).getImm() + 1;
2016 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2017 // hazard, even if one exist, won't really be visible. Should we handle it?
2018 }
2019}
2020
2022 MachineBasicBlock &MBB = *MI.getParent();
2023 DebugLoc DL = MBB.findDebugLoc(MI);
2025 switch (MI.getOpcode()) {
2026 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2027 case AMDGPU::S_MOV_B64_term:
2028 // This is only a terminator to get the correct spill code placement during
2029 // register allocation.
2030 MI.setDesc(get(AMDGPU::S_MOV_B64));
2031 break;
2032
2033 case AMDGPU::S_MOV_B32_term:
2034 // This is only a terminator to get the correct spill code placement during
2035 // register allocation.
2036 MI.setDesc(get(AMDGPU::S_MOV_B32));
2037 break;
2038
2039 case AMDGPU::S_XOR_B64_term:
2040 // This is only a terminator to get the correct spill code placement during
2041 // register allocation.
2042 MI.setDesc(get(AMDGPU::S_XOR_B64));
2043 break;
2044
2045 case AMDGPU::S_XOR_B32_term:
2046 // This is only a terminator to get the correct spill code placement during
2047 // register allocation.
2048 MI.setDesc(get(AMDGPU::S_XOR_B32));
2049 break;
2050 case AMDGPU::S_OR_B64_term:
2051 // This is only a terminator to get the correct spill code placement during
2052 // register allocation.
2053 MI.setDesc(get(AMDGPU::S_OR_B64));
2054 break;
2055 case AMDGPU::S_OR_B32_term:
2056 // This is only a terminator to get the correct spill code placement during
2057 // register allocation.
2058 MI.setDesc(get(AMDGPU::S_OR_B32));
2059 break;
2060
2061 case AMDGPU::S_ANDN2_B64_term:
2062 // This is only a terminator to get the correct spill code placement during
2063 // register allocation.
2064 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2065 break;
2066
2067 case AMDGPU::S_ANDN2_B32_term:
2068 // This is only a terminator to get the correct spill code placement during
2069 // register allocation.
2070 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2071 break;
2072
2073 case AMDGPU::S_AND_B64_term:
2074 // This is only a terminator to get the correct spill code placement during
2075 // register allocation.
2076 MI.setDesc(get(AMDGPU::S_AND_B64));
2077 break;
2078
2079 case AMDGPU::S_AND_B32_term:
2080 // This is only a terminator to get the correct spill code placement during
2081 // register allocation.
2082 MI.setDesc(get(AMDGPU::S_AND_B32));
2083 break;
2084
2085 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2086 // This is only a terminator to get the correct spill code placement during
2087 // register allocation.
2088 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2089 break;
2090
2091 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2092 // This is only a terminator to get the correct spill code placement during
2093 // register allocation.
2094 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2095 break;
2096
2097 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2098 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2099 break;
2100
2101 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2102 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2103 break;
2104 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2105 Register Dst = MI.getOperand(0).getReg();
2106 bool IsAGPR = SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst));
2107 MI.setDesc(
2108 get(IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2109 break;
2110 }
2111 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2112 Register Dst = MI.getOperand(0).getReg();
2113 if (SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst))) {
2114 int64_t Imm = MI.getOperand(1).getImm();
2115
2116 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2117 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2118 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)
2121 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)
2122 .addImm(SignExtend64<32>(Imm >> 32))
2124 MI.eraseFromParent();
2125 break;
2126 }
2127
2128 [[fallthrough]];
2129 }
2130 case AMDGPU::V_MOV_B64_PSEUDO: {
2131 Register Dst = MI.getOperand(0).getReg();
2132 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2133 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2134
2135 const MachineOperand &SrcOp = MI.getOperand(1);
2136 // FIXME: Will this work for 64-bit floating point immediates?
2137 assert(!SrcOp.isFPImm());
2138 if (ST.hasMovB64()) {
2139 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2140 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2141 isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
2142 break;
2143 }
2144 if (SrcOp.isImm()) {
2145 APInt Imm(64, SrcOp.getImm());
2146 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2147 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2148 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2149 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2151 .addImm(Lo.getSExtValue())
2153 .addImm(Lo.getSExtValue())
2154 .addImm(0) // op_sel_lo
2155 .addImm(0) // op_sel_hi
2156 .addImm(0) // neg_lo
2157 .addImm(0) // neg_hi
2158 .addImm(0); // clamp
2159 } else {
2160 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2161 .addImm(Lo.getSExtValue())
2163 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2164 .addImm(Hi.getSExtValue())
2166 }
2167 } else {
2168 assert(SrcOp.isReg());
2169 if (ST.hasPkMovB32() &&
2170 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2171 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2172 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2173 .addReg(SrcOp.getReg())
2175 .addReg(SrcOp.getReg())
2176 .addImm(0) // op_sel_lo
2177 .addImm(0) // op_sel_hi
2178 .addImm(0) // neg_lo
2179 .addImm(0) // neg_hi
2180 .addImm(0); // clamp
2181 } else {
2182 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2183 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2185 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2186 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2188 }
2189 }
2190 MI.eraseFromParent();
2191 break;
2192 }
2193 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2195 break;
2196 }
2197 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2198 const MachineOperand &SrcOp = MI.getOperand(1);
2199 assert(!SrcOp.isFPImm());
2200
2201 if (ST.has64BitLiterals()) {
2202 MI.setDesc(get(AMDGPU::S_MOV_B64));
2203 break;
2204 }
2205
2206 APInt Imm(64, SrcOp.getImm());
2207 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2208 MI.setDesc(get(AMDGPU::S_MOV_B64));
2209 break;
2210 }
2211
2212 Register Dst = MI.getOperand(0).getReg();
2213 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2214 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2215
2216 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2217 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2218 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2219 .addImm(Lo.getSExtValue())
2221 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2222 .addImm(Hi.getSExtValue())
2224 MI.eraseFromParent();
2225 break;
2226 }
2227 case AMDGPU::V_SET_INACTIVE_B32: {
2228 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2229 Register DstReg = MI.getOperand(0).getReg();
2230 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2231 .add(MI.getOperand(3))
2232 .add(MI.getOperand(4))
2233 .add(MI.getOperand(1))
2234 .add(MI.getOperand(2))
2235 .add(MI.getOperand(5));
2236 MI.eraseFromParent();
2237 break;
2238 }
2239 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2240 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2241 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2242 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2243 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2244 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2245 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2246 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2247 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2248 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2249 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2250 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2251 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2252 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2253 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2254 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2255 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2256 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2257 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2258 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2259 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2260 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2261 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2262 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2263 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2264 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2265 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2266 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2267 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2268 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2269
2270 unsigned Opc;
2271 if (RI.hasVGPRs(EltRC)) {
2272 Opc = AMDGPU::V_MOVRELD_B32_e32;
2273 } else {
2274 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2275 : AMDGPU::S_MOVRELD_B32;
2276 }
2277
2278 const MCInstrDesc &OpDesc = get(Opc);
2279 Register VecReg = MI.getOperand(0).getReg();
2280 bool IsUndef = MI.getOperand(1).isUndef();
2281 unsigned SubReg = MI.getOperand(3).getImm();
2282 assert(VecReg == MI.getOperand(1).getReg());
2283
2285 BuildMI(MBB, MI, DL, OpDesc)
2286 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2287 .add(MI.getOperand(2))
2289 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2290
2291 const int ImpDefIdx =
2292 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2293 const int ImpUseIdx = ImpDefIdx + 1;
2294 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2295 MI.eraseFromParent();
2296 break;
2297 }
2298 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2299 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2300 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2301 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2302 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2303 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2304 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2305 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2306 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2307 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2308 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2309 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2310 assert(ST.useVGPRIndexMode());
2311 Register VecReg = MI.getOperand(0).getReg();
2312 bool IsUndef = MI.getOperand(1).isUndef();
2313 MachineOperand &Idx = MI.getOperand(3);
2314 Register SubReg = MI.getOperand(4).getImm();
2315
2316 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2317 .add(Idx)
2319 SetOn->getOperand(3).setIsUndef();
2320
2321 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2323 BuildMI(MBB, MI, DL, OpDesc)
2324 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2325 .add(MI.getOperand(2))
2327 .addReg(VecReg,
2328 RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2329
2330 const int ImpDefIdx =
2331 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2332 const int ImpUseIdx = ImpDefIdx + 1;
2333 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2334
2335 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2336
2337 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2338
2339 MI.eraseFromParent();
2340 break;
2341 }
2342 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2343 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2344 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2345 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2346 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2347 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2348 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2349 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2350 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2351 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2352 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2353 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2354 assert(ST.useVGPRIndexMode());
2355 Register Dst = MI.getOperand(0).getReg();
2356 Register VecReg = MI.getOperand(1).getReg();
2357 bool IsUndef = MI.getOperand(1).isUndef();
2358 Register Idx = MI.getOperand(2).getReg();
2359 Register SubReg = MI.getOperand(3).getImm();
2360
2361 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2362 .addReg(Idx)
2364 SetOn->getOperand(3).setIsUndef();
2365
2366 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2367 .addDef(Dst)
2368 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2369 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2370
2371 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2372
2373 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2374
2375 MI.eraseFromParent();
2376 break;
2377 }
2378 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2379 MachineFunction &MF = *MBB.getParent();
2380 Register Reg = MI.getOperand(0).getReg();
2381 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2382 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2383 MachineOperand OpLo = MI.getOperand(1);
2384 MachineOperand OpHi = MI.getOperand(2);
2385
2386 // Create a bundle so these instructions won't be re-ordered by the
2387 // post-RA scheduler.
2388 MIBundleBuilder Bundler(MBB, MI);
2389 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2390
2391 // What we want here is an offset from the value returned by s_getpc (which
2392 // is the address of the s_add_u32 instruction) to the global variable, but
2393 // since the encoding of $symbol starts 4 bytes after the start of the
2394 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2395 // small. This requires us to add 4 to the global variable offset in order
2396 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2397 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2398 // instruction.
2399
2400 int64_t Adjust = 0;
2401 if (ST.hasGetPCZeroExtension()) {
2402 // Fix up hardware that does not sign-extend the 48-bit PC value by
2403 // inserting: s_sext_i32_i16 reghi, reghi
2404 Bundler.append(
2405 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2406 Adjust += 4;
2407 }
2408
2409 if (OpLo.isGlobal())
2410 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2411 Bundler.append(
2412 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2413
2414 if (OpHi.isGlobal())
2415 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2416 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2417 .addReg(RegHi)
2418 .add(OpHi));
2419
2420 finalizeBundle(MBB, Bundler.begin());
2421
2422 MI.eraseFromParent();
2423 break;
2424 }
2425 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2426 MachineFunction &MF = *MBB.getParent();
2427 Register Reg = MI.getOperand(0).getReg();
2428 MachineOperand Op = MI.getOperand(1);
2429
2430 // Create a bundle so these instructions won't be re-ordered by the
2431 // post-RA scheduler.
2432 MIBundleBuilder Bundler(MBB, MI);
2433 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2434 if (Op.isGlobal())
2435 Op.setOffset(Op.getOffset() + 4);
2436 Bundler.append(
2437 BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));
2438
2439 finalizeBundle(MBB, Bundler.begin());
2440
2441 MI.eraseFromParent();
2442 break;
2443 }
2444 case AMDGPU::ENTER_STRICT_WWM: {
2445 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2446 // Whole Wave Mode is entered.
2447 MI.setDesc(get(LMC.OrSaveExecOpc));
2448 break;
2449 }
2450 case AMDGPU::ENTER_STRICT_WQM: {
2451 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2452 // STRICT_WQM is entered.
2453 BuildMI(MBB, MI, DL, get(LMC.MovOpc), MI.getOperand(0).getReg())
2454 .addReg(LMC.ExecReg);
2455 BuildMI(MBB, MI, DL, get(LMC.WQMOpc), LMC.ExecReg).addReg(LMC.ExecReg);
2456
2457 MI.eraseFromParent();
2458 break;
2459 }
2460 case AMDGPU::EXIT_STRICT_WWM:
2461 case AMDGPU::EXIT_STRICT_WQM: {
2462 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2463 // WWM/STICT_WQM is exited.
2464 MI.setDesc(get(LMC.MovOpc));
2465 break;
2466 }
2467 case AMDGPU::SI_RETURN: {
2468 const MachineFunction *MF = MBB.getParent();
2469 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2470 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2471 // Hiding the return address use with SI_RETURN may lead to extra kills in
2472 // the function and missing live-ins. We are fine in practice because callee
2473 // saved register handling ensures the register value is restored before
2474 // RET, but we need the undef flag here to appease the MachineVerifier
2475 // liveness checks.
2477 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2478 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2479
2480 MIB.copyImplicitOps(MI);
2481 MI.eraseFromParent();
2482 break;
2483 }
2484
2485 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2486 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2487 MI.setDesc(get(AMDGPU::S_MUL_U64));
2488 break;
2489
2490 case AMDGPU::S_GETPC_B64_pseudo:
2491 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2492 if (ST.hasGetPCZeroExtension()) {
2493 Register Dst = MI.getOperand(0).getReg();
2494 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2495 // Fix up hardware that does not sign-extend the 48-bit PC value by
2496 // inserting: s_sext_i32_i16 dsthi, dsthi
2497 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2498 DstHi)
2499 .addReg(DstHi);
2500 }
2501 break;
2502
2503 case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2504 assert(ST.hasBF16PackedInsts());
2505 MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
2506 MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
2507 MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
2508 MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
2509 auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2510 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2511 auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2512 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2513 break;
2514 }
2515
2516 return true;
2517}
2518
2521 unsigned SubIdx, const MachineInstr &Orig,
2522 const TargetRegisterInfo &RI) const {
2523
2524 // Try shrinking the instruction to remat only the part needed for current
2525 // context.
2526 // TODO: Handle more cases.
2527 unsigned Opcode = Orig.getOpcode();
2528 switch (Opcode) {
2529 case AMDGPU::S_LOAD_DWORDX16_IMM:
2530 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2531 if (SubIdx != 0)
2532 break;
2533
2534 if (I == MBB.end())
2535 break;
2536
2537 if (I->isBundled())
2538 break;
2539
2540 // Look for a single use of the register that is also a subreg.
2541 Register RegToFind = Orig.getOperand(0).getReg();
2542 MachineOperand *UseMO = nullptr;
2543 for (auto &CandMO : I->operands()) {
2544 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2545 continue;
2546 if (UseMO) {
2547 UseMO = nullptr;
2548 break;
2549 }
2550 UseMO = &CandMO;
2551 }
2552 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2553 break;
2554
2555 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2556 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2557
2558 MachineFunction *MF = MBB.getParent();
2560 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2561
2562 unsigned NewOpcode = -1;
2563 if (SubregSize == 256)
2564 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2565 else if (SubregSize == 128)
2566 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2567 else
2568 break;
2569
2570 const MCInstrDesc &TID = get(NewOpcode);
2571 const TargetRegisterClass *NewRC =
2572 RI.getAllocatableClass(getRegClass(TID, 0, &RI));
2573 MRI.setRegClass(DestReg, NewRC);
2574
2575 UseMO->setReg(DestReg);
2576 UseMO->setSubReg(AMDGPU::NoSubRegister);
2577
2578 // Use a smaller load with the desired size, possibly with updated offset.
2579 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2580 MI->setDesc(TID);
2581 MI->getOperand(0).setReg(DestReg);
2582 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2583 if (Offset) {
2584 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2585 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2586 OffsetMO->setImm(FinalOffset);
2587 }
2589 for (const MachineMemOperand *MemOp : Orig.memoperands())
2590 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2591 SubregSize / 8));
2592 MI->setMemRefs(*MF, NewMMOs);
2593
2594 MBB.insert(I, MI);
2595 return;
2596 }
2597
2598 default:
2599 break;
2600 }
2601
2602 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
2603}
2604
2605std::pair<MachineInstr*, MachineInstr*>
2607 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2608
2609 if (ST.hasMovB64() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
2611 ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2612 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2613 return std::pair(&MI, nullptr);
2614 }
2615
2616 MachineBasicBlock &MBB = *MI.getParent();
2617 DebugLoc DL = MBB.findDebugLoc(MI);
2618 MachineFunction *MF = MBB.getParent();
2620 Register Dst = MI.getOperand(0).getReg();
2621 unsigned Part = 0;
2622 MachineInstr *Split[2];
2623
2624 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2625 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2626 if (Dst.isPhysical()) {
2627 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2628 } else {
2629 assert(MRI.isSSA());
2630 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2631 MovDPP.addDef(Tmp);
2632 }
2633
2634 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2635 const MachineOperand &SrcOp = MI.getOperand(I);
2636 assert(!SrcOp.isFPImm());
2637 if (SrcOp.isImm()) {
2638 APInt Imm(64, SrcOp.getImm());
2639 Imm.ashrInPlace(Part * 32);
2640 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2641 } else {
2642 assert(SrcOp.isReg());
2643 Register Src = SrcOp.getReg();
2644 if (Src.isPhysical())
2645 MovDPP.addReg(RI.getSubReg(Src, Sub));
2646 else
2647 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2648 }
2649 }
2650
2651 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2652 MovDPP.addImm(MO.getImm());
2653
2654 Split[Part] = MovDPP;
2655 ++Part;
2656 }
2657
2658 if (Dst.isVirtual())
2659 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2660 .addReg(Split[0]->getOperand(0).getReg())
2661 .addImm(AMDGPU::sub0)
2662 .addReg(Split[1]->getOperand(0).getReg())
2663 .addImm(AMDGPU::sub1);
2664
2665 MI.eraseFromParent();
2666 return std::pair(Split[0], Split[1]);
2667}
2668
2669std::optional<DestSourcePair>
2671 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2672 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2673
2674 return std::nullopt;
2675}
2676
2678 AMDGPU::OpName Src0OpName,
2679 MachineOperand &Src1,
2680 AMDGPU::OpName Src1OpName) const {
2681 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2682 if (!Src0Mods)
2683 return false;
2684
2685 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2686 assert(Src1Mods &&
2687 "All commutable instructions have both src0 and src1 modifiers");
2688
2689 int Src0ModsVal = Src0Mods->getImm();
2690 int Src1ModsVal = Src1Mods->getImm();
2691
2692 Src1Mods->setImm(Src0ModsVal);
2693 Src0Mods->setImm(Src1ModsVal);
2694 return true;
2695}
2696
2698 MachineOperand &RegOp,
2699 MachineOperand &NonRegOp) {
2700 Register Reg = RegOp.getReg();
2701 unsigned SubReg = RegOp.getSubReg();
2702 bool IsKill = RegOp.isKill();
2703 bool IsDead = RegOp.isDead();
2704 bool IsUndef = RegOp.isUndef();
2705 bool IsDebug = RegOp.isDebug();
2706
2707 if (NonRegOp.isImm())
2708 RegOp.ChangeToImmediate(NonRegOp.getImm());
2709 else if (NonRegOp.isFI())
2710 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2711 else if (NonRegOp.isGlobal()) {
2712 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2713 NonRegOp.getTargetFlags());
2714 } else
2715 return nullptr;
2716
2717 // Make sure we don't reinterpret a subreg index in the target flags.
2718 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2719
2720 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2721 NonRegOp.setSubReg(SubReg);
2722
2723 return &MI;
2724}
2725
2727 MachineOperand &NonRegOp1,
2728 MachineOperand &NonRegOp2) {
2729 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2730 int64_t NonRegVal = NonRegOp1.getImm();
2731
2732 NonRegOp1.setImm(NonRegOp2.getImm());
2733 NonRegOp2.setImm(NonRegVal);
2734 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2735 NonRegOp2.setTargetFlags(TargetFlags);
2736 return &MI;
2737}
2738
2739bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2740 unsigned OpIdx1) const {
2741 const MCInstrDesc &InstDesc = MI.getDesc();
2742 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2743 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2744
2745 unsigned Opc = MI.getOpcode();
2746 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2747
2748 const MachineOperand &MO0 = MI.getOperand(OpIdx0);
2749 const MachineOperand &MO1 = MI.getOperand(OpIdx1);
2750
2751 // Swap doesn't breach constant bus or literal limits
2752 // It may move literal to position other than src0, this is not allowed
2753 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2754 // FIXME: After gfx9, literal can be in place other than Src0
2755 if (isVALU(MI)) {
2756 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2757 !isInlineConstant(MO0, OpInfo1))
2758 return false;
2759 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2760 !isInlineConstant(MO1, OpInfo0))
2761 return false;
2762 }
2763
2764 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2765 if (OpInfo1.RegClass == -1)
2766 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2767 return isLegalRegOperand(MI, OpIdx1, MO0) &&
2768 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
2769 }
2770 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2771 if (OpInfo0.RegClass == -1)
2772 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2773 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
2774 isLegalRegOperand(MI, OpIdx0, MO1);
2775 }
2776
2777 // No need to check 64-bit literals since swapping does not bring new
2778 // 64-bit literals into current instruction to fold to 32-bit
2779
2780 return isImmOperandLegal(MI, OpIdx1, MO0);
2781}
2782
2784 unsigned Src0Idx,
2785 unsigned Src1Idx) const {
2786 assert(!NewMI && "this should never be used");
2787
2788 unsigned Opc = MI.getOpcode();
2789 int CommutedOpcode = commuteOpcode(Opc);
2790 if (CommutedOpcode == -1)
2791 return nullptr;
2792
2793 if (Src0Idx > Src1Idx)
2794 std::swap(Src0Idx, Src1Idx);
2795
2796 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2797 static_cast<int>(Src0Idx) &&
2798 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2799 static_cast<int>(Src1Idx) &&
2800 "inconsistency with findCommutedOpIndices");
2801
2802 if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
2803 return nullptr;
2804
2805 MachineInstr *CommutedMI = nullptr;
2806 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2807 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2808 if (Src0.isReg() && Src1.isReg()) {
2809 // Be sure to copy the source modifiers to the right place.
2810 CommutedMI =
2811 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2812 } else if (Src0.isReg() && !Src1.isReg()) {
2813 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2814 } else if (!Src0.isReg() && Src1.isReg()) {
2815 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2816 } else if (Src0.isImm() && Src1.isImm()) {
2817 CommutedMI = swapImmOperands(MI, Src0, Src1);
2818 } else {
2819 // FIXME: Found two non registers to commute. This does happen.
2820 return nullptr;
2821 }
2822
2823 if (CommutedMI) {
2824 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2825 Src1, AMDGPU::OpName::src1_modifiers);
2826
2827 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2828 AMDGPU::OpName::src1_sel);
2829
2830 CommutedMI->setDesc(get(CommutedOpcode));
2831 }
2832
2833 return CommutedMI;
2834}
2835
2836// This needs to be implemented because the source modifiers may be inserted
2837// between the true commutable operands, and the base
2838// TargetInstrInfo::commuteInstruction uses it.
2840 unsigned &SrcOpIdx0,
2841 unsigned &SrcOpIdx1) const {
2842 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2843}
2844
2846 unsigned &SrcOpIdx0,
2847 unsigned &SrcOpIdx1) const {
2848 if (!Desc.isCommutable())
2849 return false;
2850
2851 unsigned Opc = Desc.getOpcode();
2852 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2853 if (Src0Idx == -1)
2854 return false;
2855
2856 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2857 if (Src1Idx == -1)
2858 return false;
2859
2860 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2861}
2862
2864 int64_t BrOffset) const {
2865 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
2866 // because its dest block is unanalyzable.
2867 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
2868
2869 // Convert to dwords.
2870 BrOffset /= 4;
2871
2872 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2873 // from the next instruction.
2874 BrOffset -= 1;
2875
2876 return isIntN(BranchOffsetBits, BrOffset);
2877}
2878
2881 return MI.getOperand(0).getMBB();
2882}
2883
2885 for (const MachineInstr &MI : MBB->terminators()) {
2886 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2887 MI.getOpcode() == AMDGPU::SI_LOOP)
2888 return true;
2889 }
2890 return false;
2891}
2892
2894 MachineBasicBlock &DestBB,
2895 MachineBasicBlock &RestoreBB,
2896 const DebugLoc &DL, int64_t BrOffset,
2897 RegScavenger *RS) const {
2898 assert(MBB.empty() &&
2899 "new block should be inserted for expanding unconditional branch");
2900 assert(MBB.pred_size() == 1);
2901 assert(RestoreBB.empty() &&
2902 "restore block should be inserted for restoring clobbered registers");
2903
2904 MachineFunction *MF = MBB.getParent();
2907 auto I = MBB.end();
2908 auto &MCCtx = MF->getContext();
2909
2910 if (ST.hasAddPC64Inst()) {
2911 MCSymbol *Offset =
2912 MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
2913 auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
2915 MCSymbol *PostAddPCLabel =
2916 MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);
2917 AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);
2918 auto *OffsetExpr = MCBinaryExpr::createSub(
2919 MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
2920 MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);
2921 Offset->setVariableValue(OffsetExpr);
2922 return;
2923 }
2924
2925 assert(RS && "RegScavenger required for long branching");
2926
2927 // FIXME: Virtual register workaround for RegScavenger not working with empty
2928 // blocks.
2929 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2930
2931 // Note: as this is used after hazard recognizer we need to apply some hazard
2932 // workarounds directly.
2933 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
2934 ST.hasVALUReadSGPRHazard();
2935 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
2936 if (FlushSGPRWrites)
2937 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
2939 };
2940
2941 // We need to compute the offset relative to the instruction immediately after
2942 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2943 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2944 ApplyHazardWorkarounds();
2945
2946 MCSymbol *PostGetPCLabel =
2947 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2948 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2949
2950 MCSymbol *OffsetLo =
2951 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2952 MCSymbol *OffsetHi =
2953 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2954 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2955 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2956 .addReg(PCReg, 0, AMDGPU::sub0)
2957 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2958 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2959 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2960 .addReg(PCReg, 0, AMDGPU::sub1)
2961 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2962 ApplyHazardWorkarounds();
2963
2964 // Insert the indirect branch after the other terminator.
2965 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2966 .addReg(PCReg);
2967
2968 // If a spill is needed for the pc register pair, we need to insert a spill
2969 // restore block right before the destination block, and insert a short branch
2970 // into the old destination block's fallthrough predecessor.
2971 // e.g.:
2972 //
2973 // s_cbranch_scc0 skip_long_branch:
2974 //
2975 // long_branch_bb:
2976 // spill s[8:9]
2977 // s_getpc_b64 s[8:9]
2978 // s_add_u32 s8, s8, restore_bb
2979 // s_addc_u32 s9, s9, 0
2980 // s_setpc_b64 s[8:9]
2981 //
2982 // skip_long_branch:
2983 // foo;
2984 //
2985 // .....
2986 //
2987 // dest_bb_fallthrough_predecessor:
2988 // bar;
2989 // s_branch dest_bb
2990 //
2991 // restore_bb:
2992 // restore s[8:9]
2993 // fallthrough dest_bb
2994 ///
2995 // dest_bb:
2996 // buzz;
2997
2998 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
2999 Register Scav;
3000
3001 // If we've previously reserved a register for long branches
3002 // avoid running the scavenger and just use those registers
3003 if (LongBranchReservedReg) {
3004 RS->enterBasicBlock(MBB);
3005 Scav = LongBranchReservedReg;
3006 } else {
3007 RS->enterBasicBlockEnd(MBB);
3008 Scav = RS->scavengeRegisterBackwards(
3009 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
3010 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
3011 }
3012 if (Scav) {
3013 RS->setRegUsed(Scav);
3014 MRI.replaceRegWith(PCReg, Scav);
3015 MRI.clearVirtRegs();
3016 } else {
3017 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3018 // SGPR spill.
3019 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3020 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3021 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
3022 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
3023 MRI.clearVirtRegs();
3024 }
3025
3026 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3027 // Now, the distance could be defined.
3029 MCSymbolRefExpr::create(DestLabel, MCCtx),
3030 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
3031 // Add offset assignments.
3032 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
3033 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
3034 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
3035 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3036}
3037
3038unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3039 switch (Cond) {
3040 case SIInstrInfo::SCC_TRUE:
3041 return AMDGPU::S_CBRANCH_SCC1;
3042 case SIInstrInfo::SCC_FALSE:
3043 return AMDGPU::S_CBRANCH_SCC0;
3044 case SIInstrInfo::VCCNZ:
3045 return AMDGPU::S_CBRANCH_VCCNZ;
3046 case SIInstrInfo::VCCZ:
3047 return AMDGPU::S_CBRANCH_VCCZ;
3048 case SIInstrInfo::EXECNZ:
3049 return AMDGPU::S_CBRANCH_EXECNZ;
3050 case SIInstrInfo::EXECZ:
3051 return AMDGPU::S_CBRANCH_EXECZ;
3052 default:
3053 llvm_unreachable("invalid branch predicate");
3054 }
3055}
3056
3057SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3058 switch (Opcode) {
3059 case AMDGPU::S_CBRANCH_SCC0:
3060 return SCC_FALSE;
3061 case AMDGPU::S_CBRANCH_SCC1:
3062 return SCC_TRUE;
3063 case AMDGPU::S_CBRANCH_VCCNZ:
3064 return VCCNZ;
3065 case AMDGPU::S_CBRANCH_VCCZ:
3066 return VCCZ;
3067 case AMDGPU::S_CBRANCH_EXECNZ:
3068 return EXECNZ;
3069 case AMDGPU::S_CBRANCH_EXECZ:
3070 return EXECZ;
3071 default:
3072 return INVALID_BR;
3073 }
3074}
3075
3079 MachineBasicBlock *&FBB,
3081 bool AllowModify) const {
3082 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3083 // Unconditional Branch
3084 TBB = I->getOperand(0).getMBB();
3085 return false;
3086 }
3087
3088 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3089 if (Pred == INVALID_BR)
3090 return true;
3091
3092 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3093 Cond.push_back(MachineOperand::CreateImm(Pred));
3094 Cond.push_back(I->getOperand(1)); // Save the branch register.
3095
3096 ++I;
3097
3098 if (I == MBB.end()) {
3099 // Conditional branch followed by fall-through.
3100 TBB = CondBB;
3101 return false;
3102 }
3103
3104 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3105 TBB = CondBB;
3106 FBB = I->getOperand(0).getMBB();
3107 return false;
3108 }
3109
3110 return true;
3111}
3112
3114 MachineBasicBlock *&FBB,
3116 bool AllowModify) const {
3117 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3118 auto E = MBB.end();
3119 if (I == E)
3120 return false;
3121
3122 // Skip over the instructions that are artificially terminators for special
3123 // exec management.
3124 while (I != E && !I->isBranch() && !I->isReturn()) {
3125 switch (I->getOpcode()) {
3126 case AMDGPU::S_MOV_B64_term:
3127 case AMDGPU::S_XOR_B64_term:
3128 case AMDGPU::S_OR_B64_term:
3129 case AMDGPU::S_ANDN2_B64_term:
3130 case AMDGPU::S_AND_B64_term:
3131 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3132 case AMDGPU::S_MOV_B32_term:
3133 case AMDGPU::S_XOR_B32_term:
3134 case AMDGPU::S_OR_B32_term:
3135 case AMDGPU::S_ANDN2_B32_term:
3136 case AMDGPU::S_AND_B32_term:
3137 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3138 break;
3139 case AMDGPU::SI_IF:
3140 case AMDGPU::SI_ELSE:
3141 case AMDGPU::SI_KILL_I1_TERMINATOR:
3142 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3143 // FIXME: It's messy that these need to be considered here at all.
3144 return true;
3145 default:
3146 llvm_unreachable("unexpected non-branch terminator inst");
3147 }
3148
3149 ++I;
3150 }
3151
3152 if (I == E)
3153 return false;
3154
3155 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3156}
3157
3159 int *BytesRemoved) const {
3160 unsigned Count = 0;
3161 unsigned RemovedSize = 0;
3162 for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {
3163 // Skip over artificial terminators when removing instructions.
3164 if (MI.isBranch() || MI.isReturn()) {
3165 RemovedSize += getInstSizeInBytes(MI);
3166 MI.eraseFromParent();
3167 ++Count;
3168 }
3169 }
3170
3171 if (BytesRemoved)
3172 *BytesRemoved = RemovedSize;
3173
3174 return Count;
3175}
3176
3177// Copy the flags onto the implicit condition register operand.
3179 const MachineOperand &OrigCond) {
3180 CondReg.setIsUndef(OrigCond.isUndef());
3181 CondReg.setIsKill(OrigCond.isKill());
3182}
3183
3186 MachineBasicBlock *FBB,
3188 const DebugLoc &DL,
3189 int *BytesAdded) const {
3190 if (!FBB && Cond.empty()) {
3191 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3192 .addMBB(TBB);
3193 if (BytesAdded)
3194 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3195 return 1;
3196 }
3197
3198 assert(TBB && Cond[0].isImm());
3199
3200 unsigned Opcode
3201 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3202
3203 if (!FBB) {
3204 MachineInstr *CondBr =
3205 BuildMI(&MBB, DL, get(Opcode))
3206 .addMBB(TBB);
3207
3208 // Copy the flags onto the implicit condition register operand.
3209 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3210 fixImplicitOperands(*CondBr);
3211
3212 if (BytesAdded)
3213 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3214 return 1;
3215 }
3216
3217 assert(TBB && FBB);
3218
3219 MachineInstr *CondBr =
3220 BuildMI(&MBB, DL, get(Opcode))
3221 .addMBB(TBB);
3222 fixImplicitOperands(*CondBr);
3223 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3224 .addMBB(FBB);
3225
3226 MachineOperand &CondReg = CondBr->getOperand(1);
3227 CondReg.setIsUndef(Cond[1].isUndef());
3228 CondReg.setIsKill(Cond[1].isKill());
3229
3230 if (BytesAdded)
3231 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3232
3233 return 2;
3234}
3235
3238 if (Cond.size() != 2) {
3239 return true;
3240 }
3241
3242 if (Cond[0].isImm()) {
3243 Cond[0].setImm(-Cond[0].getImm());
3244 return false;
3245 }
3246
3247 return true;
3248}
3249
3252 Register DstReg, Register TrueReg,
3253 Register FalseReg, int &CondCycles,
3254 int &TrueCycles, int &FalseCycles) const {
3255 switch (Cond[0].getImm()) {
3256 case VCCNZ:
3257 case VCCZ: {
3258 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3259 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3260 if (MRI.getRegClass(FalseReg) != RC)
3261 return false;
3262
3263 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3264 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3265
3266 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3267 return RI.hasVGPRs(RC) && NumInsts <= 6;
3268 }
3269 case SCC_TRUE:
3270 case SCC_FALSE: {
3271 // FIXME: We could insert for VGPRs if we could replace the original compare
3272 // with a vector one.
3273 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3274 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3275 if (MRI.getRegClass(FalseReg) != RC)
3276 return false;
3277
3278 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3279
3280 // Multiples of 8 can do s_cselect_b64
3281 if (NumInsts % 2 == 0)
3282 NumInsts /= 2;
3283
3284 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3285 return RI.isSGPRClass(RC);
3286 }
3287 default:
3288 return false;
3289 }
3290}
3291
3295 Register TrueReg, Register FalseReg) const {
3296 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3297 if (Pred == VCCZ || Pred == SCC_FALSE) {
3298 Pred = static_cast<BranchPredicate>(-Pred);
3299 std::swap(TrueReg, FalseReg);
3300 }
3301
3302 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3303 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3304 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3305
3306 if (DstSize == 32) {
3308 if (Pred == SCC_TRUE) {
3309 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3310 .addReg(TrueReg)
3311 .addReg(FalseReg);
3312 } else {
3313 // Instruction's operands are backwards from what is expected.
3314 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3315 .addReg(FalseReg)
3316 .addReg(TrueReg);
3317 }
3318
3319 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3320 return;
3321 }
3322
3323 if (DstSize == 64 && Pred == SCC_TRUE) {
3325 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3326 .addReg(TrueReg)
3327 .addReg(FalseReg);
3328
3329 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3330 return;
3331 }
3332
3333 static const int16_t Sub0_15[] = {
3334 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3335 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3336 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3337 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3338 };
3339
3340 static const int16_t Sub0_15_64[] = {
3341 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3342 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3343 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3344 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3345 };
3346
3347 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3348 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3349 const int16_t *SubIndices = Sub0_15;
3350 int NElts = DstSize / 32;
3351
3352 // 64-bit select is only available for SALU.
3353 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3354 if (Pred == SCC_TRUE) {
3355 if (NElts % 2) {
3356 SelOp = AMDGPU::S_CSELECT_B32;
3357 EltRC = &AMDGPU::SGPR_32RegClass;
3358 } else {
3359 SelOp = AMDGPU::S_CSELECT_B64;
3360 EltRC = &AMDGPU::SGPR_64RegClass;
3361 SubIndices = Sub0_15_64;
3362 NElts /= 2;
3363 }
3364 }
3365
3367 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3368
3369 I = MIB->getIterator();
3370
3372 for (int Idx = 0; Idx != NElts; ++Idx) {
3373 Register DstElt = MRI.createVirtualRegister(EltRC);
3374 Regs.push_back(DstElt);
3375
3376 unsigned SubIdx = SubIndices[Idx];
3377
3379 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3380 Select =
3381 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3382 .addReg(FalseReg, 0, SubIdx)
3383 .addReg(TrueReg, 0, SubIdx);
3384 } else {
3385 Select =
3386 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3387 .addReg(TrueReg, 0, SubIdx)
3388 .addReg(FalseReg, 0, SubIdx);
3389 }
3390
3391 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3393
3394 MIB.addReg(DstElt)
3395 .addImm(SubIdx);
3396 }
3397}
3398
3400 switch (MI.getOpcode()) {
3401 case AMDGPU::V_MOV_B16_t16_e32:
3402 case AMDGPU::V_MOV_B16_t16_e64:
3403 case AMDGPU::V_MOV_B32_e32:
3404 case AMDGPU::V_MOV_B32_e64:
3405 case AMDGPU::V_MOV_B64_PSEUDO:
3406 case AMDGPU::V_MOV_B64_e32:
3407 case AMDGPU::V_MOV_B64_e64:
3408 case AMDGPU::S_MOV_B32:
3409 case AMDGPU::S_MOV_B64:
3410 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3411 case AMDGPU::COPY:
3412 case AMDGPU::WWM_COPY:
3413 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3414 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3415 case AMDGPU::V_ACCVGPR_MOV_B32:
3416 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3417 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3418 return true;
3419 default:
3420 return false;
3421 }
3422}
3423
3425 switch (MI.getOpcode()) {
3426 case AMDGPU::V_MOV_B16_t16_e32:
3427 case AMDGPU::V_MOV_B16_t16_e64:
3428 return 2;
3429 case AMDGPU::V_MOV_B32_e32:
3430 case AMDGPU::V_MOV_B32_e64:
3431 case AMDGPU::V_MOV_B64_PSEUDO:
3432 case AMDGPU::V_MOV_B64_e32:
3433 case AMDGPU::V_MOV_B64_e64:
3434 case AMDGPU::S_MOV_B32:
3435 case AMDGPU::S_MOV_B64:
3436 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3437 case AMDGPU::COPY:
3438 case AMDGPU::WWM_COPY:
3439 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3440 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3441 case AMDGPU::V_ACCVGPR_MOV_B32:
3442 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3443 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3444 return 1;
3445 default:
3446 llvm_unreachable("MI is not a foldable copy");
3447 }
3448}
3449
3450static constexpr AMDGPU::OpName ModifierOpNames[] = {
3451 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3452 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3453 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3454
3456 unsigned Opc = MI.getOpcode();
3457 for (AMDGPU::OpName Name : reverse(ModifierOpNames)) {
3458 int Idx = AMDGPU::getNamedOperandIdx(Opc, Name);
3459 if (Idx >= 0)
3460 MI.removeOperand(Idx);
3461 }
3462}
3463
3464std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3465 unsigned SubRegIndex) {
3466 switch (SubRegIndex) {
3467 case AMDGPU::NoSubRegister:
3468 return Imm;
3469 case AMDGPU::sub0:
3470 return SignExtend64<32>(Imm);
3471 case AMDGPU::sub1:
3472 return SignExtend64<32>(Imm >> 32);
3473 case AMDGPU::lo16:
3474 return SignExtend64<16>(Imm);
3475 case AMDGPU::hi16:
3476 return SignExtend64<16>(Imm >> 16);
3477 case AMDGPU::sub1_lo16:
3478 return SignExtend64<16>(Imm >> 32);
3479 case AMDGPU::sub1_hi16:
3480 return SignExtend64<16>(Imm >> 48);
3481 default:
3482 return std::nullopt;
3483 }
3484
3485 llvm_unreachable("covered subregister switch");
3486}
3487
3488static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3489 switch (Opc) {
3490 case AMDGPU::V_MAC_F16_e32:
3491 case AMDGPU::V_MAC_F16_e64:
3492 case AMDGPU::V_MAD_F16_e64:
3493 return AMDGPU::V_MADAK_F16;
3494 case AMDGPU::V_MAC_F32_e32:
3495 case AMDGPU::V_MAC_F32_e64:
3496 case AMDGPU::V_MAD_F32_e64:
3497 return AMDGPU::V_MADAK_F32;
3498 case AMDGPU::V_FMAC_F32_e32:
3499 case AMDGPU::V_FMAC_F32_e64:
3500 case AMDGPU::V_FMA_F32_e64:
3501 return AMDGPU::V_FMAAK_F32;
3502 case AMDGPU::V_FMAC_F16_e32:
3503 case AMDGPU::V_FMAC_F16_e64:
3504 case AMDGPU::V_FMAC_F16_t16_e64:
3505 case AMDGPU::V_FMAC_F16_fake16_e64:
3506 case AMDGPU::V_FMA_F16_e64:
3507 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3508 ? AMDGPU::V_FMAAK_F16_t16
3509 : AMDGPU::V_FMAAK_F16_fake16
3510 : AMDGPU::V_FMAAK_F16;
3511 case AMDGPU::V_FMAC_F64_e32:
3512 case AMDGPU::V_FMAC_F64_e64:
3513 case AMDGPU::V_FMA_F64_e64:
3514 return AMDGPU::V_FMAAK_F64;
3515 default:
3516 llvm_unreachable("invalid instruction");
3517 }
3518}
3519
3520static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3521 switch (Opc) {
3522 case AMDGPU::V_MAC_F16_e32:
3523 case AMDGPU::V_MAC_F16_e64:
3524 case AMDGPU::V_MAD_F16_e64:
3525 return AMDGPU::V_MADMK_F16;
3526 case AMDGPU::V_MAC_F32_e32:
3527 case AMDGPU::V_MAC_F32_e64:
3528 case AMDGPU::V_MAD_F32_e64:
3529 return AMDGPU::V_MADMK_F32;
3530 case AMDGPU::V_FMAC_F32_e32:
3531 case AMDGPU::V_FMAC_F32_e64:
3532 case AMDGPU::V_FMA_F32_e64:
3533 return AMDGPU::V_FMAMK_F32;
3534 case AMDGPU::V_FMAC_F16_e32:
3535 case AMDGPU::V_FMAC_F16_e64:
3536 case AMDGPU::V_FMAC_F16_t16_e64:
3537 case AMDGPU::V_FMAC_F16_fake16_e64:
3538 case AMDGPU::V_FMA_F16_e64:
3539 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3540 ? AMDGPU::V_FMAMK_F16_t16
3541 : AMDGPU::V_FMAMK_F16_fake16
3542 : AMDGPU::V_FMAMK_F16;
3543 case AMDGPU::V_FMAC_F64_e32:
3544 case AMDGPU::V_FMAC_F64_e64:
3545 case AMDGPU::V_FMA_F64_e64:
3546 return AMDGPU::V_FMAMK_F64;
3547 default:
3548 llvm_unreachable("invalid instruction");
3549 }
3550}
3551
3553 Register Reg, MachineRegisterInfo *MRI) const {
3554 int64_t Imm;
3555 if (!getConstValDefinedInReg(DefMI, Reg, Imm))
3556 return false;
3557
3558 const bool HasMultipleUses = !MRI->hasOneNonDBGUse(Reg);
3559
3560 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3561
3562 unsigned Opc = UseMI.getOpcode();
3563 if (Opc == AMDGPU::COPY) {
3564 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3565
3566 Register DstReg = UseMI.getOperand(0).getReg();
3567 Register UseSubReg = UseMI.getOperand(1).getSubReg();
3568
3569 const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
3570
3571 if (HasMultipleUses) {
3572 // TODO: This should fold in more cases with multiple use, but we need to
3573 // more carefully consider what those uses are.
3574 unsigned ImmDefSize = RI.getRegSizeInBits(*MRI->getRegClass(Reg));
3575
3576 // Avoid breaking up a 64-bit inline immediate into a subregister extract.
3577 if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
3578 return false;
3579
3580 // Most of the time folding a 32-bit inline constant is free (though this
3581 // might not be true if we can't later fold it into a real user).
3582 //
3583 // FIXME: This isInlineConstant check is imprecise if
3584 // getConstValDefinedInReg handled the tricky non-mov cases.
3585 if (ImmDefSize == 32 &&
3587 return false;
3588 }
3589
3590 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3591 RI.getSubRegIdxSize(UseSubReg) == 16;
3592
3593 if (Is16Bit) {
3594 if (RI.hasVGPRs(DstRC))
3595 return false; // Do not clobber vgpr_hi16
3596
3597 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3598 return false;
3599 }
3600
3601 MachineFunction *MF = UseMI.getMF();
3602
3603 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3604 MCRegister MovDstPhysReg =
3605 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3606
3607 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
3608
3609 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3610 for (unsigned MovOp :
3611 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3612 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3613 const MCInstrDesc &MovDesc = get(MovOp);
3614
3615 const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0, &RI);
3616 if (Is16Bit) {
3617 // We just need to find a correctly sized register class, so the
3618 // subregister index compatibility doesn't matter since we're statically
3619 // extracting the immediate value.
3620 MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
3621 if (!MovDstRC)
3622 continue;
3623
3624 if (MovDstPhysReg) {
3625 // FIXME: We probably should not do this. If there is a live value in
3626 // the high half of the register, it will be corrupted.
3627 MovDstPhysReg =
3628 RI.getMatchingSuperReg(MovDstPhysReg, AMDGPU::lo16, MovDstRC);
3629 if (!MovDstPhysReg)
3630 continue;
3631 }
3632 }
3633
3634 // Result class isn't the right size, try the next instruction.
3635 if (MovDstPhysReg) {
3636 if (!MovDstRC->contains(MovDstPhysReg))
3637 return false;
3638 } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
3639 // TODO: This will be overly conservative in the case of 16-bit virtual
3640 // SGPRs. We could hack up the virtual register uses to use a compatible
3641 // 32-bit class.
3642 continue;
3643 }
3644
3645 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3646
3647 // Ensure the interpreted immediate value is a valid operand in the new
3648 // mov.
3649 //
3650 // FIXME: isImmOperandLegal should have form that doesn't require existing
3651 // MachineInstr or MachineOperand
3652 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
3653 !isInlineConstant(*SubRegImm, OpInfo.OperandType))
3654 break;
3655
3656 NewOpc = MovOp;
3657 break;
3658 }
3659
3660 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3661 return false;
3662
3663 if (Is16Bit) {
3664 UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
3665 if (MovDstPhysReg)
3666 UseMI.getOperand(0).setReg(MovDstPhysReg);
3667 assert(UseMI.getOperand(1).getReg().isVirtual());
3668 }
3669
3670 const MCInstrDesc &NewMCID = get(NewOpc);
3671 UseMI.setDesc(NewMCID);
3672 UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
3673 UseMI.addImplicitDefUseOperands(*MF);
3674 return true;
3675 }
3676
3677 if (HasMultipleUses)
3678 return false;
3679
3680 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3681 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3682 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3683 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3684 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3685 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3686 Opc == AMDGPU::V_FMAC_F64_e64) {
3687 // Don't fold if we are using source or output modifiers. The new VOP2
3688 // instructions don't have them.
3690 return false;
3691
3692 // If this is a free constant, there's no reason to do this.
3693 // TODO: We could fold this here instead of letting SIFoldOperands do it
3694 // later.
3695 int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0);
3696
3697 // Any src operand can be used for the legality check.
3698 if (isInlineConstant(UseMI, Src0Idx, Imm))
3699 return false;
3700
3701 MachineOperand *Src0 = &UseMI.getOperand(Src0Idx);
3702
3703 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3704 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3705
3706 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3707 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3708 (Src1->isReg() && Src1->getReg() == Reg)) {
3709 MachineOperand *RegSrc =
3710 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3711 if (!RegSrc->isReg())
3712 return false;
3713 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3714 ST.getConstantBusLimit(Opc) < 2)
3715 return false;
3716
3717 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3718 return false;
3719
3720 // If src2 is also a literal constant then we have to choose which one to
3721 // fold. In general it is better to choose madak so that the other literal
3722 // can be materialized in an sgpr instead of a vgpr:
3723 // s_mov_b32 s0, literal
3724 // v_madak_f32 v0, s0, v0, literal
3725 // Instead of:
3726 // v_mov_b32 v1, literal
3727 // v_madmk_f32 v0, v0, literal, v1
3728 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3729 if (Def && Def->isMoveImmediate() &&
3730 !isInlineConstant(Def->getOperand(1)))
3731 return false;
3732
3733 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3734 if (pseudoToMCOpcode(NewOpc) == -1)
3735 return false;
3736
3737 // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
3738 // takes VGPR_32_Lo128 operands, so the rewrite would also require
3739 // restricting their register classes. For now just bail out.
3740 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3741 NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3742 return false;
3743
3744 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3745 Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3746
3747 // FIXME: This would be a lot easier if we could return a new instruction
3748 // instead of having to modify in place.
3749
3750 Register SrcReg = RegSrc->getReg();
3751 unsigned SrcSubReg = RegSrc->getSubReg();
3752 Src0->setReg(SrcReg);
3753 Src0->setSubReg(SrcSubReg);
3754 Src0->setIsKill(RegSrc->isKill());
3755
3756 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3757 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3758 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3759 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3760 UseMI.untieRegOperand(
3761 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3762
3763 Src1->ChangeToImmediate(*SubRegImm);
3764
3766 UseMI.setDesc(get(NewOpc));
3767
3768 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3769 if (DeleteDef)
3770 DefMI.eraseFromParent();
3771
3772 return true;
3773 }
3774
3775 // Added part is the constant: Use v_madak_{f16, f32}.
3776 if (Src2->isReg() && Src2->getReg() == Reg) {
3777 if (ST.getConstantBusLimit(Opc) < 2) {
3778 // Not allowed to use constant bus for another operand.
3779 // We can however allow an inline immediate as src0.
3780 bool Src0Inlined = false;
3781 if (Src0->isReg()) {
3782 // Try to inline constant if possible.
3783 // If the Def moves immediate and the use is single
3784 // We are saving VGPR here.
3785 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3786 if (Def && Def->isMoveImmediate() &&
3787 isInlineConstant(Def->getOperand(1)) &&
3788 MRI->hasOneNonDBGUse(Src0->getReg())) {
3789 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3790 Src0Inlined = true;
3791 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3792 RI.isSGPRReg(*MRI, Src0->getReg())) {
3793 return false;
3794 }
3795 // VGPR is okay as Src0 - fallthrough
3796 }
3797
3798 if (Src1->isReg() && !Src0Inlined) {
3799 // We have one slot for inlinable constant so far - try to fill it
3800 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3801 if (Def && Def->isMoveImmediate() &&
3802 isInlineConstant(Def->getOperand(1)) &&
3803 MRI->hasOneNonDBGUse(Src1->getReg()) && commuteInstruction(UseMI))
3804 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3805 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3806 return false;
3807 // VGPR is okay as Src1 - fallthrough
3808 }
3809 }
3810
3811 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3812 if (pseudoToMCOpcode(NewOpc) == -1)
3813 return false;
3814
3815 // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
3816 // takes VGPR_32_Lo128 operands, so the rewrite would also require
3817 // restricting their register classes. For now just bail out.
3818 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3819 NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3820 return false;
3821
3822 // FIXME: This would be a lot easier if we could return a new instruction
3823 // instead of having to modify in place.
3824
3825 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3826 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3827 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3828 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3829 UseMI.untieRegOperand(
3830 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3831
3832 const std::optional<int64_t> SubRegImm =
3833 extractSubregFromImm(Imm, Src2->getSubReg());
3834
3835 // ChangingToImmediate adds Src2 back to the instruction.
3836 Src2->ChangeToImmediate(*SubRegImm);
3837
3838 // These come before src2.
3840 UseMI.setDesc(get(NewOpc));
3841 // It might happen that UseMI was commuted
3842 // and we now have SGPR as SRC1. If so 2 inlined
3843 // constant and SGPR are illegal.
3845
3846 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3847 if (DeleteDef)
3848 DefMI.eraseFromParent();
3849
3850 return true;
3851 }
3852 }
3853
3854 return false;
3855}
3856
3857static bool
3860 if (BaseOps1.size() != BaseOps2.size())
3861 return false;
3862 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3863 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3864 return false;
3865 }
3866 return true;
3867}
3868
3869static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3870 LocationSize WidthB, int OffsetB) {
3871 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3872 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3873 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3874 return LowWidth.hasValue() &&
3875 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3876}
3877
3878bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3879 const MachineInstr &MIb) const {
3880 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3881 int64_t Offset0, Offset1;
3882 LocationSize Dummy0 = LocationSize::precise(0);
3883 LocationSize Dummy1 = LocationSize::precise(0);
3884 bool Offset0IsScalable, Offset1IsScalable;
3885 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3886 Dummy0, &RI) ||
3887 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3888 Dummy1, &RI))
3889 return false;
3890
3891 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3892 return false;
3893
3894 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3895 // FIXME: Handle ds_read2 / ds_write2.
3896 return false;
3897 }
3898 LocationSize Width0 = MIa.memoperands().front()->getSize();
3899 LocationSize Width1 = MIb.memoperands().front()->getSize();
3900 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3901}
3902
3904 const MachineInstr &MIb) const {
3905 assert(MIa.mayLoadOrStore() &&
3906 "MIa must load from or modify a memory location");
3907 assert(MIb.mayLoadOrStore() &&
3908 "MIb must load from or modify a memory location");
3909
3911 return false;
3912
3913 // XXX - Can we relax this between address spaces?
3914 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3915 return false;
3916
3917 if (isLDSDMA(MIa) || isLDSDMA(MIb))
3918 return false;
3919
3920 if (MIa.isBundle() || MIb.isBundle())
3921 return false;
3922
3923 // TODO: Should we check the address space from the MachineMemOperand? That
3924 // would allow us to distinguish objects we know don't alias based on the
3925 // underlying address space, even if it was lowered to a different one,
3926 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3927 // buffer.
3928 if (isDS(MIa)) {
3929 if (isDS(MIb))
3930 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3931
3932 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3933 }
3934
3935 if (isMUBUF(MIa) || isMTBUF(MIa)) {
3936 if (isMUBUF(MIb) || isMTBUF(MIb))
3937 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3938
3939 if (isFLAT(MIb))
3940 return isFLATScratch(MIb);
3941
3942 return !isSMRD(MIb);
3943 }
3944
3945 if (isSMRD(MIa)) {
3946 if (isSMRD(MIb))
3947 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3948
3949 if (isFLAT(MIb))
3950 return isFLATScratch(MIb);
3951
3952 return !isMUBUF(MIb) && !isMTBUF(MIb);
3953 }
3954
3955 if (isFLAT(MIa)) {
3956 if (isFLAT(MIb)) {
3957 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
3958 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
3959 return true;
3960
3961 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3962 }
3963
3964 return false;
3965 }
3966
3967 return false;
3968}
3969
3971 int64_t &Imm, MachineInstr **DefMI = nullptr) {
3972 if (Reg.isPhysical())
3973 return false;
3974 auto *Def = MRI.getUniqueVRegDef(Reg);
3975 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3976 Imm = Def->getOperand(1).getImm();
3977 if (DefMI)
3978 *DefMI = Def;
3979 return true;
3980 }
3981 return false;
3982}
3983
3984static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
3985 MachineInstr **DefMI = nullptr) {
3986 if (!MO->isReg())
3987 return false;
3988 const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3989 const MachineRegisterInfo &MRI = MF->getRegInfo();
3990 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
3991}
3992
3994 MachineInstr &NewMI) {
3995 if (LV) {
3996 unsigned NumOps = MI.getNumOperands();
3997 for (unsigned I = 1; I < NumOps; ++I) {
3998 MachineOperand &Op = MI.getOperand(I);
3999 if (Op.isReg() && Op.isKill())
4000 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
4001 }
4002 }
4003}
4004
4005static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
4006 switch (Opc) {
4007 case AMDGPU::V_MAC_F16_e32:
4008 case AMDGPU::V_MAC_F16_e64:
4009 return AMDGPU::V_MAD_F16_e64;
4010 case AMDGPU::V_MAC_F32_e32:
4011 case AMDGPU::V_MAC_F32_e64:
4012 return AMDGPU::V_MAD_F32_e64;
4013 case AMDGPU::V_MAC_LEGACY_F32_e32:
4014 case AMDGPU::V_MAC_LEGACY_F32_e64:
4015 return AMDGPU::V_MAD_LEGACY_F32_e64;
4016 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4017 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4018 return AMDGPU::V_FMA_LEGACY_F32_e64;
4019 case AMDGPU::V_FMAC_F16_e32:
4020 case AMDGPU::V_FMAC_F16_e64:
4021 case AMDGPU::V_FMAC_F16_t16_e64:
4022 case AMDGPU::V_FMAC_F16_fake16_e64:
4023 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4024 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4025 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4026 : AMDGPU::V_FMA_F16_gfx9_e64;
4027 case AMDGPU::V_FMAC_F32_e32:
4028 case AMDGPU::V_FMAC_F32_e64:
4029 return AMDGPU::V_FMA_F32_e64;
4030 case AMDGPU::V_FMAC_F64_e32:
4031 case AMDGPU::V_FMAC_F64_e64:
4032 return AMDGPU::V_FMA_F64_e64;
4033 default:
4034 llvm_unreachable("invalid instruction");
4035 }
4036}
4037
4038/// Helper struct for the implementation of 3-address conversion to communicate
4039/// updates made to instruction operands.
4041 /// Other instruction whose def is no longer used by the converted
4042 /// instruction.
4044};
4045
4047 LiveVariables *LV,
4048 LiveIntervals *LIS) const {
4049 MachineBasicBlock &MBB = *MI.getParent();
4051 MachineInstr *NewMI = convertToThreeAddressImpl(MI, U);
4052
4053 if (NewMI) {
4054 updateLiveVariables(LV, MI, *NewMI);
4055 if (LIS) {
4056 LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
4057 // SlotIndex of defs needs to be updated when converting to early-clobber
4058 MachineOperand &Def = NewMI->getOperand(0);
4059 if (Def.isEarlyClobber() && Def.isReg() &&
4060 LIS->hasInterval(Def.getReg())) {
4061 SlotIndex OldIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(false);
4062 SlotIndex NewIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(true);
4063 auto &LI = LIS->getInterval(Def.getReg());
4064 auto UpdateDefIndex = [&](LiveRange &LR) {
4065 auto *S = LR.find(OldIndex);
4066 if (S != LR.end() && S->start == OldIndex) {
4067 assert(S->valno && S->valno->def == OldIndex);
4068 S->start = NewIndex;
4069 S->valno->def = NewIndex;
4070 }
4071 };
4072 UpdateDefIndex(LI);
4073 for (auto &SR : LI.subranges())
4074 UpdateDefIndex(SR);
4075 }
4076 }
4077 }
4078
4079 if (U.RemoveMIUse) {
4080 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4081 // The only user is the instruction which will be killed.
4082 Register DefReg = U.RemoveMIUse->getOperand(0).getReg();
4083
4084 if (MRI.hasOneNonDBGUse(DefReg)) {
4085 // We cannot just remove the DefMI here, calling pass will crash.
4086 U.RemoveMIUse->setDesc(get(AMDGPU::IMPLICIT_DEF));
4087 U.RemoveMIUse->getOperand(0).setIsDead(true);
4088 for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I)
4089 U.RemoveMIUse->removeOperand(I);
4090 if (LV)
4091 LV->getVarInfo(DefReg).AliveBlocks.clear();
4092 }
4093
4094 if (LIS) {
4095 LiveInterval &DefLI = LIS->getInterval(DefReg);
4096
4097 // We cannot delete the original instruction here, so hack out the use
4098 // in the original instruction with a dummy register so we can use
4099 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4100 // not have the complexity of deleting a use to consider here.
4101 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
4102 for (MachineOperand &MIOp : MI.uses()) {
4103 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4104 MIOp.setIsUndef(true);
4105 MIOp.setReg(DummyReg);
4106 }
4107 }
4108
4109 LIS->shrinkToUses(&DefLI);
4110 }
4111 }
4112
4113 return NewMI;
4114}
4115
4117SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
4118 ThreeAddressUpdates &U) const {
4119 MachineBasicBlock &MBB = *MI.getParent();
4120 unsigned Opc = MI.getOpcode();
4121
4122 // Handle MFMA.
4123 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
4124 if (NewMFMAOpc != -1) {
4126 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
4127 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
4128 MIB.add(MI.getOperand(I));
4129 return MIB;
4130 }
4131
4132 if (SIInstrInfo::isWMMA(MI)) {
4133 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
4134 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4135 .setMIFlags(MI.getFlags());
4136 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
4137 MIB->addOperand(MI.getOperand(I));
4138 return MIB;
4139 }
4140
4141 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4142 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4143 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4144 "present pre-RA");
4145
4146 // Handle MAC/FMAC.
4147 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4148 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4149 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4150 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4151 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4152 bool Src0Literal = false;
4153
4154 switch (Opc) {
4155 default:
4156 return nullptr;
4157 case AMDGPU::V_MAC_F16_e64:
4158 case AMDGPU::V_FMAC_F16_e64:
4159 case AMDGPU::V_FMAC_F16_t16_e64:
4160 case AMDGPU::V_FMAC_F16_fake16_e64:
4161 case AMDGPU::V_MAC_F32_e64:
4162 case AMDGPU::V_MAC_LEGACY_F32_e64:
4163 case AMDGPU::V_FMAC_F32_e64:
4164 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4165 case AMDGPU::V_FMAC_F64_e64:
4166 break;
4167 case AMDGPU::V_MAC_F16_e32:
4168 case AMDGPU::V_FMAC_F16_e32:
4169 case AMDGPU::V_MAC_F32_e32:
4170 case AMDGPU::V_MAC_LEGACY_F32_e32:
4171 case AMDGPU::V_FMAC_F32_e32:
4172 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4173 case AMDGPU::V_FMAC_F64_e32: {
4174 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4175 AMDGPU::OpName::src0);
4176 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
4177 if (!Src0->isReg() && !Src0->isImm())
4178 return nullptr;
4179
4180 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
4181 Src0Literal = true;
4182
4183 break;
4184 }
4185 }
4186
4187 MachineInstrBuilder MIB;
4188 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4189 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
4190 const MachineOperand *Src0Mods =
4191 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4192 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4193 const MachineOperand *Src1Mods =
4194 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
4195 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4196 const MachineOperand *Src2Mods =
4197 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
4198 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4199 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
4200 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
4201
4202 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4203 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4204 // If we have an SGPR input, we will violate the constant bus restriction.
4205 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
4206 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
4207 MachineInstr *DefMI;
4208
4209 int64_t Imm;
4210 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4211 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4212 if (pseudoToMCOpcode(NewOpc) != -1) {
4213 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4214 .add(*Dst)
4215 .add(*Src0)
4216 .add(*Src1)
4217 .addImm(Imm)
4218 .setMIFlags(MI.getFlags());
4219 U.RemoveMIUse = DefMI;
4220 return MIB;
4221 }
4222 }
4223 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4224 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4225 if (pseudoToMCOpcode(NewOpc) != -1) {
4226 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4227 .add(*Dst)
4228 .add(*Src0)
4229 .addImm(Imm)
4230 .add(*Src2)
4231 .setMIFlags(MI.getFlags());
4232 U.RemoveMIUse = DefMI;
4233 return MIB;
4234 }
4235 }
4236 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4237 if (Src0Literal) {
4238 Imm = Src0->getImm();
4239 DefMI = nullptr;
4240 }
4241 if (pseudoToMCOpcode(NewOpc) != -1 &&
4243 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4244 Src1)) {
4245 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4246 .add(*Dst)
4247 .add(*Src1)
4248 .addImm(Imm)
4249 .add(*Src2)
4250 .setMIFlags(MI.getFlags());
4251 U.RemoveMIUse = DefMI;
4252 return MIB;
4253 }
4254 }
4255 }
4256
4257 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4258 // if VOP3 does not allow a literal operand.
4259 if (Src0Literal && !ST.hasVOP3Literal())
4260 return nullptr;
4261
4262 unsigned NewOpc = getNewFMAInst(ST, Opc);
4263
4264 if (pseudoToMCOpcode(NewOpc) == -1)
4265 return nullptr;
4266
4267 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4268 .add(*Dst)
4269 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4270 .add(*Src0)
4271 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4272 .add(*Src1)
4273 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4274 .add(*Src2)
4275 .addImm(Clamp ? Clamp->getImm() : 0)
4276 .addImm(Omod ? Omod->getImm() : 0)
4277 .setMIFlags(MI.getFlags());
4278 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4279 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4280 return MIB;
4281}
4282
4283// It's not generally safe to move VALU instructions across these since it will
4284// start using the register as a base index rather than directly.
4285// XXX - Why isn't hasSideEffects sufficient for these?
4287 switch (MI.getOpcode()) {
4288 case AMDGPU::S_SET_GPR_IDX_ON:
4289 case AMDGPU::S_SET_GPR_IDX_MODE:
4290 case AMDGPU::S_SET_GPR_IDX_OFF:
4291 return true;
4292 default:
4293 return false;
4294 }
4295}
4296
4298 const MachineBasicBlock *MBB,
4299 const MachineFunction &MF) const {
4300 // Skipping the check for SP writes in the base implementation. The reason it
4301 // was added was apparently due to compile time concerns.
4302 //
4303 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4304 // but is probably avoidable.
4305
4306 // Copied from base implementation.
4307 // Terminators and labels can't be scheduled around.
4308 if (MI.isTerminator() || MI.isPosition())
4309 return true;
4310
4311 // INLINEASM_BR can jump to another block
4312 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4313 return true;
4314
4315 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4316 return true;
4317
4318 // Target-independent instructions do not have an implicit-use of EXEC, even
4319 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4320 // boundaries prevents incorrect movements of such instructions.
4321 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4322 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4323 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4324 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4325 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4327}
4328
4330 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4331 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4332 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4333}
4334
4336 if (!isFLAT(MI) || isFLATGlobal(MI))
4337 return false;
4338
4339 // If scratch is not initialized, we can never access it.
4340 if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
4341 return false;
4342
4343 // SCRATCH instructions always access scratch.
4344 if (isFLATScratch(MI))
4345 return true;
4346
4347 // If there are no memory operands then conservatively assume the flat
4348 // operation may access scratch.
4349 if (MI.memoperands_empty())
4350 return true;
4351
4352 // See if any memory operand specifies an address space that involves scratch.
4353 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
4354 unsigned AS = Memop->getAddrSpace();
4355 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4356 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4357 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4358 *MD, AMDGPUAS::PRIVATE_ADDRESS);
4359 }
4360 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4361 });
4362}
4363
4365 assert(isFLAT(MI));
4366
4367 // All flat instructions use the VMEM counter except prefetch.
4368 if (!usesVM_CNT(MI))
4369 return false;
4370
4371 // If there are no memory operands then conservatively assume the flat
4372 // operation may access VMEM.
4373 if (MI.memoperands_empty())
4374 return true;
4375
4376 // See if any memory operand specifies an address space that involves VMEM.
4377 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
4378 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
4379 // (GDS) address space is not supported by flat operations. Therefore, simply
4380 // return true unless only the LDS address space is found.
4381 for (const MachineMemOperand *Memop : MI.memoperands()) {
4382 unsigned AS = Memop->getAddrSpace();
4384 if (AS != AMDGPUAS::LOCAL_ADDRESS)
4385 return true;
4386 }
4387
4388 return false;
4389}
4390
4392 assert(isFLAT(MI));
4393
4394 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
4395 if (!usesLGKM_CNT(MI))
4396 return false;
4397
4398 // If in tgsplit mode then there can be no use of LDS.
4399 if (ST.isTgSplitEnabled())
4400 return false;
4401
4402 // If there are no memory operands then conservatively assume the flat
4403 // operation may access LDS.
4404 if (MI.memoperands_empty())
4405 return true;
4406
4407 // See if any memory operand specifies an address space that involves LDS.
4408 for (const MachineMemOperand *Memop : MI.memoperands()) {
4409 unsigned AS = Memop->getAddrSpace();
4411 return true;
4412 }
4413
4414 return false;
4415}
4416
4418 // Skip the full operand and register alias search modifiesRegister
4419 // does. There's only a handful of instructions that touch this, it's only an
4420 // implicit def, and doesn't alias any other registers.
4421 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4422}
4423
4425 unsigned Opcode = MI.getOpcode();
4426
4427 if (MI.mayStore() && isSMRD(MI))
4428 return true; // scalar store or atomic
4429
4430 // This will terminate the function when other lanes may need to continue.
4431 if (MI.isReturn())
4432 return true;
4433
4434 // These instructions cause shader I/O that may cause hardware lockups
4435 // when executed with an empty EXEC mask.
4436 //
4437 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4438 // EXEC = 0, but checking for that case here seems not worth it
4439 // given the typical code patterns.
4440 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4441 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4442 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
4443 return true;
4444
4445 if (MI.isCall() || MI.isInlineAsm())
4446 return true; // conservative assumption
4447
4448 // Assume that barrier interactions are only intended with active lanes.
4449 if (isBarrier(Opcode))
4450 return true;
4451
4452 // A mode change is a scalar operation that influences vector instructions.
4454 return true;
4455
4456 // These are like SALU instructions in terms of effects, so it's questionable
4457 // whether we should return true for those.
4458 //
4459 // However, executing them with EXEC = 0 causes them to operate on undefined
4460 // data, which we avoid by returning true here.
4461 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4462 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4463 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4464 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4465 return true;
4466
4467 return false;
4468}
4469
4471 const MachineInstr &MI) const {
4472 if (MI.isMetaInstruction())
4473 return false;
4474
4475 // This won't read exec if this is an SGPR->SGPR copy.
4476 if (MI.isCopyLike()) {
4477 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4478 return true;
4479
4480 // Make sure this isn't copying exec as a normal operand
4481 return MI.readsRegister(AMDGPU::EXEC, &RI);
4482 }
4483
4484 // Make a conservative assumption about the callee.
4485 if (MI.isCall())
4486 return true;
4487
4488 // Be conservative with any unhandled generic opcodes.
4489 if (!isTargetSpecificOpcode(MI.getOpcode()))
4490 return true;
4491
4492 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4493}
4494
4495bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4496 switch (Imm.getBitWidth()) {
4497 case 1: // This likely will be a condition code mask.
4498 return true;
4499
4500 case 32:
4501 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4502 ST.hasInv2PiInlineImm());
4503 case 64:
4504 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4505 ST.hasInv2PiInlineImm());
4506 case 16:
4507 return ST.has16BitInsts() &&
4508 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4509 ST.hasInv2PiInlineImm());
4510 default:
4511 llvm_unreachable("invalid bitwidth");
4512 }
4513}
4514
4516 APInt IntImm = Imm.bitcastToAPInt();
4517 int64_t IntImmVal = IntImm.getSExtValue();
4518 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4519 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4520 default:
4521 llvm_unreachable("invalid fltSemantics");
4524 return isInlineConstant(IntImm);
4526 return ST.has16BitInsts() &&
4527 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4529 return ST.has16BitInsts() &&
4530 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4531 }
4532}
4533
4534bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4535 // MachineOperand provides no way to tell the true operand size, since it only
4536 // records a 64-bit value. We need to know the size to determine if a 32-bit
4537 // floating point immediate bit pattern is legal for an integer immediate. It
4538 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4539 switch (OperandType) {
4549 int32_t Trunc = static_cast<int32_t>(Imm);
4550 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
4551 }
4557 return AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm());
4560 // We would expect inline immediates to not be concerned with an integer/fp
4561 // distinction. However, in the case of 16-bit integer operations, the
4562 // "floating point" values appear to not work. It seems read the low 16-bits
4563 // of 32-bit immediates, which happens to always work for the integer
4564 // values.
4565 //
4566 // See llvm bugzilla 46302.
4567 //
4568 // TODO: Theoretically we could use op-sel to use the high bits of the
4569 // 32-bit FP values.
4581 return false;
4584 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4585 // A few special case instructions have 16-bit operands on subtargets
4586 // where 16-bit instructions are not legal.
4587 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4588 // constants in these cases
4589 int16_t Trunc = static_cast<int16_t>(Imm);
4590 return ST.has16BitInsts() &&
4591 AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm());
4592 }
4593
4594 return false;
4595 }
4598 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4599 int16_t Trunc = static_cast<int16_t>(Imm);
4600 return ST.has16BitInsts() &&
4601 AMDGPU::isInlinableLiteralBF16(Trunc, ST.hasInv2PiInlineImm());
4602 }
4603 return false;
4604 }
4608 return false;
4610 return isLegalAV64PseudoImm(Imm);
4613 // Always embedded in the instruction for free.
4614 return true;
4624 // Just ignore anything else.
4625 return true;
4626 default:
4627 llvm_unreachable("invalid operand type");
4628 }
4629}
4630
4631static bool compareMachineOp(const MachineOperand &Op0,
4632 const MachineOperand &Op1) {
4633 if (Op0.getType() != Op1.getType())
4634 return false;
4635
4636 switch (Op0.getType()) {
4638 return Op0.getReg() == Op1.getReg();
4640 return Op0.getImm() == Op1.getImm();
4641 default:
4642 llvm_unreachable("Didn't expect to be comparing these operand types");
4643 }
4644}
4645
4647 const MCOperandInfo &OpInfo) const {
4648 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4649 return true;
4650
4651 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4652 return false;
4653
4654 if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4655 return true;
4656
4657 return ST.hasVOP3Literal();
4658}
4659
4660bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4661 int64_t ImmVal) const {
4662 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4663 if (isInlineConstant(ImmVal, OpInfo.OperandType)) {
4664 if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4665 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(),
4666 AMDGPU::OpName::src2))
4667 return false;
4668 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4669 }
4670
4671 return isLiteralOperandLegal(InstDesc, OpInfo);
4672}
4673
4674bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4675 const MachineOperand &MO) const {
4676 if (MO.isImm())
4677 return isImmOperandLegal(InstDesc, OpNo, MO.getImm());
4678
4679 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4680 "unexpected imm-like operand kind");
4681 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4682 return isLiteralOperandLegal(InstDesc, OpInfo);
4683}
4684
4686 // 2 32-bit inline constants packed into one.
4687 return AMDGPU::isInlinableLiteral32(Lo_32(Imm), ST.hasInv2PiInlineImm()) &&
4688 AMDGPU::isInlinableLiteral32(Hi_32(Imm), ST.hasInv2PiInlineImm());
4689}
4690
4691bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4692 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4693 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4694 return false;
4695
4696 int Op32 = AMDGPU::getVOPe32(Opcode);
4697 if (Op32 == -1)
4698 return false;
4699
4700 return pseudoToMCOpcode(Op32) != -1;
4701}
4702
4703bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4704 // The src0_modifier operand is present on all instructions
4705 // that have modifiers.
4706
4707 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4708}
4709
4711 AMDGPU::OpName OpName) const {
4712 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4713 return Mods && Mods->getImm();
4714}
4715
4717 return any_of(ModifierOpNames,
4718 [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, Name); });
4719}
4720
4722 const MachineRegisterInfo &MRI) const {
4723 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4724 // Can't shrink instruction with three operands.
4725 if (Src2) {
4726 switch (MI.getOpcode()) {
4727 default: return false;
4728
4729 case AMDGPU::V_ADDC_U32_e64:
4730 case AMDGPU::V_SUBB_U32_e64:
4731 case AMDGPU::V_SUBBREV_U32_e64: {
4732 const MachineOperand *Src1
4733 = getNamedOperand(MI, AMDGPU::OpName::src1);
4734 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4735 return false;
4736 // Additional verification is needed for sdst/src2.
4737 return true;
4738 }
4739 case AMDGPU::V_MAC_F16_e64:
4740 case AMDGPU::V_MAC_F32_e64:
4741 case AMDGPU::V_MAC_LEGACY_F32_e64:
4742 case AMDGPU::V_FMAC_F16_e64:
4743 case AMDGPU::V_FMAC_F16_t16_e64:
4744 case AMDGPU::V_FMAC_F16_fake16_e64:
4745 case AMDGPU::V_FMAC_F32_e64:
4746 case AMDGPU::V_FMAC_F64_e64:
4747 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4748 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4749 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4750 return false;
4751 break;
4752
4753 case AMDGPU::V_CNDMASK_B32_e64:
4754 break;
4755 }
4756 }
4757
4758 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4759 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4760 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4761 return false;
4762
4763 // We don't need to check src0, all input types are legal, so just make sure
4764 // src0 isn't using any modifiers.
4765 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4766 return false;
4767
4768 // Can it be shrunk to a valid 32 bit opcode?
4769 if (!hasVALU32BitEncoding(MI.getOpcode()))
4770 return false;
4771
4772 // Check output modifiers
4773 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4774 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4775 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4776 // TODO: Can we avoid checking bound_ctrl/fi here?
4777 // They are only used by permlane*_swap special case.
4778 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
4779 !hasModifiersSet(MI, AMDGPU::OpName::fi);
4780}
4781
4782// Set VCC operand with all flags from \p Orig, except for setting it as
4783// implicit.
4785 const MachineOperand &Orig) {
4786
4787 for (MachineOperand &Use : MI.implicit_operands()) {
4788 if (Use.isUse() &&
4789 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4790 Use.setIsUndef(Orig.isUndef());
4791 Use.setIsKill(Orig.isKill());
4792 return;
4793 }
4794 }
4795}
4796
4798 unsigned Op32) const {
4799 MachineBasicBlock *MBB = MI.getParent();
4800
4801 const MCInstrDesc &Op32Desc = get(Op32);
4802 MachineInstrBuilder Inst32 =
4803 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4804 .setMIFlags(MI.getFlags());
4805
4806 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4807 // For VOPC instructions, this is replaced by an implicit def of vcc.
4808
4809 // We assume the defs of the shrunk opcode are in the same order, and the
4810 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4811 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4812 Inst32.add(MI.getOperand(I));
4813
4814 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4815
4816 int Idx = MI.getNumExplicitDefs();
4817 for (const MachineOperand &Use : MI.explicit_uses()) {
4818 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4820 continue;
4821
4822 if (&Use == Src2) {
4823 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4824 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4825 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4826 // of vcc was already added during the initial BuildMI, but we
4827 // 1) may need to change vcc to vcc_lo to preserve the original register
4828 // 2) have to preserve the original flags.
4829 copyFlagsToImplicitVCC(*Inst32, *Src2);
4830 continue;
4831 }
4832 }
4833
4834 Inst32.add(Use);
4835 }
4836
4837 // FIXME: Losing implicit operands
4838 fixImplicitOperands(*Inst32);
4839 return Inst32;
4840}
4841
4843 // Null is free
4844 Register Reg = RegOp.getReg();
4845 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
4846 return false;
4847
4848 // SGPRs use the constant bus
4849
4850 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
4851 // physical register operands should also count, except for exec.
4852 if (RegOp.isImplicit())
4853 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
4854
4855 // SGPRs use the constant bus
4856 return AMDGPU::SReg_32RegClass.contains(Reg) ||
4857 AMDGPU::SReg_64RegClass.contains(Reg);
4858}
4859
4861 const MachineRegisterInfo &MRI) const {
4862 Register Reg = RegOp.getReg();
4863 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
4864 : physRegUsesConstantBus(RegOp);
4865}
4866
4868 const MachineOperand &MO,
4869 const MCOperandInfo &OpInfo) const {
4870 // Literal constants use the constant bus.
4871 if (!MO.isReg())
4872 return !isInlineConstant(MO, OpInfo);
4873
4874 Register Reg = MO.getReg();
4875 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
4877}
4878
4880 for (const MachineOperand &MO : MI.implicit_operands()) {
4881 // We only care about reads.
4882 if (MO.isDef())
4883 continue;
4884
4885 switch (MO.getReg()) {
4886 case AMDGPU::VCC:
4887 case AMDGPU::VCC_LO:
4888 case AMDGPU::VCC_HI:
4889 case AMDGPU::M0:
4890 case AMDGPU::FLAT_SCR:
4891 return MO.getReg();
4892
4893 default:
4894 break;
4895 }
4896 }
4897
4898 return Register();
4899}
4900
4901static bool shouldReadExec(const MachineInstr &MI) {
4902 if (SIInstrInfo::isVALU(MI)) {
4903 switch (MI.getOpcode()) {
4904 case AMDGPU::V_READLANE_B32:
4905 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
4906 case AMDGPU::V_WRITELANE_B32:
4907 case AMDGPU::SI_SPILL_S32_TO_VGPR:
4908 return false;
4909 }
4910
4911 return true;
4912 }
4913
4914 if (MI.isPreISelOpcode() ||
4915 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
4918 return false;
4919
4920 return true;
4921}
4922
4923static bool isRegOrFI(const MachineOperand &MO) {
4924 return MO.isReg() || MO.isFI();
4925}
4926
4927static bool isSubRegOf(const SIRegisterInfo &TRI,
4928 const MachineOperand &SuperVec,
4929 const MachineOperand &SubReg) {
4930 if (SubReg.getReg().isPhysical())
4931 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
4932
4933 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
4934 SubReg.getReg() == SuperVec.getReg();
4935}
4936
4937// Verify the illegal copy from vector register to SGPR for generic opcode COPY
4938bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
4939 const MachineRegisterInfo &MRI,
4940 StringRef &ErrInfo) const {
4941 Register DstReg = MI.getOperand(0).getReg();
4942 Register SrcReg = MI.getOperand(1).getReg();
4943 // This is a check for copy from vector register to SGPR
4944 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
4945 ErrInfo = "illegal copy from vector register to SGPR";
4946 return false;
4947 }
4948 return true;
4949}
4950
4952 StringRef &ErrInfo) const {
4953 uint16_t Opcode = MI.getOpcode();
4954 const MachineFunction *MF = MI.getParent()->getParent();
4955 const MachineRegisterInfo &MRI = MF->getRegInfo();
4956
4957 // FIXME: At this point the COPY verify is done only for non-ssa forms.
4958 // Find a better property to recognize the point where instruction selection
4959 // is just done.
4960 // We can only enforce this check after SIFixSGPRCopies pass so that the
4961 // illegal copies are legalized and thereafter we don't expect a pass
4962 // inserting similar copies.
4963 if (!MRI.isSSA() && MI.isCopy())
4964 return verifyCopy(MI, MRI, ErrInfo);
4965
4966 if (SIInstrInfo::isGenericOpcode(Opcode))
4967 return true;
4968
4969 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
4970 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
4971 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
4972 int Src3Idx = -1;
4973 if (Src0Idx == -1) {
4974 // VOPD V_DUAL_* instructions use different operand names.
4975 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
4976 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
4977 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
4978 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
4979 }
4980
4981 // Make sure the number of operands is correct.
4982 const MCInstrDesc &Desc = get(Opcode);
4983 if (!Desc.isVariadic() &&
4984 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
4985 ErrInfo = "Instruction has wrong number of operands.";
4986 return false;
4987 }
4988
4989 if (MI.isInlineAsm()) {
4990 // Verify register classes for inlineasm constraints.
4991 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
4992 I != E; ++I) {
4993 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
4994 if (!RC)
4995 continue;
4996
4997 const MachineOperand &Op = MI.getOperand(I);
4998 if (!Op.isReg())
4999 continue;
5000
5001 Register Reg = Op.getReg();
5002 if (!Reg.isVirtual() && !RC->contains(Reg)) {
5003 ErrInfo = "inlineasm operand has incorrect register class.";
5004 return false;
5005 }
5006 }
5007
5008 return true;
5009 }
5010
5011 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
5012 ErrInfo = "missing memory operand from image instruction.";
5013 return false;
5014 }
5015
5016 // Make sure the register classes are correct.
5017 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
5018 const MachineOperand &MO = MI.getOperand(i);
5019 if (MO.isFPImm()) {
5020 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
5021 "all fp values to integers.";
5022 return false;
5023 }
5024
5025 const MCOperandInfo &OpInfo = Desc.operands()[i];
5026 int16_t RegClass = getOpRegClassID(OpInfo);
5027
5028 switch (OpInfo.OperandType) {
5030 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
5031 ErrInfo = "Illegal immediate value for operand.";
5032 return false;
5033 }
5034 break;
5047 break;
5049 break;
5050 break;
5064 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
5065 ErrInfo = "Illegal immediate value for operand.";
5066 return false;
5067 }
5068 break;
5069 }
5071 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
5072 ErrInfo = "Expected inline constant for operand.";
5073 return false;
5074 }
5075 break;
5079 break;
5084 // Check if this operand is an immediate.
5085 // FrameIndex operands will be replaced by immediates, so they are
5086 // allowed.
5087 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5088 ErrInfo = "Expected immediate, but got non-immediate";
5089 return false;
5090 }
5091 break;
5095 break;
5096 default:
5097 if (OpInfo.isGenericType())
5098 continue;
5099 break;
5100 }
5101
5102 if (!MO.isReg())
5103 continue;
5104 Register Reg = MO.getReg();
5105 if (!Reg)
5106 continue;
5107
5108 // FIXME: Ideally we would have separate instruction definitions with the
5109 // aligned register constraint.
5110 // FIXME: We do not verify inline asm operands, but custom inline asm
5111 // verification is broken anyway
5112 if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO) {
5113 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5114 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5115 if (const TargetRegisterClass *SubRC =
5116 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5117 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
5118 if (RC)
5119 RC = SubRC;
5120 }
5121 }
5122
5123 // Check that this is the aligned version of the class.
5124 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
5125 ErrInfo = "Subtarget requires even aligned vector registers";
5126 return false;
5127 }
5128 }
5129
5130 if (RegClass != -1) {
5131 if (Reg.isVirtual())
5132 continue;
5133
5134 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
5135 if (!RC->contains(Reg)) {
5136 ErrInfo = "Operand has incorrect register class.";
5137 return false;
5138 }
5139 }
5140 }
5141
5142 // Verify SDWA
5143 if (isSDWA(MI)) {
5144 if (!ST.hasSDWA()) {
5145 ErrInfo = "SDWA is not supported on this target";
5146 return false;
5147 }
5148
5149 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5150 AMDGPU::OpName::dst_sel}) {
5151 const MachineOperand *MO = getNamedOperand(MI, Op);
5152 if (!MO)
5153 continue;
5154 int64_t Imm = MO->getImm();
5155 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5156 ErrInfo = "Invalid SDWA selection";
5157 return false;
5158 }
5159 }
5160
5161 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
5162
5163 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5164 if (OpIdx == -1)
5165 continue;
5166 const MachineOperand &MO = MI.getOperand(OpIdx);
5167
5168 if (!ST.hasSDWAScalar()) {
5169 // Only VGPRS on VI
5170 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
5171 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5172 return false;
5173 }
5174 } else {
5175 // No immediates on GFX9
5176 if (!MO.isReg()) {
5177 ErrInfo =
5178 "Only reg allowed as operands in SDWA instructions on GFX9+";
5179 return false;
5180 }
5181 }
5182 }
5183
5184 if (!ST.hasSDWAOmod()) {
5185 // No omod allowed on VI
5186 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5187 if (OMod != nullptr &&
5188 (!OMod->isImm() || OMod->getImm() != 0)) {
5189 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5190 return false;
5191 }
5192 }
5193
5194 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5195 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5196 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5197 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5198 const MachineOperand *Src0ModsMO =
5199 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
5200 unsigned Mods = Src0ModsMO->getImm();
5201 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5202 Mods & SISrcMods::SEXT) {
5203 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5204 return false;
5205 }
5206 }
5207
5208 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5209 if (isVOPC(BasicOpcode)) {
5210 if (!ST.hasSDWASdst() && DstIdx != -1) {
5211 // Only vcc allowed as dst on VI for VOPC
5212 const MachineOperand &Dst = MI.getOperand(DstIdx);
5213 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5214 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5215 return false;
5216 }
5217 } else if (!ST.hasSDWAOutModsVOPC()) {
5218 // No clamp allowed on GFX9 for VOPC
5219 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
5220 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5221 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5222 return false;
5223 }
5224
5225 // No omod allowed on GFX9 for VOPC
5226 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5227 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5228 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5229 return false;
5230 }
5231 }
5232 }
5233
5234 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
5235 if (DstUnused && DstUnused->isImm() &&
5236 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5237 const MachineOperand &Dst = MI.getOperand(DstIdx);
5238 if (!Dst.isReg() || !Dst.isTied()) {
5239 ErrInfo = "Dst register should have tied register";
5240 return false;
5241 }
5242
5243 const MachineOperand &TiedMO =
5244 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
5245 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5246 ErrInfo =
5247 "Dst register should be tied to implicit use of preserved register";
5248 return false;
5249 }
5250 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5251 ErrInfo = "Dst register should use same physical register as preserved";
5252 return false;
5253 }
5254 }
5255 }
5256
5257 // Verify MIMG / VIMAGE / VSAMPLE
5258 if (isImage(Opcode) && !MI.mayStore()) {
5259 // Ensure that the return type used is large enough for all the options
5260 // being used TFE/LWE require an extra result register.
5261 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
5262 if (DMask) {
5263 uint64_t DMaskImm = DMask->getImm();
5264 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(DMaskImm);
5265 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
5266 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
5267 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
5268
5269 // Adjust for packed 16 bit values
5270 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5271 RegCount = divideCeil(RegCount, 2);
5272
5273 // Adjust if using LWE or TFE
5274 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5275 RegCount += 1;
5276
5277 const uint32_t DstIdx =
5278 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
5279 const MachineOperand &Dst = MI.getOperand(DstIdx);
5280 if (Dst.isReg()) {
5281 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
5282 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
5283 if (RegCount > DstSize) {
5284 ErrInfo = "Image instruction returns too many registers for dst "
5285 "register class";
5286 return false;
5287 }
5288 }
5289 }
5290 }
5291
5292 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5293 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5294 unsigned ConstantBusCount = 0;
5295 bool UsesLiteral = false;
5296 const MachineOperand *LiteralVal = nullptr;
5297
5298 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
5299 if (ImmIdx != -1) {
5300 ++ConstantBusCount;
5301 UsesLiteral = true;
5302 LiteralVal = &MI.getOperand(ImmIdx);
5303 }
5304
5305 SmallVector<Register, 2> SGPRsUsed;
5306 Register SGPRUsed;
5307
5308 // Only look at the true operands. Only a real operand can use the constant
5309 // bus, and we don't want to check pseudo-operands like the source modifier
5310 // flags.
5311 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5312 if (OpIdx == -1)
5313 continue;
5314 const MachineOperand &MO = MI.getOperand(OpIdx);
5315 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5316 if (MO.isReg()) {
5317 SGPRUsed = MO.getReg();
5318 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
5319 ++ConstantBusCount;
5320 SGPRsUsed.push_back(SGPRUsed);
5321 }
5322 } else if (!MO.isFI()) { // Treat FI like a register.
5323 if (!UsesLiteral) {
5324 ++ConstantBusCount;
5325 UsesLiteral = true;
5326 LiteralVal = &MO;
5327 } else if (!MO.isIdenticalTo(*LiteralVal)) {
5328 assert(isVOP2(MI) || isVOP3(MI));
5329 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5330 return false;
5331 }
5332 }
5333 }
5334 }
5335
5336 SGPRUsed = findImplicitSGPRRead(MI);
5337 if (SGPRUsed) {
5338 // Implicit uses may safely overlap true operands
5339 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5340 return !RI.regsOverlap(SGPRUsed, SGPR);
5341 })) {
5342 ++ConstantBusCount;
5343 SGPRsUsed.push_back(SGPRUsed);
5344 }
5345 }
5346
5347 // v_writelane_b32 is an exception from constant bus restriction:
5348 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5349 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5350 Opcode != AMDGPU::V_WRITELANE_B32) {
5351 ErrInfo = "VOP* instruction violates constant bus restriction";
5352 return false;
5353 }
5354
5355 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5356 ErrInfo = "VOP3 instruction uses literal";
5357 return false;
5358 }
5359 }
5360
5361 // Special case for writelane - this can break the multiple constant bus rule,
5362 // but still can't use more than one SGPR register
5363 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5364 unsigned SGPRCount = 0;
5365 Register SGPRUsed;
5366
5367 for (int OpIdx : {Src0Idx, Src1Idx}) {
5368 if (OpIdx == -1)
5369 break;
5370
5371 const MachineOperand &MO = MI.getOperand(OpIdx);
5372
5373 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5374 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5375 if (MO.getReg() != SGPRUsed)
5376 ++SGPRCount;
5377 SGPRUsed = MO.getReg();
5378 }
5379 }
5380 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5381 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5382 return false;
5383 }
5384 }
5385 }
5386
5387 // Verify misc. restrictions on specific instructions.
5388 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5389 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5390 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5391 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5392 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5393 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5394 if (!compareMachineOp(Src0, Src1) &&
5395 !compareMachineOp(Src0, Src2)) {
5396 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5397 return false;
5398 }
5399 }
5400 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5401 SISrcMods::ABS) ||
5402 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5403 SISrcMods::ABS) ||
5404 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5405 SISrcMods::ABS)) {
5406 ErrInfo = "ABS not allowed in VOP3B instructions";
5407 return false;
5408 }
5409 }
5410
5411 if (isSOP2(MI) || isSOPC(MI)) {
5412 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5413 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5414
5415 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5416 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5417 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5418 !Src0.isIdenticalTo(Src1)) {
5419 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5420 return false;
5421 }
5422 }
5423
5424 if (isSOPK(MI)) {
5425 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5426 if (Desc.isBranch()) {
5427 if (!Op->isMBB()) {
5428 ErrInfo = "invalid branch target for SOPK instruction";
5429 return false;
5430 }
5431 } else {
5432 uint64_t Imm = Op->getImm();
5433 if (sopkIsZext(Opcode)) {
5434 if (!isUInt<16>(Imm)) {
5435 ErrInfo = "invalid immediate for SOPK instruction";
5436 return false;
5437 }
5438 } else {
5439 if (!isInt<16>(Imm)) {
5440 ErrInfo = "invalid immediate for SOPK instruction";
5441 return false;
5442 }
5443 }
5444 }
5445 }
5446
5447 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5448 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5449 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5450 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5451 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5452 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5453
5454 const unsigned StaticNumOps =
5455 Desc.getNumOperands() + Desc.implicit_uses().size();
5456 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5457
5458 // Allow additional implicit operands. This allows a fixup done by the post
5459 // RA scheduler where the main implicit operand is killed and implicit-defs
5460 // are added for sub-registers that remain live after this instruction.
5461 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5462 ErrInfo = "missing implicit register operands";
5463 return false;
5464 }
5465
5466 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5467 if (IsDst) {
5468 if (!Dst->isUse()) {
5469 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5470 return false;
5471 }
5472
5473 unsigned UseOpIdx;
5474 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5475 UseOpIdx != StaticNumOps + 1) {
5476 ErrInfo = "movrel implicit operands should be tied";
5477 return false;
5478 }
5479 }
5480
5481 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5482 const MachineOperand &ImpUse
5483 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5484 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5485 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5486 ErrInfo = "src0 should be subreg of implicit vector use";
5487 return false;
5488 }
5489 }
5490
5491 // Make sure we aren't losing exec uses in the td files. This mostly requires
5492 // being careful when using let Uses to try to add other use registers.
5493 if (shouldReadExec(MI)) {
5494 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5495 ErrInfo = "VALU instruction does not implicitly read exec mask";
5496 return false;
5497 }
5498 }
5499
5500 if (isSMRD(MI)) {
5501 if (MI.mayStore() &&
5502 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5503 // The register offset form of scalar stores may only use m0 as the
5504 // soffset register.
5505 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5506 if (Soff && Soff->getReg() != AMDGPU::M0) {
5507 ErrInfo = "scalar stores must use m0 as offset register";
5508 return false;
5509 }
5510 }
5511 }
5512
5513 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5514 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5515 if (Offset->getImm() != 0) {
5516 ErrInfo = "subtarget does not support offsets in flat instructions";
5517 return false;
5518 }
5519 }
5520
5521 if (isDS(MI) && !ST.hasGDS()) {
5522 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5523 if (GDSOp && GDSOp->getImm() != 0) {
5524 ErrInfo = "GDS is not supported on this subtarget";
5525 return false;
5526 }
5527 }
5528
5529 if (isImage(MI)) {
5530 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5531 if (DimOp) {
5532 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5533 AMDGPU::OpName::vaddr0);
5534 AMDGPU::OpName RSrcOpName =
5535 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5536 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5537 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5538 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5539 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
5540 const AMDGPU::MIMGDimInfo *Dim =
5542
5543 if (!Dim) {
5544 ErrInfo = "dim is out of range";
5545 return false;
5546 }
5547
5548 bool IsA16 = false;
5549 if (ST.hasR128A16()) {
5550 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5551 IsA16 = R128A16->getImm() != 0;
5552 } else if (ST.hasA16()) {
5553 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5554 IsA16 = A16->getImm() != 0;
5555 }
5556
5557 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5558
5559 unsigned AddrWords =
5560 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5561
5562 unsigned VAddrWords;
5563 if (IsNSA) {
5564 VAddrWords = RsrcIdx - VAddr0Idx;
5565 if (ST.hasPartialNSAEncoding() &&
5566 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5567 unsigned LastVAddrIdx = RsrcIdx - 1;
5568 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5569 }
5570 } else {
5571 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5572 if (AddrWords > 12)
5573 AddrWords = 16;
5574 }
5575
5576 if (VAddrWords != AddrWords) {
5577 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5578 << " but got " << VAddrWords << "\n");
5579 ErrInfo = "bad vaddr size";
5580 return false;
5581 }
5582 }
5583 }
5584
5585 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5586 if (DppCt) {
5587 using namespace AMDGPU::DPP;
5588
5589 unsigned DC = DppCt->getImm();
5590 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5591 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5592 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5593 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5594 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5595 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5596 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5597 ErrInfo = "Invalid dpp_ctrl value";
5598 return false;
5599 }
5600 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5601 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5602 ErrInfo = "Invalid dpp_ctrl value: "
5603 "wavefront shifts are not supported on GFX10+";
5604 return false;
5605 }
5606 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5607 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5608 ErrInfo = "Invalid dpp_ctrl value: "
5609 "broadcasts are not supported on GFX10+";
5610 return false;
5611 }
5612 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5613 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5614 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5615 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5616 !ST.hasGFX90AInsts()) {
5617 ErrInfo = "Invalid dpp_ctrl value: "
5618 "row_newbroadcast/row_share is not supported before "
5619 "GFX90A/GFX10";
5620 return false;
5621 }
5622 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5623 ErrInfo = "Invalid dpp_ctrl value: "
5624 "row_share and row_xmask are not supported before GFX10";
5625 return false;
5626 }
5627 }
5628
5629 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5631 AMDGPU::isDPALU_DPP(Desc, *this, ST)) {
5632 ErrInfo = "Invalid dpp_ctrl value: "
5633 "DP ALU dpp only support row_newbcast";
5634 return false;
5635 }
5636 }
5637
5638 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5639 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5640 AMDGPU::OpName DataName =
5641 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5642 const MachineOperand *Data = getNamedOperand(MI, DataName);
5643 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5644 if (Data && !Data->isReg())
5645 Data = nullptr;
5646
5647 if (ST.hasGFX90AInsts()) {
5648 if (Dst && Data && !Dst->isTied() && !Data->isTied() &&
5649 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5650 ErrInfo = "Invalid register class: "
5651 "vdata and vdst should be both VGPR or AGPR";
5652 return false;
5653 }
5654 if (Data && Data2 &&
5655 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5656 ErrInfo = "Invalid register class: "
5657 "both data operands should be VGPR or AGPR";
5658 return false;
5659 }
5660 } else {
5661 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5662 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5663 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5664 ErrInfo = "Invalid register class: "
5665 "agpr loads and stores not supported on this GPU";
5666 return false;
5667 }
5668 }
5669 }
5670
5671 if (ST.needsAlignedVGPRs()) {
5672 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5674 if (!Op)
5675 return true;
5676 Register Reg = Op->getReg();
5677 if (Reg.isPhysical())
5678 return !(RI.getHWRegIndex(Reg) & 1);
5679 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5680 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5681 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5682 };
5683
5684 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5685 Opcode == AMDGPU::DS_GWS_BARRIER) {
5686
5687 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5688 ErrInfo = "Subtarget requires even aligned vector registers "
5689 "for DS_GWS instructions";
5690 return false;
5691 }
5692 }
5693
5694 if (isMIMG(MI)) {
5695 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5696 ErrInfo = "Subtarget requires even aligned vector registers "
5697 "for vaddr operand of image instructions";
5698 return false;
5699 }
5700 }
5701 }
5702
5703 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5704 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5705 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5706 ErrInfo = "Invalid register class: "
5707 "v_accvgpr_write with an SGPR is not supported on this GPU";
5708 return false;
5709 }
5710 }
5711
5712 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5713 const MachineOperand &SrcOp = MI.getOperand(1);
5714 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5715 ErrInfo = "pseudo expects only physical SGPRs";
5716 return false;
5717 }
5718 }
5719
5720 if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
5721 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
5722 if (!ST.hasScaleOffset()) {
5723 ErrInfo = "Subtarget does not support offset scaling";
5724 return false;
5725 }
5726 if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
5727 ErrInfo = "Instruction does not support offset scaling";
5728 return false;
5729 }
5730 }
5731 }
5732
5733 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
5734 // information.
5735 if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) {
5736 for (unsigned I = 0; I < 3; ++I) {
5738 return false;
5739 }
5740 }
5741
5742 return true;
5743}
5744
5745// It is more readable to list mapped opcodes on the same line.
5746// clang-format off
5747
5749 switch (MI.getOpcode()) {
5750 default: return AMDGPU::INSTRUCTION_LIST_END;
5751 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5752 case AMDGPU::COPY: return AMDGPU::COPY;
5753 case AMDGPU::PHI: return AMDGPU::PHI;
5754 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5755 case AMDGPU::WQM: return AMDGPU::WQM;
5756 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5757 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5758 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5759 case AMDGPU::S_MOV_B32: {
5760 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5761 return MI.getOperand(1).isReg() ||
5762 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5763 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5764 }
5765 case AMDGPU::S_ADD_I32:
5766 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5767 case AMDGPU::S_ADDC_U32:
5768 return AMDGPU::V_ADDC_U32_e32;
5769 case AMDGPU::S_SUB_I32:
5770 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5771 // FIXME: These are not consistently handled, and selected when the carry is
5772 // used.
5773 case AMDGPU::S_ADD_U32:
5774 return AMDGPU::V_ADD_CO_U32_e32;
5775 case AMDGPU::S_SUB_U32:
5776 return AMDGPU::V_SUB_CO_U32_e32;
5777 case AMDGPU::S_ADD_U64_PSEUDO:
5778 return AMDGPU::V_ADD_U64_PSEUDO;
5779 case AMDGPU::S_SUB_U64_PSEUDO:
5780 return AMDGPU::V_SUB_U64_PSEUDO;
5781 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5782 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5783 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5784 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5785 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5786 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5787 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5788 case AMDGPU::S_XNOR_B32:
5789 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5790 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5791 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5792 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5793 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5794 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5795 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5796 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5797 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5798 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5799 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5800 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5801 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5802 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5803 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5804 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5805 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5806 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5807 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5808 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5809 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5810 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5811 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5812 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5813 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5814 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5815 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5816 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5817 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5818 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5819 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5820 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5821 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5822 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5823 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5824 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5825 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5826 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5827 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5828 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5829 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5830 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5831 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5832 case AMDGPU::S_CVT_F32_F16:
5833 case AMDGPU::S_CVT_HI_F32_F16:
5834 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
5835 : AMDGPU::V_CVT_F32_F16_fake16_e64;
5836 case AMDGPU::S_CVT_F16_F32:
5837 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
5838 : AMDGPU::V_CVT_F16_F32_fake16_e64;
5839 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5840 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5841 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5842 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5843 case AMDGPU::S_CEIL_F16:
5844 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5845 : AMDGPU::V_CEIL_F16_fake16_e64;
5846 case AMDGPU::S_FLOOR_F16:
5847 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5848 : AMDGPU::V_FLOOR_F16_fake16_e64;
5849 case AMDGPU::S_TRUNC_F16:
5850 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
5851 : AMDGPU::V_TRUNC_F16_fake16_e64;
5852 case AMDGPU::S_RNDNE_F16:
5853 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
5854 : AMDGPU::V_RNDNE_F16_fake16_e64;
5855 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
5856 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
5857 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
5858 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
5859 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
5860 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
5861 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
5862 case AMDGPU::S_ADD_F16:
5863 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
5864 : AMDGPU::V_ADD_F16_fake16_e64;
5865 case AMDGPU::S_SUB_F16:
5866 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
5867 : AMDGPU::V_SUB_F16_fake16_e64;
5868 case AMDGPU::S_MIN_F16:
5869 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
5870 : AMDGPU::V_MIN_F16_fake16_e64;
5871 case AMDGPU::S_MAX_F16:
5872 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
5873 : AMDGPU::V_MAX_F16_fake16_e64;
5874 case AMDGPU::S_MINIMUM_F16:
5875 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
5876 : AMDGPU::V_MINIMUM_F16_fake16_e64;
5877 case AMDGPU::S_MAXIMUM_F16:
5878 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
5879 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
5880 case AMDGPU::S_MUL_F16:
5881 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
5882 : AMDGPU::V_MUL_F16_fake16_e64;
5883 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5884 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5885 case AMDGPU::S_FMAC_F16:
5886 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
5887 : AMDGPU::V_FMAC_F16_fake16_e64;
5888 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5889 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5890 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
5891 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
5892 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
5893 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
5894 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
5895 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
5896 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
5897 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
5898 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
5899 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
5900 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
5901 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
5902 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
5903 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
5904 case AMDGPU::S_CMP_LT_F16:
5905 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
5906 : AMDGPU::V_CMP_LT_F16_fake16_e64;
5907 case AMDGPU::S_CMP_EQ_F16:
5908 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
5909 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
5910 case AMDGPU::S_CMP_LE_F16:
5911 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
5912 : AMDGPU::V_CMP_LE_F16_fake16_e64;
5913 case AMDGPU::S_CMP_GT_F16:
5914 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
5915 : AMDGPU::V_CMP_GT_F16_fake16_e64;
5916 case AMDGPU::S_CMP_LG_F16:
5917 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
5918 : AMDGPU::V_CMP_LG_F16_fake16_e64;
5919 case AMDGPU::S_CMP_GE_F16:
5920 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
5921 : AMDGPU::V_CMP_GE_F16_fake16_e64;
5922 case AMDGPU::S_CMP_O_F16:
5923 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
5924 : AMDGPU::V_CMP_O_F16_fake16_e64;
5925 case AMDGPU::S_CMP_U_F16:
5926 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
5927 : AMDGPU::V_CMP_U_F16_fake16_e64;
5928 case AMDGPU::S_CMP_NGE_F16:
5929 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
5930 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
5931 case AMDGPU::S_CMP_NLG_F16:
5932 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
5933 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
5934 case AMDGPU::S_CMP_NGT_F16:
5935 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
5936 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
5937 case AMDGPU::S_CMP_NLE_F16:
5938 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
5939 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
5940 case AMDGPU::S_CMP_NEQ_F16:
5941 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
5942 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
5943 case AMDGPU::S_CMP_NLT_F16:
5944 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
5945 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
5946 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
5947 case AMDGPU::V_S_EXP_F16_e64:
5948 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
5949 : AMDGPU::V_EXP_F16_fake16_e64;
5950 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
5951 case AMDGPU::V_S_LOG_F16_e64:
5952 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
5953 : AMDGPU::V_LOG_F16_fake16_e64;
5954 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
5955 case AMDGPU::V_S_RCP_F16_e64:
5956 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
5957 : AMDGPU::V_RCP_F16_fake16_e64;
5958 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
5959 case AMDGPU::V_S_RSQ_F16_e64:
5960 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
5961 : AMDGPU::V_RSQ_F16_fake16_e64;
5962 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
5963 case AMDGPU::V_S_SQRT_F16_e64:
5964 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
5965 : AMDGPU::V_SQRT_F16_fake16_e64;
5966 }
5968 "Unexpected scalar opcode without corresponding vector one!");
5969}
5970
5971// clang-format on
5972
5976 const DebugLoc &DL, Register Reg,
5977 bool IsSCCLive,
5978 SlotIndexes *Indexes) const {
5979 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5980 const SIInstrInfo *TII = ST.getInstrInfo();
5982 if (IsSCCLive) {
5983 // Insert two move instructions, one to save the original value of EXEC and
5984 // the other to turn on all bits in EXEC. This is required as we can't use
5985 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
5986 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), Reg)
5988 auto FlipExecMI =
5989 BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
5990 if (Indexes) {
5991 Indexes->insertMachineInstrInMaps(*StoreExecMI);
5992 Indexes->insertMachineInstrInMaps(*FlipExecMI);
5993 }
5994 } else {
5995 auto SaveExec =
5996 BuildMI(MBB, MBBI, DL, TII->get(LMC.OrSaveExecOpc), Reg).addImm(-1);
5997 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
5998 if (Indexes)
5999 Indexes->insertMachineInstrInMaps(*SaveExec);
6000 }
6001}
6002
6005 const DebugLoc &DL, Register Reg,
6006 SlotIndexes *Indexes) const {
6008 auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, get(LMC.MovOpc), LMC.ExecReg)
6009 .addReg(Reg, RegState::Kill);
6010 if (Indexes)
6011 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
6012}
6013
6017 "Not a whole wave func");
6018 MachineBasicBlock &MBB = *MF.begin();
6019 for (MachineInstr &MI : MBB)
6020 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
6021 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
6022 return &MI;
6023
6024 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
6025}
6026
6027// FIXME: This should not be an overridable function. All subtarget dependent
6028// operand modifications should go through isLookupRegClassByHwMode in the
6029// generic handling.
6030const TargetRegisterClass *
6031SIInstrInfo::getRegClass(const MCInstrDesc &TID, unsigned OpNum,
6032 const TargetRegisterInfo *TRI) const {
6033 if (OpNum >= TID.getNumOperands())
6034 return nullptr;
6035 const MCOperandInfo &OpInfo = TID.operands()[OpNum];
6036 int16_t RegClass = getOpRegClassID(OpInfo);
6037 return RI.getRegClass(RegClass);
6038}
6039
6041 unsigned OpNo) const {
6042 const MCInstrDesc &Desc = get(MI.getOpcode());
6043 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6044 Desc.operands()[OpNo].RegClass == -1) {
6045 Register Reg = MI.getOperand(OpNo).getReg();
6046
6047 if (Reg.isVirtual()) {
6048 const MachineRegisterInfo &MRI =
6049 MI.getParent()->getParent()->getRegInfo();
6050 return MRI.getRegClass(Reg);
6051 }
6052 return RI.getPhysRegBaseClass(Reg);
6053 }
6054
6055 return RI.getRegClass(getOpRegClassID(Desc.operands()[OpNo]));
6056}
6057
6060 MachineBasicBlock *MBB = MI.getParent();
6061 MachineOperand &MO = MI.getOperand(OpIdx);
6062 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6063 unsigned RCID = getOpRegClassID(get(MI.getOpcode()).operands()[OpIdx]);
6064 const TargetRegisterClass *RC = RI.getRegClass(RCID);
6065 unsigned Size = RI.getRegSizeInBits(*RC);
6066 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6067 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6068 : AMDGPU::V_MOV_B32_e32;
6069 if (MO.isReg())
6070 Opcode = AMDGPU::COPY;
6071 else if (RI.isSGPRClass(RC))
6072 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6073
6074 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
6075 Register Reg = MRI.createVirtualRegister(VRC);
6076 DebugLoc DL = MBB->findDebugLoc(I);
6077 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
6078 MO.ChangeToRegister(Reg, false);
6079}
6080
6083 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6084 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6085 if (!SuperReg.getReg().isVirtual())
6086 return RI.getSubReg(SuperReg.getReg(), SubIdx);
6087
6088 MachineBasicBlock *MBB = MI->getParent();
6089 const DebugLoc &DL = MI->getDebugLoc();
6090 Register SubReg = MRI.createVirtualRegister(SubRC);
6091
6092 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
6093 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
6094 .addReg(SuperReg.getReg(), 0, NewSubIdx);
6095 return SubReg;
6096}
6097
6100 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6101 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6102 if (Op.isImm()) {
6103 if (SubIdx == AMDGPU::sub0)
6104 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
6105 if (SubIdx == AMDGPU::sub1)
6106 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
6107
6108 llvm_unreachable("Unhandled register index for immediate");
6109 }
6110
6111 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
6112 SubIdx, SubRC);
6113 return MachineOperand::CreateReg(SubReg, false);
6114}
6115
6116// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6117void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6118 assert(Inst.getNumExplicitOperands() == 3);
6119 MachineOperand Op1 = Inst.getOperand(1);
6120 Inst.removeOperand(1);
6121 Inst.addOperand(Op1);
6122}
6123
6125 const MCOperandInfo &OpInfo,
6126 const MachineOperand &MO) const {
6127 if (!MO.isReg())
6128 return false;
6129
6130 Register Reg = MO.getReg();
6131
6132 const TargetRegisterClass *DRC = RI.getRegClass(getOpRegClassID(OpInfo));
6133 if (Reg.isPhysical())
6134 return DRC->contains(Reg);
6135
6136 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6137
6138 if (MO.getSubReg()) {
6139 const MachineFunction *MF = MO.getParent()->getParent()->getParent();
6140 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
6141 if (!SuperRC)
6142 return false;
6143 return RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()) != nullptr;
6144 }
6145
6146 return RI.getCommonSubClass(DRC, RC) != nullptr;
6147}
6148
6150 const MachineOperand &MO) const {
6151 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
6152 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6153 unsigned Opc = MI.getOpcode();
6154
6155 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
6156 // information.
6157 if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
6158 MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
6159 constexpr AMDGPU::OpName OpNames[] = {
6160 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6161
6162 for (auto [I, OpName] : enumerate(OpNames)) {
6163 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
6164 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6166 return false;
6167 }
6168 }
6169
6170 if (!isLegalRegOperand(MRI, OpInfo, MO))
6171 return false;
6172
6173 // check Accumulate GPR operand
6174 bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
6175 if (IsAGPR && !ST.hasMAIInsts())
6176 return false;
6177 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6178 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
6179 return false;
6180 // Atomics should have both vdst and vdata either vgpr or agpr.
6181 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
6182 const int DataIdx = AMDGPU::getNamedOperandIdx(
6183 Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6184 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6185 MI.getOperand(DataIdx).isReg() &&
6186 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
6187 return false;
6188 if ((int)OpIdx == DataIdx) {
6189 if (VDstIdx != -1 &&
6190 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
6191 return false;
6192 // DS instructions with 2 src operands also must have tied RC.
6193 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
6194 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
6195 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
6196 return false;
6197 }
6198
6199 // Check V_ACCVGPR_WRITE_B32_e64
6200 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6201 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
6202 RI.isSGPRReg(MRI, MO.getReg()))
6203 return false;
6204 return true;
6205}
6206
6208 const MCOperandInfo &OpInfo,
6209 const MachineOperand &MO) const {
6210 if (MO.isReg())
6211 return isLegalRegOperand(MRI, OpInfo, MO);
6212
6213 // Handle non-register types that are treated like immediates.
6214 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6215 return true;
6216}
6217
6219 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6220 const MachineOperand *MO) const {
6221 constexpr unsigned NumOps = 3;
6222 constexpr AMDGPU::OpName OpNames[NumOps * 2] = {
6223 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6224 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6225 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6226
6227 assert(SrcN < NumOps);
6228
6229 if (!MO) {
6230 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
6231 if (SrcIdx == -1)
6232 return true;
6233 MO = &MI.getOperand(SrcIdx);
6234 }
6235
6236 if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
6237 return true;
6238
6239 int ModsIdx =
6240 AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
6241 if (ModsIdx == -1)
6242 return true;
6243
6244 unsigned Mods = MI.getOperand(ModsIdx).getImm();
6245 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6246 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6247
6248 return !OpSel && !OpSelHi;
6249}
6250
6252 const MachineOperand *MO) const {
6253 const MachineFunction &MF = *MI.getParent()->getParent();
6254 const MachineRegisterInfo &MRI = MF.getRegInfo();
6255 const MCInstrDesc &InstDesc = MI.getDesc();
6256 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6257 int64_t RegClass = getOpRegClassID(OpInfo);
6258 const TargetRegisterClass *DefinedRC =
6259 RegClass != -1 ? RI.getRegClass(RegClass) : nullptr;
6260 if (!MO)
6261 MO = &MI.getOperand(OpIdx);
6262
6263 const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo);
6264
6265 if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, *MO, OpInfo)) {
6266 const MachineOperand *UsedLiteral = nullptr;
6267
6268 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
6269 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6270
6271 // TODO: Be more permissive with frame indexes.
6272 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) {
6273 if (!LiteralLimit--)
6274 return false;
6275
6276 UsedLiteral = MO;
6277 }
6278
6280 if (MO->isReg())
6281 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
6282
6283 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6284 if (i == OpIdx)
6285 continue;
6286 const MachineOperand &Op = MI.getOperand(i);
6287 if (Op.isReg()) {
6288 if (Op.isUse()) {
6289 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6290 if (regUsesConstantBus(Op, MRI) && SGPRsUsed.insert(SGPR).second) {
6291 if (--ConstantBusLimit <= 0)
6292 return false;
6293 }
6294 }
6295 } else if (AMDGPU::isSISrcOperand(InstDesc.operands()[i]) &&
6296 !isInlineConstant(Op, InstDesc.operands()[i])) {
6297 // The same literal may be used multiple times.
6298 if (!UsedLiteral)
6299 UsedLiteral = &Op;
6300 else if (UsedLiteral->isIdenticalTo(Op))
6301 continue;
6302
6303 if (!LiteralLimit--)
6304 return false;
6305 if (--ConstantBusLimit <= 0)
6306 return false;
6307 }
6308 }
6309 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6310 // There can be at most one literal operand, but it can be repeated.
6311 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6312 if (i == OpIdx)
6313 continue;
6314 const MachineOperand &Op = MI.getOperand(i);
6315 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6316 !isInlineConstant(Op, InstDesc.operands()[i]) &&
6317 !Op.isIdenticalTo(*MO))
6318 return false;
6319
6320 // Do not fold a non-inlineable and non-register operand into an
6321 // instruction that already has a frame index. The frame index handling
6322 // code could not handle well when a frame index co-exists with another
6323 // non-register operand, unless that operand is an inlineable immediate.
6324 if (Op.isFI())
6325 return false;
6326 }
6327 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6328 isF16PseudoScalarTrans(MI.getOpcode())) {
6329 return false;
6330 }
6331
6332 if (MO->isReg()) {
6333 if (!DefinedRC)
6334 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6335 return isLegalRegOperand(MI, OpIdx, *MO);
6336 }
6337
6338 if (MO->isImm()) {
6339 uint64_t Imm = MO->getImm();
6340 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
6341 bool Is64BitOp = Is64BitFPOp ||
6342 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6343 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6344 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
6345 if (Is64BitOp &&
6346 !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
6347 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
6348 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6349 return false;
6350
6351 // FIXME: We can use sign extended 64-bit literals, but only for signed
6352 // operands. At the moment we do not know if an operand is signed.
6353 // Such operand will be encoded as its low 32 bits and then either
6354 // correctly sign extended or incorrectly zero extended by HW.
6355 // If 64-bit literals are supported and the literal will be encoded
6356 // as full 64 bit we still can use it.
6357 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6358 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
6359 return false;
6360 }
6361 }
6362
6363 // Handle non-register types that are treated like immediates.
6364 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6365
6366 if (!DefinedRC) {
6367 // This operand expects an immediate.
6368 return true;
6369 }
6370
6371 return isImmOperandLegal(MI, OpIdx, *MO);
6372}
6373
6375 bool IsGFX950Only = ST.hasGFX950Insts();
6376 bool IsGFX940Only = ST.hasGFX940Insts();
6377
6378 if (!IsGFX950Only && !IsGFX940Only)
6379 return false;
6380
6381 if (!isVALU(MI))
6382 return false;
6383
6384 // V_COS, V_EXP, V_RCP, etc.
6385 if (isTRANS(MI))
6386 return true;
6387
6388 // DOT2, DOT2C, DOT4, etc.
6389 if (isDOT(MI))
6390 return true;
6391
6392 // MFMA, SMFMA
6393 if (isMFMA(MI))
6394 return true;
6395
6396 unsigned Opcode = MI.getOpcode();
6397 switch (Opcode) {
6398 case AMDGPU::V_CVT_PK_BF8_F32_e64:
6399 case AMDGPU::V_CVT_PK_FP8_F32_e64:
6400 case AMDGPU::V_MQSAD_PK_U16_U8_e64:
6401 case AMDGPU::V_MQSAD_U32_U8_e64:
6402 case AMDGPU::V_PK_ADD_F16:
6403 case AMDGPU::V_PK_ADD_F32:
6404 case AMDGPU::V_PK_ADD_I16:
6405 case AMDGPU::V_PK_ADD_U16:
6406 case AMDGPU::V_PK_ASHRREV_I16:
6407 case AMDGPU::V_PK_FMA_F16:
6408 case AMDGPU::V_PK_FMA_F32:
6409 case AMDGPU::V_PK_FMAC_F16_e32:
6410 case AMDGPU::V_PK_FMAC_F16_e64:
6411 case AMDGPU::V_PK_LSHLREV_B16:
6412 case AMDGPU::V_PK_LSHRREV_B16:
6413 case AMDGPU::V_PK_MAD_I16:
6414 case AMDGPU::V_PK_MAD_U16:
6415 case AMDGPU::V_PK_MAX_F16:
6416 case AMDGPU::V_PK_MAX_I16:
6417 case AMDGPU::V_PK_MAX_U16:
6418 case AMDGPU::V_PK_MIN_F16:
6419 case AMDGPU::V_PK_MIN_I16:
6420 case AMDGPU::V_PK_MIN_U16:
6421 case AMDGPU::V_PK_MOV_B32:
6422 case AMDGPU::V_PK_MUL_F16:
6423 case AMDGPU::V_PK_MUL_F32:
6424 case AMDGPU::V_PK_MUL_LO_U16:
6425 case AMDGPU::V_PK_SUB_I16:
6426 case AMDGPU::V_PK_SUB_U16:
6427 case AMDGPU::V_QSAD_PK_U16_U8_e64:
6428 return true;
6429 default:
6430 return false;
6431 }
6432}
6433
6435 MachineInstr &MI) const {
6436 unsigned Opc = MI.getOpcode();
6437 const MCInstrDesc &InstrDesc = get(Opc);
6438
6439 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
6440 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6441
6442 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
6443 MachineOperand &Src1 = MI.getOperand(Src1Idx);
6444
6445 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6446 // we need to only have one constant bus use before GFX10.
6447 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6448 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
6449 RI.isSGPRReg(MRI, Src0.getReg()))
6450 legalizeOpWithMove(MI, Src0Idx);
6451
6452 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6453 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6454 // src0/src1 with V_READFIRSTLANE.
6455 if (Opc == AMDGPU::V_WRITELANE_B32) {
6456 const DebugLoc &DL = MI.getDebugLoc();
6457 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
6458 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6459 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6460 .add(Src0);
6461 Src0.ChangeToRegister(Reg, false);
6462 }
6463 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
6464 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6465 const DebugLoc &DL = MI.getDebugLoc();
6466 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6467 .add(Src1);
6468 Src1.ChangeToRegister(Reg, false);
6469 }
6470 return;
6471 }
6472
6473 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6474 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6475 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
6476 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6477 legalizeOpWithMove(MI, Src2Idx);
6478 }
6479
6480 // VOP2 src0 instructions support all operand types, so we don't need to check
6481 // their legality. If src1 is already legal, we don't need to do anything.
6482 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6483 return;
6484
6485 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6486 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6487 // select is uniform.
6488 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6489 RI.isVGPR(MRI, Src1.getReg())) {
6490 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6491 const DebugLoc &DL = MI.getDebugLoc();
6492 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6493 .add(Src1);
6494 Src1.ChangeToRegister(Reg, false);
6495 return;
6496 }
6497
6498 // We do not use commuteInstruction here because it is too aggressive and will
6499 // commute if it is possible. We only want to commute here if it improves
6500 // legality. This can be called a fairly large number of times so don't waste
6501 // compile time pointlessly swapping and checking legality again.
6502 if (HasImplicitSGPR || !MI.isCommutable()) {
6503 legalizeOpWithMove(MI, Src1Idx);
6504 return;
6505 }
6506
6507 // If src0 can be used as src1, commuting will make the operands legal.
6508 // Otherwise we have to give up and insert a move.
6509 //
6510 // TODO: Other immediate-like operand kinds could be commuted if there was a
6511 // MachineOperand::ChangeTo* for them.
6512 if ((!Src1.isImm() && !Src1.isReg()) ||
6513 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6514 legalizeOpWithMove(MI, Src1Idx);
6515 return;
6516 }
6517
6518 int CommutedOpc = commuteOpcode(MI);
6519 if (CommutedOpc == -1) {
6520 legalizeOpWithMove(MI, Src1Idx);
6521 return;
6522 }
6523
6524 MI.setDesc(get(CommutedOpc));
6525
6526 Register Src0Reg = Src0.getReg();
6527 unsigned Src0SubReg = Src0.getSubReg();
6528 bool Src0Kill = Src0.isKill();
6529
6530 if (Src1.isImm())
6531 Src0.ChangeToImmediate(Src1.getImm());
6532 else if (Src1.isReg()) {
6533 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6534 Src0.setSubReg(Src1.getSubReg());
6535 } else
6536 llvm_unreachable("Should only have register or immediate operands");
6537
6538 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6539 Src1.setSubReg(Src0SubReg);
6541}
6542
6543// Legalize VOP3 operands. All operand types are supported for any operand
6544// but only one literal constant and only starting from GFX10.
6546 MachineInstr &MI) const {
6547 unsigned Opc = MI.getOpcode();
6548
6549 int VOP3Idx[3] = {
6550 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6551 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6552 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6553 };
6554
6555 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6556 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6557 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6558 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6559 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6560 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6561 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6562 // src1 and src2 must be scalar
6563 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6564 const DebugLoc &DL = MI.getDebugLoc();
6565 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6566 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6567 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6568 .add(Src1);
6569 Src1.ChangeToRegister(Reg, false);
6570 }
6571 if (VOP3Idx[2] != -1) {
6572 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6573 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6574 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6575 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6576 .add(Src2);
6577 Src2.ChangeToRegister(Reg, false);
6578 }
6579 }
6580 }
6581
6582 // Find the one SGPR operand we are allowed to use.
6583 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6584 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6585 SmallDenseSet<unsigned> SGPRsUsed;
6586 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6587 if (SGPRReg) {
6588 SGPRsUsed.insert(SGPRReg);
6589 --ConstantBusLimit;
6590 }
6591
6592 for (int Idx : VOP3Idx) {
6593 if (Idx == -1)
6594 break;
6595 MachineOperand &MO = MI.getOperand(Idx);
6596
6597 if (!MO.isReg()) {
6598 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6599 continue;
6600
6601 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6602 --LiteralLimit;
6603 --ConstantBusLimit;
6604 continue;
6605 }
6606
6607 --LiteralLimit;
6608 --ConstantBusLimit;
6609 legalizeOpWithMove(MI, Idx);
6610 continue;
6611 }
6612
6613 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6614 continue; // VGPRs are legal
6615
6616 // We can use one SGPR in each VOP3 instruction prior to GFX10
6617 // and two starting from GFX10.
6618 if (SGPRsUsed.count(MO.getReg()))
6619 continue;
6620 if (ConstantBusLimit > 0) {
6621 SGPRsUsed.insert(MO.getReg());
6622 --ConstantBusLimit;
6623 continue;
6624 }
6625
6626 // If we make it this far, then the operand is not legal and we must
6627 // legalize it.
6628 legalizeOpWithMove(MI, Idx);
6629 }
6630
6631 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6632 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6633 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6634 legalizeOpWithMove(MI, VOP3Idx[2]);
6635
6636 // Fix the register class of packed FP32 instructions on gfx12+. See
6637 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
6639 for (unsigned I = 0; I < 3; ++I) {
6641 legalizeOpWithMove(MI, VOP3Idx[I]);
6642 }
6643 }
6644}
6645
6648 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6649 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6650 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6651 if (DstRC)
6652 SRC = RI.getCommonSubClass(SRC, DstRC);
6653
6654 Register DstReg = MRI.createVirtualRegister(SRC);
6655 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6656
6657 if (RI.hasAGPRs(VRC)) {
6658 VRC = RI.getEquivalentVGPRClass(VRC);
6659 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6660 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6661 get(TargetOpcode::COPY), NewSrcReg)
6662 .addReg(SrcReg);
6663 SrcReg = NewSrcReg;
6664 }
6665
6666 if (SubRegs == 1) {
6667 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6668 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6669 .addReg(SrcReg);
6670 return DstReg;
6671 }
6672
6674 for (unsigned i = 0; i < SubRegs; ++i) {
6675 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6676 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6677 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6678 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
6679 SRegs.push_back(SGPR);
6680 }
6681
6683 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6684 get(AMDGPU::REG_SEQUENCE), DstReg);
6685 for (unsigned i = 0; i < SubRegs; ++i) {
6686 MIB.addReg(SRegs[i]);
6687 MIB.addImm(RI.getSubRegFromChannel(i));
6688 }
6689 return DstReg;
6690}
6691
6693 MachineInstr &MI) const {
6694
6695 // If the pointer is store in VGPRs, then we need to move them to
6696 // SGPRs using v_readfirstlane. This is safe because we only select
6697 // loads with uniform pointers to SMRD instruction so we know the
6698 // pointer value is uniform.
6699 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6700 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6701 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6702 SBase->setReg(SGPR);
6703 }
6704 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6705 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6706 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6707 SOff->setReg(SGPR);
6708 }
6709}
6710
6712 unsigned Opc = Inst.getOpcode();
6713 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6714 if (OldSAddrIdx < 0)
6715 return false;
6716
6717 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
6718
6719 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6720 if (NewOpc < 0)
6722 if (NewOpc < 0)
6723 return false;
6724
6726 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6727 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6728 return false;
6729
6730 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6731 if (NewVAddrIdx < 0)
6732 return false;
6733
6734 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6735
6736 // Check vaddr, it shall be zero or absent.
6737 MachineInstr *VAddrDef = nullptr;
6738 if (OldVAddrIdx >= 0) {
6739 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6740 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6741 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
6742 !VAddrDef->getOperand(1).isImm() ||
6743 VAddrDef->getOperand(1).getImm() != 0)
6744 return false;
6745 }
6746
6747 const MCInstrDesc &NewDesc = get(NewOpc);
6748 Inst.setDesc(NewDesc);
6749
6750 // Callers expect iterator to be valid after this call, so modify the
6751 // instruction in place.
6752 if (OldVAddrIdx == NewVAddrIdx) {
6753 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6754 // Clear use list from the old vaddr holding a zero register.
6755 MRI.removeRegOperandFromUseList(&NewVAddr);
6756 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6757 Inst.removeOperand(OldSAddrIdx);
6758 // Update the use list with the pointer we have just moved from vaddr to
6759 // saddr position. Otherwise new vaddr will be missing from the use list.
6760 MRI.removeRegOperandFromUseList(&NewVAddr);
6761 MRI.addRegOperandToUseList(&NewVAddr);
6762 } else {
6763 assert(OldSAddrIdx == NewVAddrIdx);
6764
6765 if (OldVAddrIdx >= 0) {
6766 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6767 AMDGPU::OpName::vdst_in);
6768
6769 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6770 // it asserts. Untie the operands for now and retie them afterwards.
6771 if (NewVDstIn != -1) {
6772 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6773 Inst.untieRegOperand(OldVDstIn);
6774 }
6775
6776 Inst.removeOperand(OldVAddrIdx);
6777
6778 if (NewVDstIn != -1) {
6779 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6780 Inst.tieOperands(NewVDst, NewVDstIn);
6781 }
6782 }
6783 }
6784
6785 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6786 VAddrDef->eraseFromParent();
6787
6788 return true;
6789}
6790
6791// FIXME: Remove this when SelectionDAG is obsoleted.
6793 MachineInstr &MI) const {
6794 if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
6795 return;
6796
6797 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6798 // thinks they are uniform, so a readfirstlane should be valid.
6799 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6800 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6801 return;
6802
6804 return;
6805
6806 const TargetRegisterClass *DeclaredRC =
6807 getRegClass(MI.getDesc(), SAddr->getOperandNo(), &RI);
6808
6809 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
6810 SAddr->setReg(ToSGPR);
6811}
6812
6815 const TargetRegisterClass *DstRC,
6818 const DebugLoc &DL) const {
6819 Register OpReg = Op.getReg();
6820 unsigned OpSubReg = Op.getSubReg();
6821
6822 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6823 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6824
6825 // Check if operand is already the correct register class.
6826 if (DstRC == OpRC)
6827 return;
6828
6829 Register DstReg = MRI.createVirtualRegister(DstRC);
6830 auto Copy =
6831 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
6832 Op.setReg(DstReg);
6833
6834 MachineInstr *Def = MRI.getVRegDef(OpReg);
6835 if (!Def)
6836 return;
6837
6838 // Try to eliminate the copy if it is copying an immediate value.
6839 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6840 foldImmediate(*Copy, *Def, OpReg, &MRI);
6841
6842 bool ImpDef = Def->isImplicitDef();
6843 while (!ImpDef && Def && Def->isCopy()) {
6844 if (Def->getOperand(1).getReg().isPhysical())
6845 break;
6846 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
6847 ImpDef = Def && Def->isImplicitDef();
6848 }
6849 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
6850 !ImpDef)
6851 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
6852}
6853
6854// Emit the actual waterfall loop, executing the wrapped instruction for each
6855// unique value of \p ScalarOps across all lanes. In the best case we execute 1
6856// iteration, in the worst case we execute 64 (once per lane).
6857static void
6860 MachineBasicBlock &LoopBB,
6861 MachineBasicBlock &BodyBB,
6862 const DebugLoc &DL,
6863 ArrayRef<MachineOperand *> ScalarOps) {
6864 MachineFunction &MF = *LoopBB.getParent();
6865 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6866 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6868 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
6869
6871 Register CondReg;
6872
6873 for (MachineOperand *ScalarOp : ScalarOps) {
6874 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
6875 unsigned NumSubRegs = RegSize / 32;
6876 Register VScalarOp = ScalarOp->getReg();
6877
6878 if (NumSubRegs == 1) {
6879 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6880
6881 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
6882 .addReg(VScalarOp);
6883
6884 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6885
6886 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
6887 .addReg(CurReg)
6888 .addReg(VScalarOp);
6889
6890 // Combine the comparison results with AND.
6891 if (!CondReg) // First.
6892 CondReg = NewCondReg;
6893 else { // If not the first, we create an AND.
6894 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6895 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
6896 .addReg(CondReg)
6897 .addReg(NewCondReg);
6898 CondReg = AndReg;
6899 }
6900
6901 // Update ScalarOp operand to use the SGPR ScalarOp.
6902 ScalarOp->setReg(CurReg);
6903 ScalarOp->setIsKill();
6904 } else {
6905 SmallVector<Register, 8> ReadlanePieces;
6906 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
6907 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
6908 "Unhandled register size");
6909
6910 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6911 Register CurRegLo =
6912 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6913 Register CurRegHi =
6914 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6915
6916 // Read the next variant <- also loop target.
6917 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
6918 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
6919
6920 // Read the next variant <- also loop target.
6921 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
6922 .addReg(VScalarOp, VScalarOpUndef,
6923 TRI->getSubRegFromChannel(Idx + 1));
6924
6925 ReadlanePieces.push_back(CurRegLo);
6926 ReadlanePieces.push_back(CurRegHi);
6927
6928 // Comparison is to be done as 64-bit.
6929 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
6930 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
6931 .addReg(CurRegLo)
6932 .addImm(AMDGPU::sub0)
6933 .addReg(CurRegHi)
6934 .addImm(AMDGPU::sub1);
6935
6936 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6937 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
6938 NewCondReg)
6939 .addReg(CurReg);
6940 if (NumSubRegs <= 2)
6941 Cmp.addReg(VScalarOp);
6942 else
6943 Cmp.addReg(VScalarOp, VScalarOpUndef,
6944 TRI->getSubRegFromChannel(Idx, 2));
6945
6946 // Combine the comparison results with AND.
6947 if (!CondReg) // First.
6948 CondReg = NewCondReg;
6949 else { // If not the first, we create an AND.
6950 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6951 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
6952 .addReg(CondReg)
6953 .addReg(NewCondReg);
6954 CondReg = AndReg;
6955 }
6956 } // End for loop.
6957
6958 const auto *SScalarOpRC =
6959 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
6960 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
6961
6962 // Build scalar ScalarOp.
6963 auto Merge =
6964 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
6965 unsigned Channel = 0;
6966 for (Register Piece : ReadlanePieces) {
6967 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
6968 }
6969
6970 // Update ScalarOp operand to use the SGPR ScalarOp.
6971 ScalarOp->setReg(SScalarOp);
6972 ScalarOp->setIsKill();
6973 }
6974 }
6975
6976 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6977 MRI.setSimpleHint(SaveExec, CondReg);
6978
6979 // Update EXEC to matching lanes, saving original to SaveExec.
6980 BuildMI(LoopBB, I, DL, TII.get(LMC.AndSaveExecOpc), SaveExec)
6981 .addReg(CondReg, RegState::Kill);
6982
6983 // The original instruction is here; we insert the terminators after it.
6984 I = BodyBB.end();
6985
6986 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
6987 BuildMI(BodyBB, I, DL, TII.get(LMC.XorTermOpc), LMC.ExecReg)
6988 .addReg(LMC.ExecReg)
6989 .addReg(SaveExec);
6990
6991 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
6992}
6993
6994// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
6995// with SGPRs by iterating over all unique values across all lanes.
6996// Returns the loop basic block that now contains \p MI.
6997static MachineBasicBlock *
7001 MachineBasicBlock::iterator Begin = nullptr,
7002 MachineBasicBlock::iterator End = nullptr) {
7003 MachineBasicBlock &MBB = *MI.getParent();
7004 MachineFunction &MF = *MBB.getParent();
7005 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7006 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7008 if (!Begin.isValid())
7009 Begin = &MI;
7010 if (!End.isValid()) {
7011 End = &MI;
7012 ++End;
7013 }
7014 const DebugLoc &DL = MI.getDebugLoc();
7016 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7017
7018 // Save SCC. Waterfall Loop may overwrite SCC.
7019 Register SaveSCCReg;
7020
7021 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
7022 // rather than unlimited scan everywhere
7023 bool SCCNotDead =
7024 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
7025 std::numeric_limits<unsigned>::max()) !=
7027 if (SCCNotDead) {
7028 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7029 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
7030 .addImm(1)
7031 .addImm(0);
7032 }
7033
7034 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7035
7036 // Save the EXEC mask
7037 BuildMI(MBB, Begin, DL, TII.get(LMC.MovOpc), SaveExec).addReg(LMC.ExecReg);
7038
7039 // Killed uses in the instruction we are waterfalling around will be
7040 // incorrect due to the added control-flow.
7042 ++AfterMI;
7043 for (auto I = Begin; I != AfterMI; I++) {
7044 for (auto &MO : I->all_uses())
7045 MRI.clearKillFlags(MO.getReg());
7046 }
7047
7048 // To insert the loop we need to split the block. Move everything after this
7049 // point to a new block, and insert a new empty block between the two.
7052 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
7054 ++MBBI;
7055
7056 MF.insert(MBBI, LoopBB);
7057 MF.insert(MBBI, BodyBB);
7058 MF.insert(MBBI, RemainderBB);
7059
7060 LoopBB->addSuccessor(BodyBB);
7061 BodyBB->addSuccessor(LoopBB);
7062 BodyBB->addSuccessor(RemainderBB);
7063
7064 // Move Begin to MI to the BodyBB, and the remainder of the block to
7065 // RemainderBB.
7066 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
7067 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
7068 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
7069
7070 MBB.addSuccessor(LoopBB);
7071
7072 // Update dominators. We know that MBB immediately dominates LoopBB, that
7073 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7074 // RemainderBB. RemainderBB immediately dominates all of the successors
7075 // transferred to it from MBB that MBB used to properly dominate.
7076 if (MDT) {
7077 MDT->addNewBlock(LoopBB, &MBB);
7078 MDT->addNewBlock(BodyBB, LoopBB);
7079 MDT->addNewBlock(RemainderBB, BodyBB);
7080 for (auto &Succ : RemainderBB->successors()) {
7081 if (MDT->properlyDominates(&MBB, Succ)) {
7082 MDT->changeImmediateDominator(Succ, RemainderBB);
7083 }
7084 }
7085 }
7086
7087 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps);
7088
7089 MachineBasicBlock::iterator First = RemainderBB->begin();
7090 // Restore SCC
7091 if (SCCNotDead) {
7092 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
7093 .addReg(SaveSCCReg, RegState::Kill)
7094 .addImm(0);
7095 }
7096
7097 // Restore the EXEC mask
7098 BuildMI(*RemainderBB, First, DL, TII.get(LMC.MovOpc), LMC.ExecReg)
7099 .addReg(SaveExec);
7100 return BodyBB;
7101}
7102
7103// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7104static std::tuple<unsigned, unsigned>
7106 MachineBasicBlock &MBB = *MI.getParent();
7107 MachineFunction &MF = *MBB.getParent();
7109
7110 // Extract the ptr from the resource descriptor.
7111 unsigned RsrcPtr =
7112 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
7113 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
7114
7115 // Create an empty resource descriptor
7116 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
7117 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7118 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7119 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
7120 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7121
7122 // Zero64 = 0
7123 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
7124 .addImm(0);
7125
7126 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7127 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
7128 .addImm(Lo_32(RsrcDataFormat));
7129
7130 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7131 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
7132 .addImm(Hi_32(RsrcDataFormat));
7133
7134 // NewSRsrc = {Zero64, SRsrcFormat}
7135 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
7136 .addReg(Zero64)
7137 .addImm(AMDGPU::sub0_sub1)
7138 .addReg(SRsrcFormatLo)
7139 .addImm(AMDGPU::sub2)
7140 .addReg(SRsrcFormatHi)
7141 .addImm(AMDGPU::sub3);
7142
7143 return std::tuple(RsrcPtr, NewSRsrc);
7144}
7145
7148 MachineDominatorTree *MDT) const {
7149 MachineFunction &MF = *MI.getParent()->getParent();
7151 MachineBasicBlock *CreatedBB = nullptr;
7152
7153 // Legalize VOP2
7154 if (isVOP2(MI) || isVOPC(MI)) {
7156 return CreatedBB;
7157 }
7158
7159 // Legalize VOP3
7160 if (isVOP3(MI)) {
7162 return CreatedBB;
7163 }
7164
7165 // Legalize SMRD
7166 if (isSMRD(MI)) {
7168 return CreatedBB;
7169 }
7170
7171 // Legalize FLAT
7172 if (isFLAT(MI)) {
7174 return CreatedBB;
7175 }
7176
7177 // Legalize REG_SEQUENCE and PHI
7178 // The register class of the operands much be the same type as the register
7179 // class of the output.
7180 if (MI.getOpcode() == AMDGPU::PHI) {
7181 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
7182 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
7183 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
7184 continue;
7185 const TargetRegisterClass *OpRC =
7186 MRI.getRegClass(MI.getOperand(i).getReg());
7187 if (RI.hasVectorRegisters(OpRC)) {
7188 VRC = OpRC;
7189 } else {
7190 SRC = OpRC;
7191 }
7192 }
7193
7194 // If any of the operands are VGPR registers, then they all most be
7195 // otherwise we will create illegal VGPR->SGPR copies when legalizing
7196 // them.
7197 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
7198 if (!VRC) {
7199 assert(SRC);
7200 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
7201 VRC = &AMDGPU::VReg_1RegClass;
7202 } else
7203 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7204 ? RI.getEquivalentAGPRClass(SRC)
7205 : RI.getEquivalentVGPRClass(SRC);
7206 } else {
7207 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
7208 ? RI.getEquivalentAGPRClass(VRC)
7209 : RI.getEquivalentVGPRClass(VRC);
7210 }
7211 RC = VRC;
7212 } else {
7213 RC = SRC;
7214 }
7215
7216 // Update all the operands so they have the same type.
7217 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7218 MachineOperand &Op = MI.getOperand(I);
7219 if (!Op.isReg() || !Op.getReg().isVirtual())
7220 continue;
7221
7222 // MI is a PHI instruction.
7223 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
7225
7226 // Avoid creating no-op copies with the same src and dst reg class. These
7227 // confuse some of the machine passes.
7228 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
7229 }
7230 }
7231
7232 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7233 // VGPR dest type and SGPR sources, insert copies so all operands are
7234 // VGPRs. This seems to help operand folding / the register coalescer.
7235 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7236 MachineBasicBlock *MBB = MI.getParent();
7237 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
7238 if (RI.hasVGPRs(DstRC)) {
7239 // Update all the operands so they are VGPR register classes. These may
7240 // not be the same register class because REG_SEQUENCE supports mixing
7241 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7242 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7243 MachineOperand &Op = MI.getOperand(I);
7244 if (!Op.isReg() || !Op.getReg().isVirtual())
7245 continue;
7246
7247 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
7248 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
7249 if (VRC == OpRC)
7250 continue;
7251
7252 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
7253 Op.setIsKill();
7254 }
7255 }
7256
7257 return CreatedBB;
7258 }
7259
7260 // Legalize INSERT_SUBREG
7261 // src0 must have the same register class as dst
7262 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7263 Register Dst = MI.getOperand(0).getReg();
7264 Register Src0 = MI.getOperand(1).getReg();
7265 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
7266 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
7267 if (DstRC != Src0RC) {
7268 MachineBasicBlock *MBB = MI.getParent();
7269 MachineOperand &Op = MI.getOperand(1);
7270 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
7271 }
7272 return CreatedBB;
7273 }
7274
7275 // Legalize SI_INIT_M0
7276 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7277 MachineOperand &Src = MI.getOperand(0);
7278 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7279 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7280 return CreatedBB;
7281 }
7282
7283 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7284 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7285 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7286 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7287 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7288 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7289 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7290 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7291 MachineOperand &Src = MI.getOperand(1);
7292 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7293 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7294 return CreatedBB;
7295 }
7296
7297 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7298 //
7299 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7300 // scratch memory access. In both cases, the legalization never involves
7301 // conversion to the addr64 form.
7303 (isMUBUF(MI) || isMTBUF(MI)))) {
7304 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7305 ? AMDGPU::OpName::rsrc
7306 : AMDGPU::OpName::srsrc;
7307 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
7308 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
7309 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
7310
7311 AMDGPU::OpName SampOpName =
7312 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7313 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
7314 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
7315 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
7316
7317 return CreatedBB;
7318 }
7319
7320 // Legalize SI_CALL
7321 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7322 MachineOperand *Dest = &MI.getOperand(0);
7323 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
7324 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7325 // following copies, we also need to move copies from and to physical
7326 // registers into the loop block.
7327 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
7328 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
7329
7330 // Also move the copies to physical registers into the loop block
7331 MachineBasicBlock &MBB = *MI.getParent();
7333 while (Start->getOpcode() != FrameSetupOpcode)
7334 --Start;
7336 while (End->getOpcode() != FrameDestroyOpcode)
7337 ++End;
7338 // Also include following copies of the return value
7339 ++End;
7340 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
7341 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
7342 ++End;
7343 CreatedBB =
7344 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
7345 }
7346 }
7347
7348 // Legalize s_sleep_var.
7349 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7350 const DebugLoc &DL = MI.getDebugLoc();
7351 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7352 int Src0Idx =
7353 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
7354 MachineOperand &Src0 = MI.getOperand(Src0Idx);
7355 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
7356 .add(Src0);
7357 Src0.ChangeToRegister(Reg, false);
7358 return nullptr;
7359 }
7360
7361 // Legalize TENSOR_LOAD_TO_LDS, TENSOR_LOAD_TO_LDS_D2, TENSOR_STORE_FROM_LDS,
7362 // TENSOR_STORE_FROM_LDS_D2. All their operands are scalar.
7363 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS ||
7364 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2 ||
7365 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS ||
7366 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D2) {
7367 for (MachineOperand &Src : MI.explicit_operands()) {
7368 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7369 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7370 }
7371 return CreatedBB;
7372 }
7373
7374 // Legalize MUBUF instructions.
7375 bool isSoffsetLegal = true;
7376 int SoffsetIdx =
7377 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7378 if (SoffsetIdx != -1) {
7379 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7380 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7381 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7382 isSoffsetLegal = false;
7383 }
7384 }
7385
7386 bool isRsrcLegal = true;
7387 int RsrcIdx =
7388 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
7389 if (RsrcIdx != -1) {
7390 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7391 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
7392 isRsrcLegal = false;
7393 }
7394
7395 // The operands are legal.
7396 if (isRsrcLegal && isSoffsetLegal)
7397 return CreatedBB;
7398
7399 if (!isRsrcLegal) {
7400 // Legalize a VGPR Rsrc
7401 //
7402 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7403 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7404 // a zero-value SRsrc.
7405 //
7406 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7407 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7408 // above.
7409 //
7410 // Otherwise we are on non-ADDR64 hardware, and/or we have
7411 // idxen/offen/bothen and we fall back to a waterfall loop.
7412
7413 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7414 MachineBasicBlock &MBB = *MI.getParent();
7415
7416 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7417 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
7418 // This is already an ADDR64 instruction so we need to add the pointer
7419 // extracted from the resource descriptor to the current value of VAddr.
7420 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7421 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7422 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7423
7424 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7425 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
7426 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
7427
7428 unsigned RsrcPtr, NewSRsrc;
7429 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7430
7431 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7432 const DebugLoc &DL = MI.getDebugLoc();
7433 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
7434 .addDef(CondReg0)
7435 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7436 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
7437 .addImm(0);
7438
7439 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7440 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
7441 .addDef(CondReg1, RegState::Dead)
7442 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7443 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
7444 .addReg(CondReg0, RegState::Kill)
7445 .addImm(0);
7446
7447 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7448 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
7449 .addReg(NewVAddrLo)
7450 .addImm(AMDGPU::sub0)
7451 .addReg(NewVAddrHi)
7452 .addImm(AMDGPU::sub1);
7453
7454 VAddr->setReg(NewVAddr);
7455 Rsrc->setReg(NewSRsrc);
7456 } else if (!VAddr && ST.hasAddr64()) {
7457 // This instructions is the _OFFSET variant, so we need to convert it to
7458 // ADDR64.
7459 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7460 "FIXME: Need to emit flat atomics here");
7461
7462 unsigned RsrcPtr, NewSRsrc;
7463 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7464
7465 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7466 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
7467 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
7468 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7469 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
7470
7471 // Atomics with return have an additional tied operand and are
7472 // missing some of the special bits.
7473 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
7474 MachineInstr *Addr64;
7475
7476 if (!VDataIn) {
7477 // Regular buffer load / store.
7479 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7480 .add(*VData)
7481 .addReg(NewVAddr)
7482 .addReg(NewSRsrc)
7483 .add(*SOffset)
7484 .add(*Offset);
7485
7486 if (const MachineOperand *CPol =
7487 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
7488 MIB.addImm(CPol->getImm());
7489 }
7490
7491 if (const MachineOperand *TFE =
7492 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
7493 MIB.addImm(TFE->getImm());
7494 }
7495
7496 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
7497
7498 MIB.cloneMemRefs(MI);
7499 Addr64 = MIB;
7500 } else {
7501 // Atomics with return.
7502 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7503 .add(*VData)
7504 .add(*VDataIn)
7505 .addReg(NewVAddr)
7506 .addReg(NewSRsrc)
7507 .add(*SOffset)
7508 .add(*Offset)
7509 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7510 .cloneMemRefs(MI);
7511 }
7512
7513 MI.removeFromParent();
7514
7515 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7516 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7517 NewVAddr)
7518 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7519 .addImm(AMDGPU::sub0)
7520 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7521 .addImm(AMDGPU::sub1);
7522 } else {
7523 // Legalize a VGPR Rsrc and soffset together.
7524 if (!isSoffsetLegal) {
7525 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7526 CreatedBB =
7527 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
7528 return CreatedBB;
7529 }
7530 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
7531 return CreatedBB;
7532 }
7533 }
7534
7535 // Legalize a VGPR soffset.
7536 if (!isSoffsetLegal) {
7537 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7538 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
7539 return CreatedBB;
7540 }
7541 return CreatedBB;
7542}
7543
7545 InstrList.insert(MI);
7546 // Add MBUF instructiosn to deferred list.
7547 int RsrcIdx =
7548 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7549 if (RsrcIdx != -1) {
7550 DeferredList.insert(MI);
7551 }
7552}
7553
7555 return DeferredList.contains(MI);
7556}
7557
7558// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7559// lowering (change spgr to vgpr).
7560// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7561// size. Need to legalize the size of the operands during the vgpr lowering
7562// chain. This can be removed after we have sgpr16 in place
7564 MachineRegisterInfo &MRI) const {
7565 if (!ST.useRealTrue16Insts())
7566 return;
7567
7568 unsigned Opcode = MI.getOpcode();
7569 MachineBasicBlock *MBB = MI.getParent();
7570 // Legalize operands and check for size mismatch
7571 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7572 OpIdx >= get(Opcode).getNumOperands() ||
7573 get(Opcode).operands()[OpIdx].RegClass == -1)
7574 return;
7575
7576 MachineOperand &Op = MI.getOperand(OpIdx);
7577 if (!Op.isReg() || !Op.getReg().isVirtual())
7578 return;
7579
7580 const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
7581 if (!RI.isVGPRClass(CurrRC))
7582 return;
7583
7584 int16_t RCID = getOpRegClassID(get(Opcode).operands()[OpIdx]);
7585 const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
7586 if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
7587 Op.setSubReg(AMDGPU::lo16);
7588 } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
7589 const DebugLoc &DL = MI.getDebugLoc();
7590 Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7591 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7592 BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
7593 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
7594 .addReg(Op.getReg())
7595 .addImm(AMDGPU::lo16)
7596 .addReg(Undef)
7597 .addImm(AMDGPU::hi16);
7598 Op.setReg(NewDstReg);
7599 }
7600}
7602 MachineRegisterInfo &MRI) const {
7603 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7605}
7606
7608 MachineDominatorTree *MDT) const {
7609
7610 while (!Worklist.empty()) {
7611 MachineInstr &Inst = *Worklist.top();
7612 Worklist.erase_top();
7613 // Skip MachineInstr in the deferred list.
7614 if (Worklist.isDeferred(&Inst))
7615 continue;
7616 moveToVALUImpl(Worklist, MDT, Inst);
7617 }
7618
7619 // Deferred list of instructions will be processed once
7620 // all the MachineInstr in the worklist are done.
7621 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7622 moveToVALUImpl(Worklist, MDT, *Inst);
7623 assert(Worklist.empty() &&
7624 "Deferred MachineInstr are not supposed to re-populate worklist");
7625 }
7626}
7627
7630 MachineInstr &Inst) const {
7631
7633 if (!MBB)
7634 return;
7635 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7636 unsigned Opcode = Inst.getOpcode();
7637 unsigned NewOpcode = getVALUOp(Inst);
7638 // Handle some special cases
7639 switch (Opcode) {
7640 default:
7641 break;
7642 case AMDGPU::S_ADD_I32:
7643 case AMDGPU::S_SUB_I32: {
7644 // FIXME: The u32 versions currently selected use the carry.
7645 bool Changed;
7646 MachineBasicBlock *CreatedBBTmp = nullptr;
7647 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7648 if (Changed)
7649 return;
7650
7651 // Default handling
7652 break;
7653 }
7654
7655 case AMDGPU::S_MUL_U64:
7656 if (ST.hasVectorMulU64()) {
7657 NewOpcode = AMDGPU::V_MUL_U64_e64;
7658 break;
7659 }
7660 // Split s_mul_u64 in 32-bit vector multiplications.
7661 splitScalarSMulU64(Worklist, Inst, MDT);
7662 Inst.eraseFromParent();
7663 return;
7664
7665 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7666 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7667 // This is a special case of s_mul_u64 where all the operands are either
7668 // zero extended or sign extended.
7669 splitScalarSMulPseudo(Worklist, Inst, MDT);
7670 Inst.eraseFromParent();
7671 return;
7672
7673 case AMDGPU::S_AND_B64:
7674 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7675 Inst.eraseFromParent();
7676 return;
7677
7678 case AMDGPU::S_OR_B64:
7679 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7680 Inst.eraseFromParent();
7681 return;
7682
7683 case AMDGPU::S_XOR_B64:
7684 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7685 Inst.eraseFromParent();
7686 return;
7687
7688 case AMDGPU::S_NAND_B64:
7689 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7690 Inst.eraseFromParent();
7691 return;
7692
7693 case AMDGPU::S_NOR_B64:
7694 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7695 Inst.eraseFromParent();
7696 return;
7697
7698 case AMDGPU::S_XNOR_B64:
7699 if (ST.hasDLInsts())
7700 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7701 else
7702 splitScalar64BitXnor(Worklist, Inst, MDT);
7703 Inst.eraseFromParent();
7704 return;
7705
7706 case AMDGPU::S_ANDN2_B64:
7707 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7708 Inst.eraseFromParent();
7709 return;
7710
7711 case AMDGPU::S_ORN2_B64:
7712 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7713 Inst.eraseFromParent();
7714 return;
7715
7716 case AMDGPU::S_BREV_B64:
7717 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7718 Inst.eraseFromParent();
7719 return;
7720
7721 case AMDGPU::S_NOT_B64:
7722 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7723 Inst.eraseFromParent();
7724 return;
7725
7726 case AMDGPU::S_BCNT1_I32_B64:
7727 splitScalar64BitBCNT(Worklist, Inst);
7728 Inst.eraseFromParent();
7729 return;
7730
7731 case AMDGPU::S_BFE_I64:
7732 splitScalar64BitBFE(Worklist, Inst);
7733 Inst.eraseFromParent();
7734 return;
7735
7736 case AMDGPU::S_FLBIT_I32_B64:
7737 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7738 Inst.eraseFromParent();
7739 return;
7740 case AMDGPU::S_FF1_I32_B64:
7741 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7742 Inst.eraseFromParent();
7743 return;
7744
7745 case AMDGPU::S_LSHL_B32:
7746 if (ST.hasOnlyRevVALUShifts()) {
7747 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7748 swapOperands(Inst);
7749 }
7750 break;
7751 case AMDGPU::S_ASHR_I32:
7752 if (ST.hasOnlyRevVALUShifts()) {
7753 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7754 swapOperands(Inst);
7755 }
7756 break;
7757 case AMDGPU::S_LSHR_B32:
7758 if (ST.hasOnlyRevVALUShifts()) {
7759 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7760 swapOperands(Inst);
7761 }
7762 break;
7763 case AMDGPU::S_LSHL_B64:
7764 if (ST.hasOnlyRevVALUShifts()) {
7765 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7766 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7767 : AMDGPU::V_LSHLREV_B64_e64;
7768 swapOperands(Inst);
7769 }
7770 break;
7771 case AMDGPU::S_ASHR_I64:
7772 if (ST.hasOnlyRevVALUShifts()) {
7773 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7774 swapOperands(Inst);
7775 }
7776 break;
7777 case AMDGPU::S_LSHR_B64:
7778 if (ST.hasOnlyRevVALUShifts()) {
7779 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7780 swapOperands(Inst);
7781 }
7782 break;
7783
7784 case AMDGPU::S_ABS_I32:
7785 lowerScalarAbs(Worklist, Inst);
7786 Inst.eraseFromParent();
7787 return;
7788
7789 case AMDGPU::S_CBRANCH_SCC0:
7790 case AMDGPU::S_CBRANCH_SCC1: {
7791 // Clear unused bits of vcc
7792 Register CondReg = Inst.getOperand(1).getReg();
7793 bool IsSCC = CondReg == AMDGPU::SCC;
7795 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(LMC.AndOpc), LMC.VccReg)
7796 .addReg(LMC.ExecReg)
7797 .addReg(IsSCC ? LMC.VccReg : CondReg);
7798 Inst.removeOperand(1);
7799 } break;
7800
7801 case AMDGPU::S_BFE_U64:
7802 case AMDGPU::S_BFM_B64:
7803 llvm_unreachable("Moving this op to VALU not implemented");
7804
7805 case AMDGPU::S_PACK_LL_B32_B16:
7806 case AMDGPU::S_PACK_LH_B32_B16:
7807 case AMDGPU::S_PACK_HL_B32_B16:
7808 case AMDGPU::S_PACK_HH_B32_B16:
7809 movePackToVALU(Worklist, MRI, Inst);
7810 Inst.eraseFromParent();
7811 return;
7812
7813 case AMDGPU::S_XNOR_B32:
7814 lowerScalarXnor(Worklist, Inst);
7815 Inst.eraseFromParent();
7816 return;
7817
7818 case AMDGPU::S_NAND_B32:
7819 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7820 Inst.eraseFromParent();
7821 return;
7822
7823 case AMDGPU::S_NOR_B32:
7824 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7825 Inst.eraseFromParent();
7826 return;
7827
7828 case AMDGPU::S_ANDN2_B32:
7829 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7830 Inst.eraseFromParent();
7831 return;
7832
7833 case AMDGPU::S_ORN2_B32:
7834 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7835 Inst.eraseFromParent();
7836 return;
7837
7838 // TODO: remove as soon as everything is ready
7839 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7840 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7841 // can only be selected from the uniform SDNode.
7842 case AMDGPU::S_ADD_CO_PSEUDO:
7843 case AMDGPU::S_SUB_CO_PSEUDO: {
7844 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7845 ? AMDGPU::V_ADDC_U32_e64
7846 : AMDGPU::V_SUBB_U32_e64;
7847 const auto *CarryRC = RI.getWaveMaskRegClass();
7848
7849 Register CarryInReg = Inst.getOperand(4).getReg();
7850 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
7851 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
7852 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
7853 .addReg(CarryInReg);
7854 }
7855
7856 Register CarryOutReg = Inst.getOperand(1).getReg();
7857
7858 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
7859 MRI.getRegClass(Inst.getOperand(0).getReg())));
7860 MachineInstr *CarryOp =
7861 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
7862 .addReg(CarryOutReg, RegState::Define)
7863 .add(Inst.getOperand(2))
7864 .add(Inst.getOperand(3))
7865 .addReg(CarryInReg)
7866 .addImm(0);
7867 legalizeOperands(*CarryOp);
7868 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
7869 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7870 Inst.eraseFromParent();
7871 }
7872 return;
7873 case AMDGPU::S_UADDO_PSEUDO:
7874 case AMDGPU::S_USUBO_PSEUDO: {
7875 const DebugLoc &DL = Inst.getDebugLoc();
7876 MachineOperand &Dest0 = Inst.getOperand(0);
7877 MachineOperand &Dest1 = Inst.getOperand(1);
7878 MachineOperand &Src0 = Inst.getOperand(2);
7879 MachineOperand &Src1 = Inst.getOperand(3);
7880
7881 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
7882 ? AMDGPU::V_ADD_CO_U32_e64
7883 : AMDGPU::V_SUB_CO_U32_e64;
7884 const TargetRegisterClass *NewRC =
7885 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
7886 Register DestReg = MRI.createVirtualRegister(NewRC);
7887 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
7888 .addReg(Dest1.getReg(), RegState::Define)
7889 .add(Src0)
7890 .add(Src1)
7891 .addImm(0); // clamp bit
7892
7893 legalizeOperands(*NewInstr, MDT);
7894 MRI.replaceRegWith(Dest0.getReg(), DestReg);
7895 addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
7896 Worklist);
7897 Inst.eraseFromParent();
7898 }
7899 return;
7900
7901 case AMDGPU::S_CSELECT_B32:
7902 case AMDGPU::S_CSELECT_B64:
7903 lowerSelect(Worklist, Inst, MDT);
7904 Inst.eraseFromParent();
7905 return;
7906 case AMDGPU::S_CMP_EQ_I32:
7907 case AMDGPU::S_CMP_LG_I32:
7908 case AMDGPU::S_CMP_GT_I32:
7909 case AMDGPU::S_CMP_GE_I32:
7910 case AMDGPU::S_CMP_LT_I32:
7911 case AMDGPU::S_CMP_LE_I32:
7912 case AMDGPU::S_CMP_EQ_U32:
7913 case AMDGPU::S_CMP_LG_U32:
7914 case AMDGPU::S_CMP_GT_U32:
7915 case AMDGPU::S_CMP_GE_U32:
7916 case AMDGPU::S_CMP_LT_U32:
7917 case AMDGPU::S_CMP_LE_U32:
7918 case AMDGPU::S_CMP_EQ_U64:
7919 case AMDGPU::S_CMP_LG_U64:
7920 case AMDGPU::S_CMP_LT_F32:
7921 case AMDGPU::S_CMP_EQ_F32:
7922 case AMDGPU::S_CMP_LE_F32:
7923 case AMDGPU::S_CMP_GT_F32:
7924 case AMDGPU::S_CMP_LG_F32:
7925 case AMDGPU::S_CMP_GE_F32:
7926 case AMDGPU::S_CMP_O_F32:
7927 case AMDGPU::S_CMP_U_F32:
7928 case AMDGPU::S_CMP_NGE_F32:
7929 case AMDGPU::S_CMP_NLG_F32:
7930 case AMDGPU::S_CMP_NGT_F32:
7931 case AMDGPU::S_CMP_NLE_F32:
7932 case AMDGPU::S_CMP_NEQ_F32:
7933 case AMDGPU::S_CMP_NLT_F32: {
7934 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7935 auto NewInstr =
7936 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7937 .setMIFlags(Inst.getFlags());
7938 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
7939 0) {
7940 NewInstr
7941 .addImm(0) // src0_modifiers
7942 .add(Inst.getOperand(0)) // src0
7943 .addImm(0) // src1_modifiers
7944 .add(Inst.getOperand(1)) // src1
7945 .addImm(0); // clamp
7946 } else {
7947 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
7948 }
7949 legalizeOperands(*NewInstr, MDT);
7950 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7951 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
7952 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7953 Inst.eraseFromParent();
7954 return;
7955 }
7956 case AMDGPU::S_CMP_LT_F16:
7957 case AMDGPU::S_CMP_EQ_F16:
7958 case AMDGPU::S_CMP_LE_F16:
7959 case AMDGPU::S_CMP_GT_F16:
7960 case AMDGPU::S_CMP_LG_F16:
7961 case AMDGPU::S_CMP_GE_F16:
7962 case AMDGPU::S_CMP_O_F16:
7963 case AMDGPU::S_CMP_U_F16:
7964 case AMDGPU::S_CMP_NGE_F16:
7965 case AMDGPU::S_CMP_NLG_F16:
7966 case AMDGPU::S_CMP_NGT_F16:
7967 case AMDGPU::S_CMP_NLE_F16:
7968 case AMDGPU::S_CMP_NEQ_F16:
7969 case AMDGPU::S_CMP_NLT_F16: {
7970 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7971 auto NewInstr =
7972 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7973 .setMIFlags(Inst.getFlags());
7974 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
7975 NewInstr
7976 .addImm(0) // src0_modifiers
7977 .add(Inst.getOperand(0)) // src0
7978 .addImm(0) // src1_modifiers
7979 .add(Inst.getOperand(1)) // src1
7980 .addImm(0); // clamp
7981 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
7982 NewInstr.addImm(0); // op_sel0
7983 } else {
7984 NewInstr
7985 .add(Inst.getOperand(0))
7986 .add(Inst.getOperand(1));
7987 }
7988 legalizeOperandsVALUt16(*NewInstr, MRI);
7989 legalizeOperands(*NewInstr, MDT);
7990 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7991 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
7992 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7993 Inst.eraseFromParent();
7994 return;
7995 }
7996 case AMDGPU::S_CVT_HI_F32_F16: {
7997 const DebugLoc &DL = Inst.getDebugLoc();
7998 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7999 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8000 if (ST.useRealTrue16Insts()) {
8001 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
8002 .add(Inst.getOperand(1));
8003 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8004 .addImm(0) // src0_modifiers
8005 .addReg(TmpReg, 0, AMDGPU::hi16)
8006 .addImm(0) // clamp
8007 .addImm(0) // omod
8008 .addImm(0); // op_sel0
8009 } else {
8010 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8011 .addImm(16)
8012 .add(Inst.getOperand(1));
8013 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8014 .addImm(0) // src0_modifiers
8015 .addReg(TmpReg)
8016 .addImm(0) // clamp
8017 .addImm(0); // omod
8018 }
8019
8020 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8021 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8022 Inst.eraseFromParent();
8023 return;
8024 }
8025 case AMDGPU::S_MINIMUM_F32:
8026 case AMDGPU::S_MAXIMUM_F32: {
8027 const DebugLoc &DL = Inst.getDebugLoc();
8028 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8029 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8030 .addImm(0) // src0_modifiers
8031 .add(Inst.getOperand(1))
8032 .addImm(0) // src1_modifiers
8033 .add(Inst.getOperand(2))
8034 .addImm(0) // clamp
8035 .addImm(0); // omod
8036 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8037
8038 legalizeOperands(*NewInstr, MDT);
8039 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8040 Inst.eraseFromParent();
8041 return;
8042 }
8043 case AMDGPU::S_MINIMUM_F16:
8044 case AMDGPU::S_MAXIMUM_F16: {
8045 const DebugLoc &DL = Inst.getDebugLoc();
8046 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8047 ? &AMDGPU::VGPR_16RegClass
8048 : &AMDGPU::VGPR_32RegClass);
8049 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8050 .addImm(0) // src0_modifiers
8051 .add(Inst.getOperand(1))
8052 .addImm(0) // src1_modifiers
8053 .add(Inst.getOperand(2))
8054 .addImm(0) // clamp
8055 .addImm(0) // omod
8056 .addImm(0); // opsel0
8057 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8058 legalizeOperandsVALUt16(*NewInstr, MRI);
8059 legalizeOperands(*NewInstr, MDT);
8060 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8061 Inst.eraseFromParent();
8062 return;
8063 }
8064 case AMDGPU::V_S_EXP_F16_e64:
8065 case AMDGPU::V_S_LOG_F16_e64:
8066 case AMDGPU::V_S_RCP_F16_e64:
8067 case AMDGPU::V_S_RSQ_F16_e64:
8068 case AMDGPU::V_S_SQRT_F16_e64: {
8069 const DebugLoc &DL = Inst.getDebugLoc();
8070 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8071 ? &AMDGPU::VGPR_16RegClass
8072 : &AMDGPU::VGPR_32RegClass);
8073 auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8074 .add(Inst.getOperand(1)) // src0_modifiers
8075 .add(Inst.getOperand(2))
8076 .add(Inst.getOperand(3)) // clamp
8077 .add(Inst.getOperand(4)) // omod
8078 .setMIFlags(Inst.getFlags());
8079 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8080 NewInstr.addImm(0); // opsel0
8081 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8082 legalizeOperandsVALUt16(*NewInstr, MRI);
8083 legalizeOperands(*NewInstr, MDT);
8084 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8085 Inst.eraseFromParent();
8086 return;
8087 }
8088 }
8089
8090 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8091 // We cannot move this instruction to the VALU, so we should try to
8092 // legalize its operands instead.
8093 legalizeOperands(Inst, MDT);
8094 return;
8095 }
8096 // Handle converting generic instructions like COPY-to-SGPR into
8097 // COPY-to-VGPR.
8098 if (NewOpcode == Opcode) {
8099 Register DstReg = Inst.getOperand(0).getReg();
8100 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8101
8102 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
8103 // hope for the best.
8104 if (Inst.isCopy() && DstReg.isPhysical() &&
8105 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8106 Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8107 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8108 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
8109 .add(Inst.getOperand(1));
8110 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
8111 DstReg)
8112 .addReg(NewDst);
8113
8114 Inst.eraseFromParent();
8115 return;
8116 }
8117
8118 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
8119 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
8120 // Instead of creating a copy where src and dst are the same register
8121 // class, we just replace all uses of dst with src. These kinds of
8122 // copies interfere with the heuristics MachineSink uses to decide
8123 // whether or not to split a critical edge. Since the pass assumes
8124 // that copies will end up as machine instructions and not be
8125 // eliminated.
8126 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
8127 Register NewDstReg = Inst.getOperand(1).getReg();
8128 MRI.replaceRegWith(DstReg, NewDstReg);
8129 MRI.clearKillFlags(NewDstReg);
8130 Inst.getOperand(0).setReg(DstReg);
8131 Inst.eraseFromParent();
8132 // Legalize t16 operand since replaceReg is called after addUsersToVALU
8133 for (MachineOperand &MO :
8134 make_early_inc_range(MRI.use_operands(NewDstReg))) {
8135 legalizeOperandsVALUt16(*MO.getParent(), MRI);
8136 }
8137 return;
8138 }
8139
8140 // If this is a v2s copy between 16bit and 32bit reg,
8141 // replace vgpr copy to reg_sequence/extract_subreg
8142 // This can be remove after we have sgpr16 in place
8143 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8144 Inst.getOperand(1).getReg().isVirtual() &&
8145 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8146 const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
8147 if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
8148 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8149 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
8150 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8151 get(AMDGPU::IMPLICIT_DEF), Undef);
8152 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8153 get(AMDGPU::REG_SEQUENCE), NewDstReg)
8154 .addReg(Inst.getOperand(1).getReg())
8155 .addImm(AMDGPU::lo16)
8156 .addReg(Undef)
8157 .addImm(AMDGPU::hi16);
8158 Inst.eraseFromParent();
8159 MRI.replaceRegWith(DstReg, NewDstReg);
8160 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8161 return;
8162 } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
8163 AMDGPU::lo16)) {
8164 Inst.getOperand(1).setSubReg(AMDGPU::lo16);
8165 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8166 MRI.replaceRegWith(DstReg, NewDstReg);
8167 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8168 return;
8169 }
8170 }
8171
8172 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8173 MRI.replaceRegWith(DstReg, NewDstReg);
8174 legalizeOperands(Inst, MDT);
8175 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8176 return;
8177 }
8178
8179 // Use the new VALU Opcode.
8180 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
8181 .setMIFlags(Inst.getFlags());
8182 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
8183 // Intersperse VOP3 modifiers among the SALU operands.
8184 NewInstr->addOperand(Inst.getOperand(0));
8185 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8186 AMDGPU::OpName::src0_modifiers) >= 0)
8187 NewInstr.addImm(0);
8188 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
8189 const MachineOperand &Src = Inst.getOperand(1);
8190 NewInstr->addOperand(Src);
8191 }
8192
8193 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8194 // We are converting these to a BFE, so we need to add the missing
8195 // operands for the size and offset.
8196 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8197 NewInstr.addImm(0);
8198 NewInstr.addImm(Size);
8199 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8200 // The VALU version adds the second operand to the result, so insert an
8201 // extra 0 operand.
8202 NewInstr.addImm(0);
8203 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8204 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
8205 // If we need to move this to VGPRs, we need to unpack the second
8206 // operand back into the 2 separate ones for bit offset and width.
8207 assert(OffsetWidthOp.isImm() &&
8208 "Scalar BFE is only implemented for constant width and offset");
8209 uint32_t Imm = OffsetWidthOp.getImm();
8210
8211 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8212 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8213 NewInstr.addImm(Offset);
8214 NewInstr.addImm(BitWidth);
8215 } else {
8216 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8217 AMDGPU::OpName::src1_modifiers) >= 0)
8218 NewInstr.addImm(0);
8219 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
8220 NewInstr->addOperand(Inst.getOperand(2));
8221 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8222 AMDGPU::OpName::src2_modifiers) >= 0)
8223 NewInstr.addImm(0);
8224 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
8225 NewInstr->addOperand(Inst.getOperand(3));
8226 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
8227 NewInstr.addImm(0);
8228 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
8229 NewInstr.addImm(0);
8230 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
8231 NewInstr.addImm(0);
8232 }
8233 } else {
8234 // Just copy the SALU operands.
8235 for (const MachineOperand &Op : Inst.explicit_operands())
8236 NewInstr->addOperand(Op);
8237 }
8238
8239 // Remove any references to SCC. Vector instructions can't read from it, and
8240 // We're just about to add the implicit use / defs of VCC, and we don't want
8241 // both.
8242 for (MachineOperand &Op : Inst.implicit_operands()) {
8243 if (Op.getReg() == AMDGPU::SCC) {
8244 // Only propagate through live-def of SCC.
8245 if (Op.isDef() && !Op.isDead())
8246 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
8247 if (Op.isUse())
8248 addSCCDefsToVALUWorklist(NewInstr, Worklist);
8249 }
8250 }
8251 Inst.eraseFromParent();
8252 Register NewDstReg;
8253 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
8254 Register DstReg = NewInstr->getOperand(0).getReg();
8255 assert(DstReg.isVirtual());
8256 // Update the destination register class.
8257 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
8258 assert(NewDstRC);
8259 NewDstReg = MRI.createVirtualRegister(NewDstRC);
8260 MRI.replaceRegWith(DstReg, NewDstReg);
8261 }
8262 fixImplicitOperands(*NewInstr);
8263
8264 legalizeOperandsVALUt16(*NewInstr, MRI);
8265
8266 // Legalize the operands
8267 legalizeOperands(*NewInstr, MDT);
8268 if (NewDstReg)
8269 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8270}
8271
8272// Add/sub require special handling to deal with carry outs.
8273std::pair<bool, MachineBasicBlock *>
8274SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8275 MachineDominatorTree *MDT) const {
8276 if (ST.hasAddNoCarry()) {
8277 // Assume there is no user of scc since we don't select this in that case.
8278 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8279 // is used.
8280
8281 MachineBasicBlock &MBB = *Inst.getParent();
8282 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8283
8284 Register OldDstReg = Inst.getOperand(0).getReg();
8285 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8286
8287 unsigned Opc = Inst.getOpcode();
8288 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8289
8290 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8291 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8292
8293 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8294 Inst.removeOperand(3);
8295
8296 Inst.setDesc(get(NewOpc));
8297 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
8298 Inst.addImplicitDefUseOperands(*MBB.getParent());
8299 MRI.replaceRegWith(OldDstReg, ResultReg);
8300 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
8301
8302 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8303 return std::pair(true, NewBB);
8304 }
8305
8306 return std::pair(false, nullptr);
8307}
8308
8309void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8310 MachineDominatorTree *MDT) const {
8311
8312 MachineBasicBlock &MBB = *Inst.getParent();
8313 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8314 MachineBasicBlock::iterator MII = Inst;
8315 DebugLoc DL = Inst.getDebugLoc();
8316
8317 MachineOperand &Dest = Inst.getOperand(0);
8318 MachineOperand &Src0 = Inst.getOperand(1);
8319 MachineOperand &Src1 = Inst.getOperand(2);
8320 MachineOperand &Cond = Inst.getOperand(3);
8321
8322 Register CondReg = Cond.getReg();
8323 bool IsSCC = (CondReg == AMDGPU::SCC);
8324
8325 // If this is a trivial select where the condition is effectively not SCC
8326 // (CondReg is a source of copy to SCC), then the select is semantically
8327 // equivalent to copying CondReg. Hence, there is no need to create
8328 // V_CNDMASK, we can just use that and bail out.
8329 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8330 (Src1.getImm() == 0)) {
8331 MRI.replaceRegWith(Dest.getReg(), CondReg);
8332 return;
8333 }
8334
8335 Register NewCondReg = CondReg;
8336 if (IsSCC) {
8337 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8338 NewCondReg = MRI.createVirtualRegister(TC);
8339
8340 // Now look for the closest SCC def if it is a copy
8341 // replacing the CondReg with the COPY source register
8342 bool CopyFound = false;
8343 for (MachineInstr &CandI :
8345 Inst.getParent()->rend())) {
8346 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
8347 -1) {
8348 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
8349 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
8350 .addReg(CandI.getOperand(1).getReg());
8351 CopyFound = true;
8352 }
8353 break;
8354 }
8355 }
8356 if (!CopyFound) {
8357 // SCC def is not a copy
8358 // Insert a trivial select instead of creating a copy, because a copy from
8359 // SCC would semantically mean just copying a single bit, but we may need
8360 // the result to be a vector condition mask that needs preserving.
8361 unsigned Opcode =
8362 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8363 auto NewSelect =
8364 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
8365 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
8366 }
8367 }
8368
8369 Register NewDestReg = MRI.createVirtualRegister(
8370 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
8371 MachineInstr *NewInst;
8372 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8373 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
8374 .addImm(0)
8375 .add(Src1) // False
8376 .addImm(0)
8377 .add(Src0) // True
8378 .addReg(NewCondReg);
8379 } else {
8380 NewInst =
8381 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
8382 .add(Src1) // False
8383 .add(Src0) // True
8384 .addReg(NewCondReg);
8385 }
8386 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
8387 legalizeOperands(*NewInst, MDT);
8388 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
8389}
8390
8391void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8392 MachineInstr &Inst) const {
8393 MachineBasicBlock &MBB = *Inst.getParent();
8394 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8395 MachineBasicBlock::iterator MII = Inst;
8396 DebugLoc DL = Inst.getDebugLoc();
8397
8398 MachineOperand &Dest = Inst.getOperand(0);
8399 MachineOperand &Src = Inst.getOperand(1);
8400 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8401 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8402
8403 unsigned SubOp = ST.hasAddNoCarry() ?
8404 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
8405
8406 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
8407 .addImm(0)
8408 .addReg(Src.getReg());
8409
8410 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8411 .addReg(Src.getReg())
8412 .addReg(TmpReg);
8413
8414 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8415 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8416}
8417
8418void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8419 MachineInstr &Inst) const {
8420 MachineBasicBlock &MBB = *Inst.getParent();
8421 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8422 MachineBasicBlock::iterator MII = Inst;
8423 const DebugLoc &DL = Inst.getDebugLoc();
8424
8425 MachineOperand &Dest = Inst.getOperand(0);
8426 MachineOperand &Src0 = Inst.getOperand(1);
8427 MachineOperand &Src1 = Inst.getOperand(2);
8428
8429 if (ST.hasDLInsts()) {
8430 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8431 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
8432 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
8433
8434 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
8435 .add(Src0)
8436 .add(Src1);
8437
8438 MRI.replaceRegWith(Dest.getReg(), NewDest);
8439 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8440 } else {
8441 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8442 // invert either source and then perform the XOR. If either source is a
8443 // scalar register, then we can leave the inversion on the scalar unit to
8444 // achieve a better distribution of scalar and vector instructions.
8445 bool Src0IsSGPR = Src0.isReg() &&
8446 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
8447 bool Src1IsSGPR = Src1.isReg() &&
8448 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
8449 MachineInstr *Xor;
8450 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8451 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8452
8453 // Build a pair of scalar instructions and add them to the work list.
8454 // The next iteration over the work list will lower these to the vector
8455 // unit as necessary.
8456 if (Src0IsSGPR) {
8457 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
8458 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8459 .addReg(Temp)
8460 .add(Src1);
8461 } else if (Src1IsSGPR) {
8462 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
8463 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8464 .add(Src0)
8465 .addReg(Temp);
8466 } else {
8467 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
8468 .add(Src0)
8469 .add(Src1);
8470 MachineInstr *Not =
8471 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
8472 Worklist.insert(Not);
8473 }
8474
8475 MRI.replaceRegWith(Dest.getReg(), NewDest);
8476
8477 Worklist.insert(Xor);
8478
8479 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8480 }
8481}
8482
8483void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8484 MachineInstr &Inst,
8485 unsigned Opcode) const {
8486 MachineBasicBlock &MBB = *Inst.getParent();
8487 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8488 MachineBasicBlock::iterator MII = Inst;
8489 const DebugLoc &DL = Inst.getDebugLoc();
8490
8491 MachineOperand &Dest = Inst.getOperand(0);
8492 MachineOperand &Src0 = Inst.getOperand(1);
8493 MachineOperand &Src1 = Inst.getOperand(2);
8494
8495 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8496 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8497
8498 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
8499 .add(Src0)
8500 .add(Src1);
8501
8502 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
8503 .addReg(Interm);
8504
8505 Worklist.insert(&Op);
8506 Worklist.insert(&Not);
8507
8508 MRI.replaceRegWith(Dest.getReg(), NewDest);
8509 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8510}
8511
8512void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8513 MachineInstr &Inst,
8514 unsigned Opcode) const {
8515 MachineBasicBlock &MBB = *Inst.getParent();
8516 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8517 MachineBasicBlock::iterator MII = Inst;
8518 const DebugLoc &DL = Inst.getDebugLoc();
8519
8520 MachineOperand &Dest = Inst.getOperand(0);
8521 MachineOperand &Src0 = Inst.getOperand(1);
8522 MachineOperand &Src1 = Inst.getOperand(2);
8523
8524 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8525 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8526
8527 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
8528 .add(Src1);
8529
8530 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
8531 .add(Src0)
8532 .addReg(Interm);
8533
8534 Worklist.insert(&Not);
8535 Worklist.insert(&Op);
8536
8537 MRI.replaceRegWith(Dest.getReg(), NewDest);
8538 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8539}
8540
8541void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8542 MachineInstr &Inst, unsigned Opcode,
8543 bool Swap) const {
8544 MachineBasicBlock &MBB = *Inst.getParent();
8545 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8546
8547 MachineOperand &Dest = Inst.getOperand(0);
8548 MachineOperand &Src0 = Inst.getOperand(1);
8549 DebugLoc DL = Inst.getDebugLoc();
8550
8551 MachineBasicBlock::iterator MII = Inst;
8552
8553 const MCInstrDesc &InstDesc = get(Opcode);
8554 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8555 MRI.getRegClass(Src0.getReg()) :
8556 &AMDGPU::SGPR_32RegClass;
8557
8558 const TargetRegisterClass *Src0SubRC =
8559 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8560
8561 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8562 AMDGPU::sub0, Src0SubRC);
8563
8564 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8565 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8566 const TargetRegisterClass *NewDestSubRC =
8567 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8568
8569 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8570 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
8571
8572 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8573 AMDGPU::sub1, Src0SubRC);
8574
8575 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8576 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
8577
8578 if (Swap)
8579 std::swap(DestSub0, DestSub1);
8580
8581 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8582 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8583 .addReg(DestSub0)
8584 .addImm(AMDGPU::sub0)
8585 .addReg(DestSub1)
8586 .addImm(AMDGPU::sub1);
8587
8588 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8589
8590 Worklist.insert(&LoHalf);
8591 Worklist.insert(&HiHalf);
8592
8593 // We don't need to legalizeOperands here because for a single operand, src0
8594 // will support any kind of input.
8595
8596 // Move all users of this moved value.
8597 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8598}
8599
8600// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8601// split the s_mul_u64 in 32-bit vector multiplications.
8602void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8603 MachineInstr &Inst,
8604 MachineDominatorTree *MDT) const {
8605 MachineBasicBlock &MBB = *Inst.getParent();
8606 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8607
8608 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8609 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8610 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8611
8612 MachineOperand &Dest = Inst.getOperand(0);
8613 MachineOperand &Src0 = Inst.getOperand(1);
8614 MachineOperand &Src1 = Inst.getOperand(2);
8615 const DebugLoc &DL = Inst.getDebugLoc();
8616 MachineBasicBlock::iterator MII = Inst;
8617
8618 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8619 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8620 const TargetRegisterClass *Src0SubRC =
8621 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8622 if (RI.isSGPRClass(Src0SubRC))
8623 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8624 const TargetRegisterClass *Src1SubRC =
8625 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8626 if (RI.isSGPRClass(Src1SubRC))
8627 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8628
8629 // First, we extract the low 32-bit and high 32-bit values from each of the
8630 // operands.
8631 MachineOperand Op0L =
8632 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8633 MachineOperand Op1L =
8634 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8635 MachineOperand Op0H =
8636 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
8637 MachineOperand Op1H =
8638 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
8639
8640 // The multilication is done as follows:
8641 //
8642 // Op1H Op1L
8643 // * Op0H Op0L
8644 // --------------------
8645 // Op1H*Op0L Op1L*Op0L
8646 // + Op1H*Op0H Op1L*Op0H
8647 // -----------------------------------------
8648 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
8649 //
8650 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
8651 // value and that would overflow.
8652 // The low 32-bit value is Op1L*Op0L.
8653 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8654
8655 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8656 MachineInstr *Op1L_Op0H =
8657 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
8658 .add(Op1L)
8659 .add(Op0H);
8660
8661 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8662 MachineInstr *Op1H_Op0L =
8663 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
8664 .add(Op1H)
8665 .add(Op0L);
8666
8667 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8668 MachineInstr *Carry =
8669 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
8670 .add(Op1L)
8671 .add(Op0L);
8672
8673 MachineInstr *LoHalf =
8674 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8675 .add(Op1L)
8676 .add(Op0L);
8677
8678 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8679 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
8680 .addReg(Op1L_Op0H_Reg)
8681 .addReg(Op1H_Op0L_Reg);
8682
8683 MachineInstr *HiHalf =
8684 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
8685 .addReg(AddReg)
8686 .addReg(CarryReg);
8687
8688 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8689 .addReg(DestSub0)
8690 .addImm(AMDGPU::sub0)
8691 .addReg(DestSub1)
8692 .addImm(AMDGPU::sub1);
8693
8694 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8695
8696 // Try to legalize the operands in case we need to swap the order to keep it
8697 // valid.
8698 legalizeOperands(*Op1L_Op0H, MDT);
8699 legalizeOperands(*Op1H_Op0L, MDT);
8700 legalizeOperands(*Carry, MDT);
8701 legalizeOperands(*LoHalf, MDT);
8702 legalizeOperands(*Add, MDT);
8703 legalizeOperands(*HiHalf, MDT);
8704
8705 // Move all users of this moved value.
8706 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8707}
8708
8709// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8710// multiplications.
8711void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
8712 MachineInstr &Inst,
8713 MachineDominatorTree *MDT) const {
8714 MachineBasicBlock &MBB = *Inst.getParent();
8715 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8716
8717 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8718 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8719 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8720
8721 MachineOperand &Dest = Inst.getOperand(0);
8722 MachineOperand &Src0 = Inst.getOperand(1);
8723 MachineOperand &Src1 = Inst.getOperand(2);
8724 const DebugLoc &DL = Inst.getDebugLoc();
8725 MachineBasicBlock::iterator MII = Inst;
8726
8727 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8728 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8729 const TargetRegisterClass *Src0SubRC =
8730 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8731 if (RI.isSGPRClass(Src0SubRC))
8732 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8733 const TargetRegisterClass *Src1SubRC =
8734 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8735 if (RI.isSGPRClass(Src1SubRC))
8736 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8737
8738 // First, we extract the low 32-bit and high 32-bit values from each of the
8739 // operands.
8740 MachineOperand Op0L =
8741 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8742 MachineOperand Op1L =
8743 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8744
8745 unsigned Opc = Inst.getOpcode();
8746 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
8747 ? AMDGPU::V_MUL_HI_U32_e64
8748 : AMDGPU::V_MUL_HI_I32_e64;
8749 MachineInstr *HiHalf =
8750 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
8751
8752 MachineInstr *LoHalf =
8753 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8754 .add(Op1L)
8755 .add(Op0L);
8756
8757 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8758 .addReg(DestSub0)
8759 .addImm(AMDGPU::sub0)
8760 .addReg(DestSub1)
8761 .addImm(AMDGPU::sub1);
8762
8763 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8764
8765 // Try to legalize the operands in case we need to swap the order to keep it
8766 // valid.
8767 legalizeOperands(*HiHalf, MDT);
8768 legalizeOperands(*LoHalf, MDT);
8769
8770 // Move all users of this moved value.
8771 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8772}
8773
8774void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
8775 MachineInstr &Inst, unsigned Opcode,
8776 MachineDominatorTree *MDT) const {
8777 MachineBasicBlock &MBB = *Inst.getParent();
8778 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8779
8780 MachineOperand &Dest = Inst.getOperand(0);
8781 MachineOperand &Src0 = Inst.getOperand(1);
8782 MachineOperand &Src1 = Inst.getOperand(2);
8783 DebugLoc DL = Inst.getDebugLoc();
8784
8785 MachineBasicBlock::iterator MII = Inst;
8786
8787 const MCInstrDesc &InstDesc = get(Opcode);
8788 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8789 MRI.getRegClass(Src0.getReg()) :
8790 &AMDGPU::SGPR_32RegClass;
8791
8792 const TargetRegisterClass *Src0SubRC =
8793 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8794 const TargetRegisterClass *Src1RC = Src1.isReg() ?
8795 MRI.getRegClass(Src1.getReg()) :
8796 &AMDGPU::SGPR_32RegClass;
8797
8798 const TargetRegisterClass *Src1SubRC =
8799 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8800
8801 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8802 AMDGPU::sub0, Src0SubRC);
8803 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8804 AMDGPU::sub0, Src1SubRC);
8805 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8806 AMDGPU::sub1, Src0SubRC);
8807 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8808 AMDGPU::sub1, Src1SubRC);
8809
8810 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8811 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8812 const TargetRegisterClass *NewDestSubRC =
8813 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8814
8815 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8816 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
8817 .add(SrcReg0Sub0)
8818 .add(SrcReg1Sub0);
8819
8820 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8821 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
8822 .add(SrcReg0Sub1)
8823 .add(SrcReg1Sub1);
8824
8825 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8826 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8827 .addReg(DestSub0)
8828 .addImm(AMDGPU::sub0)
8829 .addReg(DestSub1)
8830 .addImm(AMDGPU::sub1);
8831
8832 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8833
8834 Worklist.insert(&LoHalf);
8835 Worklist.insert(&HiHalf);
8836
8837 // Move all users of this moved value.
8838 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8839}
8840
8841void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
8842 MachineInstr &Inst,
8843 MachineDominatorTree *MDT) const {
8844 MachineBasicBlock &MBB = *Inst.getParent();
8845 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8846
8847 MachineOperand &Dest = Inst.getOperand(0);
8848 MachineOperand &Src0 = Inst.getOperand(1);
8849 MachineOperand &Src1 = Inst.getOperand(2);
8850 const DebugLoc &DL = Inst.getDebugLoc();
8851
8852 MachineBasicBlock::iterator MII = Inst;
8853
8854 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8855
8856 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
8857
8858 MachineOperand* Op0;
8859 MachineOperand* Op1;
8860
8861 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
8862 Op0 = &Src0;
8863 Op1 = &Src1;
8864 } else {
8865 Op0 = &Src1;
8866 Op1 = &Src0;
8867 }
8868
8869 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
8870 .add(*Op0);
8871
8872 Register NewDest = MRI.createVirtualRegister(DestRC);
8873
8874 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
8875 .addReg(Interm)
8876 .add(*Op1);
8877
8878 MRI.replaceRegWith(Dest.getReg(), NewDest);
8879
8880 Worklist.insert(&Xor);
8881}
8882
8883void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
8884 MachineInstr &Inst) const {
8885 MachineBasicBlock &MBB = *Inst.getParent();
8886 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8887
8888 MachineBasicBlock::iterator MII = Inst;
8889 const DebugLoc &DL = Inst.getDebugLoc();
8890
8891 MachineOperand &Dest = Inst.getOperand(0);
8892 MachineOperand &Src = Inst.getOperand(1);
8893
8894 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
8895 const TargetRegisterClass *SrcRC = Src.isReg() ?
8896 MRI.getRegClass(Src.getReg()) :
8897 &AMDGPU::SGPR_32RegClass;
8898
8899 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8900 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8901
8902 const TargetRegisterClass *SrcSubRC =
8903 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8904
8905 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8906 AMDGPU::sub0, SrcSubRC);
8907 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8908 AMDGPU::sub1, SrcSubRC);
8909
8910 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
8911
8912 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
8913
8914 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8915
8916 // We don't need to legalize operands here. src0 for either instruction can be
8917 // an SGPR, and the second input is unused or determined here.
8918 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8919}
8920
8921void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
8922 MachineInstr &Inst) const {
8923 MachineBasicBlock &MBB = *Inst.getParent();
8924 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8925 MachineBasicBlock::iterator MII = Inst;
8926 const DebugLoc &DL = Inst.getDebugLoc();
8927
8928 MachineOperand &Dest = Inst.getOperand(0);
8929 uint32_t Imm = Inst.getOperand(2).getImm();
8930 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8931 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8932
8933 (void) Offset;
8934
8935 // Only sext_inreg cases handled.
8936 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
8937 Offset == 0 && "Not implemented");
8938
8939 if (BitWidth < 32) {
8940 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8941 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8942 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8943
8944 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
8945 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
8946 .addImm(0)
8947 .addImm(BitWidth);
8948
8949 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
8950 .addImm(31)
8951 .addReg(MidRegLo);
8952
8953 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8954 .addReg(MidRegLo)
8955 .addImm(AMDGPU::sub0)
8956 .addReg(MidRegHi)
8957 .addImm(AMDGPU::sub1);
8958
8959 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8960 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8961 return;
8962 }
8963
8964 MachineOperand &Src = Inst.getOperand(1);
8965 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8966 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8967
8968 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
8969 .addImm(31)
8970 .addReg(Src.getReg(), 0, AMDGPU::sub0);
8971
8972 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8973 .addReg(Src.getReg(), 0, AMDGPU::sub0)
8974 .addImm(AMDGPU::sub0)
8975 .addReg(TmpReg)
8976 .addImm(AMDGPU::sub1);
8977
8978 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8979 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8980}
8981
8982void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
8983 MachineInstr &Inst, unsigned Opcode,
8984 MachineDominatorTree *MDT) const {
8985 // (S_FLBIT_I32_B64 hi:lo) ->
8986 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
8987 // (S_FF1_I32_B64 hi:lo) ->
8988 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
8989
8990 MachineBasicBlock &MBB = *Inst.getParent();
8991 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8992 MachineBasicBlock::iterator MII = Inst;
8993 const DebugLoc &DL = Inst.getDebugLoc();
8994
8995 MachineOperand &Dest = Inst.getOperand(0);
8996 MachineOperand &Src = Inst.getOperand(1);
8997
8998 const MCInstrDesc &InstDesc = get(Opcode);
8999
9000 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
9001 unsigned OpcodeAdd =
9002 ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
9003
9004 const TargetRegisterClass *SrcRC =
9005 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
9006 const TargetRegisterClass *SrcSubRC =
9007 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9008
9009 MachineOperand SrcRegSub0 =
9010 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
9011 MachineOperand SrcRegSub1 =
9012 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
9013
9014 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9015 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9016 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9017 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9018
9019 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
9020
9021 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
9022
9023 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
9024 .addReg(IsCtlz ? MidReg1 : MidReg2)
9025 .addImm(32)
9026 .addImm(1); // enable clamp
9027
9028 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
9029 .addReg(MidReg3)
9030 .addReg(IsCtlz ? MidReg2 : MidReg1);
9031
9032 MRI.replaceRegWith(Dest.getReg(), MidReg4);
9033
9034 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
9035}
9036
9037void SIInstrInfo::addUsersToMoveToVALUWorklist(
9039 SIInstrWorklist &Worklist) const {
9040 for (MachineOperand &MO : make_early_inc_range(MRI.use_operands(DstReg))) {
9041 MachineInstr &UseMI = *MO.getParent();
9042
9043 unsigned OpNo = 0;
9044
9045 switch (UseMI.getOpcode()) {
9046 case AMDGPU::COPY:
9047 case AMDGPU::WQM:
9048 case AMDGPU::SOFT_WQM:
9049 case AMDGPU::STRICT_WWM:
9050 case AMDGPU::STRICT_WQM:
9051 case AMDGPU::REG_SEQUENCE:
9052 case AMDGPU::PHI:
9053 case AMDGPU::INSERT_SUBREG:
9054 break;
9055 default:
9056 OpNo = MO.getOperandNo();
9057 break;
9058 }
9059
9060 const TargetRegisterClass *OpRC = getOpRegClass(UseMI, OpNo);
9061 MRI.constrainRegClass(DstReg, OpRC);
9062
9063 if (!RI.hasVectorRegisters(OpRC))
9064 Worklist.insert(&UseMI);
9065 else
9066 // Legalization could change user list.
9068 }
9069}
9070
9071void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9073 MachineInstr &Inst) const {
9074 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9075 MachineBasicBlock *MBB = Inst.getParent();
9076 MachineOperand &Src0 = Inst.getOperand(1);
9077 MachineOperand &Src1 = Inst.getOperand(2);
9078 const DebugLoc &DL = Inst.getDebugLoc();
9079
9080 if (ST.useRealTrue16Insts()) {
9081 Register SrcReg0, SrcReg1;
9082 if (!Src0.isReg() || !RI.isVGPR(MRI, Src0.getReg())) {
9083 SrcReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9084 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg0).add(Src0);
9085 } else {
9086 SrcReg0 = Src0.getReg();
9087 }
9088
9089 if (!Src1.isReg() || !RI.isVGPR(MRI, Src1.getReg())) {
9090 SrcReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9091 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg1).add(Src1);
9092 } else {
9093 SrcReg1 = Src1.getReg();
9094 }
9095
9096 bool isSrc0Reg16 = MRI.constrainRegClass(SrcReg0, &AMDGPU::VGPR_16RegClass);
9097 bool isSrc1Reg16 = MRI.constrainRegClass(SrcReg1, &AMDGPU::VGPR_16RegClass);
9098
9099 auto NewMI = BuildMI(*MBB, Inst, DL, get(AMDGPU::REG_SEQUENCE), ResultReg);
9100 switch (Inst.getOpcode()) {
9101 case AMDGPU::S_PACK_LL_B32_B16:
9102 NewMI
9103 .addReg(SrcReg0, 0,
9104 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9105 .addImm(AMDGPU::lo16)
9106 .addReg(SrcReg1, 0,
9107 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9108 .addImm(AMDGPU::hi16);
9109 break;
9110 case AMDGPU::S_PACK_LH_B32_B16:
9111 NewMI
9112 .addReg(SrcReg0, 0,
9113 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9114 .addImm(AMDGPU::lo16)
9115 .addReg(SrcReg1, 0, AMDGPU::hi16)
9116 .addImm(AMDGPU::hi16);
9117 break;
9118 case AMDGPU::S_PACK_HL_B32_B16:
9119 NewMI.addReg(SrcReg0, 0, AMDGPU::hi16)
9120 .addImm(AMDGPU::lo16)
9121 .addReg(SrcReg1, 0,
9122 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9123 .addImm(AMDGPU::hi16);
9124 break;
9125 case AMDGPU::S_PACK_HH_B32_B16:
9126 NewMI.addReg(SrcReg0, 0, AMDGPU::hi16)
9127 .addImm(AMDGPU::lo16)
9128 .addReg(SrcReg1, 0, AMDGPU::hi16)
9129 .addImm(AMDGPU::hi16);
9130 break;
9131 default:
9132 llvm_unreachable("unhandled s_pack_* instruction");
9133 }
9134
9135 MachineOperand &Dest = Inst.getOperand(0);
9136 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9137 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9138 return;
9139 }
9140
9141 switch (Inst.getOpcode()) {
9142 case AMDGPU::S_PACK_LL_B32_B16: {
9143 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9144 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9145
9146 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9147 // 0.
9148 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9149 .addImm(0xffff);
9150
9151 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
9152 .addReg(ImmReg, RegState::Kill)
9153 .add(Src0);
9154
9155 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9156 .add(Src1)
9157 .addImm(16)
9158 .addReg(TmpReg, RegState::Kill);
9159 break;
9160 }
9161 case AMDGPU::S_PACK_LH_B32_B16: {
9162 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9163 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9164 .addImm(0xffff);
9165 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
9166 .addReg(ImmReg, RegState::Kill)
9167 .add(Src0)
9168 .add(Src1);
9169 break;
9170 }
9171 case AMDGPU::S_PACK_HL_B32_B16: {
9172 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9173 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9174 .addImm(16)
9175 .add(Src0);
9176 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9177 .add(Src1)
9178 .addImm(16)
9179 .addReg(TmpReg, RegState::Kill);
9180 break;
9181 }
9182 case AMDGPU::S_PACK_HH_B32_B16: {
9183 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9184 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9185 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9186 .addImm(16)
9187 .add(Src0);
9188 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9189 .addImm(0xffff0000);
9190 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
9191 .add(Src1)
9192 .addReg(ImmReg, RegState::Kill)
9193 .addReg(TmpReg, RegState::Kill);
9194 break;
9195 }
9196 default:
9197 llvm_unreachable("unhandled s_pack_* instruction");
9198 }
9199
9200 MachineOperand &Dest = Inst.getOperand(0);
9201 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9202 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9203}
9204
9205void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
9206 MachineInstr &SCCDefInst,
9207 SIInstrWorklist &Worklist,
9208 Register NewCond) const {
9209
9210 // Ensure that def inst defines SCC, which is still live.
9211 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9212 !Op.isDead() && Op.getParent() == &SCCDefInst);
9213 SmallVector<MachineInstr *, 4> CopyToDelete;
9214 // This assumes that all the users of SCC are in the same block
9215 // as the SCC def.
9216 for (MachineInstr &MI : // Skip the def inst itself.
9217 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
9218 SCCDefInst.getParent()->end())) {
9219 // Check if SCC is used first.
9220 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
9221 if (SCCIdx != -1) {
9222 if (MI.isCopy()) {
9223 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9224 Register DestReg = MI.getOperand(0).getReg();
9225
9226 MRI.replaceRegWith(DestReg, NewCond);
9227 CopyToDelete.push_back(&MI);
9228 } else {
9229
9230 if (NewCond.isValid())
9231 MI.getOperand(SCCIdx).setReg(NewCond);
9232
9233 Worklist.insert(&MI);
9234 }
9235 }
9236 // Exit if we find another SCC def.
9237 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
9238 break;
9239 }
9240 for (auto &Copy : CopyToDelete)
9241 Copy->eraseFromParent();
9242}
9243
9244// Instructions that use SCC may be converted to VALU instructions. When that
9245// happens, the SCC register is changed to VCC_LO. The instruction that defines
9246// SCC must be changed to an instruction that defines VCC. This function makes
9247// sure that the instruction that defines SCC is added to the moveToVALU
9248// worklist.
9249void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9250 SIInstrWorklist &Worklist) const {
9251 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9252 // then there is nothing to do because the defining instruction has been
9253 // converted to a VALU already. If SCC then that instruction needs to be
9254 // converted to a VALU.
9255 for (MachineInstr &MI :
9256 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
9257 SCCUseInst->getParent()->rend())) {
9258 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
9259 break;
9260 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
9261 Worklist.insert(&MI);
9262 break;
9263 }
9264 }
9265}
9266
9267const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9268 const MachineInstr &Inst) const {
9269 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
9270
9271 switch (Inst.getOpcode()) {
9272 // For target instructions, getOpRegClass just returns the virtual register
9273 // class associated with the operand, so we need to find an equivalent VGPR
9274 // register class in order to move the instruction to the VALU.
9275 case AMDGPU::COPY:
9276 case AMDGPU::PHI:
9277 case AMDGPU::REG_SEQUENCE:
9278 case AMDGPU::INSERT_SUBREG:
9279 case AMDGPU::WQM:
9280 case AMDGPU::SOFT_WQM:
9281 case AMDGPU::STRICT_WWM:
9282 case AMDGPU::STRICT_WQM: {
9283 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
9284 if (RI.isAGPRClass(SrcRC)) {
9285 if (RI.isAGPRClass(NewDstRC))
9286 return nullptr;
9287
9288 switch (Inst.getOpcode()) {
9289 case AMDGPU::PHI:
9290 case AMDGPU::REG_SEQUENCE:
9291 case AMDGPU::INSERT_SUBREG:
9292 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
9293 break;
9294 default:
9295 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9296 }
9297
9298 if (!NewDstRC)
9299 return nullptr;
9300 } else {
9301 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9302 return nullptr;
9303
9304 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9305 if (!NewDstRC)
9306 return nullptr;
9307 }
9308
9309 return NewDstRC;
9310 }
9311 default:
9312 return NewDstRC;
9313 }
9314}
9315
9316// Find the one SGPR operand we are allowed to use.
9317Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9318 int OpIndices[3]) const {
9319 const MCInstrDesc &Desc = MI.getDesc();
9320
9321 // Find the one SGPR operand we are allowed to use.
9322 //
9323 // First we need to consider the instruction's operand requirements before
9324 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9325 // of VCC, but we are still bound by the constant bus requirement to only use
9326 // one.
9327 //
9328 // If the operand's class is an SGPR, we can never move it.
9329
9330 Register SGPRReg = findImplicitSGPRRead(MI);
9331 if (SGPRReg)
9332 return SGPRReg;
9333
9334 Register UsedSGPRs[3] = {Register()};
9335 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9336
9337 for (unsigned i = 0; i < 3; ++i) {
9338 int Idx = OpIndices[i];
9339 if (Idx == -1)
9340 break;
9341
9342 const MachineOperand &MO = MI.getOperand(Idx);
9343 if (!MO.isReg())
9344 continue;
9345
9346 // Is this operand statically required to be an SGPR based on the operand
9347 // constraints?
9348 const TargetRegisterClass *OpRC =
9349 RI.getRegClass(getOpRegClassID(Desc.operands()[Idx]));
9350 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
9351 if (IsRequiredSGPR)
9352 return MO.getReg();
9353
9354 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9355 Register Reg = MO.getReg();
9356 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9357 if (RI.isSGPRClass(RegRC))
9358 UsedSGPRs[i] = Reg;
9359 }
9360
9361 // We don't have a required SGPR operand, so we have a bit more freedom in
9362 // selecting operands to move.
9363
9364 // Try to select the most used SGPR. If an SGPR is equal to one of the
9365 // others, we choose that.
9366 //
9367 // e.g.
9368 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9369 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9370
9371 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9372 // prefer those.
9373
9374 if (UsedSGPRs[0]) {
9375 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9376 SGPRReg = UsedSGPRs[0];
9377 }
9378
9379 if (!SGPRReg && UsedSGPRs[1]) {
9380 if (UsedSGPRs[1] == UsedSGPRs[2])
9381 SGPRReg = UsedSGPRs[1];
9382 }
9383
9384 return SGPRReg;
9385}
9386
9388 AMDGPU::OpName OperandName) const {
9389 if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
9390 return nullptr;
9391
9392 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
9393 if (Idx == -1)
9394 return nullptr;
9395
9396 return &MI.getOperand(Idx);
9397}
9398
9400 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9401 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9404 return (Format << 44) |
9405 (1ULL << 56) | // RESOURCE_LEVEL = 1
9406 (3ULL << 60); // OOB_SELECT = 3
9407 }
9408
9409 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9410 if (ST.isAmdHsaOS()) {
9411 // Set ATC = 1. GFX9 doesn't have this bit.
9412 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9413 RsrcDataFormat |= (1ULL << 56);
9414
9415 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9416 // BTW, it disables TC L2 and therefore decreases performance.
9417 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9418 RsrcDataFormat |= (2ULL << 59);
9419 }
9420
9421 return RsrcDataFormat;
9422}
9423
9427 0xffffffff; // Size;
9428
9429 // GFX9 doesn't have ELEMENT_SIZE.
9430 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9431 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
9432 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9433 }
9434
9435 // IndexStride = 64 / 32.
9436 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9437 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9438
9439 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9440 // Clear them unless we want a huge stride.
9441 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9442 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9443 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9444
9445 return Rsrc23;
9446}
9447
9449 unsigned Opc = MI.getOpcode();
9450
9451 return isSMRD(Opc);
9452}
9453
9455 return get(Opc).mayLoad() &&
9456 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
9457}
9458
9460 int &FrameIndex) const {
9461 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
9462 if (!Addr || !Addr->isFI())
9463 return Register();
9464
9465 assert(!MI.memoperands_empty() &&
9466 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9467
9468 FrameIndex = Addr->getIndex();
9469 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
9470}
9471
9473 int &FrameIndex) const {
9474 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
9475 assert(Addr && Addr->isFI());
9476 FrameIndex = Addr->getIndex();
9477 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
9478}
9479
9481 int &FrameIndex) const {
9482 if (!MI.mayLoad())
9483 return Register();
9484
9485 if (isMUBUF(MI) || isVGPRSpill(MI))
9486 return isStackAccess(MI, FrameIndex);
9487
9488 if (isSGPRSpill(MI))
9489 return isSGPRStackAccess(MI, FrameIndex);
9490
9491 return Register();
9492}
9493
9495 int &FrameIndex) const {
9496 if (!MI.mayStore())
9497 return Register();
9498
9499 if (isMUBUF(MI) || isVGPRSpill(MI))
9500 return isStackAccess(MI, FrameIndex);
9501
9502 if (isSGPRSpill(MI))
9503 return isSGPRStackAccess(MI, FrameIndex);
9504
9505 return Register();
9506}
9507
9509 unsigned Size = 0;
9511 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
9512 while (++I != E && I->isInsideBundle()) {
9513 assert(!I->isBundle() && "No nested bundle!");
9515 }
9516
9517 return Size;
9518}
9519
9521 unsigned Opc = MI.getOpcode();
9523 unsigned DescSize = Desc.getSize();
9524
9525 // If we have a definitive size, we can use it. Otherwise we need to inspect
9526 // the operands to know the size.
9527 if (isFixedSize(MI)) {
9528 unsigned Size = DescSize;
9529
9530 // If we hit the buggy offset, an extra nop will be inserted in MC so
9531 // estimate the worst case.
9532 if (MI.isBranch() && ST.hasOffset3fBug())
9533 Size += 4;
9534
9535 return Size;
9536 }
9537
9538 // Instructions may have a 32-bit literal encoded after them. Check
9539 // operands that could ever be literals.
9540 if (isVALU(MI) || isSALU(MI)) {
9541 if (isDPP(MI))
9542 return DescSize;
9543 bool HasLiteral = false;
9544 unsigned LiteralSize = 4;
9545 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9546 const MachineOperand &Op = MI.getOperand(I);
9547 const MCOperandInfo &OpInfo = Desc.operands()[I];
9548 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
9549 HasLiteral = true;
9550 if (ST.has64BitLiterals()) {
9551 switch (OpInfo.OperandType) {
9552 default:
9553 break;
9555 if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
9556 LiteralSize = 8;
9557 break;
9559 if (!Op.isImm() || !AMDGPU::isValid32BitLiteral(Op.getImm(), false))
9560 LiteralSize = 8;
9561 break;
9562 }
9563 }
9564 break;
9565 }
9566 }
9567 return HasLiteral ? DescSize + LiteralSize : DescSize;
9568 }
9569
9570 // Check whether we have extra NSA words.
9571 if (isMIMG(MI)) {
9572 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
9573 if (VAddr0Idx < 0)
9574 return 8;
9575
9576 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
9577 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9578 }
9579
9580 switch (Opc) {
9581 case TargetOpcode::BUNDLE:
9582 return getInstBundleSize(MI);
9583 case TargetOpcode::INLINEASM:
9584 case TargetOpcode::INLINEASM_BR: {
9585 const MachineFunction *MF = MI.getParent()->getParent();
9586 const char *AsmStr = MI.getOperand(0).getSymbolName();
9587 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
9588 }
9589 default:
9590 if (MI.isMetaInstruction())
9591 return 0;
9592
9593 // If D16 Pseudo inst, get correct MC code size
9594 const auto *D16Info = AMDGPU::getT16D16Helper(Opc);
9595 if (D16Info) {
9596 // Assume d16_lo/hi inst are always in same size
9597 unsigned LoInstOpcode = D16Info->LoOp;
9598 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);
9599 DescSize = Desc.getSize();
9600 }
9601
9602 // If FMA Pseudo inst, get correct MC code size
9603 if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) {
9604 // All potential lowerings are the same size; arbitrarily pick one.
9605 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(AMDGPU::V_FMA_MIXLO_F16);
9606 DescSize = Desc.getSize();
9607 }
9608
9609 return DescSize;
9610 }
9611}
9612
9614 if (!isFLAT(MI))
9615 return false;
9616
9617 if (MI.memoperands_empty())
9618 return true;
9619
9620 for (const MachineMemOperand *MMO : MI.memoperands()) {
9621 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
9622 return true;
9623 }
9624 return false;
9625}
9626
9629 static const std::pair<int, const char *> TargetIndices[] = {
9630 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
9631 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
9632 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
9633 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
9634 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
9635 return ArrayRef(TargetIndices);
9636}
9637
9638/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
9639/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
9645
9646/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
9647/// pass.
9652
9653// Called during:
9654// - pre-RA scheduling and post-RA scheduling
9657 const ScheduleDAGMI *DAG) const {
9658 // Borrowed from Arm Target
9659 // We would like to restrict this hazard recognizer to only
9660 // post-RA scheduling; we can tell that we're post-RA because we don't
9661 // track VRegLiveness.
9662 if (!DAG->hasVRegLiveness())
9663 return new GCNHazardRecognizer(DAG->MF);
9665}
9666
9667std::pair<unsigned, unsigned>
9669 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
9670}
9671
9674 static const std::pair<unsigned, const char *> TargetFlags[] = {
9675 {MO_GOTPCREL, "amdgpu-gotprel"},
9676 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
9677 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
9678 {MO_GOTPCREL64, "amdgpu-gotprel64"},
9679 {MO_REL32_LO, "amdgpu-rel32-lo"},
9680 {MO_REL32_HI, "amdgpu-rel32-hi"},
9681 {MO_REL64, "amdgpu-rel64"},
9682 {MO_ABS32_LO, "amdgpu-abs32-lo"},
9683 {MO_ABS32_HI, "amdgpu-abs32-hi"},
9684 {MO_ABS64, "amdgpu-abs64"},
9685 };
9686
9687 return ArrayRef(TargetFlags);
9688}
9689
9692 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9693 {
9694 {MONoClobber, "amdgpu-noclobber"},
9695 {MOLastUse, "amdgpu-last-use"},
9696 {MOCooperative, "amdgpu-cooperative"},
9697 };
9698
9699 return ArrayRef(TargetFlags);
9700}
9701
9703 const MachineFunction &MF) const {
9705 assert(SrcReg.isVirtual());
9706 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
9707 return AMDGPU::WWM_COPY;
9708
9709 return AMDGPU::COPY;
9710}
9711
9713 Register Reg) const {
9714 // We need to handle instructions which may be inserted during register
9715 // allocation to handle the prolog. The initial prolog instruction may have
9716 // been separated from the start of the block by spills and copies inserted
9717 // needed by the prolog. However, the insertions for scalar registers can
9718 // always be placed at the BB top as they are independent of the exec mask
9719 // value.
9720 const MachineFunction *MF = MI.getParent()->getParent();
9721 bool IsNullOrVectorRegister = true;
9722 if (Reg) {
9723 const MachineRegisterInfo &MRI = MF->getRegInfo();
9724 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
9725 }
9726
9727 uint16_t Opcode = MI.getOpcode();
9729 return IsNullOrVectorRegister &&
9730 (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) ||
9731 (Opcode == AMDGPU::IMPLICIT_DEF &&
9732 MFI->isWWMReg(MI.getOperand(0).getReg())) ||
9733 (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
9734 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
9735}
9736
9740 const DebugLoc &DL,
9741 Register DestReg) const {
9742 if (ST.hasAddNoCarry())
9743 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
9744
9745 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9746 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
9747 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
9748
9749 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9750 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9751}
9752
9755 const DebugLoc &DL,
9756 Register DestReg,
9757 RegScavenger &RS) const {
9758 if (ST.hasAddNoCarry())
9759 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
9760
9761 // If available, prefer to use vcc.
9762 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
9763 ? Register(RI.getVCC())
9764 : RS.scavengeRegisterBackwards(
9765 *RI.getBoolRC(), I, /* RestoreAfter */ false,
9766 0, /* AllowSpill */ false);
9767
9768 // TODO: Users need to deal with this.
9769 if (!UnusedCarry.isValid())
9770 return MachineInstrBuilder();
9771
9772 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9773 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9774}
9775
9776bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
9777 switch (Opcode) {
9778 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
9779 case AMDGPU::SI_KILL_I1_TERMINATOR:
9780 return true;
9781 default:
9782 return false;
9783 }
9784}
9785
9787 switch (Opcode) {
9788 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
9789 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
9790 case AMDGPU::SI_KILL_I1_PSEUDO:
9791 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
9792 default:
9793 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
9794 }
9795}
9796
9797bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
9798 return Imm <= getMaxMUBUFImmOffset(ST);
9799}
9800
9802 // GFX12 field is non-negative 24-bit signed byte offset.
9803 const unsigned OffsetBits =
9804 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
9805 return (1 << OffsetBits) - 1;
9806}
9807
9809 if (!ST.isWave32())
9810 return;
9811
9812 if (MI.isInlineAsm())
9813 return;
9814
9815 for (auto &Op : MI.implicit_operands()) {
9816 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
9817 Op.setReg(AMDGPU::VCC_LO);
9818 }
9819}
9820
9822 if (!isSMRD(MI))
9823 return false;
9824
9825 // Check that it is using a buffer resource.
9826 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
9827 if (Idx == -1) // e.g. s_memtime
9828 return false;
9829
9830 const int16_t RCID = getOpRegClassID(MI.getDesc().operands()[Idx]);
9831 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
9832}
9833
9834// Given Imm, split it into the values to put into the SOffset and ImmOffset
9835// fields in an MUBUF instruction. Return false if it is not possible (due to a
9836// hardware bug needing a workaround).
9837//
9838// The required alignment ensures that individual address components remain
9839// aligned if they are aligned to begin with. It also ensures that additional
9840// offsets within the given alignment can be added to the resulting ImmOffset.
9842 uint32_t &ImmOffset, Align Alignment) const {
9843 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
9844 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
9845 uint32_t Overflow = 0;
9846
9847 if (Imm > MaxImm) {
9848 if (Imm <= MaxImm + 64) {
9849 // Use an SOffset inline constant for 4..64
9850 Overflow = Imm - MaxImm;
9851 Imm = MaxImm;
9852 } else {
9853 // Try to keep the same value in SOffset for adjacent loads, so that
9854 // the corresponding register contents can be re-used.
9855 //
9856 // Load values with all low-bits (except for alignment bits) set into
9857 // SOffset, so that a larger range of values can be covered using
9858 // s_movk_i32.
9859 //
9860 // Atomic operations fail to work correctly when individual address
9861 // components are unaligned, even if their sum is aligned.
9862 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
9863 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
9864 Imm = Low;
9865 Overflow = High - Alignment.value();
9866 }
9867 }
9868
9869 if (Overflow > 0) {
9870 // There is a hardware bug in SI and CI which prevents address clamping in
9871 // MUBUF instructions from working correctly with SOffsets. The immediate
9872 // offset is unaffected.
9873 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
9874 return false;
9875
9876 // It is not possible to set immediate in SOffset field on some targets.
9877 if (ST.hasRestrictedSOffset())
9878 return false;
9879 }
9880
9881 ImmOffset = Imm;
9882 SOffset = Overflow;
9883 return true;
9884}
9885
9886// Depending on the used address space and instructions, some immediate offsets
9887// are allowed and some are not.
9888// Pre-GFX12, flat instruction offsets can only be non-negative, global and
9889// scratch instruction offsets can also be negative. On GFX12, offsets can be
9890// negative for all variants.
9891//
9892// There are several bugs related to these offsets:
9893// On gfx10.1, flat instructions that go into the global address space cannot
9894// use an offset.
9895//
9896// For scratch instructions, the address can be either an SGPR or a VGPR.
9897// The following offsets can be used, depending on the architecture (x means
9898// cannot be used):
9899// +----------------------------+------+------+
9900// | Address-Mode | SGPR | VGPR |
9901// +----------------------------+------+------+
9902// | gfx9 | | |
9903// | negative, 4-aligned offset | x | ok |
9904// | negative, unaligned offset | x | ok |
9905// +----------------------------+------+------+
9906// | gfx10 | | |
9907// | negative, 4-aligned offset | ok | ok |
9908// | negative, unaligned offset | ok | x |
9909// +----------------------------+------+------+
9910// | gfx10.3 | | |
9911// | negative, 4-aligned offset | ok | ok |
9912// | negative, unaligned offset | ok | ok |
9913// +----------------------------+------+------+
9914//
9915// This function ignores the addressing mode, so if an offset cannot be used in
9916// one addressing mode, it is considered illegal.
9917bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
9918 uint64_t FlatVariant) const {
9919 // TODO: Should 0 be special cased?
9920 if (!ST.hasFlatInstOffsets())
9921 return false;
9922
9923 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
9924 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
9925 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
9926 return false;
9927
9928 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
9929 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
9930 (Offset % 4) != 0) {
9931 return false;
9932 }
9933
9934 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9935 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
9936 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
9937}
9938
9939// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
9940std::pair<int64_t, int64_t>
9941SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
9942 uint64_t FlatVariant) const {
9943 int64_t RemainderOffset = COffsetVal;
9944 int64_t ImmField = 0;
9945
9946 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9947 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
9948
9949 if (AllowNegative) {
9950 // Use signed division by a power of two to truncate towards 0.
9951 int64_t D = 1LL << NumBits;
9952 RemainderOffset = (COffsetVal / D) * D;
9953 ImmField = COffsetVal - RemainderOffset;
9954
9955 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
9956 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
9957 (ImmField % 4) != 0) {
9958 // Make ImmField a multiple of 4
9959 RemainderOffset += ImmField % 4;
9960 ImmField -= ImmField % 4;
9961 }
9962 } else if (COffsetVal >= 0) {
9963 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
9964 RemainderOffset = COffsetVal - ImmField;
9965 }
9966
9967 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
9968 assert(RemainderOffset + ImmField == COffsetVal);
9969 return {ImmField, RemainderOffset};
9970}
9971
9973 if (ST.hasNegativeScratchOffsetBug() &&
9974 FlatVariant == SIInstrFlags::FlatScratch)
9975 return false;
9976
9977 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
9978}
9979
9980static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
9981 switch (ST.getGeneration()) {
9982 default:
9983 break;
9986 return SIEncodingFamily::SI;
9989 return SIEncodingFamily::VI;
9995 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
9997 }
9998 llvm_unreachable("Unknown subtarget generation!");
9999}
10000
10001bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
10002 switch(MCOp) {
10003 // These opcodes use indirect register addressing so
10004 // they need special handling by codegen (currently missing).
10005 // Therefore it is too risky to allow these opcodes
10006 // to be selected by dpp combiner or sdwa peepholer.
10007 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
10008 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
10009 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
10010 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
10011 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
10012 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
10013 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
10014 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
10015 return true;
10016 default:
10017 return false;
10018 }
10019}
10020
10021#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
10022 case OPCODE##_dpp: \
10023 case OPCODE##_e32: \
10024 case OPCODE##_e64: \
10025 case OPCODE##_e64_dpp: \
10026 case OPCODE##_sdwa:
10027
10028static bool isRenamedInGFX9(int Opcode) {
10029 switch (Opcode) {
10030 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
10031 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
10032 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
10033 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
10034 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
10035 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
10036 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
10037 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
10038 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
10039 //
10040 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
10041 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
10042 case AMDGPU::V_FMA_F16_gfx9_e64:
10043 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
10044 case AMDGPU::V_INTERP_P2_F16:
10045 case AMDGPU::V_MAD_F16_e64:
10046 case AMDGPU::V_MAD_U16_e64:
10047 case AMDGPU::V_MAD_I16_e64:
10048 return true;
10049 default:
10050 return false;
10051 }
10052}
10053
10054int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
10055 Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
10056
10057 unsigned Gen = subtargetEncodingFamily(ST);
10058
10059 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
10061
10062 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
10063 // subtarget has UnpackedD16VMem feature.
10064 // TODO: remove this when we discard GFX80 encoding.
10065 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
10067
10068 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
10069 switch (ST.getGeneration()) {
10070 default:
10072 break;
10075 break;
10078 break;
10079 }
10080 }
10081
10082 if (isMAI(Opcode)) {
10083 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
10084 if (MFMAOp != -1)
10085 Opcode = MFMAOp;
10086 }
10087
10088 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
10089
10090 if (MCOp == (uint16_t)-1 && ST.hasGFX1250Insts())
10092
10093 // -1 means that Opcode is already a native instruction.
10094 if (MCOp == -1)
10095 return Opcode;
10096
10097 if (ST.hasGFX90AInsts()) {
10098 uint16_t NMCOp = (uint16_t)-1;
10099 if (ST.hasGFX940Insts())
10101 if (NMCOp == (uint16_t)-1)
10103 if (NMCOp == (uint16_t)-1)
10105 if (NMCOp != (uint16_t)-1)
10106 MCOp = NMCOp;
10107 }
10108
10109 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
10110 // no encoding in the given subtarget generation.
10111 if (MCOp == (uint16_t)-1)
10112 return -1;
10113
10114 if (isAsmOnlyOpcode(MCOp))
10115 return -1;
10116
10117 return MCOp;
10118}
10119
10120static
10122 assert(RegOpnd.isReg());
10123 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
10124 getRegSubRegPair(RegOpnd);
10125}
10126
10129 assert(MI.isRegSequence());
10130 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
10131 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
10132 auto &RegOp = MI.getOperand(1 + 2 * I);
10133 return getRegOrUndef(RegOp);
10134 }
10136}
10137
10138// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10139// Following a subreg of reg:subreg isn't supported
10142 if (!RSR.SubReg)
10143 return false;
10144 switch (MI.getOpcode()) {
10145 default: break;
10146 case AMDGPU::REG_SEQUENCE:
10147 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
10148 return true;
10149 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10150 case AMDGPU::INSERT_SUBREG:
10151 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
10152 // inserted the subreg we're looking for
10153 RSR = getRegOrUndef(MI.getOperand(2));
10154 else { // the subreg in the rest of the reg
10155 auto R1 = getRegOrUndef(MI.getOperand(1));
10156 if (R1.SubReg) // subreg of subreg isn't supported
10157 return false;
10158 RSR.Reg = R1.Reg;
10159 }
10160 return true;
10161 }
10162 return false;
10163}
10164
10166 const MachineRegisterInfo &MRI) {
10167 assert(MRI.isSSA());
10168 if (!P.Reg.isVirtual())
10169 return nullptr;
10170
10171 auto RSR = P;
10172 auto *DefInst = MRI.getVRegDef(RSR.Reg);
10173 while (auto *MI = DefInst) {
10174 DefInst = nullptr;
10175 switch (MI->getOpcode()) {
10176 case AMDGPU::COPY:
10177 case AMDGPU::V_MOV_B32_e32: {
10178 auto &Op1 = MI->getOperand(1);
10179 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10180 if (Op1.isUndef())
10181 return nullptr;
10182 RSR = getRegSubRegPair(Op1);
10183 DefInst = MRI.getVRegDef(RSR.Reg);
10184 }
10185 break;
10186 }
10187 default:
10188 if (followSubRegDef(*MI, RSR)) {
10189 if (!RSR.Reg)
10190 return nullptr;
10191 DefInst = MRI.getVRegDef(RSR.Reg);
10192 }
10193 }
10194 if (!DefInst)
10195 return MI;
10196 }
10197 return nullptr;
10198}
10199
10201 Register VReg,
10202 const MachineInstr &DefMI,
10203 const MachineInstr &UseMI) {
10204 assert(MRI.isSSA() && "Must be run on SSA");
10205
10206 auto *TRI = MRI.getTargetRegisterInfo();
10207 auto *DefBB = DefMI.getParent();
10208
10209 // Don't bother searching between blocks, although it is possible this block
10210 // doesn't modify exec.
10211 if (UseMI.getParent() != DefBB)
10212 return true;
10213
10214 const int MaxInstScan = 20;
10215 int NumInst = 0;
10216
10217 // Stop scan at the use.
10218 auto E = UseMI.getIterator();
10219 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
10220 if (I->isDebugInstr())
10221 continue;
10222
10223 if (++NumInst > MaxInstScan)
10224 return true;
10225
10226 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
10227 return true;
10228 }
10229
10230 return false;
10231}
10232
10234 Register VReg,
10235 const MachineInstr &DefMI) {
10236 assert(MRI.isSSA() && "Must be run on SSA");
10237
10238 auto *TRI = MRI.getTargetRegisterInfo();
10239 auto *DefBB = DefMI.getParent();
10240
10241 const int MaxUseScan = 10;
10242 int NumUse = 0;
10243
10244 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
10245 auto &UseInst = *Use.getParent();
10246 // Don't bother searching between blocks, although it is possible this block
10247 // doesn't modify exec.
10248 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10249 return true;
10250
10251 if (++NumUse > MaxUseScan)
10252 return true;
10253 }
10254
10255 if (NumUse == 0)
10256 return false;
10257
10258 const int MaxInstScan = 20;
10259 int NumInst = 0;
10260
10261 // Stop scan when we have seen all the uses.
10262 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
10263 assert(I != DefBB->end());
10264
10265 if (I->isDebugInstr())
10266 continue;
10267
10268 if (++NumInst > MaxInstScan)
10269 return true;
10270
10271 for (const MachineOperand &Op : I->operands()) {
10272 // We don't check reg masks here as they're used only on calls:
10273 // 1. EXEC is only considered const within one BB
10274 // 2. Call should be a terminator instruction if present in a BB
10275
10276 if (!Op.isReg())
10277 continue;
10278
10279 Register Reg = Op.getReg();
10280 if (Op.isUse()) {
10281 if (Reg == VReg && --NumUse == 0)
10282 return false;
10283 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
10284 return true;
10285 }
10286 }
10287}
10288
10291 const DebugLoc &DL, Register Src, Register Dst) const {
10292 auto Cur = MBB.begin();
10293 if (Cur != MBB.end())
10294 do {
10295 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
10296 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
10297 ++Cur;
10298 } while (Cur != MBB.end() && Cur != LastPHIIt);
10299
10300 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
10301 Dst);
10302}
10303
10306 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10307 if (InsPt != MBB.end() &&
10308 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10309 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10310 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10311 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
10312 InsPt++;
10313 return BuildMI(MBB, InsPt, DL,
10314 get(AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), Dst)
10315 .addReg(Src, 0, SrcSubReg)
10316 .addReg(AMDGPU::EXEC, RegState::Implicit);
10317 }
10318 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10319 Dst);
10320}
10321
10322bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10323
10326 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
10327 VirtRegMap *VRM) const {
10328 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10329 //
10330 // %0:sreg_32 = COPY $m0
10331 //
10332 // We explicitly chose SReg_32 for the virtual register so such a copy might
10333 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10334 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10335 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10336 // TargetInstrInfo::foldMemoryOperand() is going to try.
10337 // A similar issue also exists with spilling and reloading $exec registers.
10338 //
10339 // To prevent that, constrain the %0 register class here.
10340 if (isFullCopyInstr(MI)) {
10341 Register DstReg = MI.getOperand(0).getReg();
10342 Register SrcReg = MI.getOperand(1).getReg();
10343 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10344 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10346 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10347 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
10348 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
10349 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
10350 return nullptr;
10351 }
10352 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
10353 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
10354 return nullptr;
10355 }
10356 }
10357 }
10358
10359 return nullptr;
10360}
10361
10363 const MachineInstr &MI,
10364 unsigned *PredCost) const {
10365 if (MI.isBundle()) {
10367 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10368 unsigned Lat = 0, Count = 0;
10369 for (++I; I != E && I->isBundledWithPred(); ++I) {
10370 ++Count;
10371 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
10372 }
10373 return Lat + Count - 1;
10374 }
10375
10376 return SchedModel.computeInstrLatency(&MI);
10377}
10378
10381 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10382 unsigned Opcode = MI.getOpcode();
10383
10384 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10385 Register Dst = MI.getOperand(0).getReg();
10386 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
10387 : MI.getOperand(1).getReg();
10388 LLT DstTy = MRI.getType(Dst);
10389 LLT SrcTy = MRI.getType(Src);
10390 unsigned DstAS = DstTy.getAddressSpace();
10391 unsigned SrcAS = SrcTy.getAddressSpace();
10392 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10393 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10394 ST.hasGloballyAddressableScratch()
10397 };
10398
10399 // If the target supports globally addressable scratch, the mapping from
10400 // scratch memory to the flat aperture changes therefore an address space cast
10401 // is no longer uniform.
10402 if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
10403 return HandleAddrSpaceCast(MI);
10404
10405 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
10406 auto IID = GI->getIntrinsicID();
10411
10412 switch (IID) {
10413 case Intrinsic::amdgcn_addrspacecast_nonnull:
10414 return HandleAddrSpaceCast(MI);
10415 case Intrinsic::amdgcn_if:
10416 case Intrinsic::amdgcn_else:
10417 // FIXME: Uniform if second result
10418 break;
10419 }
10420
10422 }
10423
10424 // Loads from the private and flat address spaces are divergent, because
10425 // threads can execute the load instruction with the same inputs and get
10426 // different results.
10427 //
10428 // All other loads are not divergent, because if threads issue loads with the
10429 // same arguments, they will always get the same result.
10430 if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
10431 Opcode == AMDGPU::G_SEXTLOAD) {
10432 if (MI.memoperands_empty())
10433 return InstructionUniformity::NeverUniform; // conservative assumption
10434
10435 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10436 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10437 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10438 })) {
10439 // At least one MMO in a non-global address space.
10441 }
10443 }
10444
10445 if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) ||
10446 Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10447 Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10448 AMDGPU::isGenericAtomic(Opcode)) {
10450 }
10452}
10453
10456
10457 if (isNeverUniform(MI))
10459
10460 unsigned opcode = MI.getOpcode();
10461 if (opcode == AMDGPU::V_READLANE_B32 ||
10462 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10463 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10465
10466 if (isCopyInstr(MI)) {
10467 const MachineOperand &srcOp = MI.getOperand(1);
10468 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10469 const TargetRegisterClass *regClass =
10470 RI.getPhysRegBaseClass(srcOp.getReg());
10471 return RI.isSGPRClass(regClass) ? InstructionUniformity::AlwaysUniform
10473 }
10475 }
10476
10477 // GMIR handling
10478 if (MI.isPreISelOpcode())
10480
10481 // Atomics are divergent because they are executed sequentially: when an
10482 // atomic operation refers to the same address in each thread, then each
10483 // thread after the first sees the value written by the previous thread as
10484 // original value.
10485
10486 if (isAtomic(MI))
10488
10489 // Loads from the private and flat address spaces are divergent, because
10490 // threads can execute the load instruction with the same inputs and get
10491 // different results.
10492 if (isFLAT(MI) && MI.mayLoad()) {
10493 if (MI.memoperands_empty())
10494 return InstructionUniformity::NeverUniform; // conservative assumption
10495
10496 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10497 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10498 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10499 })) {
10500 // At least one MMO in a non-global address space.
10502 }
10503
10505 }
10506
10507 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
10508 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
10509
10510 // FIXME: It's conceptually broken to report this for an instruction, and not
10511 // a specific def operand. For inline asm in particular, there could be mixed
10512 // uniform and divergent results.
10513 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
10514 const MachineOperand &SrcOp = MI.getOperand(I);
10515 if (!SrcOp.isReg())
10516 continue;
10517
10518 Register Reg = SrcOp.getReg();
10519 if (!Reg || !SrcOp.readsReg())
10520 continue;
10521
10522 // If RegBank is null, this is unassigned or an unallocatable special
10523 // register, which are all scalars.
10524 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
10525 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
10527 }
10528
10529 // TODO: Uniformity check condtions above can be rearranged for more
10530 // redability
10531
10532 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
10533 // currently turned into no-op COPYs by SelectionDAG ISel and are
10534 // therefore no longer recognizable.
10535
10537}
10538
10540 switch (MF.getFunction().getCallingConv()) {
10542 return 1;
10544 return 2;
10546 return 3;
10550 const Function &F = MF.getFunction();
10551 F.getContext().diagnose(DiagnosticInfoUnsupported(
10552 F, "ds_ordered_count unsupported for this calling conv"));
10553 [[fallthrough]];
10554 }
10557 case CallingConv::C:
10558 case CallingConv::Fast:
10559 default:
10560 // Assume other calling conventions are various compute callable functions
10561 return 0;
10562 }
10563}
10564
10566 Register &SrcReg2, int64_t &CmpMask,
10567 int64_t &CmpValue) const {
10568 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
10569 return false;
10570
10571 switch (MI.getOpcode()) {
10572 default:
10573 break;
10574 case AMDGPU::S_CMP_EQ_U32:
10575 case AMDGPU::S_CMP_EQ_I32:
10576 case AMDGPU::S_CMP_LG_U32:
10577 case AMDGPU::S_CMP_LG_I32:
10578 case AMDGPU::S_CMP_LT_U32:
10579 case AMDGPU::S_CMP_LT_I32:
10580 case AMDGPU::S_CMP_GT_U32:
10581 case AMDGPU::S_CMP_GT_I32:
10582 case AMDGPU::S_CMP_LE_U32:
10583 case AMDGPU::S_CMP_LE_I32:
10584 case AMDGPU::S_CMP_GE_U32:
10585 case AMDGPU::S_CMP_GE_I32:
10586 case AMDGPU::S_CMP_EQ_U64:
10587 case AMDGPU::S_CMP_LG_U64:
10588 SrcReg = MI.getOperand(0).getReg();
10589 if (MI.getOperand(1).isReg()) {
10590 if (MI.getOperand(1).getSubReg())
10591 return false;
10592 SrcReg2 = MI.getOperand(1).getReg();
10593 CmpValue = 0;
10594 } else if (MI.getOperand(1).isImm()) {
10595 SrcReg2 = Register();
10596 CmpValue = MI.getOperand(1).getImm();
10597 } else {
10598 return false;
10599 }
10600 CmpMask = ~0;
10601 return true;
10602 case AMDGPU::S_CMPK_EQ_U32:
10603 case AMDGPU::S_CMPK_EQ_I32:
10604 case AMDGPU::S_CMPK_LG_U32:
10605 case AMDGPU::S_CMPK_LG_I32:
10606 case AMDGPU::S_CMPK_LT_U32:
10607 case AMDGPU::S_CMPK_LT_I32:
10608 case AMDGPU::S_CMPK_GT_U32:
10609 case AMDGPU::S_CMPK_GT_I32:
10610 case AMDGPU::S_CMPK_LE_U32:
10611 case AMDGPU::S_CMPK_LE_I32:
10612 case AMDGPU::S_CMPK_GE_U32:
10613 case AMDGPU::S_CMPK_GE_I32:
10614 SrcReg = MI.getOperand(0).getReg();
10615 SrcReg2 = Register();
10616 CmpValue = MI.getOperand(1).getImm();
10617 CmpMask = ~0;
10618 return true;
10619 }
10620
10621 return false;
10622}
10623
10624// SCC is already valid after SCCValid.
10625// SCCRedefine will redefine SCC to the same value already available after
10626// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
10627// update kill/dead flags if necessary.
10628static bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
10629 const SIRegisterInfo &RI) {
10630 MachineInstr *KillsSCC = nullptr;
10631 if (SCCValid->getParent() != SCCRedefine->getParent())
10632 return false;
10633 for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()),
10634 SCCRedefine->getIterator())) {
10635 if (MI.modifiesRegister(AMDGPU::SCC, &RI))
10636 return false;
10637 if (MI.killsRegister(AMDGPU::SCC, &RI))
10638 KillsSCC = &MI;
10639 }
10640 if (MachineOperand *SccDef =
10641 SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
10642 SccDef->setIsDead(false);
10643 if (KillsSCC)
10644 KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
10645 SCCRedefine->eraseFromParent();
10646 return true;
10647}
10648
10649static bool foldableSelect(const MachineInstr &Def) {
10650 if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 &&
10651 Def.getOpcode() != AMDGPU::S_CSELECT_B64)
10652 return false;
10653 bool Op1IsNonZeroImm =
10654 Def.getOperand(1).isImm() && Def.getOperand(1).getImm() != 0;
10655 bool Op2IsZeroImm =
10656 Def.getOperand(2).isImm() && Def.getOperand(2).getImm() == 0;
10657 if (!Op1IsNonZeroImm || !Op2IsZeroImm)
10658 return false;
10659 return true;
10660}
10661
10663 Register SrcReg2, int64_t CmpMask,
10664 int64_t CmpValue,
10665 const MachineRegisterInfo *MRI) const {
10666 if (!SrcReg || SrcReg.isPhysical())
10667 return false;
10668
10669 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
10670 return false;
10671
10672 const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
10673 this]() -> bool {
10674 if (CmpValue != 0)
10675 return false;
10676
10677 MachineInstr *Def = MRI->getVRegDef(SrcReg);
10678 if (!Def)
10679 return false;
10680
10681 // For S_OP that set SCC = DST!=0, do the transformation
10682 //
10683 // s_cmp_lg_* (S_OP ...), 0 => (S_OP ...)
10684
10685 // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
10686 // for S_CSELECT* already has the same value that will be calculated by
10687 // s_cmp_lg_*
10688 //
10689 // s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero
10690 // imm), 0)
10691 if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(*Def))
10692 return false;
10693
10694 if (!optimizeSCC(Def, &CmpInstr, RI))
10695 return false;
10696
10697 // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
10698 // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
10699 // 64-bit foldableSelect then delete s_or_b32 in the sequence:
10700 // sX = s_cselect_b64 (non-zero imm), 0
10701 // sLo = copy sX.sub0
10702 // sHi = copy sX.sub1
10703 // sY = s_or_b32 sLo, sHi
10704 if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
10705 MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
10706 const MachineOperand &OrOpnd1 = Def->getOperand(1);
10707 const MachineOperand &OrOpnd2 = Def->getOperand(2);
10708 if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
10709 MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg());
10710 MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg());
10711 if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
10712 Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() &&
10713 Def2->getOperand(1).isReg() &&
10714 Def1->getOperand(1).getSubReg() == AMDGPU::sub0 &&
10715 Def2->getOperand(1).getSubReg() == AMDGPU::sub1 &&
10716 Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
10717 MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg());
10718 if (Select && foldableSelect(*Select))
10719 optimizeSCC(Select, Def, RI);
10720 }
10721 }
10722 }
10723 return true;
10724 };
10725
10726 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
10727 this](int64_t ExpectedValue, unsigned SrcSize,
10728 bool IsReversible, bool IsSigned) -> bool {
10729 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10730 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10731 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10732 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
10733 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
10734 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10735 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10736 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10737 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
10738 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
10739 //
10740 // Signed ge/gt are not used for the sign bit.
10741 //
10742 // If result of the AND is unused except in the compare:
10743 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
10744 //
10745 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
10746 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
10747 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
10748 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
10749 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
10750 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
10751
10752 MachineInstr *Def = MRI->getVRegDef(SrcReg);
10753 if (!Def)
10754 return false;
10755
10756 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
10757 Def->getOpcode() != AMDGPU::S_AND_B64)
10758 return false;
10759
10760 int64_t Mask;
10761 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
10762 if (MO->isImm())
10763 Mask = MO->getImm();
10764 else if (!getFoldableImm(MO, Mask))
10765 return false;
10766 Mask &= maxUIntN(SrcSize);
10767 return isPowerOf2_64(Mask);
10768 };
10769
10770 MachineOperand *SrcOp = &Def->getOperand(1);
10771 if (isMask(SrcOp))
10772 SrcOp = &Def->getOperand(2);
10773 else if (isMask(&Def->getOperand(2)))
10774 SrcOp = &Def->getOperand(1);
10775 else
10776 return false;
10777
10778 // A valid Mask is required to have a single bit set, hence a non-zero and
10779 // power-of-two value. This verifies that we will not do 64-bit shift below.
10780 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
10781 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
10782 if (IsSigned && BitNo == SrcSize - 1)
10783 return false;
10784
10785 ExpectedValue <<= BitNo;
10786
10787 bool IsReversedCC = false;
10788 if (CmpValue != ExpectedValue) {
10789 if (!IsReversible)
10790 return false;
10791 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
10792 if (!IsReversedCC)
10793 return false;
10794 }
10795
10796 Register DefReg = Def->getOperand(0).getReg();
10797 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
10798 return false;
10799
10800 if (!optimizeSCC(Def, &CmpInstr, RI))
10801 return false;
10802
10803 if (!MRI->use_nodbg_empty(DefReg)) {
10804 assert(!IsReversedCC);
10805 return true;
10806 }
10807
10808 // Replace AND with unused result with a S_BITCMP.
10809 MachineBasicBlock *MBB = Def->getParent();
10810
10811 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
10812 : AMDGPU::S_BITCMP1_B32
10813 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
10814 : AMDGPU::S_BITCMP1_B64;
10815
10816 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
10817 .add(*SrcOp)
10818 .addImm(BitNo);
10819 Def->eraseFromParent();
10820
10821 return true;
10822 };
10823
10824 switch (CmpInstr.getOpcode()) {
10825 default:
10826 break;
10827 case AMDGPU::S_CMP_EQ_U32:
10828 case AMDGPU::S_CMP_EQ_I32:
10829 case AMDGPU::S_CMPK_EQ_U32:
10830 case AMDGPU::S_CMPK_EQ_I32:
10831 return optimizeCmpAnd(1, 32, true, false);
10832 case AMDGPU::S_CMP_GE_U32:
10833 case AMDGPU::S_CMPK_GE_U32:
10834 return optimizeCmpAnd(1, 32, false, false);
10835 case AMDGPU::S_CMP_GE_I32:
10836 case AMDGPU::S_CMPK_GE_I32:
10837 return optimizeCmpAnd(1, 32, false, true);
10838 case AMDGPU::S_CMP_EQ_U64:
10839 return optimizeCmpAnd(1, 64, true, false);
10840 case AMDGPU::S_CMP_LG_U32:
10841 case AMDGPU::S_CMP_LG_I32:
10842 case AMDGPU::S_CMPK_LG_U32:
10843 case AMDGPU::S_CMPK_LG_I32:
10844 return optimizeCmpAnd(0, 32, true, false) || optimizeCmpSelect();
10845 case AMDGPU::S_CMP_GT_U32:
10846 case AMDGPU::S_CMPK_GT_U32:
10847 return optimizeCmpAnd(0, 32, false, false);
10848 case AMDGPU::S_CMP_GT_I32:
10849 case AMDGPU::S_CMPK_GT_I32:
10850 return optimizeCmpAnd(0, 32, false, true);
10851 case AMDGPU::S_CMP_LG_U64:
10852 return optimizeCmpAnd(0, 64, true, false) || optimizeCmpSelect();
10853 }
10854
10855 return false;
10856}
10857
10859 AMDGPU::OpName OpName) const {
10860 if (!ST.needsAlignedVGPRs())
10861 return;
10862
10863 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
10864 if (OpNo < 0)
10865 return;
10866 MachineOperand &Op = MI.getOperand(OpNo);
10867 if (getOpSize(MI, OpNo) > 4)
10868 return;
10869
10870 // Add implicit aligned super-reg to force alignment on the data operand.
10871 const DebugLoc &DL = MI.getDebugLoc();
10872 MachineBasicBlock *BB = MI.getParent();
10874 Register DataReg = Op.getReg();
10875 bool IsAGPR = RI.isAGPR(MRI, DataReg);
10876 Register Undef = MRI.createVirtualRegister(
10877 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
10878 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
10879 Register NewVR =
10880 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
10881 : &AMDGPU::VReg_64_Align2RegClass);
10882 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
10883 .addReg(DataReg, 0, Op.getSubReg())
10884 .addImm(AMDGPU::sub0)
10885 .addReg(Undef)
10886 .addImm(AMDGPU::sub1);
10887 Op.setReg(NewVR);
10888 Op.setSubReg(AMDGPU::sub0);
10889 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
10890}
10891
10893 if (isIGLP(*MI))
10894 return false;
10895
10897}
10898
10900 if (!isWMMA(MI) && !isSWMMAC(MI))
10901 return false;
10902
10903 if (AMDGPU::isGFX1250(ST))
10904 return AMDGPU::getWMMAIsXDL(MI.getOpcode());
10905
10906 return true;
10907}
10908
10910 unsigned Opcode = MI.getOpcode();
10911
10912 if (AMDGPU::isGFX12Plus(ST))
10913 return isDOT(MI) || isXDLWMMA(MI);
10914
10915 if (!isMAI(MI) || isDGEMM(Opcode) ||
10916 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
10917 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
10918 return false;
10919
10920 if (!ST.hasGFX940Insts())
10921 return true;
10922
10923 return AMDGPU::getMAIIsGFX940XDL(Opcode);
10924}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
static bool isUndef(const MachineInstr &MI)
TargetInstrInfo::RegSubRegPair RegSubRegPair
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static MachineInstr * swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine, const SIRegisterInfo &RI)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static constexpr AMDGPU::OpName ModifierOpNames[]
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static bool foldableSelect(const MachineInstr &Def)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, AMDGPU::OpName OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
static bool canRemat(const MachineInstr &MI)
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
#define LLVM_DEBUG(...)
Definition Debug.h:114
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static LLVM_ABI Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition APFloat.cpp:221
Class for arbitrary precision integers.
Definition APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1563
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
const T & front() const
front - Get the first element.
Definition ArrayRef.h:146
size_t size() const
size - Get the array size.
Definition ArrayRef.h:143
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:138
uint64_t getZExtValue() const
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
bool hasAddNoCarry() const
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
constexpr unsigned getAddressSpace() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
static LocationSize precise(uint64_t Value)
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:418
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
unsigned getSize() const
Return the number of bytes in the encoding of this instruction, or zero if the encoding size cannot b...
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
unsigned getOpcode() const
Return the opcode number for this descriptor.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:87
uint8_t OperandType
Information about the type of the operand.
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition MCInstrDesc.h:96
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
LLVM_ABI void setVariableValue(const MCExpr *Value)
Definition MCSymbol.cpp:50
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
bool isBundle() const
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
mop_range explicit_operands()
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
LLVM_ABI void clearRegisterKills(Register Reg, const TargetRegisterInfo *RegInfo)
Clear all kill flags affecting Reg.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
MachineOperand * findRegisterDefOperand(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false)
Wrapper for findRegisterDefOperandIdx, it returns a pointer to the MachineOperand rather than an inde...
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
static bool isDS(const MachineInstr &MI)
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
bool isXDLWMMA(const MachineInstr &MI) const
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
uint64_t getDefaultRsrcDataFormat() const
static bool isSOPP(const MachineInstr &MI)
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
bool isIGLP(unsigned Opcode) const
static bool isFLATScratch(const MachineInstr &MI)
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static unsigned getFoldableCopySrcIdx(const MachineInstr &MI)
bool mayAccessScratchThroughFlat(const MachineInstr &MI) const
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
Register isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isMTBUF(const MachineInstr &MI)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const final
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
MachineInstr * getWholeWaveFunctionSetup(MachineFunction &MF) const
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
static bool isDOT(const MachineInstr &MI)
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
static bool isSWMMAC(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
bool isXDL(const MachineInstr &MI) const
static bool isVIMAGE(const MachineInstr &MI)
void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const
static bool isSOP2(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
bool isLegalAV64PseudoImm(uint64_t Imm) const
Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool isNeverCoissue(MachineInstr &MI) const
bool hasModifiersSet(const MachineInstr &MI, AMDGPU::OpName OpName) const
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, unsigned toIdx) const
static bool isFLATGlobal(const MachineInstr &MI)
bool isGlobalMemoryObject(const MachineInstr *MI) const override
static bool isVSAMPLE(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const override
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isTRANS(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static bool isSOPK(const MachineInstr &MI)
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
const TargetRegisterClass * getRegClass(const MCInstrDesc &TID, unsigned OpNum, const TargetRegisterInfo *TRI) const override
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
bool isReMaterializableImpl(const MachineInstr &MI) const override
static bool isVOP3(const MCInstrDesc &Desc)
bool physRegUsesConstantBus(const MachineOperand &Reg) const
static bool isF16PseudoScalarTrans(unsigned Opcode)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const
static bool isDPP(const MachineInstr &MI)
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
static bool isMFMA(const MachineInstr &MI)
bool isLowLatencyInstruction(const MachineInstr &MI) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
bool isAlwaysGDS(uint16_t Opcode) const
static bool isMAI(const MCInstrDesc &Desc)
static bool usesLGKM_CNT(const MachineInstr &MI)
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsVALUt16(MachineInstr &Inst, MachineRegisterInfo &MRI) const
Fix operands in Inst to fix 16bit SALU to VALU lowering.
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
static bool setsSCCifResultIsNonZero(const MachineInstr &MI)
static bool isVGPRSpill(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
static bool isWWMRegSpillOpcode(uint16_t Opcode)
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
int64_t getNamedImmOperand(const MachineInstr &MI, AMDGPU::OpName OperandName) const
Get required immediate operand.
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool regUsesConstantBus(const MachineOperand &Reg, const MachineRegisterInfo &MRI) const
static bool isMIMG(const MachineInstr &MI)
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description or operand ind...
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
static bool isAtomic(const MachineInstr &MI)
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
bool isLiteralOperandLegal(const MCInstrDesc &InstDesc, const MCOperandInfo &OpInfo) const
static bool sopkIsZext(unsigned Opcode)
static bool isSGPRSpill(const MachineInstr &MI)
static bool isWMMA(const MachineInstr &MI)
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
bool isBarrier(unsigned Opcode) const
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
bool isLegalGFX12PlusPackedMathFP32Operand(const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO=nullptr) const
Check if MO would be a legal operand for gfx12+ packed math FP32 instructions.
static bool usesVM_CNT(const MachineInstr &MI)
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
static bool isFixedSize(const MachineInstr &MI)
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
static bool isLDSDMA(const MachineInstr &MI)
static bool isVOP1(const MachineInstr &MI)
SIInstrInfo(const GCNSubtarget &ST)
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
unsigned getHWRegIndex(MCRegister Reg) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
unsigned getChannelFromSubReg(unsigned SubReg) const
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReMaterializableImpl(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:344
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isPackedFP32Inst(unsigned Opc)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
bool getWMMAIsXDL(unsigned Opc)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
bool getMAIIsGFX940XDL(unsigned Opc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
const uint64_t RSRC_TID_ENABLE
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
bool isGenericAtomic(unsigned Opc)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
LLVM_READONLY int getCommuteRev(uint16_t Opcode)
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition SIDefines.h:231
@ OPERAND_REG_IMM_INT64
Definition SIDefines.h:202
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:209
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:222
@ OPERAND_REG_INLINE_C_BF16
Definition SIDefines.h:219
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:224
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:210
@ OPERAND_REG_IMM_BF16
Definition SIDefines.h:206
@ OPERAND_REG_IMM_INT32
Operands with register, 32-bit, or 64-bit immediate.
Definition SIDefines.h:201
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:208
@ OPERAND_REG_IMM_FP16
Definition SIDefines.h:207
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:218
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition SIDefines.h:216
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:211
@ OPERAND_REG_IMM_FP64
Definition SIDefines.h:205
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:225
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:236
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:237
@ OPERAND_REG_IMM_V2INT32
Definition SIDefines.h:212
@ OPERAND_SDWA_VOPC_DST
Definition SIDefines.h:248
@ OPERAND_REG_IMM_FP32
Definition SIDefines.h:204
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:221
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:217
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:223
@ OPERAND_INLINE_C_AV64_PSEUDO
Definition SIDefines.h:242
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:213
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:238
@ OPERAND_REG_INLINE_C_FP16
Definition SIDefines.h:220
@ OPERAND_REG_IMM_INT16
Definition SIDefines.h:203
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition SIDefines.h:228
bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, const MCSubtargetInfo &ST)
@ TI_SCRATCH_RSRC_DWORD1
Definition AMDGPU.h:578
@ TI_SCRATCH_RSRC_DWORD3
Definition AMDGPU.h:580
@ TI_SCRATCH_RSRC_DWORD0
Definition AMDGPU.h:577
@ TI_SCRATCH_RSRC_DWORD2
Definition AMDGPU.h:579
@ TI_CONSTDATA_START
Definition AMDGPU.h:576
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool isGFX1250(const MCSubtargetInfo &STI)
int getMCOpcode(uint16_t Opcode, unsigned Gen)
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ OPERAND_GENERIC_4
Definition MCInstrDesc.h:72
@ OPERAND_GENERIC_2
Definition MCInstrDesc.h:70
@ OPERAND_GENERIC_1
Definition MCInstrDesc.h:69
@ OPERAND_GENERIC_3
Definition MCInstrDesc.h:71
@ OPERAND_IMMEDIATE
Definition MCInstrDesc.h:62
@ OPERAND_GENERIC_0
Definition MCInstrDesc.h:68
@ OPERAND_GENERIC_5
Definition MCInstrDesc.h:73
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Not(const Pred &P) -> Not< Pred >
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:477
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:207
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, const MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ Xor
Bitwise or logical XOR of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
constexpr unsigned BitWidth
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
constexpr T reverseBits(T Val)
Reverse the bits in Val.
Definition MathExtras.h:118
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result values can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
GenericCycleInfo< MachineSSAContext > MachineCycleInfo
MachineCycleInfo::CycleT MachineCycle
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
Helper struct for the implementation of 3-address conversion to communicate updates made to instructi...
MachineInstr * RemoveMIUse
Other instruction whose def is no longer used by the converted instruction.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition SIInstrInfo.h:56
MachineInstr * top() const
Definition SIInstrInfo.h:61
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition SIInstrInfo.h:80
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.