LLVM 23.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "GCNHazardRecognizer.h"
19#include "GCNSubtarget.h"
22#include "llvm/ADT/STLExtras.h"
34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/MC/MCContext.h"
38
39using namespace llvm;
40
41#define DEBUG_TYPE "si-instr-info"
42
43#define GET_INSTRINFO_CTOR_DTOR
44#include "AMDGPUGenInstrInfo.inc"
45
46namespace llvm::AMDGPU {
47#define GET_D16ImageDimIntrinsics_IMPL
48#define GET_ImageDimIntrinsicTable_IMPL
49#define GET_RsrcIntrinsics_IMPL
50#include "AMDGPUGenSearchableTables.inc"
51} // namespace llvm::AMDGPU
52
53// Must be at least 4 to be able to branch over minimum unconditional branch
54// code. This is only for making it possible to write reasonably small tests for
55// long branches.
57BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
58 cl::desc("Restrict range of branch instructions (DEBUG)"));
59
61 "amdgpu-fix-16-bit-physreg-copies",
62 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
63 cl::init(true),
65
67 : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP,
68 AMDGPU::ADJCALLSTACKDOWN),
69 RI(ST), ST(ST) {
70 SchedModel.init(&ST);
71}
72
73//===----------------------------------------------------------------------===//
74// TargetInstrInfo callbacks
75//===----------------------------------------------------------------------===//
76
77static unsigned getNumOperandsNoGlue(SDNode *Node) {
78 unsigned N = Node->getNumOperands();
79 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
80 --N;
81 return N;
82}
83
84/// Returns true if both nodes have the same value for the given
85/// operand \p Op, or if both nodes do not have this operand.
87 AMDGPU::OpName OpName) {
88 unsigned Opc0 = N0->getMachineOpcode();
89 unsigned Opc1 = N1->getMachineOpcode();
90
91 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
92 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
93
94 if (Op0Idx == -1 && Op1Idx == -1)
95 return true;
96
97
98 if ((Op0Idx == -1 && Op1Idx != -1) ||
99 (Op1Idx == -1 && Op0Idx != -1))
100 return false;
101
102 // getNamedOperandIdx returns the index for the MachineInstr's operands,
103 // which includes the result as the first operand. We are indexing into the
104 // MachineSDNode's operands, so we need to skip the result operand to get
105 // the real index.
106 --Op0Idx;
107 --Op1Idx;
108
109 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
110}
111
112static bool canRemat(const MachineInstr &MI) {
113
117 return true;
118
119 if (SIInstrInfo::isSMRD(MI)) {
120 return !MI.memoperands_empty() &&
121 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
122 return MMO->isLoad() && MMO->isInvariant();
123 });
124 }
125
126 return false;
127}
128
130 const MachineInstr &MI) const {
131
132 if (canRemat(MI)) {
133 // Normally VALU use of exec would block the rematerialization, but that
134 // is OK in this case to have an implicit exec read as all VALU do.
135 // We really want all of the generic logic for this except for this.
136
137 // Another potential implicit use is mode register. The core logic of
138 // the RA will not attempt rematerialization if mode is set anywhere
139 // in the function, otherwise it is safe since mode is not changed.
140
141 // There is difference to generic method which does not allow
142 // rematerialization if there are virtual register uses. We allow this,
143 // therefore this method includes SOP instructions as well.
144 if (!MI.hasImplicitDef() &&
145 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
146 !MI.mayRaiseFPException())
147 return true;
148 }
149
151}
152
153// Returns true if the result of a VALU instruction depends on exec.
154bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
155 assert(isVALU(MI, /*AllowLDSDMA=*/true));
156
157 // If it is convergent it depends on EXEC.
158 if (MI.isConvergent())
159 return true;
160
161 // If it defines SGPR it depends on EXEC
162 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
163 for (const MachineOperand &Def : MI.defs()) {
164 if (!Def.isReg())
165 continue;
166
167 Register Reg = Def.getReg();
168 if (Reg && RI.isSGPRReg(MRI, Reg))
169 return true;
170 }
171
172 return false;
173}
174
176 // Any implicit use of exec by VALU is not a real register read.
177 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
178 isVALU(*MO.getParent(), /*AllowLDSDMA=*/true) &&
179 !resultDependsOnExec(*MO.getParent());
180}
181
183 MachineBasicBlock *SuccToSinkTo,
184 MachineCycleInfo *CI) const {
185 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
186 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
187 return true;
188
189 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
190 // Check if sinking of MI would create temporal divergent use.
191 for (auto Op : MI.uses()) {
192 if (Op.isReg() && Op.getReg().isVirtual() &&
193 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
194 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
195
196 // SgprDef defined inside cycle
197 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
198 if (FromCycle == nullptr)
199 continue;
200
201 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
202 // Check if there is a FromCycle that contains SgprDef's basic block but
203 // does not contain SuccToSinkTo and also has divergent exit condition.
204 while (FromCycle && !FromCycle->contains(ToCycle)) {
206 FromCycle->getExitingBlocks(ExitingBlocks);
207
208 // FromCycle has divergent exit condition.
209 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
210 if (hasDivergentBranch(ExitingBlock))
211 return false;
212 }
213
214 FromCycle = FromCycle->getParentCycle();
215 }
216 }
217 }
218
219 return true;
220}
221
223 int64_t &Offset0,
224 int64_t &Offset1) const {
225 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
226 return false;
227
228 unsigned Opc0 = Load0->getMachineOpcode();
229 unsigned Opc1 = Load1->getMachineOpcode();
230
231 // Make sure both are actually loads.
232 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
233 return false;
234
235 // A mayLoad instruction without a def is not a load. Likely a prefetch.
236 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
237 return false;
238
239 if (isDS(Opc0) && isDS(Opc1)) {
240
241 // FIXME: Handle this case:
242 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
243 return false;
244
245 // Check base reg.
246 if (Load0->getOperand(0) != Load1->getOperand(0))
247 return false;
248
249 // Skip read2 / write2 variants for simplicity.
250 // TODO: We should report true if the used offsets are adjacent (excluded
251 // st64 versions).
252 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
253 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
254 if (Offset0Idx == -1 || Offset1Idx == -1)
255 return false;
256
257 // XXX - be careful of dataless loads
258 // getNamedOperandIdx returns the index for MachineInstrs. Since they
259 // include the output in the operand list, but SDNodes don't, we need to
260 // subtract the index by one.
261 Offset0Idx -= get(Opc0).NumDefs;
262 Offset1Idx -= get(Opc1).NumDefs;
263 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
264 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
265 return true;
266 }
267
268 if (isSMRD(Opc0) && isSMRD(Opc1)) {
269 // Skip time and cache invalidation instructions.
270 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
271 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
272 return false;
273
274 unsigned NumOps = getNumOperandsNoGlue(Load0);
275 if (NumOps != getNumOperandsNoGlue(Load1))
276 return false;
277
278 // Check base reg.
279 if (Load0->getOperand(0) != Load1->getOperand(0))
280 return false;
281
282 // Match register offsets, if both register and immediate offsets present.
283 assert(NumOps == 4 || NumOps == 5);
284 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
285 return false;
286
287 const ConstantSDNode *Load0Offset =
289 const ConstantSDNode *Load1Offset =
291
292 if (!Load0Offset || !Load1Offset)
293 return false;
294
295 Offset0 = Load0Offset->getZExtValue();
296 Offset1 = Load1Offset->getZExtValue();
297 return true;
298 }
299
300 // MUBUF and MTBUF can access the same addresses.
301 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
302
303 // MUBUF and MTBUF have vaddr at different indices.
304 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
305 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
306 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
307 return false;
308
309 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
310 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
311
312 if (OffIdx0 == -1 || OffIdx1 == -1)
313 return false;
314
315 // getNamedOperandIdx returns the index for MachineInstrs. Since they
316 // include the output in the operand list, but SDNodes don't, we need to
317 // subtract the index by one.
318 OffIdx0 -= get(Opc0).NumDefs;
319 OffIdx1 -= get(Opc1).NumDefs;
320
321 SDValue Off0 = Load0->getOperand(OffIdx0);
322 SDValue Off1 = Load1->getOperand(OffIdx1);
323
324 // The offset might be a FrameIndexSDNode.
325 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
326 return false;
327
328 Offset0 = Off0->getAsZExtVal();
329 Offset1 = Off1->getAsZExtVal();
330 return true;
331 }
332
333 return false;
334}
335
336static bool isStride64(unsigned Opc) {
337 switch (Opc) {
338 case AMDGPU::DS_READ2ST64_B32:
339 case AMDGPU::DS_READ2ST64_B64:
340 case AMDGPU::DS_WRITE2ST64_B32:
341 case AMDGPU::DS_WRITE2ST64_B64:
342 return true;
343 default:
344 return false;
345 }
346}
347
350 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
351 const TargetRegisterInfo *TRI) const {
352 if (!LdSt.mayLoadOrStore())
353 return false;
354
355 unsigned Opc = LdSt.getOpcode();
356 OffsetIsScalable = false;
357 const MachineOperand *BaseOp, *OffsetOp;
358 int DataOpIdx;
359
360 if (isDS(LdSt)) {
361 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
362 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
363 if (OffsetOp) {
364 // Normal, single offset LDS instruction.
365 if (!BaseOp) {
366 // DS_CONSUME/DS_APPEND use M0 for the base address.
367 // TODO: find the implicit use operand for M0 and use that as BaseOp?
368 return false;
369 }
370 BaseOps.push_back(BaseOp);
371 Offset = OffsetOp->getImm();
372 // Get appropriate operand, and compute width accordingly.
373 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
374 if (DataOpIdx == -1)
375 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
376 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
377 Width = LocationSize::precise(64);
378 else
379 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
380 } else {
381 // The 2 offset instructions use offset0 and offset1 instead. We can treat
382 // these as a load with a single offset if the 2 offsets are consecutive.
383 // We will use this for some partially aligned loads.
384 const MachineOperand *Offset0Op =
385 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
386 const MachineOperand *Offset1Op =
387 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
388
389 unsigned Offset0 = Offset0Op->getImm() & 0xff;
390 unsigned Offset1 = Offset1Op->getImm() & 0xff;
391 if (Offset0 + 1 != Offset1)
392 return false;
393
394 // Each of these offsets is in element sized units, so we need to convert
395 // to bytes of the individual reads.
396
397 unsigned EltSize;
398 if (LdSt.mayLoad())
399 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
400 else {
401 assert(LdSt.mayStore());
402 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
403 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
404 }
405
406 if (isStride64(Opc))
407 EltSize *= 64;
408
409 BaseOps.push_back(BaseOp);
410 Offset = EltSize * Offset0;
411 // Get appropriate operand(s), and compute width accordingly.
412 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
413 if (DataOpIdx == -1) {
414 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
415 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
416 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
417 Width = LocationSize::precise(
418 Width.getValue() + TypeSize::getFixed(getOpSize(LdSt, DataOpIdx)));
419 } else {
420 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
421 }
422 }
423 return true;
424 }
425
426 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
427 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
428 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
429 return false;
430 BaseOps.push_back(RSrc);
431 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
432 if (BaseOp && !BaseOp->isFI())
433 BaseOps.push_back(BaseOp);
434 const MachineOperand *OffsetImm =
435 getNamedOperand(LdSt, AMDGPU::OpName::offset);
436 Offset = OffsetImm->getImm();
437 const MachineOperand *SOffset =
438 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
439 if (SOffset) {
440 if (SOffset->isReg())
441 BaseOps.push_back(SOffset);
442 else
443 Offset += SOffset->getImm();
444 }
445 // Get appropriate operand, and compute width accordingly.
446 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
447 if (DataOpIdx == -1)
448 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
449 if (DataOpIdx == -1) // LDS DMA
450 return false;
451 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
452 return true;
453 }
454
455 if (isImage(LdSt)) {
456 auto RsrcOpName =
457 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
458 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
459 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
460 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
461 if (VAddr0Idx >= 0) {
462 // GFX10 possible NSA encoding.
463 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
464 BaseOps.push_back(&LdSt.getOperand(I));
465 } else {
466 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
467 }
468 Offset = 0;
469 // Get appropriate operand, and compute width accordingly.
470 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
471 if (DataOpIdx == -1)
472 return false; // no return sampler
473 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
474 return true;
475 }
476
477 if (isSMRD(LdSt)) {
478 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
479 if (!BaseOp) // e.g. S_MEMTIME
480 return false;
481 BaseOps.push_back(BaseOp);
482 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
483 Offset = OffsetOp ? OffsetOp->getImm() : 0;
484 // Get appropriate operand, and compute width accordingly.
485 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
486 if (DataOpIdx == -1)
487 return false;
488 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
489 return true;
490 }
491
492 if (isFLAT(LdSt)) {
493 // Instructions have either vaddr or saddr or both or none.
494 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
495 if (BaseOp)
496 BaseOps.push_back(BaseOp);
497 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
498 if (BaseOp)
499 BaseOps.push_back(BaseOp);
500 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
501 // Get appropriate operand, and compute width accordingly.
502 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
503 if (DataOpIdx == -1)
504 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
505 if (DataOpIdx == -1) // LDS DMA
506 return false;
507 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
508 return true;
509 }
510
511 return false;
512}
513
514static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
516 const MachineInstr &MI2,
518 // Only examine the first "base" operand of each instruction, on the
519 // assumption that it represents the real base address of the memory access.
520 // Other operands are typically offsets or indices from this base address.
521 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
522 return true;
523
524 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
525 return false;
526
527 auto *MO1 = *MI1.memoperands_begin();
528 auto *MO2 = *MI2.memoperands_begin();
529 if (MO1->getAddrSpace() != MO2->getAddrSpace())
530 return false;
531
532 const auto *Base1 = MO1->getValue();
533 const auto *Base2 = MO2->getValue();
534 if (!Base1 || !Base2)
535 return false;
536 Base1 = getUnderlyingObject(Base1);
537 Base2 = getUnderlyingObject(Base2);
538
539 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
540 return false;
541
542 return Base1 == Base2;
543}
544
546 int64_t Offset1, bool OffsetIsScalable1,
548 int64_t Offset2, bool OffsetIsScalable2,
549 unsigned ClusterSize,
550 unsigned NumBytes) const {
551 // If the mem ops (to be clustered) do not have the same base ptr, then they
552 // should not be clustered
553 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
554 if (!BaseOps1.empty() && !BaseOps2.empty()) {
555 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
556 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
557 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
558 return false;
559
560 const SIMachineFunctionInfo *MFI =
561 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
562 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
563 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
564 // If only one base op is empty, they do not have the same base ptr
565 return false;
566 }
567
568 // In order to avoid register pressure, on an average, the number of DWORDS
569 // loaded together by all clustered mem ops should not exceed
570 // MaxMemoryClusterDWords. This is an empirical value based on certain
571 // observations and performance related experiments.
572 // The good thing about this heuristic is - it avoids clustering of too many
573 // sub-word loads, and also avoids clustering of wide loads. Below is the
574 // brief summary of how the heuristic behaves for various `LoadSize` when
575 // MaxMemoryClusterDWords is 8.
576 //
577 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
578 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
579 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
580 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
581 // (5) LoadSize >= 17: do not cluster
582 const unsigned LoadSize = NumBytes / ClusterSize;
583 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
584 return NumDWords <= MaxMemoryClusterDWords;
585}
586
587// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
588// the first 16 loads will be interleaved with the stores, and the next 16 will
589// be clustered as expected. It should really split into 2 16 store batches.
590//
591// Loads are clustered until this returns false, rather than trying to schedule
592// groups of stores. This also means we have to deal with saying different
593// address space loads should be clustered, and ones which might cause bank
594// conflicts.
595//
596// This might be deprecated so it might not be worth that much effort to fix.
598 int64_t Offset0, int64_t Offset1,
599 unsigned NumLoads) const {
600 assert(Offset1 > Offset0 &&
601 "Second offset should be larger than first offset!");
602 // If we have less than 16 loads in a row, and the offsets are within 64
603 // bytes, then schedule together.
604
605 // A cacheline is 64 bytes (for global memory).
606 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
607}
608
611 const DebugLoc &DL, MCRegister DestReg,
612 MCRegister SrcReg, bool KillSrc,
613 const char *Msg = "illegal VGPR to SGPR copy") {
614 MachineFunction *MF = MBB.getParent();
615
617 C.diagnose(DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
618
619 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
620 .addReg(SrcReg, getKillRegState(KillSrc));
621}
622
623/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
624/// possible to have a direct copy in these cases on GFX908, so an intermediate
625/// VGPR copy is required.
629 const DebugLoc &DL, MCRegister DestReg,
630 MCRegister SrcReg, bool KillSrc,
631 RegScavenger &RS, bool RegsOverlap,
632 Register ImpDefSuperReg = Register(),
633 Register ImpUseSuperReg = Register()) {
634 assert((TII.getSubtarget().hasMAIInsts() &&
635 !TII.getSubtarget().hasGFX90AInsts()) &&
636 "Expected GFX908 subtarget.");
637
638 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
639 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
640 "Source register of the copy should be either an SGPR or an AGPR.");
641
642 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
643 "Destination register of the copy should be an AGPR.");
644
645 const SIRegisterInfo &RI = TII.getRegisterInfo();
646
647 // First try to find defining accvgpr_write to avoid temporary registers.
648 // In the case of copies of overlapping AGPRs, we conservatively do not
649 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
650 // an accvgpr_write used for this same copy due to implicit-defs
651 if (!RegsOverlap) {
652 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
653 --Def;
654
655 if (!Def->modifiesRegister(SrcReg, &RI))
656 continue;
657
658 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
659 Def->getOperand(0).getReg() != SrcReg)
660 break;
661
662 MachineOperand &DefOp = Def->getOperand(1);
663 assert(DefOp.isReg() || DefOp.isImm());
664
665 if (DefOp.isReg()) {
666 bool SafeToPropagate = true;
667 // Check that register source operand is not clobbered before MI.
668 // Immediate operands are always safe to propagate.
669 for (auto I = Def; I != MI && SafeToPropagate; ++I)
670 if (I->modifiesRegister(DefOp.getReg(), &RI))
671 SafeToPropagate = false;
672
673 if (!SafeToPropagate)
674 break;
675
676 for (auto I = Def; I != MI; ++I)
677 I->clearRegisterKills(DefOp.getReg(), &RI);
678 }
679
680 MachineInstrBuilder Builder =
681 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
682 .add(DefOp);
683 if (ImpDefSuperReg)
684 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
685
686 if (ImpUseSuperReg) {
687 Builder.addReg(ImpUseSuperReg,
689 }
690
691 return;
692 }
693 }
694
695 RS.enterBasicBlockEnd(MBB);
696 RS.backward(std::next(MI));
697
698 // Ideally we want to have three registers for a long reg_sequence copy
699 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
700 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
701 *MBB.getParent());
702
703 // Registers in the sequence are allocated contiguously so we can just
704 // use register number to pick one of three round-robin temps.
705 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
706 Register Tmp =
707 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
708 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
709 "VGPR used for an intermediate copy should have been reserved.");
710
711 // Only loop through if there are any free registers left. We don't want to
712 // spill.
713 while (RegNo--) {
714 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
715 /* RestoreAfter */ false, 0,
716 /* AllowSpill */ false);
717 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
718 break;
719 Tmp = Tmp2;
720 RS.setRegUsed(Tmp);
721 }
722
723 // Insert copy to temporary VGPR.
724 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
725 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
726 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
727 } else {
728 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
729 }
730
731 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
732 .addReg(SrcReg, getKillRegState(KillSrc));
733 if (ImpUseSuperReg) {
734 UseBuilder.addReg(ImpUseSuperReg,
736 }
737
738 MachineInstrBuilder DefBuilder
739 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
740 .addReg(Tmp, RegState::Kill);
741
742 if (ImpDefSuperReg)
743 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
744}
745
748 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
749 const TargetRegisterClass *RC, bool Forward) {
750 const SIRegisterInfo &RI = TII.getRegisterInfo();
751 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
753 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
754
755 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
756 int16_t SubIdx = BaseIndices[Idx];
757 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
758 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
759 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
760 unsigned Opcode = AMDGPU::S_MOV_B32;
761
762 // Is SGPR aligned? If so try to combine with next.
763 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
764 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
765 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
766 // Can use SGPR64 copy
767 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
768 SubIdx = RI.getSubRegFromChannel(Channel, 2);
769 DestSubReg = RI.getSubReg(DestReg, SubIdx);
770 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
771 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
772 Opcode = AMDGPU::S_MOV_B64;
773 Idx++;
774 }
775
776 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
777 .addReg(SrcSubReg)
778 .addReg(SrcReg, RegState::Implicit);
779
780 if (!FirstMI)
781 FirstMI = LastMI;
782
783 if (!Forward)
784 I--;
785 }
786
787 assert(FirstMI && LastMI);
788 if (!Forward)
789 std::swap(FirstMI, LastMI);
790
791 FirstMI->addOperand(
792 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
793
794 if (KillSrc)
795 LastMI->addRegisterKilled(SrcReg, &RI);
796}
797
800 const DebugLoc &DL, Register DestReg,
801 Register SrcReg, bool KillSrc, bool RenamableDest,
802 bool RenamableSrc) const {
803 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
804 unsigned Size = RI.getRegSizeInBits(*RC);
805 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
806 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
807
808 // The rest of copyPhysReg assumes Src and Dst size are the same size.
809 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
810 // we remove Fix16BitCopies and this code block?
811 if (Fix16BitCopies) {
812 if (((Size == 16) != (SrcSize == 16))) {
813 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
814 assert(ST.useRealTrue16Insts());
815 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
816 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
817 RegToFix = SubReg;
818
819 if (DestReg == SrcReg) {
820 // Identity copy. Insert empty bundle since ExpandPostRA expects an
821 // instruction here.
822 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
823 return;
824 }
825 RC = RI.getPhysRegBaseClass(DestReg);
826 Size = RI.getRegSizeInBits(*RC);
827 SrcRC = RI.getPhysRegBaseClass(SrcReg);
828 SrcSize = RI.getRegSizeInBits(*SrcRC);
829 }
830 }
831
832 if (RC == &AMDGPU::VGPR_32RegClass) {
833 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
834 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
835 AMDGPU::AGPR_32RegClass.contains(SrcReg));
836 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
837 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
838 BuildMI(MBB, MI, DL, get(Opc), DestReg)
839 .addReg(SrcReg, getKillRegState(KillSrc));
840 return;
841 }
842
843 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
844 RC == &AMDGPU::SReg_32RegClass) {
845 if (SrcReg == AMDGPU::SCC) {
846 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
847 .addImm(1)
848 .addImm(0);
849 return;
850 }
851
852 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
853 if (DestReg == AMDGPU::VCC_LO) {
854 // FIXME: Hack until VReg_1 removed.
855 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
856 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
857 .addImm(0)
858 .addReg(SrcReg, getKillRegState(KillSrc));
859 return;
860 }
861
862 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
863 return;
864 }
865
866 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
867 .addReg(SrcReg, getKillRegState(KillSrc));
868 return;
869 }
870
871 if (RC == &AMDGPU::SReg_64RegClass) {
872 if (SrcReg == AMDGPU::SCC) {
873 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
874 .addImm(1)
875 .addImm(0);
876 return;
877 }
878
879 if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
880 if (DestReg == AMDGPU::VCC) {
881 // FIXME: Hack until VReg_1 removed.
882 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
883 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
884 .addImm(0)
885 .addReg(SrcReg, getKillRegState(KillSrc));
886 return;
887 }
888
889 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
890 return;
891 }
892
893 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
894 .addReg(SrcReg, getKillRegState(KillSrc));
895 return;
896 }
897
898 if (DestReg == AMDGPU::SCC) {
899 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
900 // but SelectionDAG emits such copies for i1 sources.
901 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
902 // This copy can only be produced by patterns
903 // with explicit SCC, which are known to be enabled
904 // only for subtargets with S_CMP_LG_U64 present.
905 assert(ST.hasScalarCompareEq64());
906 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
907 .addReg(SrcReg, getKillRegState(KillSrc))
908 .addImm(0);
909 } else {
910 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
911 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
912 .addReg(SrcReg, getKillRegState(KillSrc))
913 .addImm(0);
914 }
915
916 return;
917 }
918
919 if (RC == &AMDGPU::AGPR_32RegClass) {
920 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
921 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
922 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
923 .addReg(SrcReg, getKillRegState(KillSrc));
924 return;
925 }
926
927 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
928 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
929 .addReg(SrcReg, getKillRegState(KillSrc));
930 return;
931 }
932
933 // FIXME: Pass should maintain scavenger to avoid scan through the block on
934 // every AGPR spill.
935 RegScavenger RS;
936 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
937 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
938 return;
939 }
940
941 if (Size == 16) {
942 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
943 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
944 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
945
946 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
947 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
948 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
949 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
950 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
951 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
952 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
953 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
954
955 if (IsSGPRDst) {
956 if (!IsSGPRSrc) {
957 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
958 return;
959 }
960
961 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
962 .addReg(NewSrcReg, getKillRegState(KillSrc));
963 return;
964 }
965
966 if (IsAGPRDst || IsAGPRSrc) {
967 if (!DstLow || !SrcLow) {
968 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
969 "Cannot use hi16 subreg with an AGPR!");
970 }
971
972 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
973 return;
974 }
975
976 if (ST.useRealTrue16Insts()) {
977 if (IsSGPRSrc) {
978 assert(SrcLow);
979 SrcReg = NewSrcReg;
980 }
981 // Use the smaller instruction encoding if possible.
982 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
983 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
984 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
985 .addReg(SrcReg);
986 } else {
987 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
988 .addImm(0) // src0_modifiers
989 .addReg(SrcReg)
990 .addImm(0); // op_sel
991 }
992 return;
993 }
994
995 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
996 if (!DstLow || !SrcLow) {
997 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
998 "Cannot use hi16 subreg on VI!");
999 }
1000
1001 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1002 .addReg(NewSrcReg, getKillRegState(KillSrc));
1003 return;
1004 }
1005
1006 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1007 .addImm(0) // src0_modifiers
1008 .addReg(NewSrcReg)
1009 .addImm(0) // clamp
1016 // First implicit operand is $exec.
1017 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1018 return;
1019 }
1020
1021 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1022 if (ST.hasVMovB64Inst()) {
1023 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1024 .addReg(SrcReg, getKillRegState(KillSrc));
1025 return;
1026 }
1027 if (ST.hasPkMovB32()) {
1028 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1030 .addReg(SrcReg)
1032 .addReg(SrcReg)
1033 .addImm(0) // op_sel_lo
1034 .addImm(0) // op_sel_hi
1035 .addImm(0) // neg_lo
1036 .addImm(0) // neg_hi
1037 .addImm(0) // clamp
1038 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1039 return;
1040 }
1041 }
1042
1043 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1044 if (RI.isSGPRClass(RC)) {
1045 if (!RI.isSGPRClass(SrcRC)) {
1046 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1047 return;
1048 }
1049 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1050 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1051 Forward);
1052 return;
1053 }
1054
1055 unsigned EltSize = 4;
1056 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1057 if (RI.isAGPRClass(RC)) {
1058 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1059 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1060 else if (RI.hasVGPRs(SrcRC) ||
1061 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1062 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1063 else
1064 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1065 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1066 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1067 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1068 (RI.isProperlyAlignedRC(*RC) &&
1069 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1070 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1071 if (ST.hasVMovB64Inst()) {
1072 Opcode = AMDGPU::V_MOV_B64_e32;
1073 EltSize = 8;
1074 } else if (ST.hasPkMovB32()) {
1075 Opcode = AMDGPU::V_PK_MOV_B32;
1076 EltSize = 8;
1077 }
1078 }
1079
1080 // For the cases where we need an intermediate instruction/temporary register
1081 // (destination is an AGPR), we need a scavenger.
1082 //
1083 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1084 // whole block for every handled copy.
1085 std::unique_ptr<RegScavenger> RS;
1086 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1087 RS = std::make_unique<RegScavenger>();
1088
1089 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1090
1091 // If there is an overlap, we can't kill the super-register on the last
1092 // instruction, since it will also kill the components made live by this def.
1093 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1094 const bool CanKillSuperReg = KillSrc && !Overlap;
1095
1096 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1097 unsigned SubIdx;
1098 if (Forward)
1099 SubIdx = SubIndices[Idx];
1100 else
1101 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1102 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1103 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1104 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1105
1106 bool IsFirstSubreg = Idx == 0;
1107 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1108
1109 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1110 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1111 Register ImpUseSuper = SrcReg;
1112 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1113 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1114 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1116 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1118 .addReg(SrcSubReg)
1120 .addReg(SrcSubReg)
1121 .addImm(0) // op_sel_lo
1122 .addImm(0) // op_sel_hi
1123 .addImm(0) // neg_lo
1124 .addImm(0) // neg_hi
1125 .addImm(0) // clamp
1126 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1127 if (IsFirstSubreg)
1129 } else {
1130 MachineInstrBuilder Builder =
1131 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1132 if (IsFirstSubreg)
1133 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1134
1135 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1136 }
1137 }
1138}
1139
1140int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1141 int32_t NewOpc;
1142
1143 // Try to map original to commuted opcode
1144 NewOpc = AMDGPU::getCommuteRev(Opcode);
1145 if (NewOpc != -1)
1146 // Check if the commuted (REV) opcode exists on the target.
1147 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1148
1149 // Try to map commuted to original opcode
1150 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1151 if (NewOpc != -1)
1152 // Check if the original (non-REV) opcode exists on the target.
1153 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1154
1155 return Opcode;
1156}
1157
1159 const Register Reg,
1160 int64_t &ImmVal) const {
1161 switch (MI.getOpcode()) {
1162 case AMDGPU::V_MOV_B32_e32:
1163 case AMDGPU::S_MOV_B32:
1164 case AMDGPU::S_MOVK_I32:
1165 case AMDGPU::S_MOV_B64:
1166 case AMDGPU::V_MOV_B64_e32:
1167 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1168 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1169 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1170 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1171 case AMDGPU::V_MOV_B64_PSEUDO:
1172 case AMDGPU::V_MOV_B16_t16_e32: {
1173 const MachineOperand &Src0 = MI.getOperand(1);
1174 if (Src0.isImm()) {
1175 ImmVal = Src0.getImm();
1176 return MI.getOperand(0).getReg() == Reg;
1177 }
1178
1179 return false;
1180 }
1181 case AMDGPU::V_MOV_B16_t16_e64: {
1182 const MachineOperand &Src0 = MI.getOperand(2);
1183 if (Src0.isImm() && !MI.getOperand(1).getImm()) {
1184 ImmVal = Src0.getImm();
1185 return MI.getOperand(0).getReg() == Reg;
1186 }
1187
1188 return false;
1189 }
1190 case AMDGPU::S_BREV_B32:
1191 case AMDGPU::V_BFREV_B32_e32:
1192 case AMDGPU::V_BFREV_B32_e64: {
1193 const MachineOperand &Src0 = MI.getOperand(1);
1194 if (Src0.isImm()) {
1195 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Src0.getImm()));
1196 return MI.getOperand(0).getReg() == Reg;
1197 }
1198
1199 return false;
1200 }
1201 case AMDGPU::S_NOT_B32:
1202 case AMDGPU::V_NOT_B32_e32:
1203 case AMDGPU::V_NOT_B32_e64: {
1204 const MachineOperand &Src0 = MI.getOperand(1);
1205 if (Src0.isImm()) {
1206 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1207 return MI.getOperand(0).getReg() == Reg;
1208 }
1209
1210 return false;
1211 }
1212 default:
1213 return false;
1214 }
1215}
1216
1217std::optional<int64_t>
1219 if (Op.isImm())
1220 return Op.getImm();
1221
1222 if (!Op.isReg() || !Op.getReg().isVirtual())
1223 return std::nullopt;
1224 MachineRegisterInfo &MRI = Op.getParent()->getMF()->getRegInfo();
1225 const MachineInstr *Def = MRI.getVRegDef(Op.getReg());
1226 if (Def && Def->isMoveImmediate()) {
1227 const MachineOperand &ImmSrc = Def->getOperand(1);
1228 if (ImmSrc.isImm())
1229 return extractSubregFromImm(ImmSrc.getImm(), Op.getSubReg());
1230 }
1231
1232 return std::nullopt;
1233}
1234
1236
1237 if (RI.isAGPRClass(DstRC))
1238 return AMDGPU::COPY;
1239 if (RI.getRegSizeInBits(*DstRC) == 16) {
1240 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1241 // before RA.
1242 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1243 }
1244 if (RI.getRegSizeInBits(*DstRC) == 32)
1245 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1246 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1247 return AMDGPU::S_MOV_B64;
1248 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1249 return AMDGPU::V_MOV_B64_PSEUDO;
1250 return AMDGPU::COPY;
1251}
1252
1253const MCInstrDesc &
1255 bool IsIndirectSrc) const {
1256 if (IsIndirectSrc) {
1257 if (VecSize <= 32) // 4 bytes
1258 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1259 if (VecSize <= 64) // 8 bytes
1260 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1261 if (VecSize <= 96) // 12 bytes
1262 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1263 if (VecSize <= 128) // 16 bytes
1264 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1265 if (VecSize <= 160) // 20 bytes
1266 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1267 if (VecSize <= 192) // 24 bytes
1268 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6);
1269 if (VecSize <= 224) // 28 bytes
1270 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7);
1271 if (VecSize <= 256) // 32 bytes
1272 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1273 if (VecSize <= 288) // 36 bytes
1274 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1275 if (VecSize <= 320) // 40 bytes
1276 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1277 if (VecSize <= 352) // 44 bytes
1278 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1279 if (VecSize <= 384) // 48 bytes
1280 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1281 if (VecSize <= 512) // 64 bytes
1282 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1283 if (VecSize <= 1024) // 128 bytes
1284 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1285
1286 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1287 }
1288
1289 if (VecSize <= 32) // 4 bytes
1290 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1291 if (VecSize <= 64) // 8 bytes
1292 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1293 if (VecSize <= 96) // 12 bytes
1294 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1295 if (VecSize <= 128) // 16 bytes
1296 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1297 if (VecSize <= 160) // 20 bytes
1298 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1299 if (VecSize <= 192) // 24 bytes
1300 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6);
1301 if (VecSize <= 224) // 28 bytes
1302 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7);
1303 if (VecSize <= 256) // 32 bytes
1304 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1305 if (VecSize <= 288) // 36 bytes
1306 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1307 if (VecSize <= 320) // 40 bytes
1308 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1309 if (VecSize <= 352) // 44 bytes
1310 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1311 if (VecSize <= 384) // 48 bytes
1312 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1313 if (VecSize <= 512) // 64 bytes
1314 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1315 if (VecSize <= 1024) // 128 bytes
1316 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1317
1318 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1319}
1320
1321static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1322 if (VecSize <= 32) // 4 bytes
1323 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1324 if (VecSize <= 64) // 8 bytes
1325 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1326 if (VecSize <= 96) // 12 bytes
1327 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1328 if (VecSize <= 128) // 16 bytes
1329 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1330 if (VecSize <= 160) // 20 bytes
1331 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1332 if (VecSize <= 192) // 24 bytes
1333 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1334 if (VecSize <= 224) // 28 bytes
1335 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1336 if (VecSize <= 256) // 32 bytes
1337 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1338 if (VecSize <= 288) // 36 bytes
1339 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1340 if (VecSize <= 320) // 40 bytes
1341 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1342 if (VecSize <= 352) // 44 bytes
1343 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1344 if (VecSize <= 384) // 48 bytes
1345 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1346 if (VecSize <= 512) // 64 bytes
1347 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1348 if (VecSize <= 1024) // 128 bytes
1349 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1350
1351 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1352}
1353
1354static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1355 if (VecSize <= 32) // 4 bytes
1356 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1357 if (VecSize <= 64) // 8 bytes
1358 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1359 if (VecSize <= 96) // 12 bytes
1360 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1361 if (VecSize <= 128) // 16 bytes
1362 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1363 if (VecSize <= 160) // 20 bytes
1364 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1365 if (VecSize <= 192) // 24 bytes
1366 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1367 if (VecSize <= 224) // 28 bytes
1368 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1369 if (VecSize <= 256) // 32 bytes
1370 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1371 if (VecSize <= 288) // 36 bytes
1372 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1373 if (VecSize <= 320) // 40 bytes
1374 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1375 if (VecSize <= 352) // 44 bytes
1376 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1377 if (VecSize <= 384) // 48 bytes
1378 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1379 if (VecSize <= 512) // 64 bytes
1380 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1381 if (VecSize <= 1024) // 128 bytes
1382 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1383
1384 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1385}
1386
1387static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1388 if (VecSize <= 64) // 8 bytes
1389 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1390 if (VecSize <= 128) // 16 bytes
1391 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1392 if (VecSize <= 256) // 32 bytes
1393 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1394 if (VecSize <= 512) // 64 bytes
1395 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1396 if (VecSize <= 1024) // 128 bytes
1397 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1398
1399 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1400}
1401
1402const MCInstrDesc &
1403SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1404 bool IsSGPR) const {
1405 if (IsSGPR) {
1406 switch (EltSize) {
1407 case 32:
1408 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1409 case 64:
1410 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1411 default:
1412 llvm_unreachable("invalid reg indexing elt size");
1413 }
1414 }
1415
1416 assert(EltSize == 32 && "invalid reg indexing elt size");
1418}
1419
1420static unsigned getSGPRSpillSaveOpcode(unsigned Size, bool NeedsCFI) {
1421 switch (Size) {
1422 case 4:
1423 return NeedsCFI ? AMDGPU::SI_SPILL_S32_CFI_SAVE : AMDGPU::SI_SPILL_S32_SAVE;
1424 case 8:
1425 return NeedsCFI ? AMDGPU::SI_SPILL_S64_CFI_SAVE : AMDGPU::SI_SPILL_S64_SAVE;
1426 case 12:
1427 return NeedsCFI ? AMDGPU::SI_SPILL_S96_CFI_SAVE : AMDGPU::SI_SPILL_S96_SAVE;
1428 case 16:
1429 return NeedsCFI ? AMDGPU::SI_SPILL_S128_CFI_SAVE
1430 : AMDGPU::SI_SPILL_S128_SAVE;
1431 case 20:
1432 return NeedsCFI ? AMDGPU::SI_SPILL_S160_CFI_SAVE
1433 : AMDGPU::SI_SPILL_S160_SAVE;
1434 case 24:
1435 return NeedsCFI ? AMDGPU::SI_SPILL_S192_CFI_SAVE
1436 : AMDGPU::SI_SPILL_S192_SAVE;
1437 case 28:
1438 return NeedsCFI ? AMDGPU::SI_SPILL_S224_CFI_SAVE
1439 : AMDGPU::SI_SPILL_S224_SAVE;
1440 case 32:
1441 return AMDGPU::SI_SPILL_S256_SAVE;
1442 case 36:
1443 return AMDGPU::SI_SPILL_S288_SAVE;
1444 case 40:
1445 return AMDGPU::SI_SPILL_S320_SAVE;
1446 case 44:
1447 return AMDGPU::SI_SPILL_S352_SAVE;
1448 case 48:
1449 return AMDGPU::SI_SPILL_S384_SAVE;
1450 case 64:
1451 return NeedsCFI ? AMDGPU::SI_SPILL_S512_CFI_SAVE
1452 : AMDGPU::SI_SPILL_S512_SAVE;
1453 case 128:
1454 return NeedsCFI ? AMDGPU::SI_SPILL_S1024_CFI_SAVE
1455 : AMDGPU::SI_SPILL_S1024_SAVE;
1456 default:
1457 llvm_unreachable("unknown register size");
1458 }
1459}
1460
1461static unsigned getVGPRSpillSaveOpcode(unsigned Size, bool NeedsCFI) {
1462 switch (Size) {
1463 case 2:
1464 return AMDGPU::SI_SPILL_V16_SAVE;
1465 case 4:
1466 return NeedsCFI ? AMDGPU::SI_SPILL_V32_CFI_SAVE : AMDGPU::SI_SPILL_V32_SAVE;
1467 case 8:
1468 return NeedsCFI ? AMDGPU::SI_SPILL_V64_CFI_SAVE : AMDGPU::SI_SPILL_V64_SAVE;
1469 case 12:
1470 return NeedsCFI ? AMDGPU::SI_SPILL_V96_CFI_SAVE : AMDGPU::SI_SPILL_V96_SAVE;
1471 case 16:
1472 return NeedsCFI ? AMDGPU::SI_SPILL_V128_CFI_SAVE
1473 : AMDGPU::SI_SPILL_V128_SAVE;
1474 case 20:
1475 return NeedsCFI ? AMDGPU::SI_SPILL_V160_CFI_SAVE
1476 : AMDGPU::SI_SPILL_V160_SAVE;
1477 case 24:
1478 return NeedsCFI ? AMDGPU::SI_SPILL_V192_CFI_SAVE
1479 : AMDGPU::SI_SPILL_V192_SAVE;
1480 case 28:
1481 return NeedsCFI ? AMDGPU::SI_SPILL_V224_CFI_SAVE
1482 : AMDGPU::SI_SPILL_V224_SAVE;
1483 case 32:
1484 return NeedsCFI ? AMDGPU::SI_SPILL_V256_CFI_SAVE
1485 : AMDGPU::SI_SPILL_V256_SAVE;
1486 case 36:
1487 return NeedsCFI ? AMDGPU::SI_SPILL_V288_CFI_SAVE
1488 : AMDGPU::SI_SPILL_V288_SAVE;
1489 case 40:
1490 return NeedsCFI ? AMDGPU::SI_SPILL_V320_CFI_SAVE
1491 : AMDGPU::SI_SPILL_V320_SAVE;
1492 case 44:
1493 return NeedsCFI ? AMDGPU::SI_SPILL_V352_CFI_SAVE
1494 : AMDGPU::SI_SPILL_V352_SAVE;
1495 case 48:
1496 return NeedsCFI ? AMDGPU::SI_SPILL_V384_CFI_SAVE
1497 : AMDGPU::SI_SPILL_V384_SAVE;
1498 case 64:
1499 return NeedsCFI ? AMDGPU::SI_SPILL_V512_CFI_SAVE
1500 : AMDGPU::SI_SPILL_V512_SAVE;
1501 case 128:
1502 return NeedsCFI ? AMDGPU::SI_SPILL_V1024_CFI_SAVE
1503 : AMDGPU::SI_SPILL_V1024_SAVE;
1504 default:
1505 llvm_unreachable("unknown register size");
1506 }
1507}
1508
1509static unsigned getAVSpillSaveOpcode(unsigned Size, bool NeedsCFI) {
1510 switch (Size) {
1511 case 4:
1512 return NeedsCFI ? AMDGPU::SI_SPILL_AV32_CFI_SAVE
1513 : AMDGPU::SI_SPILL_AV32_SAVE;
1514 case 8:
1515 return NeedsCFI ? AMDGPU::SI_SPILL_AV64_CFI_SAVE
1516 : AMDGPU::SI_SPILL_AV64_SAVE;
1517 case 12:
1518 return NeedsCFI ? AMDGPU::SI_SPILL_AV96_CFI_SAVE
1519 : AMDGPU::SI_SPILL_AV96_SAVE;
1520 case 16:
1521 return NeedsCFI ? AMDGPU::SI_SPILL_AV128_CFI_SAVE
1522 : AMDGPU::SI_SPILL_AV128_SAVE;
1523 case 20:
1524 return NeedsCFI ? AMDGPU::SI_SPILL_AV160_CFI_SAVE
1525 : AMDGPU::SI_SPILL_AV160_SAVE;
1526 case 24:
1527 return NeedsCFI ? AMDGPU::SI_SPILL_AV192_CFI_SAVE
1528 : AMDGPU::SI_SPILL_AV192_SAVE;
1529 case 28:
1530 return NeedsCFI ? AMDGPU::SI_SPILL_AV224_CFI_SAVE
1531 : AMDGPU::SI_SPILL_AV224_SAVE;
1532 case 32:
1533 return NeedsCFI ? AMDGPU::SI_SPILL_AV256_CFI_SAVE
1534 : AMDGPU::SI_SPILL_AV256_SAVE;
1535 case 36:
1536 return AMDGPU::SI_SPILL_AV288_SAVE;
1537 case 40:
1538 return AMDGPU::SI_SPILL_AV320_SAVE;
1539 case 44:
1540 return AMDGPU::SI_SPILL_AV352_SAVE;
1541 case 48:
1542 return AMDGPU::SI_SPILL_AV384_SAVE;
1543 case 64:
1544 return NeedsCFI ? AMDGPU::SI_SPILL_AV512_CFI_SAVE
1545 : AMDGPU::SI_SPILL_AV512_SAVE;
1546 case 128:
1547 return NeedsCFI ? AMDGPU::SI_SPILL_AV1024_CFI_SAVE
1548 : AMDGPU::SI_SPILL_AV1024_SAVE;
1549 default:
1550 llvm_unreachable("unknown register size");
1551 }
1552}
1553
1554static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1555 bool IsVectorSuperClass) {
1556 // Currently, there is only 32-bit WWM register spills needed.
1557 if (Size != 4)
1558 llvm_unreachable("unknown wwm register spill size");
1559
1560 if (IsVectorSuperClass)
1561 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1562
1563 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1564}
1565
1567 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1568 const SIMachineFunctionInfo &MFI, bool NeedsCFI) const {
1569 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1570
1571 // Choose the right opcode if spilling a WWM register.
1573 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1574
1575 // TODO: Check if AGPRs are available
1576 if (ST.hasMAIInsts())
1577 return getAVSpillSaveOpcode(Size, NeedsCFI);
1578
1579 return getVGPRSpillSaveOpcode(Size, NeedsCFI);
1580}
1581
1582void SIInstrInfo::storeRegToStackSlotImpl(
1584 bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
1585 MachineInstr::MIFlag Flags, bool NeedsCFI) const {
1586 MachineFunction *MF = MBB.getParent();
1588 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1589 const DebugLoc &DL = MBB.findDebugLoc(MI);
1590
1591 MachinePointerInfo PtrInfo
1592 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1594 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1595 FrameInfo.getObjectAlign(FrameIndex));
1596 unsigned SpillSize = RI.getSpillSize(*RC);
1597
1598 MachineRegisterInfo &MRI = MF->getRegInfo();
1599 if (RI.isSGPRClass(RC)) {
1600 if (FrameInfo.getStackID(FrameIndex) == TargetStackID::SGPRSpill)
1601 MFI->setHasSpilledSGPRs();
1602 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1603 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1604 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1605
1606 // We are only allowed to create one new instruction when spilling
1607 // registers, so we need to use pseudo instruction for spilling SGPRs.
1608 const MCInstrDesc &OpDesc =
1609 get(getSGPRSpillSaveOpcode(SpillSize, NeedsCFI));
1610
1611 // The SGPR spill/restore instructions only work on number sgprs, so we need
1612 // to make sure we are using the correct register class.
1613 if (SrcReg.isVirtual() && SpillSize == 4) {
1614 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1615 }
1616
1617 BuildMI(MBB, MI, DL, OpDesc)
1618 .addReg(SrcReg, getKillRegState(isKill)) // data
1619 .addFrameIndex(FrameIndex) // addr
1620 .addMemOperand(MMO)
1622
1623 return;
1624 }
1625
1626 unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC,
1627 SpillSize, *MFI, NeedsCFI);
1628 MFI->setHasSpilledVGPRs();
1629
1630 BuildMI(MBB, MI, DL, get(Opcode))
1631 .addReg(SrcReg, getKillRegState(isKill)) // data
1632 .addFrameIndex(FrameIndex) // addr
1633 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1634 .addImm(0) // offset
1635 .addMemOperand(MMO);
1636}
1637
1640 bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
1641 MachineInstr::MIFlag Flags) const {
1642 storeRegToStackSlotImpl(MBB, MI, SrcReg, isKill, FrameIndex, RC, VReg, Flags,
1643 false);
1644}
1645
1648 Register SrcReg, bool isKill,
1649 int FrameIndex,
1650 const TargetRegisterClass *RC) const {
1651 storeRegToStackSlotImpl(MBB, MI, SrcReg, isKill, FrameIndex, RC, Register(),
1652 MachineInstr::NoFlags, true);
1653}
1654
1655static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1656 switch (Size) {
1657 case 4:
1658 return AMDGPU::SI_SPILL_S32_RESTORE;
1659 case 8:
1660 return AMDGPU::SI_SPILL_S64_RESTORE;
1661 case 12:
1662 return AMDGPU::SI_SPILL_S96_RESTORE;
1663 case 16:
1664 return AMDGPU::SI_SPILL_S128_RESTORE;
1665 case 20:
1666 return AMDGPU::SI_SPILL_S160_RESTORE;
1667 case 24:
1668 return AMDGPU::SI_SPILL_S192_RESTORE;
1669 case 28:
1670 return AMDGPU::SI_SPILL_S224_RESTORE;
1671 case 32:
1672 return AMDGPU::SI_SPILL_S256_RESTORE;
1673 case 36:
1674 return AMDGPU::SI_SPILL_S288_RESTORE;
1675 case 40:
1676 return AMDGPU::SI_SPILL_S320_RESTORE;
1677 case 44:
1678 return AMDGPU::SI_SPILL_S352_RESTORE;
1679 case 48:
1680 return AMDGPU::SI_SPILL_S384_RESTORE;
1681 case 64:
1682 return AMDGPU::SI_SPILL_S512_RESTORE;
1683 case 128:
1684 return AMDGPU::SI_SPILL_S1024_RESTORE;
1685 default:
1686 llvm_unreachable("unknown register size");
1687 }
1688}
1689
1690static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1691 switch (Size) {
1692 case 2:
1693 return AMDGPU::SI_SPILL_V16_RESTORE;
1694 case 4:
1695 return AMDGPU::SI_SPILL_V32_RESTORE;
1696 case 8:
1697 return AMDGPU::SI_SPILL_V64_RESTORE;
1698 case 12:
1699 return AMDGPU::SI_SPILL_V96_RESTORE;
1700 case 16:
1701 return AMDGPU::SI_SPILL_V128_RESTORE;
1702 case 20:
1703 return AMDGPU::SI_SPILL_V160_RESTORE;
1704 case 24:
1705 return AMDGPU::SI_SPILL_V192_RESTORE;
1706 case 28:
1707 return AMDGPU::SI_SPILL_V224_RESTORE;
1708 case 32:
1709 return AMDGPU::SI_SPILL_V256_RESTORE;
1710 case 36:
1711 return AMDGPU::SI_SPILL_V288_RESTORE;
1712 case 40:
1713 return AMDGPU::SI_SPILL_V320_RESTORE;
1714 case 44:
1715 return AMDGPU::SI_SPILL_V352_RESTORE;
1716 case 48:
1717 return AMDGPU::SI_SPILL_V384_RESTORE;
1718 case 64:
1719 return AMDGPU::SI_SPILL_V512_RESTORE;
1720 case 128:
1721 return AMDGPU::SI_SPILL_V1024_RESTORE;
1722 default:
1723 llvm_unreachable("unknown register size");
1724 }
1725}
1726
1727static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1728 switch (Size) {
1729 case 4:
1730 return AMDGPU::SI_SPILL_AV32_RESTORE;
1731 case 8:
1732 return AMDGPU::SI_SPILL_AV64_RESTORE;
1733 case 12:
1734 return AMDGPU::SI_SPILL_AV96_RESTORE;
1735 case 16:
1736 return AMDGPU::SI_SPILL_AV128_RESTORE;
1737 case 20:
1738 return AMDGPU::SI_SPILL_AV160_RESTORE;
1739 case 24:
1740 return AMDGPU::SI_SPILL_AV192_RESTORE;
1741 case 28:
1742 return AMDGPU::SI_SPILL_AV224_RESTORE;
1743 case 32:
1744 return AMDGPU::SI_SPILL_AV256_RESTORE;
1745 case 36:
1746 return AMDGPU::SI_SPILL_AV288_RESTORE;
1747 case 40:
1748 return AMDGPU::SI_SPILL_AV320_RESTORE;
1749 case 44:
1750 return AMDGPU::SI_SPILL_AV352_RESTORE;
1751 case 48:
1752 return AMDGPU::SI_SPILL_AV384_RESTORE;
1753 case 64:
1754 return AMDGPU::SI_SPILL_AV512_RESTORE;
1755 case 128:
1756 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1757 default:
1758 llvm_unreachable("unknown register size");
1759 }
1760}
1761
1762static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1763 bool IsVectorSuperClass) {
1764 // Currently, there is only 32-bit WWM register spills needed.
1765 if (Size != 4)
1766 llvm_unreachable("unknown wwm register spill size");
1767
1768 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1769 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1770
1771 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1772}
1773
1775 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1776 const SIMachineFunctionInfo &MFI) const {
1777 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1778
1779 // Choose the right opcode if restoring a WWM register.
1781 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1782
1783 // TODO: Check if AGPRs are available
1784 if (ST.hasMAIInsts())
1786
1787 assert(!RI.isAGPRClass(RC));
1789}
1790
1793 Register DestReg, int FrameIndex,
1794 const TargetRegisterClass *RC,
1795 Register VReg, unsigned SubReg,
1796 MachineInstr::MIFlag Flags) const {
1797 MachineFunction *MF = MBB.getParent();
1799 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1800 const DebugLoc &DL = MBB.findDebugLoc(MI);
1801 unsigned SpillSize = RI.getSpillSize(*RC);
1802
1803 MachinePointerInfo PtrInfo
1804 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1805
1807 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1808 FrameInfo.getObjectAlign(FrameIndex));
1809
1810 if (RI.isSGPRClass(RC)) {
1811 if (FrameInfo.getStackID(FrameIndex) == TargetStackID::SGPRSpill)
1812 MFI->setHasSpilledSGPRs();
1813 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1814 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1815 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1816
1817 // FIXME: Maybe this should not include a memoperand because it will be
1818 // lowered to non-memory instructions.
1819 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1820 if (DestReg.isVirtual() && SpillSize == 4) {
1821 MachineRegisterInfo &MRI = MF->getRegInfo();
1822 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1823 }
1824
1825 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1826 .addFrameIndex(FrameIndex) // addr
1827 .addMemOperand(MMO)
1829
1830 return;
1831 }
1832
1833 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1834 SpillSize, *MFI);
1835 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1836 .addFrameIndex(FrameIndex) // vaddr
1837 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1838 .addImm(0) // offset
1839 .addMemOperand(MMO);
1840}
1841
1846
1849 unsigned Quantity) const {
1850 DebugLoc DL = MBB.findDebugLoc(MI);
1851 unsigned MaxSNopCount = 1u << ST.getSNopBits();
1852 while (Quantity > 0) {
1853 unsigned Arg = std::min(Quantity, MaxSNopCount);
1854 Quantity -= Arg;
1855 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1856 }
1857}
1858
1862 const DebugLoc &DL) const {
1863 MachineFunction *MF = MBB.getParent();
1864 constexpr unsigned DoorbellIDMask = 0x3ff;
1865 constexpr unsigned ECQueueWaveAbort = 0x400;
1866
1867 MachineBasicBlock *TrapBB = &MBB;
1868 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
1869
1870 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
1871 MBB.splitAt(MI, /*UpdateLiveIns=*/false);
1872 TrapBB = MF->CreateMachineBasicBlock();
1873 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
1874 MF->push_back(TrapBB);
1875 MBB.addSuccessor(TrapBB);
1876 }
1877 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
1878 // will be a nop.
1879 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
1880 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
1881 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1882 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
1883 DoorbellReg)
1885 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
1886 .addUse(AMDGPU::M0);
1887 Register DoorbellRegMasked =
1888 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1889 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
1890 .addUse(DoorbellReg)
1891 .addImm(DoorbellIDMask);
1892 Register SetWaveAbortBit =
1893 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1894 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
1895 .addUse(DoorbellRegMasked)
1896 .addImm(ECQueueWaveAbort);
1897 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1898 .addUse(SetWaveAbortBit);
1899 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
1901 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1902 .addUse(AMDGPU::TTMP2);
1903 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
1904 TrapBB->addSuccessor(HaltLoopBB);
1905
1906 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
1907 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
1908 .addMBB(HaltLoopBB);
1909 MF->push_back(HaltLoopBB);
1910 HaltLoopBB->addSuccessor(HaltLoopBB);
1911
1912 return MBB.getNextNode();
1913}
1914
1916 switch (MI.getOpcode()) {
1917 default:
1918 if (MI.isMetaInstruction())
1919 return 0;
1920 return 1; // FIXME: Do wait states equal cycles?
1921
1922 case AMDGPU::S_NOP:
1923 return MI.getOperand(0).getImm() + 1;
1924 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
1925 // hazard, even if one exist, won't really be visible. Should we handle it?
1926 }
1927}
1928
1930 MachineBasicBlock &MBB = *MI.getParent();
1931 DebugLoc DL = MBB.findDebugLoc(MI);
1933 switch (MI.getOpcode()) {
1934 default: return TargetInstrInfo::expandPostRAPseudo(MI);
1935 case AMDGPU::S_MOV_B64_term:
1936 // This is only a terminator to get the correct spill code placement during
1937 // register allocation.
1938 MI.setDesc(get(AMDGPU::S_MOV_B64));
1939 break;
1940
1941 case AMDGPU::S_MOV_B32_term:
1942 // This is only a terminator to get the correct spill code placement during
1943 // register allocation.
1944 MI.setDesc(get(AMDGPU::S_MOV_B32));
1945 break;
1946
1947 case AMDGPU::S_XOR_B64_term:
1948 // This is only a terminator to get the correct spill code placement during
1949 // register allocation.
1950 MI.setDesc(get(AMDGPU::S_XOR_B64));
1951 break;
1952
1953 case AMDGPU::S_XOR_B32_term:
1954 // This is only a terminator to get the correct spill code placement during
1955 // register allocation.
1956 MI.setDesc(get(AMDGPU::S_XOR_B32));
1957 break;
1958 case AMDGPU::S_OR_B64_term:
1959 // This is only a terminator to get the correct spill code placement during
1960 // register allocation.
1961 MI.setDesc(get(AMDGPU::S_OR_B64));
1962 break;
1963 case AMDGPU::S_OR_B32_term:
1964 // This is only a terminator to get the correct spill code placement during
1965 // register allocation.
1966 MI.setDesc(get(AMDGPU::S_OR_B32));
1967 break;
1968
1969 case AMDGPU::S_ANDN2_B64_term:
1970 // This is only a terminator to get the correct spill code placement during
1971 // register allocation.
1972 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
1973 break;
1974
1975 case AMDGPU::S_ANDN2_B32_term:
1976 // This is only a terminator to get the correct spill code placement during
1977 // register allocation.
1978 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
1979 break;
1980
1981 case AMDGPU::S_AND_B64_term:
1982 // This is only a terminator to get the correct spill code placement during
1983 // register allocation.
1984 MI.setDesc(get(AMDGPU::S_AND_B64));
1985 break;
1986
1987 case AMDGPU::S_AND_B32_term:
1988 // This is only a terminator to get the correct spill code placement during
1989 // register allocation.
1990 MI.setDesc(get(AMDGPU::S_AND_B32));
1991 break;
1992
1993 case AMDGPU::S_AND_SAVEEXEC_B64_term:
1994 // This is only a terminator to get the correct spill code placement during
1995 // register allocation.
1996 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
1997 break;
1998
1999 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2000 // This is only a terminator to get the correct spill code placement during
2001 // register allocation.
2002 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2003 break;
2004
2005 case AMDGPU::V_CMPX_EQ_U32_nosdst_e32_term:
2006 MI.setDesc(get(AMDGPU::V_CMPX_EQ_U32_nosdst_e32));
2007 break;
2008 case AMDGPU::V_CMPX_EQ_U64_nosdst_e32_term:
2009 MI.setDesc(get(AMDGPU::V_CMPX_EQ_U64_nosdst_e32));
2010 break;
2011
2012 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2013 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2014 break;
2015
2016 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2017 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2018 break;
2019 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2020 Register Dst = MI.getOperand(0).getReg();
2021 bool IsAGPR = SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst));
2022 MI.setDesc(
2023 get(IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2024 break;
2025 }
2026 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2027 Register Dst = MI.getOperand(0).getReg();
2028 if (SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst))) {
2029 int64_t Imm = MI.getOperand(1).getImm();
2030
2031 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2032 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2033 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)
2034 .addImm(SignExtend64<32>(Imm));
2035 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)
2036 .addImm(SignExtend64<32>(Imm >> 32));
2037 MI.eraseFromParent();
2038 break;
2039 }
2040
2041 [[fallthrough]];
2042 }
2043 case AMDGPU::V_MOV_B64_PSEUDO: {
2044 Register Dst = MI.getOperand(0).getReg();
2045 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2046 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2047
2048 const MCInstrDesc &Mov64Desc = get(AMDGPU::V_MOV_B64_e32);
2049 const TargetRegisterClass *Mov64RC = getRegClass(Mov64Desc, /*OpNum=*/0);
2050
2051 const MachineOperand &SrcOp = MI.getOperand(1);
2052 // FIXME: Will this work for 64-bit floating point immediates?
2053 assert(!SrcOp.isFPImm());
2054 if (ST.hasVMovB64Inst() && Mov64RC->contains(Dst)) {
2055 MI.setDesc(Mov64Desc);
2056 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2057 isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
2058 break;
2059 }
2060 if (SrcOp.isImm()) {
2061 APInt Imm(64, SrcOp.getImm());
2062 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2063 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2064 const MCInstrDesc &PkMovDesc = get(AMDGPU::V_PK_MOV_B32);
2065 const TargetRegisterClass *PkMovRC = getRegClass(PkMovDesc, /*OpNum=*/0);
2066
2067 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo) &&
2068 PkMovRC->contains(Dst)) {
2069 BuildMI(MBB, MI, DL, PkMovDesc, Dst)
2071 .addImm(Lo.getSExtValue())
2073 .addImm(Lo.getSExtValue())
2074 .addImm(0) // op_sel_lo
2075 .addImm(0) // op_sel_hi
2076 .addImm(0) // neg_lo
2077 .addImm(0) // neg_hi
2078 .addImm(0); // clamp
2079 } else {
2080 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2081 .addImm(Lo.getSExtValue());
2082 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2083 .addImm(Hi.getSExtValue());
2084 }
2085 } else {
2086 assert(SrcOp.isReg());
2087 if (ST.hasPkMovB32() &&
2088 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2089 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2090 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2091 .addReg(SrcOp.getReg())
2093 .addReg(SrcOp.getReg())
2094 .addImm(0) // op_sel_lo
2095 .addImm(0) // op_sel_hi
2096 .addImm(0) // neg_lo
2097 .addImm(0) // neg_hi
2098 .addImm(0); // clamp
2099 } else {
2100 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2101 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0));
2102 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2103 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1));
2104 }
2105 }
2106 MI.eraseFromParent();
2107 break;
2108 }
2109 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2111 break;
2112 }
2113 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2114 const MachineOperand &SrcOp = MI.getOperand(1);
2115 assert(!SrcOp.isFPImm());
2116
2117 if (ST.has64BitLiterals()) {
2118 MI.setDesc(get(AMDGPU::S_MOV_B64));
2119 break;
2120 }
2121
2122 APInt Imm(64, SrcOp.getImm());
2123 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2124 MI.setDesc(get(AMDGPU::S_MOV_B64));
2125 break;
2126 }
2127
2128 Register Dst = MI.getOperand(0).getReg();
2129 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2130 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2131
2132 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2133 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2134 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2135 .addImm(Lo.getSExtValue());
2136 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2137 .addImm(Hi.getSExtValue());
2138 MI.eraseFromParent();
2139 break;
2140 }
2141 case AMDGPU::V_SET_INACTIVE_B32: {
2142 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2143 Register DstReg = MI.getOperand(0).getReg();
2144 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2145 .add(MI.getOperand(3))
2146 .add(MI.getOperand(4))
2147 .add(MI.getOperand(1))
2148 .add(MI.getOperand(2))
2149 .add(MI.getOperand(5));
2150 MI.eraseFromParent();
2151 break;
2152 }
2153 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2154 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2155 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2156 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2157 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2158 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2159 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2160 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2161 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2162 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2163 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2164 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2165 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2166 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2167 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2168 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2169 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2170 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2171 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2172 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2173 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2174 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2175 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2176 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2177 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2178 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2179 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2180 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2181 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2182 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2183 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2184 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2185 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2186 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2187
2188 unsigned Opc;
2189 if (RI.hasVGPRs(EltRC)) {
2190 Opc = AMDGPU::V_MOVRELD_B32_e32;
2191 } else {
2192 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2193 : AMDGPU::S_MOVRELD_B32;
2194 }
2195
2196 const MCInstrDesc &OpDesc = get(Opc);
2197 Register VecReg = MI.getOperand(0).getReg();
2198 bool IsUndef = MI.getOperand(1).isUndef();
2199 unsigned SubReg = MI.getOperand(3).getImm();
2200 assert(VecReg == MI.getOperand(1).getReg());
2201
2203 BuildMI(MBB, MI, DL, OpDesc)
2204 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2205 .add(MI.getOperand(2))
2207 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2208
2209 const int ImpDefIdx =
2210 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2211 const int ImpUseIdx = ImpDefIdx + 1;
2212 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2213 MI.eraseFromParent();
2214 break;
2215 }
2216 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2217 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2218 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2219 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2220 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2221 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6:
2222 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7:
2223 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2224 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2225 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2226 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2227 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2228 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2229 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2230 assert(ST.useVGPRIndexMode());
2231 Register VecReg = MI.getOperand(0).getReg();
2232 bool IsUndef = MI.getOperand(1).isUndef();
2233 MachineOperand &Idx = MI.getOperand(3);
2234 Register SubReg = MI.getOperand(4).getImm();
2235
2236 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2237 .add(Idx)
2239 SetOn->getOperand(3).setIsUndef();
2240
2241 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2243 BuildMI(MBB, MI, DL, OpDesc)
2244 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2245 .add(MI.getOperand(2))
2247 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2248
2249 const int ImpDefIdx =
2250 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2251 const int ImpUseIdx = ImpDefIdx + 1;
2252 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2253
2254 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2255
2256 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2257
2258 MI.eraseFromParent();
2259 break;
2260 }
2261 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2262 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2263 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2264 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2265 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2266 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6:
2267 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7:
2268 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2269 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2270 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2271 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2272 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2273 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2274 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2275 assert(ST.useVGPRIndexMode());
2276 Register Dst = MI.getOperand(0).getReg();
2277 Register VecReg = MI.getOperand(1).getReg();
2278 bool IsUndef = MI.getOperand(1).isUndef();
2279 Register SubReg = MI.getOperand(3).getImm();
2280
2281 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2282 .add(MI.getOperand(2))
2284 SetOn->getOperand(3).setIsUndef();
2285
2286 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2287 .addDef(Dst)
2288 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2289 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2290
2291 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2292
2293 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2294
2295 MI.eraseFromParent();
2296 break;
2297 }
2298 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2299 MachineFunction &MF = *MBB.getParent();
2300 Register Reg = MI.getOperand(0).getReg();
2301 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2302 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2303 MachineOperand OpLo = MI.getOperand(1);
2304 MachineOperand OpHi = MI.getOperand(2);
2305
2306 // Create a bundle so these instructions won't be re-ordered by the
2307 // post-RA scheduler.
2308 MIBundleBuilder Bundler(MBB, MI);
2309 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2310
2311 // What we want here is an offset from the value returned by s_getpc (which
2312 // is the address of the s_add_u32 instruction) to the global variable, but
2313 // since the encoding of $symbol starts 4 bytes after the start of the
2314 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2315 // small. This requires us to add 4 to the global variable offset in order
2316 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2317 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2318 // instruction.
2319
2320 int64_t Adjust = 0;
2321 if (ST.hasGetPCZeroExtension()) {
2322 // Fix up hardware that does not sign-extend the 48-bit PC value by
2323 // inserting: s_sext_i32_i16 reghi, reghi
2324 Bundler.append(
2325 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2326 Adjust += 4;
2327 }
2328
2329 if (OpLo.isGlobal())
2330 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2331 Bundler.append(
2332 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2333
2334 if (OpHi.isGlobal())
2335 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2336 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2337 .addReg(RegHi)
2338 .add(OpHi));
2339
2340 finalizeBundle(MBB, Bundler.begin());
2341
2342 MI.eraseFromParent();
2343 break;
2344 }
2345 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2346 MachineFunction &MF = *MBB.getParent();
2347 Register Reg = MI.getOperand(0).getReg();
2348 MachineOperand Op = MI.getOperand(1);
2349
2350 // Create a bundle so these instructions won't be re-ordered by the
2351 // post-RA scheduler.
2352 MIBundleBuilder Bundler(MBB, MI);
2353 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2354 if (Op.isGlobal())
2355 Op.setOffset(Op.getOffset() + 4);
2356 Bundler.append(
2357 BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));
2358
2359 finalizeBundle(MBB, Bundler.begin());
2360
2361 MI.eraseFromParent();
2362 break;
2363 }
2364 case AMDGPU::ENTER_STRICT_WWM: {
2365 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2366 // Whole Wave Mode is entered.
2367 MI.setDesc(get(LMC.OrSaveExecOpc));
2368 break;
2369 }
2370 case AMDGPU::ENTER_STRICT_WQM: {
2371 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2372 // STRICT_WQM is entered.
2373 BuildMI(MBB, MI, DL, get(LMC.MovOpc), MI.getOperand(0).getReg())
2374 .addReg(LMC.ExecReg);
2375 BuildMI(MBB, MI, DL, get(LMC.WQMOpc), LMC.ExecReg).addReg(LMC.ExecReg);
2376
2377 MI.eraseFromParent();
2378 break;
2379 }
2380 case AMDGPU::EXIT_STRICT_WWM:
2381 case AMDGPU::EXIT_STRICT_WQM: {
2382 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2383 // WWM/STICT_WQM is exited.
2384 MI.setDesc(get(LMC.MovOpc));
2385 break;
2386 }
2387 case AMDGPU::SI_RETURN: {
2388 const MachineFunction *MF = MBB.getParent();
2389 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2390 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2391 // Hiding the return address use with SI_RETURN may lead to extra kills in
2392 // the function and missing live-ins. We are fine in practice because callee
2393 // saved register handling ensures the register value is restored before
2394 // RET, but we need the undef flag here to appease the MachineVerifier
2395 // liveness checks.
2397 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2398 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2399
2400 MIB.copyImplicitOps(MI);
2401 MI.eraseFromParent();
2402 break;
2403 }
2404
2405 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2406 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2407 MI.setDesc(get(AMDGPU::S_MUL_U64));
2408 break;
2409
2410 case AMDGPU::S_GETPC_B64_pseudo:
2411 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2412 if (ST.hasGetPCZeroExtension()) {
2413 Register Dst = MI.getOperand(0).getReg();
2414 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2415 // Fix up hardware that does not sign-extend the 48-bit PC value by
2416 // inserting: s_sext_i32_i16 dsthi, dsthi
2417 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2418 DstHi)
2419 .addReg(DstHi);
2420 }
2421 break;
2422
2423 case AMDGPU::V_MAX_BF16_PSEUDO_e64: {
2424 assert(ST.hasBF16PackedInsts());
2425 MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
2426 MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
2427 MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
2428 MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
2429 auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2430 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2431 auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2432 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2433 break;
2434 }
2435
2436 case AMDGPU::GET_STACK_BASE:
2437 // The stack starts at offset 0 unless we need to reserve some space at the
2438 // bottom.
2439 if (ST.getFrameLowering()->mayReserveScratchForCWSR(*MBB.getParent())) {
2440 // When CWSR is used in dynamic VGPR mode, the trap handler needs to save
2441 // some of the VGPRs. The size of the required scratch space has already
2442 // been computed by prolog epilog insertion.
2443 const SIMachineFunctionInfo *MFI =
2444 MBB.getParent()->getInfo<SIMachineFunctionInfo>();
2445 unsigned VGPRSize = MFI->getScratchReservedForDynamicVGPRs();
2446 Register DestReg = MI.getOperand(0).getReg();
2447 BuildMI(MBB, MI, DL, get(AMDGPU::S_GETREG_B32), DestReg)
2450 // The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
2451 // (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
2452 // SCC, so we need to check for 0 manually.
2453 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)).addImm(0).addReg(DestReg);
2454 // Change the implicif-def of SCC to an explicit use (but first remove
2455 // the dead flag if present).
2456 MI.getOperand(MI.getNumExplicitOperands()).setIsDead(false);
2457 MI.getOperand(MI.getNumExplicitOperands()).setIsUse();
2458 MI.setDesc(get(AMDGPU::S_CMOVK_I32));
2459 MI.addOperand(MachineOperand::CreateImm(VGPRSize));
2460 } else {
2461 MI.setDesc(get(AMDGPU::S_MOV_B32));
2462 MI.addOperand(MachineOperand::CreateImm(0));
2463 MI.removeOperand(
2464 MI.getNumExplicitOperands()); // Drop implicit def of SCC.
2465 }
2466 break;
2467 }
2468
2469 return true;
2470}
2471
2474 unsigned SubIdx, const MachineInstr &Orig,
2475 LaneBitmask UsedLanes) const {
2476
2477 // Try shrinking the instruction to remat only the part needed for current
2478 // context.
2479 // TODO: Handle more cases.
2480 unsigned Opcode = Orig.getOpcode();
2481 switch (Opcode) {
2482 case AMDGPU::S_MOV_B64:
2483 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2484 if (SubIdx != 0)
2485 break;
2486
2487 if (!Orig.getOperand(1).isImm())
2488 break;
2489
2490 // Shrink S_MOV_B64 to S_MOV_B32 when UsedLanes indicates only a single
2491 // 32-bit lane of the 64-bit value is live at the rematerialization point.
2492 if (UsedLanes.all())
2493 break;
2494
2495 // Determine which half of the 64-bit immediate corresponds to the use.
2496 unsigned OrigSubReg = Orig.getOperand(0).getSubReg();
2497 unsigned LoSubReg = RI.composeSubRegIndices(OrigSubReg, AMDGPU::sub0);
2498 unsigned HiSubReg = RI.composeSubRegIndices(OrigSubReg, AMDGPU::sub1);
2499
2500 bool NeedLo = (UsedLanes & RI.getSubRegIndexLaneMask(LoSubReg)).any();
2501 bool NeedHi = (UsedLanes & RI.getSubRegIndexLaneMask(HiSubReg)).any();
2502
2503 if (NeedLo && NeedHi)
2504 break;
2505
2506 int64_t Imm64 = Orig.getOperand(1).getImm();
2507 int32_t Imm32 = NeedLo ? Lo_32(Imm64) : Hi_32(Imm64);
2508
2509 unsigned UseSubReg = NeedLo ? LoSubReg : HiSubReg;
2510
2511 // Emit S_MOV_B32 defining just the needed 32-bit subreg of DestReg.
2512 BuildMI(MBB, I, Orig.getDebugLoc(), get(AMDGPU::S_MOV_B32))
2513 .addReg(DestReg, RegState::Define | RegState::Undef, UseSubReg)
2514 .addImm(Imm32);
2515 return;
2516 }
2517
2518 case AMDGPU::S_LOAD_DWORDX16_IMM:
2519 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2520 if (SubIdx != 0)
2521 break;
2522
2523 if (I == MBB.end())
2524 break;
2525
2526 if (I->isBundled())
2527 break;
2528
2529 // Look for a single use of the register that is also a subreg.
2530 Register RegToFind = Orig.getOperand(0).getReg();
2531 MachineOperand *UseMO = nullptr;
2532 for (auto &CandMO : I->operands()) {
2533 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2534 continue;
2535 if (UseMO) {
2536 UseMO = nullptr;
2537 break;
2538 }
2539 UseMO = &CandMO;
2540 }
2541 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2542 break;
2543
2544 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2545 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2546
2547 MachineFunction *MF = MBB.getParent();
2548 MachineRegisterInfo &MRI = MF->getRegInfo();
2549 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2550
2551 unsigned NewOpcode = -1;
2552 if (SubregSize == 256)
2553 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2554 else if (SubregSize == 128)
2555 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2556 else
2557 break;
2558
2559 const MCInstrDesc &TID = get(NewOpcode);
2560 const TargetRegisterClass *NewRC =
2561 RI.getAllocatableClass(getRegClass(TID, 0));
2562 MRI.setRegClass(DestReg, NewRC);
2563
2564 UseMO->setReg(DestReg);
2565 UseMO->setSubReg(AMDGPU::NoSubRegister);
2566
2567 // Use a smaller load with the desired size, possibly with updated offset.
2568 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2569 MI->setDesc(TID);
2570 MI->getOperand(0).setReg(DestReg);
2571 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2572 if (Offset) {
2573 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2574 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2575 OffsetMO->setImm(FinalOffset);
2576 }
2578 for (const MachineMemOperand *MemOp : Orig.memoperands())
2579 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2580 SubregSize / 8));
2581 MI->setMemRefs(*MF, NewMMOs);
2582
2583 MBB.insert(I, MI);
2584 return;
2585 }
2586
2587 default:
2588 break;
2589 }
2590
2591 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, UsedLanes);
2592}
2593
2594std::pair<MachineInstr*, MachineInstr*>
2596 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2597
2598 if (ST.hasVMovB64Inst() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
2600 ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2601 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2602 return std::pair(&MI, nullptr);
2603 }
2604
2605 MachineBasicBlock &MBB = *MI.getParent();
2606 DebugLoc DL = MBB.findDebugLoc(MI);
2607 MachineFunction *MF = MBB.getParent();
2608 MachineRegisterInfo &MRI = MF->getRegInfo();
2609 Register Dst = MI.getOperand(0).getReg();
2610 unsigned Part = 0;
2611 MachineInstr *Split[2];
2612
2613 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2614 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2615 if (Dst.isPhysical()) {
2616 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2617 } else {
2618 assert(MRI.isSSA());
2619 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2620 MovDPP.addDef(Tmp);
2621 }
2622
2623 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2624 const MachineOperand &SrcOp = MI.getOperand(I);
2625 assert(!SrcOp.isFPImm());
2626 if (SrcOp.isImm()) {
2627 APInt Imm(64, SrcOp.getImm());
2628 Imm.ashrInPlace(Part * 32);
2629 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2630 } else {
2631 assert(SrcOp.isReg());
2632 Register Src = SrcOp.getReg();
2633 if (Src.isPhysical())
2634 MovDPP.addReg(RI.getSubReg(Src, Sub));
2635 else
2636 MovDPP.addReg(Src, getUndefRegState(SrcOp.isUndef()), Sub);
2637 }
2638 }
2639
2640 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2641 MovDPP.addImm(MO.getImm());
2642
2643 Split[Part] = MovDPP;
2644 ++Part;
2645 }
2646
2647 if (Dst.isVirtual())
2648 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2649 .addReg(Split[0]->getOperand(0).getReg())
2650 .addImm(AMDGPU::sub0)
2651 .addReg(Split[1]->getOperand(0).getReg())
2652 .addImm(AMDGPU::sub1);
2653
2654 MI.eraseFromParent();
2655 return std::pair(Split[0], Split[1]);
2656}
2657
2658std::optional<DestSourcePair>
2660 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2661 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2662
2663 return std::nullopt;
2664}
2665
2667 AMDGPU::OpName Src0OpName,
2668 MachineOperand &Src1,
2669 AMDGPU::OpName Src1OpName) const {
2670 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2671 if (!Src0Mods)
2672 return false;
2673
2674 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2675 assert(Src1Mods &&
2676 "All commutable instructions have both src0 and src1 modifiers");
2677
2678 int Src0ModsVal = Src0Mods->getImm();
2679 int Src1ModsVal = Src1Mods->getImm();
2680
2681 Src1Mods->setImm(Src0ModsVal);
2682 Src0Mods->setImm(Src1ModsVal);
2683 return true;
2684}
2685
2687 MachineOperand &RegOp,
2688 MachineOperand &NonRegOp) {
2689 Register Reg = RegOp.getReg();
2690 unsigned SubReg = RegOp.getSubReg();
2691 bool IsKill = RegOp.isKill();
2692 bool IsDead = RegOp.isDead();
2693 bool IsUndef = RegOp.isUndef();
2694 bool IsDebug = RegOp.isDebug();
2695
2696 if (NonRegOp.isImm())
2697 RegOp.ChangeToImmediate(NonRegOp.getImm());
2698 else if (NonRegOp.isFI())
2699 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2700 else if (NonRegOp.isGlobal()) {
2701 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2702 NonRegOp.getTargetFlags());
2703 } else
2704 return nullptr;
2705
2706 // Make sure we don't reinterpret a subreg index in the target flags.
2707 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2708
2709 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2710 NonRegOp.setSubReg(SubReg);
2711
2712 return &MI;
2713}
2714
2716 MachineOperand &NonRegOp1,
2717 MachineOperand &NonRegOp2) {
2718 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2719 int64_t NonRegVal = NonRegOp1.getImm();
2720
2721 NonRegOp1.setImm(NonRegOp2.getImm());
2722 NonRegOp2.setImm(NonRegVal);
2723 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2724 NonRegOp2.setTargetFlags(TargetFlags);
2725 return &MI;
2726}
2727
2728bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2729 unsigned OpIdx1) const {
2730 const MCInstrDesc &InstDesc = MI.getDesc();
2731 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2732 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2733
2734 unsigned Opc = MI.getOpcode();
2735 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2736
2737 const MachineOperand &MO0 = MI.getOperand(OpIdx0);
2738 const MachineOperand &MO1 = MI.getOperand(OpIdx1);
2739
2740 // Swap doesn't breach constant bus or literal limits
2741 // It may move literal to position other than src0, this is not allowed
2742 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2743 // FIXME: After gfx9, literal can be in place other than Src0
2744 if (isVALU(MI, /*AllowLDSDMA=*/true)) {
2745 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2746 !isInlineConstant(MO0, OpInfo1))
2747 return false;
2748 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2749 !isInlineConstant(MO1, OpInfo0))
2750 return false;
2751 }
2752
2753 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2754 if (OpInfo1.RegClass == -1)
2755 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2756 return isLegalRegOperand(MI, OpIdx1, MO0) &&
2757 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
2758 }
2759 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2760 if (OpInfo0.RegClass == -1)
2761 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2762 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
2763 isLegalRegOperand(MI, OpIdx0, MO1);
2764 }
2765
2766 // No need to check 64-bit literals since swapping does not bring new
2767 // 64-bit literals into current instruction to fold to 32-bit
2768
2769 return isImmOperandLegal(MI, OpIdx1, MO0);
2770}
2771
2773 unsigned Src0Idx,
2774 unsigned Src1Idx) const {
2775 assert(!NewMI && "this should never be used");
2776
2777 unsigned Opc = MI.getOpcode();
2778 int CommutedOpcode = commuteOpcode(Opc);
2779 if (CommutedOpcode == -1)
2780 return nullptr;
2781
2782 if (Src0Idx > Src1Idx)
2783 std::swap(Src0Idx, Src1Idx);
2784
2785 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2786 static_cast<int>(Src0Idx) &&
2787 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2788 static_cast<int>(Src1Idx) &&
2789 "inconsistency with findCommutedOpIndices");
2790
2791 if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
2792 return nullptr;
2793
2794 MachineInstr *CommutedMI = nullptr;
2795 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2796 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2797 if (Src0.isReg() && Src1.isReg()) {
2798 // Be sure to copy the source modifiers to the right place.
2799 CommutedMI =
2800 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2801 } else if (Src0.isReg() && !Src1.isReg()) {
2802 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2803 } else if (!Src0.isReg() && Src1.isReg()) {
2804 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2805 } else if (Src0.isImm() && Src1.isImm()) {
2806 CommutedMI = swapImmOperands(MI, Src0, Src1);
2807 } else {
2808 // FIXME: Found two non registers to commute. This does happen.
2809 return nullptr;
2810 }
2811
2812 if (CommutedMI) {
2813 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2814 Src1, AMDGPU::OpName::src1_modifiers);
2815
2816 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2817 AMDGPU::OpName::src1_sel);
2818
2819 CommutedMI->setDesc(get(CommutedOpcode));
2820 }
2821
2822 return CommutedMI;
2823}
2824
2825// This needs to be implemented because the source modifiers may be inserted
2826// between the true commutable operands, and the base
2827// TargetInstrInfo::commuteInstruction uses it.
2829 unsigned &SrcOpIdx0,
2830 unsigned &SrcOpIdx1) const {
2831 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2832}
2833
2835 unsigned &SrcOpIdx0,
2836 unsigned &SrcOpIdx1) const {
2837 if (!Desc.isCommutable())
2838 return false;
2839
2840 unsigned Opc = Desc.getOpcode();
2841 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2842 if (Src0Idx == -1)
2843 return false;
2844
2845 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2846 if (Src1Idx == -1)
2847 return false;
2848
2849 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2850}
2851
2853 int64_t BrOffset) const {
2854 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
2855 // because its dest block is unanalyzable.
2856 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
2857
2858 // Convert to dwords.
2859 BrOffset /= 4;
2860
2861 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2862 // from the next instruction.
2863 BrOffset -= 1;
2864
2865 return isIntN(BranchOffsetBits, BrOffset);
2866}
2867
2870 return MI.getOperand(0).getMBB();
2871}
2872
2874 for (const MachineInstr &MI : MBB->terminators()) {
2875 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2876 MI.getOpcode() == AMDGPU::SI_LOOP)
2877 return true;
2878 }
2879 return false;
2880}
2881
2883 MachineBasicBlock &DestBB,
2884 MachineBasicBlock &RestoreBB,
2885 const DebugLoc &DL, int64_t BrOffset,
2886 RegScavenger *RS) const {
2887 assert(MBB.empty() &&
2888 "new block should be inserted for expanding unconditional branch");
2889 assert(MBB.pred_size() == 1);
2890 assert(RestoreBB.empty() &&
2891 "restore block should be inserted for restoring clobbered registers");
2892
2893 MachineFunction *MF = MBB.getParent();
2894 MachineRegisterInfo &MRI = MF->getRegInfo();
2896 auto I = MBB.end();
2897 auto &MCCtx = MF->getContext();
2898
2899 if (ST.useAddPC64Inst()) {
2900 MCSymbol *Offset =
2901 MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
2902 auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
2904 MCSymbol *PostAddPCLabel =
2905 MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);
2906 AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);
2907 auto *OffsetExpr = MCBinaryExpr::createSub(
2908 MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
2909 MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);
2910 Offset->setVariableValue(OffsetExpr);
2911 return;
2912 }
2913
2914 assert(RS && "RegScavenger required for long branching");
2915
2916 // FIXME: Virtual register workaround for RegScavenger not working with empty
2917 // blocks.
2918 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2919
2920 // Note: as this is used after hazard recognizer we need to apply some hazard
2921 // workarounds directly.
2922 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
2923 ST.hasVALUReadSGPRHazard();
2924 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
2925 if (FlushSGPRWrites)
2926 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
2928 };
2929
2930 // We need to compute the offset relative to the instruction immediately after
2931 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2932 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2933 ApplyHazardWorkarounds();
2934
2935 MCSymbol *PostGetPCLabel =
2936 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2937 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2938
2939 MCSymbol *OffsetLo =
2940 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2941 MCSymbol *OffsetHi =
2942 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2943 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2944 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2945 .addReg(PCReg, {}, AMDGPU::sub0)
2946 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2947 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2948 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2949 .addReg(PCReg, {}, AMDGPU::sub1)
2950 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2951 ApplyHazardWorkarounds();
2952
2953 // Insert the indirect branch after the other terminator.
2954 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2955 .addReg(PCReg);
2956
2957 // If a spill is needed for the pc register pair, we need to insert a spill
2958 // restore block right before the destination block, and insert a short branch
2959 // into the old destination block's fallthrough predecessor.
2960 // e.g.:
2961 //
2962 // s_cbranch_scc0 skip_long_branch:
2963 //
2964 // long_branch_bb:
2965 // spill s[8:9]
2966 // s_getpc_b64 s[8:9]
2967 // s_add_u32 s8, s8, restore_bb
2968 // s_addc_u32 s9, s9, 0
2969 // s_setpc_b64 s[8:9]
2970 //
2971 // skip_long_branch:
2972 // foo;
2973 //
2974 // .....
2975 //
2976 // dest_bb_fallthrough_predecessor:
2977 // bar;
2978 // s_branch dest_bb
2979 //
2980 // restore_bb:
2981 // restore s[8:9]
2982 // fallthrough dest_bb
2983 ///
2984 // dest_bb:
2985 // buzz;
2986
2987 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
2988 Register Scav;
2989
2990 // If we've previously reserved a register for long branches
2991 // avoid running the scavenger and just use those registers
2992 if (LongBranchReservedReg) {
2993 RS->enterBasicBlock(MBB);
2994 Scav = LongBranchReservedReg;
2995 } else {
2996 RS->enterBasicBlockEnd(MBB);
2997 Scav = RS->scavengeRegisterBackwards(
2998 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
2999 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
3000 }
3001 if (Scav) {
3002 RS->setRegUsed(Scav);
3003 MRI.replaceRegWith(PCReg, Scav);
3004 MRI.clearVirtRegs();
3005 } else {
3006 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3007 // SGPR spill.
3008 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3009 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3010 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
3011 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
3012 MRI.clearVirtRegs();
3013 }
3014
3015 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3016 // Now, the distance could be defined.
3018 MCSymbolRefExpr::create(DestLabel, MCCtx),
3019 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
3020 // Add offset assignments.
3021 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
3022 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
3023 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
3024 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3025}
3026
3027unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3028 switch (Cond) {
3029 case SIInstrInfo::SCC_TRUE:
3030 return AMDGPU::S_CBRANCH_SCC1;
3031 case SIInstrInfo::SCC_FALSE:
3032 return AMDGPU::S_CBRANCH_SCC0;
3033 case SIInstrInfo::VCCNZ:
3034 return AMDGPU::S_CBRANCH_VCCNZ;
3035 case SIInstrInfo::VCCZ:
3036 return AMDGPU::S_CBRANCH_VCCZ;
3037 case SIInstrInfo::EXECNZ:
3038 return AMDGPU::S_CBRANCH_EXECNZ;
3039 case SIInstrInfo::EXECZ:
3040 return AMDGPU::S_CBRANCH_EXECZ;
3041 default:
3042 llvm_unreachable("invalid branch predicate");
3043 }
3044}
3045
3046SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3047 switch (Opcode) {
3048 case AMDGPU::S_CBRANCH_SCC0:
3049 return SCC_FALSE;
3050 case AMDGPU::S_CBRANCH_SCC1:
3051 return SCC_TRUE;
3052 case AMDGPU::S_CBRANCH_VCCNZ:
3053 return VCCNZ;
3054 case AMDGPU::S_CBRANCH_VCCZ:
3055 return VCCZ;
3056 case AMDGPU::S_CBRANCH_EXECNZ:
3057 return EXECNZ;
3058 case AMDGPU::S_CBRANCH_EXECZ:
3059 return EXECZ;
3060 default:
3061 return INVALID_BR;
3062 }
3063}
3064
3068 MachineBasicBlock *&FBB,
3070 bool AllowModify) const {
3071 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3072 // Unconditional Branch
3073 TBB = I->getOperand(0).getMBB();
3074 return false;
3075 }
3076
3077 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3078 if (Pred == INVALID_BR)
3079 return true;
3080
3081 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3082 Cond.push_back(MachineOperand::CreateImm(Pred));
3083 Cond.push_back(I->getOperand(1)); // Save the branch register.
3084
3085 ++I;
3086
3087 if (I == MBB.end()) {
3088 // Conditional branch followed by fall-through.
3089 TBB = CondBB;
3090 return false;
3091 }
3092
3093 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3094 TBB = CondBB;
3095 FBB = I->getOperand(0).getMBB();
3096 return false;
3097 }
3098
3099 return true;
3100}
3101
3103 MachineBasicBlock *&FBB,
3105 bool AllowModify) const {
3106 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3107 auto E = MBB.end();
3108 if (I == E)
3109 return false;
3110
3111 // Skip over the instructions that are artificially terminators for special
3112 // exec management.
3113 while (I != E && !I->isBranch() && !I->isReturn()) {
3114 switch (I->getOpcode()) {
3115 case AMDGPU::S_MOV_B64_term:
3116 case AMDGPU::S_XOR_B64_term:
3117 case AMDGPU::S_OR_B64_term:
3118 case AMDGPU::S_ANDN2_B64_term:
3119 case AMDGPU::S_AND_B64_term:
3120 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3121 case AMDGPU::S_MOV_B32_term:
3122 case AMDGPU::S_XOR_B32_term:
3123 case AMDGPU::S_OR_B32_term:
3124 case AMDGPU::S_ANDN2_B32_term:
3125 case AMDGPU::S_AND_B32_term:
3126 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3127 case AMDGPU::V_CMPX_EQ_U32_nosdst_e32_term:
3128 case AMDGPU::V_CMPX_EQ_U64_nosdst_e32_term:
3129 break;
3130 case AMDGPU::SI_IF:
3131 case AMDGPU::SI_ELSE:
3132 case AMDGPU::SI_KILL_I1_TERMINATOR:
3133 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3134 // FIXME: It's messy that these need to be considered here at all.
3135 return true;
3136 default:
3137 llvm_unreachable("unexpected non-branch terminator inst");
3138 }
3139
3140 ++I;
3141 }
3142
3143 if (I == E)
3144 return false;
3145
3146 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3147}
3148
3150 int *BytesRemoved) const {
3151 unsigned Count = 0;
3152 unsigned RemovedSize = 0;
3153 for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {
3154 // Skip over artificial terminators when removing instructions.
3155 if (MI.isBranch() || MI.isReturn()) {
3156 RemovedSize += getInstSizeInBytes(MI);
3157 MI.eraseFromParent();
3158 ++Count;
3159 }
3160 }
3161
3162 if (BytesRemoved)
3163 *BytesRemoved = RemovedSize;
3164
3165 return Count;
3166}
3167
3168// Copy the flags onto the implicit condition register operand.
3170 const MachineOperand &OrigCond) {
3171 CondReg.setIsUndef(OrigCond.isUndef());
3172 CondReg.setIsKill(OrigCond.isKill());
3173}
3174
3177 MachineBasicBlock *FBB,
3179 const DebugLoc &DL,
3180 int *BytesAdded) const {
3181 if (!FBB && Cond.empty()) {
3182 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3183 .addMBB(TBB);
3184 if (BytesAdded)
3185 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3186 return 1;
3187 }
3188
3189 assert(TBB && Cond[0].isImm());
3190
3191 unsigned Opcode
3192 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3193
3194 if (!FBB) {
3195 MachineInstr *CondBr =
3196 BuildMI(&MBB, DL, get(Opcode))
3197 .addMBB(TBB);
3198
3199 // Copy the flags onto the implicit condition register operand.
3200 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3201 fixImplicitOperands(*CondBr);
3202
3203 if (BytesAdded)
3204 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3205 return 1;
3206 }
3207
3208 assert(TBB && FBB);
3209
3210 MachineInstr *CondBr =
3211 BuildMI(&MBB, DL, get(Opcode))
3212 .addMBB(TBB);
3213 fixImplicitOperands(*CondBr);
3214 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3215 .addMBB(FBB);
3216
3217 MachineOperand &CondReg = CondBr->getOperand(1);
3218 CondReg.setIsUndef(Cond[1].isUndef());
3219 CondReg.setIsKill(Cond[1].isKill());
3220
3221 if (BytesAdded)
3222 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3223
3224 return 2;
3225}
3226
3229 if (Cond.size() != 2) {
3230 return true;
3231 }
3232
3233 if (Cond[0].isImm()) {
3234 Cond[0].setImm(-Cond[0].getImm());
3235 return false;
3236 }
3237
3238 return true;
3239}
3240
3243 Register DstReg, Register TrueReg,
3244 Register FalseReg, int &CondCycles,
3245 int &TrueCycles, int &FalseCycles) const {
3246 switch (Cond[0].getImm()) {
3247 case VCCNZ:
3248 case VCCZ: {
3249 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3250 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3251 if (MRI.getRegClass(FalseReg) != RC)
3252 return false;
3253
3254 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3255 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3256
3257 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3258 return RI.hasVGPRs(RC) && NumInsts <= 6;
3259 }
3260 case SCC_TRUE:
3261 case SCC_FALSE: {
3262 // FIXME: We could insert for VGPRs if we could replace the original compare
3263 // with a vector one.
3264 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3265 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3266 if (MRI.getRegClass(FalseReg) != RC)
3267 return false;
3268
3269 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3270
3271 // Multiples of 8 can do s_cselect_b64
3272 if (NumInsts % 2 == 0)
3273 NumInsts /= 2;
3274
3275 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3276 return RI.isSGPRClass(RC);
3277 }
3278 default:
3279 return false;
3280 }
3281}
3282
3286 Register TrueReg, Register FalseReg) const {
3287 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3288 if (Pred == VCCZ || Pred == SCC_FALSE) {
3289 Pred = static_cast<BranchPredicate>(-Pred);
3290 std::swap(TrueReg, FalseReg);
3291 }
3292
3293 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3294 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3295 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3296
3297 if (DstSize == 32) {
3299 if (Pred == SCC_TRUE) {
3300 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3301 .addReg(TrueReg)
3302 .addReg(FalseReg);
3303 } else {
3304 // Instruction's operands are backwards from what is expected.
3305 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3306 .addReg(FalseReg)
3307 .addReg(TrueReg);
3308 }
3309
3310 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3311 return;
3312 }
3313
3314 if (DstSize == 64 && Pred == SCC_TRUE) {
3316 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3317 .addReg(TrueReg)
3318 .addReg(FalseReg);
3319
3320 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3321 return;
3322 }
3323
3324 static const int16_t Sub0_15[] = {
3325 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3326 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3327 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3328 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3329 };
3330
3331 static const int16_t Sub0_15_64[] = {
3332 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3333 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3334 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3335 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3336 };
3337
3338 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3339 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3340 const int16_t *SubIndices = Sub0_15;
3341 int NElts = DstSize / 32;
3342
3343 // 64-bit select is only available for SALU.
3344 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3345 if (Pred == SCC_TRUE) {
3346 if (NElts % 2) {
3347 SelOp = AMDGPU::S_CSELECT_B32;
3348 EltRC = &AMDGPU::SGPR_32RegClass;
3349 } else {
3350 SelOp = AMDGPU::S_CSELECT_B64;
3351 EltRC = &AMDGPU::SGPR_64RegClass;
3352 SubIndices = Sub0_15_64;
3353 NElts /= 2;
3354 }
3355 }
3356
3358 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3359
3360 I = MIB->getIterator();
3361
3363 for (int Idx = 0; Idx != NElts; ++Idx) {
3364 Register DstElt = MRI.createVirtualRegister(EltRC);
3365 Regs.push_back(DstElt);
3366
3367 unsigned SubIdx = SubIndices[Idx];
3368
3370 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3371 Select = BuildMI(MBB, I, DL, get(SelOp), DstElt)
3372 .addReg(FalseReg, {}, SubIdx)
3373 .addReg(TrueReg, {}, SubIdx);
3374 } else {
3375 Select = BuildMI(MBB, I, DL, get(SelOp), DstElt)
3376 .addReg(TrueReg, {}, SubIdx)
3377 .addReg(FalseReg, {}, SubIdx);
3378 }
3379
3380 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3382
3383 MIB.addReg(DstElt)
3384 .addImm(SubIdx);
3385 }
3386}
3387
3389
3390 if (MI.isBranch() || MI.isCall() || MI.isReturn() || MI.isIndirectBranch())
3391 return true;
3392
3393 switch (MI.getOpcode()) {
3394 case AMDGPU::S_ENDPGM:
3395 case AMDGPU::S_ENDPGM_SAVED:
3396 case AMDGPU::S_TRAP:
3397 case AMDGPU::S_GETREG_B32:
3398 case AMDGPU::S_SETREG_B32:
3399 case AMDGPU::S_SETREG_B32_mode:
3400 case AMDGPU::S_SETREG_IMM32_B32:
3401 case AMDGPU::S_SETREG_IMM32_B32_mode:
3402 case AMDGPU::S_SENDMSG:
3403 case AMDGPU::S_SENDMSGHALT:
3404 case AMDGPU::S_SENDMSG_RTN_B32:
3405 case AMDGPU::S_SENDMSG_RTN_B64:
3406 case AMDGPU::S_BARRIER_WAIT:
3407 case AMDGPU::S_BARRIER_SIGNAL_M0:
3408 case AMDGPU::S_BARRIER_SIGNAL_IMM:
3409 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
3410 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
3411 return true;
3412 default:
3413 return false;
3414 }
3415}
3416
3418 switch (MI.getOpcode()) {
3419 case AMDGPU::V_MOV_B16_t16_e32:
3420 case AMDGPU::V_MOV_B16_t16_e64:
3421 case AMDGPU::V_MOV_B32_e32:
3422 case AMDGPU::V_MOV_B32_e64:
3423 case AMDGPU::V_MOV_B64_PSEUDO:
3424 case AMDGPU::V_MOV_B64_e32:
3425 case AMDGPU::V_MOV_B64_e64:
3426 case AMDGPU::S_MOV_B32:
3427 case AMDGPU::S_MOV_B64:
3428 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3429 case AMDGPU::COPY:
3430 case AMDGPU::WWM_COPY:
3431 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3432 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3433 case AMDGPU::V_ACCVGPR_MOV_B32:
3434 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3435 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3436 return true;
3437 default:
3438 return false;
3439 }
3440}
3441
3443 switch (MI.getOpcode()) {
3444 case AMDGPU::V_MOV_B16_t16_e32:
3445 case AMDGPU::V_MOV_B16_t16_e64:
3446 return 2;
3447 case AMDGPU::V_MOV_B32_e32:
3448 case AMDGPU::V_MOV_B32_e64:
3449 case AMDGPU::V_MOV_B64_PSEUDO:
3450 case AMDGPU::V_MOV_B64_e32:
3451 case AMDGPU::V_MOV_B64_e64:
3452 case AMDGPU::S_MOV_B32:
3453 case AMDGPU::S_MOV_B64:
3454 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3455 case AMDGPU::COPY:
3456 case AMDGPU::WWM_COPY:
3457 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3458 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3459 case AMDGPU::V_ACCVGPR_MOV_B32:
3460 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3461 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3462 return 1;
3463 default:
3464 llvm_unreachable("MI is not a foldable copy");
3465 }
3466}
3467
3468static constexpr AMDGPU::OpName ModifierOpNames[] = {
3469 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3470 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3471 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3472
3474 unsigned Opc = MI.getOpcode();
3475 for (AMDGPU::OpName Name : reverse(ModifierOpNames)) {
3476 int Idx = AMDGPU::getNamedOperandIdx(Opc, Name);
3477 if (Idx >= 0)
3478 MI.removeOperand(Idx);
3479 }
3480}
3481
3483 const MCInstrDesc &NewDesc) const {
3484 MI.setDesc(NewDesc);
3485
3486 // Remove any leftover implicit operands from mutating the instruction. e.g.
3487 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
3488 // anymore.
3489 const MCInstrDesc &Desc = MI.getDesc();
3490 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
3491 Desc.implicit_defs().size();
3492
3493 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
3494 MI.removeOperand(I);
3495}
3496
3497std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3498 unsigned SubRegIndex) {
3499 switch (SubRegIndex) {
3500 case AMDGPU::NoSubRegister:
3501 return Imm;
3502 case AMDGPU::sub0:
3503 return SignExtend64<32>(Imm);
3504 case AMDGPU::sub1:
3505 return SignExtend64<32>(Imm >> 32);
3506 case AMDGPU::lo16:
3507 return SignExtend64<16>(Imm);
3508 case AMDGPU::hi16:
3509 return SignExtend64<16>(Imm >> 16);
3510 case AMDGPU::sub1_lo16:
3511 return SignExtend64<16>(Imm >> 32);
3512 case AMDGPU::sub1_hi16:
3513 return SignExtend64<16>(Imm >> 48);
3514 default:
3515 return std::nullopt;
3516 }
3517
3518 llvm_unreachable("covered subregister switch");
3519}
3520
3521static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3522 switch (Opc) {
3523 case AMDGPU::V_MAC_F16_e32:
3524 case AMDGPU::V_MAC_F16_e64:
3525 case AMDGPU::V_MAD_F16_e64:
3526 return AMDGPU::V_MADAK_F16;
3527 case AMDGPU::V_MAC_F32_e32:
3528 case AMDGPU::V_MAC_F32_e64:
3529 case AMDGPU::V_MAD_F32_e64:
3530 return AMDGPU::V_MADAK_F32;
3531 case AMDGPU::V_FMAC_F32_e32:
3532 case AMDGPU::V_FMAC_F32_e64:
3533 case AMDGPU::V_FMA_F32_e64:
3534 return AMDGPU::V_FMAAK_F32;
3535 case AMDGPU::V_FMAC_F16_e32:
3536 case AMDGPU::V_FMAC_F16_e64:
3537 case AMDGPU::V_FMAC_F16_t16_e64:
3538 case AMDGPU::V_FMAC_F16_fake16_e64:
3539 case AMDGPU::V_FMAC_F16_t16_e32:
3540 case AMDGPU::V_FMAC_F16_fake16_e32:
3541 case AMDGPU::V_FMA_F16_e64:
3542 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3543 ? AMDGPU::V_FMAAK_F16_t16
3544 : AMDGPU::V_FMAAK_F16_fake16
3545 : AMDGPU::V_FMAAK_F16;
3546 case AMDGPU::V_FMAC_F64_e32:
3547 case AMDGPU::V_FMAC_F64_e64:
3548 case AMDGPU::V_FMA_F64_e64:
3549 return AMDGPU::V_FMAAK_F64;
3550 default:
3551 llvm_unreachable("invalid instruction");
3552 }
3553}
3554
3555static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3556 switch (Opc) {
3557 case AMDGPU::V_MAC_F16_e32:
3558 case AMDGPU::V_MAC_F16_e64:
3559 case AMDGPU::V_MAD_F16_e64:
3560 return AMDGPU::V_MADMK_F16;
3561 case AMDGPU::V_MAC_F32_e32:
3562 case AMDGPU::V_MAC_F32_e64:
3563 case AMDGPU::V_MAD_F32_e64:
3564 return AMDGPU::V_MADMK_F32;
3565 case AMDGPU::V_FMAC_F32_e32:
3566 case AMDGPU::V_FMAC_F32_e64:
3567 case AMDGPU::V_FMA_F32_e64:
3568 return AMDGPU::V_FMAMK_F32;
3569 case AMDGPU::V_FMAC_F16_e32:
3570 case AMDGPU::V_FMAC_F16_e64:
3571 case AMDGPU::V_FMAC_F16_t16_e64:
3572 case AMDGPU::V_FMAC_F16_fake16_e64:
3573 case AMDGPU::V_FMAC_F16_t16_e32:
3574 case AMDGPU::V_FMAC_F16_fake16_e32:
3575 case AMDGPU::V_FMA_F16_e64:
3576 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3577 ? AMDGPU::V_FMAMK_F16_t16
3578 : AMDGPU::V_FMAMK_F16_fake16
3579 : AMDGPU::V_FMAMK_F16;
3580 case AMDGPU::V_FMAC_F64_e32:
3581 case AMDGPU::V_FMAC_F64_e64:
3582 case AMDGPU::V_FMA_F64_e64:
3583 return AMDGPU::V_FMAMK_F64;
3584 default:
3585 llvm_unreachable("invalid instruction");
3586 }
3587}
3588
3590 Register Reg, MachineRegisterInfo *MRI) const {
3591 int64_t Imm;
3592 if (!getConstValDefinedInReg(DefMI, Reg, Imm))
3593 return false;
3594
3595 const bool HasMultipleUses = !MRI->hasOneNonDBGUse(Reg);
3596
3597 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3598
3599 unsigned Opc = UseMI.getOpcode();
3600 if (Opc == AMDGPU::COPY) {
3601 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3602
3603 Register DstReg = UseMI.getOperand(0).getReg();
3604 Register UseSubReg = UseMI.getOperand(1).getSubReg();
3605
3606 const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
3607
3608 if (HasMultipleUses) {
3609 // TODO: This should fold in more cases with multiple use, but we need to
3610 // more carefully consider what those uses are.
3611 unsigned ImmDefSize = RI.getRegSizeInBits(*MRI->getRegClass(Reg));
3612
3613 // Avoid breaking up a 64-bit inline immediate into a subregister extract.
3614 if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
3615 return false;
3616
3617 // Most of the time folding a 32-bit inline constant is free (though this
3618 // might not be true if we can't later fold it into a real user).
3619 //
3620 // FIXME: This isInlineConstant check is imprecise if
3621 // getConstValDefinedInReg handled the tricky non-mov cases.
3622 if (ImmDefSize == 32 &&
3624 return false;
3625 }
3626
3627 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3628 RI.getSubRegIdxSize(UseSubReg) == 16;
3629
3630 if (Is16Bit) {
3631 if (RI.hasVGPRs(DstRC))
3632 return false; // Do not clobber vgpr_hi16
3633
3634 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3635 return false;
3636 }
3637
3638 MachineFunction *MF = UseMI.getMF();
3639
3640 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3641 MCRegister MovDstPhysReg =
3642 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3643
3644 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
3645
3646 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3647 for (unsigned MovOp :
3648 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3649 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3650 const MCInstrDesc &MovDesc = get(MovOp);
3651
3652 const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0);
3653 if (Is16Bit) {
3654 // We just need to find a correctly sized register class, so the
3655 // subregister index compatibility doesn't matter since we're statically
3656 // extracting the immediate value.
3657 MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
3658 if (!MovDstRC)
3659 continue;
3660
3661 if (MovDstPhysReg) {
3662 // FIXME: We probably should not do this. If there is a live value in
3663 // the high half of the register, it will be corrupted.
3664 MovDstPhysReg =
3665 RI.getMatchingSuperReg(MovDstPhysReg, AMDGPU::lo16, MovDstRC);
3666 if (!MovDstPhysReg)
3667 continue;
3668 }
3669 }
3670
3671 // Result class isn't the right size, try the next instruction.
3672 if (MovDstPhysReg) {
3673 if (!MovDstRC->contains(MovDstPhysReg))
3674 return false;
3675 } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
3676 // TODO: This will be overly conservative in the case of 16-bit virtual
3677 // SGPRs. We could hack up the virtual register uses to use a compatible
3678 // 32-bit class.
3679 continue;
3680 }
3681
3682 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3683
3684 // Ensure the interpreted immediate value is a valid operand in the new
3685 // mov.
3686 //
3687 // FIXME: isImmOperandLegal should have form that doesn't require existing
3688 // MachineInstr or MachineOperand
3689 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
3690 !isInlineConstant(*SubRegImm, OpInfo.OperandType))
3691 break;
3692
3693 NewOpc = MovOp;
3694 break;
3695 }
3696
3697 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3698 return false;
3699
3700 if (Is16Bit) {
3701 UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
3702 if (MovDstPhysReg)
3703 UseMI.getOperand(0).setReg(MovDstPhysReg);
3704 assert(UseMI.getOperand(1).getReg().isVirtual());
3705 }
3706
3707 const MCInstrDesc &NewMCID = get(NewOpc);
3708 UseMI.setDesc(NewMCID);
3709 UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
3710 UseMI.addImplicitDefUseOperands(*MF);
3711 return true;
3712 }
3713
3714 if (HasMultipleUses)
3715 return false;
3716
3717 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3718 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3719 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3720 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3721 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3722 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3723 Opc == AMDGPU::V_FMAC_F64_e64) {
3724 // Don't fold if we are using source or output modifiers. The new VOP2
3725 // instructions don't have them.
3727 return false;
3728
3729 // If this is a free constant, there's no reason to do this.
3730 // TODO: We could fold this here instead of letting SIFoldOperands do it
3731 // later.
3732 int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0);
3733
3734 // Any src operand can be used for the legality check.
3735 if (isInlineConstant(UseMI, Src0Idx, Imm))
3736 return false;
3737
3738 MachineOperand *Src0 = &UseMI.getOperand(Src0Idx);
3739
3740 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3741 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3742
3743 auto CopyRegOperandToNarrowerRC =
3744 [MRI, this](MachineInstr &MI, unsigned OpNo,
3745 const TargetRegisterClass *NewRC) -> void {
3746 if (!MI.getOperand(OpNo).isReg())
3747 return;
3748 Register Reg = MI.getOperand(OpNo).getReg();
3749 const TargetRegisterClass *RC = RI.getRegClassForReg(*MRI, Reg);
3750 if (RI.getCommonSubClass(RC, NewRC) != NewRC)
3751 return;
3752 Register Tmp = MRI->createVirtualRegister(NewRC);
3753 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
3754 get(AMDGPU::COPY), Tmp)
3755 .addReg(Reg);
3756 MI.getOperand(OpNo).setReg(Tmp);
3757 MI.getOperand(OpNo).setIsKill();
3758 };
3759
3760 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3761 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3762 (Src1->isReg() && Src1->getReg() == Reg)) {
3763 MachineOperand *RegSrc =
3764 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3765 if (!RegSrc->isReg())
3766 return false;
3767 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3768 ST.getConstantBusLimit(Opc) < 2)
3769 return false;
3770
3771 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3772 return false;
3773
3774 // If src2 is also a literal constant then we have to choose which one to
3775 // fold. In general it is better to choose madak so that the other literal
3776 // can be materialized in an sgpr instead of a vgpr:
3777 // s_mov_b32 s0, literal
3778 // v_madak_f32 v0, s0, v0, literal
3779 // Instead of:
3780 // v_mov_b32 v1, literal
3781 // v_madmk_f32 v0, v0, literal, v1
3782 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3783 if (Def && Def->isMoveImmediate() &&
3784 !isInlineConstant(Def->getOperand(1)))
3785 return false;
3786
3787 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3788 if (pseudoToMCOpcode(NewOpc) == -1)
3789 return false;
3790
3791 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3792 Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3793
3794 // FIXME: This would be a lot easier if we could return a new instruction
3795 // instead of having to modify in place.
3796
3797 Register SrcReg = RegSrc->getReg();
3798 unsigned SrcSubReg = RegSrc->getSubReg();
3799 Src0->setReg(SrcReg);
3800 Src0->setSubReg(SrcSubReg);
3801 Src0->setIsKill(RegSrc->isKill());
3802
3803 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3804 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3805 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3806 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3807 UseMI.untieRegOperand(
3808 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3809
3810 Src1->ChangeToImmediate(*SubRegImm);
3811
3813 UseMI.setDesc(get(NewOpc));
3814
3815 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3816 NewOpc == AMDGPU::V_FMAMK_F16_fake16) {
3817 const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
3818 Register Tmp = MRI->createVirtualRegister(NewRC);
3819 BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
3820 UseMI.getDebugLoc(), get(AMDGPU::COPY),
3821 UseMI.getOperand(0).getReg())
3822 .addReg(Tmp, RegState::Kill);
3823 UseMI.getOperand(0).setReg(Tmp);
3824 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
3825 CopyRegOperandToNarrowerRC(UseMI, 3, NewRC);
3826 }
3827
3828 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3829 if (DeleteDef)
3830 DefMI.eraseFromParent();
3831
3832 return true;
3833 }
3834
3835 // Added part is the constant: Use v_madak_{f16, f32}.
3836 if (Src2->isReg() && Src2->getReg() == Reg) {
3837 if (ST.getConstantBusLimit(Opc) < 2) {
3838 // Not allowed to use constant bus for another operand.
3839 // We can however allow an inline immediate as src0.
3840 bool Src0Inlined = false;
3841 if (Src0->isReg()) {
3842 // Try to inline constant if possible.
3843 // If the Def moves immediate and the use is single
3844 // We are saving VGPR here.
3845 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3846 if (Def && Def->isMoveImmediate() &&
3847 isInlineConstant(Def->getOperand(1)) &&
3848 MRI->hasOneNonDBGUse(Src0->getReg())) {
3849 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3850 Src0Inlined = true;
3851 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3852 RI.isSGPRReg(*MRI, Src0->getReg())) {
3853 return false;
3854 }
3855 // VGPR is okay as Src0 - fallthrough
3856 }
3857
3858 if (Src1->isReg() && !Src0Inlined) {
3859 // We have one slot for inlinable constant so far - try to fill it
3860 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3861 if (Def && Def->isMoveImmediate() &&
3862 isInlineConstant(Def->getOperand(1)) &&
3863 MRI->hasOneNonDBGUse(Src1->getReg()) && commuteInstruction(UseMI))
3864 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3865 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3866 return false;
3867 // VGPR is okay as Src1 - fallthrough
3868 }
3869 }
3870
3871 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3872 if (pseudoToMCOpcode(NewOpc) == -1)
3873 return false;
3874
3875 // FIXME: This would be a lot easier if we could return a new instruction
3876 // instead of having to modify in place.
3877
3878 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3879 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3880 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3881 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3882 UseMI.untieRegOperand(
3883 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3884
3885 const std::optional<int64_t> SubRegImm =
3886 extractSubregFromImm(Imm, Src2->getSubReg());
3887
3888 // ChangingToImmediate adds Src2 back to the instruction.
3889 Src2->ChangeToImmediate(*SubRegImm);
3890
3891 // These come before src2.
3893 UseMI.setDesc(get(NewOpc));
3894
3895 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3896 NewOpc == AMDGPU::V_FMAAK_F16_fake16) {
3897 const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
3898 Register Tmp = MRI->createVirtualRegister(NewRC);
3899 BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
3900 UseMI.getDebugLoc(), get(AMDGPU::COPY),
3901 UseMI.getOperand(0).getReg())
3902 .addReg(Tmp, RegState::Kill);
3903 UseMI.getOperand(0).setReg(Tmp);
3904 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
3905 CopyRegOperandToNarrowerRC(UseMI, 2, NewRC);
3906 }
3907
3908 // It might happen that UseMI was commuted
3909 // and we now have SGPR as SRC1. If so 2 inlined
3910 // constant and SGPR are illegal.
3912
3913 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3914 if (DeleteDef)
3915 DefMI.eraseFromParent();
3916
3917 return true;
3918 }
3919 }
3920
3921 return false;
3922}
3923
3924static bool
3927 if (BaseOps1.size() != BaseOps2.size())
3928 return false;
3929 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3930 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3931 return false;
3932 }
3933 return true;
3934}
3935
3936static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3937 LocationSize WidthB, int OffsetB) {
3938 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3939 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3940 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3941 return LowWidth.hasValue() &&
3942 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3943}
3944
3945bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3946 const MachineInstr &MIb) const {
3947 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3948 int64_t Offset0, Offset1;
3949 LocationSize Dummy0 = LocationSize::precise(0);
3950 LocationSize Dummy1 = LocationSize::precise(0);
3951 bool Offset0IsScalable, Offset1IsScalable;
3952 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3953 Dummy0, &RI) ||
3954 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3955 Dummy1, &RI))
3956 return false;
3957
3958 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3959 return false;
3960
3961 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3962 // FIXME: Handle ds_read2 / ds_write2.
3963 return false;
3964 }
3965 LocationSize Width0 = MIa.memoperands().front()->getSize();
3966 LocationSize Width1 = MIb.memoperands().front()->getSize();
3967 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3968}
3969
3971 const MachineInstr &MIb) const {
3972 assert(MIa.mayLoadOrStore() &&
3973 "MIa must load from or modify a memory location");
3974 assert(MIb.mayLoadOrStore() &&
3975 "MIb must load from or modify a memory location");
3976
3978 return false;
3979
3980 // XXX - Can we relax this between address spaces?
3981 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3982 return false;
3983
3984 if (isLDSDMA(MIa) || isLDSDMA(MIb))
3985 return false;
3986
3987 if (MIa.isBundle() || MIb.isBundle())
3988 return false;
3989
3990 // TODO: Should we check the address space from the MachineMemOperand? That
3991 // would allow us to distinguish objects we know don't alias based on the
3992 // underlying address space, even if it was lowered to a different one,
3993 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3994 // buffer.
3995 if (isDS(MIa)) {
3996 if (isDS(MIb))
3997 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3998
3999 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
4000 }
4001
4002 if (isMUBUF(MIa) || isMTBUF(MIa)) {
4003 if (isMUBUF(MIb) || isMTBUF(MIb))
4004 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4005
4006 if (isFLAT(MIb))
4007 return isFLATScratch(MIb);
4008
4009 return !isSMRD(MIb);
4010 }
4011
4012 if (isSMRD(MIa)) {
4013 if (isSMRD(MIb))
4014 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4015
4016 if (isFLAT(MIb))
4017 return isFLATScratch(MIb);
4018
4019 return !isMUBUF(MIb) && !isMTBUF(MIb);
4020 }
4021
4022 if (isFLAT(MIa)) {
4023 if (isFLAT(MIb)) {
4024 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
4025 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
4026 return true;
4027
4028 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4029 }
4030
4031 return false;
4032 }
4033
4034 return false;
4035}
4036
4038 int64_t &Imm, MachineInstr **DefMI = nullptr) {
4039 if (Reg.isPhysical())
4040 return false;
4041 auto *Def = MRI.getUniqueVRegDef(Reg);
4042 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
4043 Imm = Def->getOperand(1).getImm();
4044 if (DefMI)
4045 *DefMI = Def;
4046 return true;
4047 }
4048 return false;
4049}
4050
4051static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
4052 MachineInstr **DefMI = nullptr) {
4053 if (!MO->isReg())
4054 return false;
4055 const MachineFunction *MF = MO->getParent()->getMF();
4056 const MachineRegisterInfo &MRI = MF->getRegInfo();
4057 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
4058}
4059
4061 MachineInstr &NewMI) {
4062 if (LV) {
4063 unsigned NumOps = MI.getNumOperands();
4064 for (unsigned I = 1; I < NumOps; ++I) {
4065 MachineOperand &Op = MI.getOperand(I);
4066 if (Op.isReg() && Op.isKill())
4067 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
4068 }
4069 }
4070}
4071
4072static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
4073 switch (Opc) {
4074 case AMDGPU::V_MAC_F16_e32:
4075 case AMDGPU::V_MAC_F16_e64:
4076 return AMDGPU::V_MAD_F16_e64;
4077 case AMDGPU::V_MAC_F32_e32:
4078 case AMDGPU::V_MAC_F32_e64:
4079 return AMDGPU::V_MAD_F32_e64;
4080 case AMDGPU::V_MAC_LEGACY_F32_e32:
4081 case AMDGPU::V_MAC_LEGACY_F32_e64:
4082 return AMDGPU::V_MAD_LEGACY_F32_e64;
4083 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4084 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4085 return AMDGPU::V_FMA_LEGACY_F32_e64;
4086 case AMDGPU::V_FMAC_F16_e32:
4087 case AMDGPU::V_FMAC_F16_e64:
4088 case AMDGPU::V_FMAC_F16_t16_e64:
4089 case AMDGPU::V_FMAC_F16_fake16_e64:
4090 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4091 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4092 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4093 : AMDGPU::V_FMA_F16_gfx9_e64;
4094 case AMDGPU::V_FMAC_F32_e32:
4095 case AMDGPU::V_FMAC_F32_e64:
4096 return AMDGPU::V_FMA_F32_e64;
4097 case AMDGPU::V_FMAC_F64_e32:
4098 case AMDGPU::V_FMAC_F64_e64:
4099 return AMDGPU::V_FMA_F64_e64;
4100 default:
4101 llvm_unreachable("invalid instruction");
4102 }
4103}
4104
4105/// Helper struct for the implementation of 3-address conversion to communicate
4106/// updates made to instruction operands.
4108 /// Other instruction whose def is no longer used by the converted
4109 /// instruction.
4111};
4112
4114 LiveVariables *LV,
4115 LiveIntervals *LIS) const {
4116 MachineBasicBlock &MBB = *MI.getParent();
4117 MachineInstr *CandidateMI = &MI;
4118
4119 if (MI.isBundle()) {
4120 // This is a temporary placeholder for bundle handling that enables us to
4121 // exercise the relevant code paths in the two-address instruction pass.
4122 if (MI.getBundleSize() != 1)
4123 return nullptr;
4124 CandidateMI = MI.getNextNode();
4125 }
4126
4128 MachineInstr *NewMI = convertToThreeAddressImpl(*CandidateMI, U);
4129 if (!NewMI)
4130 return nullptr;
4131
4132 if (MI.isBundle()) {
4133 CandidateMI->eraseFromBundle();
4134
4135 for (MachineOperand &MO : MI.all_defs()) {
4136 if (MO.isTied())
4137 MI.untieRegOperand(MO.getOperandNo());
4138 }
4139 } else {
4140 updateLiveVariables(LV, MI, *NewMI);
4141 if (LIS) {
4142 LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
4143 // SlotIndex of defs needs to be updated when converting to early-clobber
4144 MachineOperand &Def = NewMI->getOperand(0);
4145 if (Def.isEarlyClobber() && Def.isReg() &&
4146 LIS->hasInterval(Def.getReg())) {
4147 SlotIndex OldIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(false);
4148 SlotIndex NewIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(true);
4149 auto &LI = LIS->getInterval(Def.getReg());
4150 auto UpdateDefIndex = [&](LiveRange &LR) {
4151 auto *S = LR.find(OldIndex);
4152 if (S != LR.end() && S->start == OldIndex) {
4153 assert(S->valno && S->valno->def == OldIndex);
4154 S->start = NewIndex;
4155 S->valno->def = NewIndex;
4156 }
4157 };
4158 UpdateDefIndex(LI);
4159 for (auto &SR : LI.subranges())
4160 UpdateDefIndex(SR);
4161 }
4162 }
4163 }
4164
4165 if (U.RemoveMIUse) {
4166 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4167 // The only user is the instruction which will be killed.
4168 Register DefReg = U.RemoveMIUse->getOperand(0).getReg();
4169
4170 if (MRI.hasOneNonDBGUse(DefReg)) {
4171 // We cannot just remove the DefMI here, calling pass will crash.
4172 U.RemoveMIUse->setDesc(get(AMDGPU::IMPLICIT_DEF));
4173 U.RemoveMIUse->getOperand(0).setIsDead(true);
4174 for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I)
4175 U.RemoveMIUse->removeOperand(I);
4176 if (LV)
4177 LV->getVarInfo(DefReg).AliveBlocks.clear();
4178 }
4179
4180 if (MI.isBundle()) {
4181 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4182 if (!VRI.Reads && !VRI.Writes) {
4183 for (MachineOperand &MO : MI.all_uses()) {
4184 if (MO.isReg() && MO.getReg() == DefReg) {
4185 assert(MO.getSubReg() == 0 &&
4186 "tied sub-registers in bundles currently not supported");
4187 MI.removeOperand(MO.getOperandNo());
4188 break;
4189 }
4190 }
4191
4192 if (LIS)
4193 LIS->shrinkToUses(&LIS->getInterval(DefReg));
4194 }
4195 } else if (LIS) {
4196 LiveInterval &DefLI = LIS->getInterval(DefReg);
4197
4198 // We cannot delete the original instruction here, so hack out the use
4199 // in the original instruction with a dummy register so we can use
4200 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4201 // not have the complexity of deleting a use to consider here.
4202 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
4203 for (MachineOperand &MIOp : MI.uses()) {
4204 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4205 MIOp.setIsUndef(true);
4206 MIOp.setReg(DummyReg);
4207 }
4208 }
4209
4210 if (MI.isBundle()) {
4211 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4212 if (!VRI.Reads && !VRI.Writes) {
4213 for (MachineOperand &MIOp : MI.uses()) {
4214 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4215 MIOp.setIsUndef(true);
4216 MIOp.setReg(DummyReg);
4217 }
4218 }
4219 }
4220
4221 MI.addOperand(MachineOperand::CreateReg(DummyReg, false, false, false,
4222 false, /*isUndef=*/true));
4223 }
4224
4225 LIS->shrinkToUses(&DefLI);
4226 }
4227 }
4228
4229 return MI.isBundle() ? &MI : NewMI;
4230}
4231
4233SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
4234 ThreeAddressUpdates &U) const {
4235 MachineBasicBlock &MBB = *MI.getParent();
4236 unsigned Opc = MI.getOpcode();
4237
4238 // Handle MFMA.
4239 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
4240 if (NewMFMAOpc != -1) {
4242 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
4243 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4244 MIB.add(MI.getOperand(I));
4245 return MIB;
4246 }
4247
4248 if (SIInstrInfo::isWMMA(MI)) {
4249 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
4250 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4251 .setMIFlags(MI.getFlags());
4252 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4253 MIB->addOperand(MI.getOperand(I));
4254 return MIB;
4255 }
4256
4257 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4258 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4259 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4260 "present pre-RA");
4261
4262 // Handle MAC/FMAC.
4263 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4264 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4265 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4266 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4267 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4268 bool Src0Literal = false;
4269
4270 switch (Opc) {
4271 default:
4272 return nullptr;
4273 case AMDGPU::V_MAC_F16_e64:
4274 case AMDGPU::V_FMAC_F16_e64:
4275 case AMDGPU::V_FMAC_F16_t16_e64:
4276 case AMDGPU::V_FMAC_F16_fake16_e64:
4277 case AMDGPU::V_MAC_F32_e64:
4278 case AMDGPU::V_MAC_LEGACY_F32_e64:
4279 case AMDGPU::V_FMAC_F32_e64:
4280 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4281 case AMDGPU::V_FMAC_F64_e64:
4282 break;
4283 case AMDGPU::V_MAC_F16_e32:
4284 case AMDGPU::V_FMAC_F16_e32:
4285 case AMDGPU::V_MAC_F32_e32:
4286 case AMDGPU::V_MAC_LEGACY_F32_e32:
4287 case AMDGPU::V_FMAC_F32_e32:
4288 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4289 case AMDGPU::V_FMAC_F64_e32: {
4290 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4291 AMDGPU::OpName::src0);
4292 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
4293 if (!Src0->isReg() && !Src0->isImm())
4294 return nullptr;
4295
4296 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
4297 Src0Literal = true;
4298
4299 break;
4300 }
4301 }
4302
4303 MachineInstrBuilder MIB;
4304 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4305 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
4306 const MachineOperand *Src0Mods =
4307 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4308 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4309 const MachineOperand *Src1Mods =
4310 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
4311 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4312 const MachineOperand *Src2Mods =
4313 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
4314 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4315 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
4316 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
4317
4318 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4319 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4320 // If we have an SGPR input, we will violate the constant bus restriction.
4321 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
4322 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
4323 MachineInstr *DefMI;
4324
4325 int64_t Imm;
4326 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4327 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4328 if (pseudoToMCOpcode(NewOpc) != -1) {
4329 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4330 .add(*Dst)
4331 .add(*Src0)
4332 .add(*Src1)
4333 .addImm(Imm)
4334 .setMIFlags(MI.getFlags());
4335 U.RemoveMIUse = DefMI;
4336 return MIB;
4337 }
4338 }
4339 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4340 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4341 if (pseudoToMCOpcode(NewOpc) != -1) {
4342 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4343 .add(*Dst)
4344 .add(*Src0)
4345 .addImm(Imm)
4346 .add(*Src2)
4347 .setMIFlags(MI.getFlags());
4348 U.RemoveMIUse = DefMI;
4349 return MIB;
4350 }
4351 }
4352 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4353 if (Src0Literal) {
4354 Imm = Src0->getImm();
4355 DefMI = nullptr;
4356 }
4357 if (pseudoToMCOpcode(NewOpc) != -1 &&
4359 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4360 Src1)) {
4361 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4362 .add(*Dst)
4363 .add(*Src1)
4364 .addImm(Imm)
4365 .add(*Src2)
4366 .setMIFlags(MI.getFlags());
4367 U.RemoveMIUse = DefMI;
4368 return MIB;
4369 }
4370 }
4371 }
4372
4373 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4374 // if VOP3 does not allow a literal operand.
4375 if (Src0Literal && !ST.hasVOP3Literal())
4376 return nullptr;
4377
4378 unsigned NewOpc = getNewFMAInst(ST, Opc);
4379
4380 if (pseudoToMCOpcode(NewOpc) == -1)
4381 return nullptr;
4382
4383 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4384 .add(*Dst)
4385 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4386 .add(*Src0)
4387 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4388 .add(*Src1)
4389 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4390 .add(*Src2)
4391 .addImm(Clamp ? Clamp->getImm() : 0)
4392 .addImm(Omod ? Omod->getImm() : 0)
4393 .setMIFlags(MI.getFlags());
4394 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4395 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4396 return MIB;
4397}
4398
4399// It's not generally safe to move VALU instructions across these since it will
4400// start using the register as a base index rather than directly.
4401// XXX - Why isn't hasSideEffects sufficient for these?
4403 switch (MI.getOpcode()) {
4404 case AMDGPU::S_SET_GPR_IDX_ON:
4405 case AMDGPU::S_SET_GPR_IDX_MODE:
4406 case AMDGPU::S_SET_GPR_IDX_OFF:
4407 return true;
4408 default:
4409 return false;
4410 }
4411}
4412
4414 const MachineBasicBlock *MBB,
4415 const MachineFunction &MF) const {
4416 // Skipping the check for SP writes in the base implementation. The reason it
4417 // was added was apparently due to compile time concerns.
4418 //
4419 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4420 // but is probably avoidable.
4421
4422 // Copied from base implementation.
4423 // Terminators and labels can't be scheduled around.
4424 if (MI.isTerminator() || MI.isPosition())
4425 return true;
4426
4427 // INLINEASM_BR can jump to another block
4428 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4429 return true;
4430
4431 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4432 return true;
4433
4434 // Target-independent instructions do not have an implicit-use of EXEC, even
4435 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4436 // boundaries prevents incorrect movements of such instructions.
4437 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4438 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4439 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4440 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4441 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4443}
4444
4446 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4447 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4448 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4449}
4450
4452 // Instructions that access scratch use FLAT encoding or BUF encodings.
4453 if ((!isFLAT(MI) || isFLATGlobal(MI)) && !isBUF(MI))
4454 return false;
4455
4456 // SCRATCH instructions always access scratch.
4457 if (isFLATScratch(MI))
4458 return true;
4459
4460 // If FLAT_SCRATCH registers are not initialized, we can never access scratch
4461 // via the aperture.
4462 if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
4463 return false;
4464
4465 // If there are no memory operands then conservatively assume the flat
4466 // operation may access scratch.
4467 if (MI.memoperands_empty())
4468 return true;
4469
4470 // See if any memory operand specifies an address space that involves scratch.
4471 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
4472 unsigned AS = Memop->getAddrSpace();
4473 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4474 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4475 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4476 *MD, AMDGPUAS::PRIVATE_ADDRESS);
4477 }
4478 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4479 });
4480}
4481
4483 assert(isFLAT(MI));
4484
4485 // All flat instructions use the VMEM counter except prefetch.
4486 if (!usesVM_CNT(MI))
4487 return false;
4488
4489 // If there are no memory operands then conservatively assume the flat
4490 // operation may access VMEM.
4491 if (MI.memoperands_empty())
4492 return true;
4493
4494 // See if any memory operand specifies an address space that involves VMEM.
4495 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
4496 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
4497 // (GDS) address space is not supported by flat operations. Therefore, simply
4498 // return true unless only the LDS address space is found.
4499 for (const MachineMemOperand *Memop : MI.memoperands()) {
4500 unsigned AS = Memop->getAddrSpace();
4502 if (AS != AMDGPUAS::LOCAL_ADDRESS)
4503 return true;
4504 }
4505
4506 return false;
4507}
4508
4510 assert(isFLAT(MI));
4511
4512 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
4513 if (!usesLGKM_CNT(MI))
4514 return false;
4515
4516 // If in tgsplit mode then there can be no use of LDS.
4517 if (ST.isTgSplitEnabled())
4518 return false;
4519
4520 // If there are no memory operands then conservatively assume the flat
4521 // operation may access LDS.
4522 if (MI.memoperands_empty())
4523 return true;
4524
4525 // See if any memory operand specifies an address space that involves LDS.
4526 for (const MachineMemOperand *Memop : MI.memoperands()) {
4527 unsigned AS = Memop->getAddrSpace();
4529 return true;
4530 }
4531
4532 return false;
4533}
4534
4536 // Skip the full operand and register alias search modifiesRegister
4537 // does. There's only a handful of instructions that touch this, it's only an
4538 // implicit def, and doesn't alias any other registers.
4539 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4540}
4541
4543 unsigned Opcode = MI.getOpcode();
4544
4545 if (MI.mayStore() && isSMRD(MI))
4546 return true; // scalar store or atomic
4547
4548 // This will terminate the function when other lanes may need to continue.
4549 if (MI.isReturn())
4550 return true;
4551
4552 // These instructions cause shader I/O that may cause hardware lockups
4553 // when executed with an empty EXEC mask.
4554 //
4555 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4556 // EXEC = 0, but checking for that case here seems not worth it
4557 // given the typical code patterns.
4558 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4559 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4560 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT ||
4561 Opcode == AMDGPU::S_SETHALT)
4562 return true;
4563
4564 if (MI.isCall() || MI.isInlineAsm())
4565 return true; // conservative assumption
4566
4567 // Assume that barrier interactions are only intended with active lanes.
4568 if (isBarrier(Opcode))
4569 return true;
4570
4571 // A mode change is a scalar operation that influences vector instructions.
4573 return true;
4574
4575 // These are like SALU instructions in terms of effects, so it's questionable
4576 // whether we should return true for those.
4577 //
4578 // However, executing them with EXEC = 0 causes them to operate on undefined
4579 // data, which we avoid by returning true here.
4580 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4581 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4582 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4583 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4584 return true;
4585
4586 return false;
4587}
4588
4590 const MachineInstr &MI) const {
4591 if (MI.isMetaInstruction())
4592 return false;
4593
4594 // This won't read exec if this is an SGPR->SGPR copy.
4595 if (MI.isCopyLike()) {
4596 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4597 return true;
4598
4599 // Make sure this isn't copying exec as a normal operand
4600 return MI.readsRegister(AMDGPU::EXEC, &RI);
4601 }
4602
4603 // Make a conservative assumption about the callee.
4604 if (MI.isCall())
4605 return true;
4606
4607 // Be conservative with any unhandled generic opcodes.
4608 if (!isTargetSpecificOpcode(MI.getOpcode()))
4609 return true;
4610
4611 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4612}
4613
4614bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4615 switch (Imm.getBitWidth()) {
4616 case 1: // This likely will be a condition code mask.
4617 return true;
4618
4619 case 32:
4620 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4621 ST.hasInv2PiInlineImm());
4622 case 64:
4623 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4624 ST.hasInv2PiInlineImm());
4625 case 16:
4626 return ST.has16BitInsts() &&
4627 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4628 ST.hasInv2PiInlineImm());
4629 default:
4630 llvm_unreachable("invalid bitwidth");
4631 }
4632}
4633
4635 APInt IntImm = Imm.bitcastToAPInt();
4636 int64_t IntImmVal = IntImm.getSExtValue();
4637 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4638 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4639 default:
4640 llvm_unreachable("invalid fltSemantics");
4643 return isInlineConstant(IntImm);
4645 return ST.has16BitInsts() &&
4646 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4648 return ST.has16BitInsts() &&
4649 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4650 }
4651}
4652
4653bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4654 // MachineOperand provides no way to tell the true operand size, since it only
4655 // records a 64-bit value. We need to know the size to determine if a 32-bit
4656 // floating point immediate bit pattern is legal for an integer immediate. It
4657 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4658 switch (OperandType) {
4668 int32_t Trunc = static_cast<int32_t>(Imm);
4669 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
4670 }
4678 return AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm());
4681 // We would expect inline immediates to not be concerned with an integer/fp
4682 // distinction. However, in the case of 16-bit integer operations, the
4683 // "floating point" values appear to not work. It seems read the low 16-bits
4684 // of 32-bit immediates, which happens to always work for the integer
4685 // values.
4686 //
4687 // See llvm bugzilla 46302.
4688 //
4689 // TODO: Theoretically we could use op-sel to use the high bits of the
4690 // 32-bit FP values.
4699 return AMDGPU::isPKFMACF16InlineConstant(Imm, ST.isGFX11Plus());
4704 return false;
4707 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4708 // A few special case instructions have 16-bit operands on subtargets
4709 // where 16-bit instructions are not legal.
4710 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4711 // constants in these cases
4712 int16_t Trunc = static_cast<int16_t>(Imm);
4713 return ST.has16BitInsts() &&
4714 AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm());
4715 }
4716
4717 return false;
4718 }
4721 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4722 int16_t Trunc = static_cast<int16_t>(Imm);
4723 return ST.has16BitInsts() &&
4724 AMDGPU::isInlinableLiteralBF16(Trunc, ST.hasInv2PiInlineImm());
4725 }
4726 return false;
4727 }
4731 return false;
4733 return isLegalAV64PseudoImm(Imm);
4736 // Always embedded in the instruction for free.
4737 return true;
4747 // Just ignore anything else.
4748 return true;
4749 default:
4750 llvm_unreachable("invalid operand type");
4751 }
4752}
4753
4754static bool compareMachineOp(const MachineOperand &Op0,
4755 const MachineOperand &Op1) {
4756 if (Op0.getType() != Op1.getType())
4757 return false;
4758
4759 switch (Op0.getType()) {
4761 return Op0.getReg() == Op1.getReg();
4763 return Op0.getImm() == Op1.getImm();
4764 default:
4765 llvm_unreachable("Didn't expect to be comparing these operand types");
4766 }
4767}
4768
4770 const MCOperandInfo &OpInfo) const {
4771 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4772 return true;
4773
4774 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4775 return false;
4776
4777 if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4778 return true;
4779
4780 return ST.hasVOP3Literal();
4781}
4782
4783bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4784 int64_t ImmVal) const {
4785 const unsigned Opc = InstDesc.getOpcode();
4786 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
4787 if (Src1Idx != -1 && isDPP(Opc) && !ST.hasDPPSrc1SGPR() &&
4788 OpNo == static_cast<unsigned>(Src1Idx))
4789 return false;
4790
4791 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4792 if (isInlineConstant(ImmVal, OpInfo.OperandType)) {
4793 if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4794 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(),
4795 AMDGPU::OpName::src2))
4796 return false;
4797 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4798 }
4799
4800 return isLiteralOperandLegal(InstDesc, OpInfo);
4801}
4802
4803bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4804 const MachineOperand &MO) const {
4805 if (MO.isImm())
4806 return isImmOperandLegal(InstDesc, OpNo, MO.getImm());
4807
4808 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4809 "unexpected imm-like operand kind");
4810 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4811 return isLiteralOperandLegal(InstDesc, OpInfo);
4812}
4813
4815 // 2 32-bit inline constants packed into one.
4816 return AMDGPU::isInlinableLiteral32(Lo_32(Imm), ST.hasInv2PiInlineImm()) &&
4817 AMDGPU::isInlinableLiteral32(Hi_32(Imm), ST.hasInv2PiInlineImm());
4818}
4819
4820bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4821 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4822 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4823 return false;
4824
4825 int Op32 = AMDGPU::getVOPe32(Opcode);
4826 if (Op32 == -1)
4827 return false;
4828
4829 return pseudoToMCOpcode(Op32) != -1;
4830}
4831
4832bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4833 // The src0_modifier operand is present on all instructions
4834 // that have modifiers.
4835
4836 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4837}
4838
4840 AMDGPU::OpName OpName) const {
4841 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4842 return Mods && Mods->getImm();
4843}
4844
4846 return any_of(ModifierOpNames,
4847 [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, Name); });
4848}
4849
4851 const MachineRegisterInfo &MRI) const {
4852 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4853 // Can't shrink instruction with three operands.
4854 if (Src2) {
4855 switch (MI.getOpcode()) {
4856 default: return false;
4857
4858 case AMDGPU::V_ADDC_U32_e64:
4859 case AMDGPU::V_SUBB_U32_e64:
4860 case AMDGPU::V_SUBBREV_U32_e64: {
4861 const MachineOperand *Src1
4862 = getNamedOperand(MI, AMDGPU::OpName::src1);
4863 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4864 return false;
4865 // Additional verification is needed for sdst/src2.
4866 return true;
4867 }
4868 case AMDGPU::V_MAC_F16_e64:
4869 case AMDGPU::V_MAC_F32_e64:
4870 case AMDGPU::V_MAC_LEGACY_F32_e64:
4871 case AMDGPU::V_FMAC_F16_e64:
4872 case AMDGPU::V_FMAC_F16_t16_e64:
4873 case AMDGPU::V_FMAC_F16_fake16_e64:
4874 case AMDGPU::V_FMAC_F32_e64:
4875 case AMDGPU::V_FMAC_F64_e64:
4876 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4877 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4878 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4879 return false;
4880 break;
4881
4882 case AMDGPU::V_CNDMASK_B32_e64:
4883 break;
4884 }
4885 }
4886
4887 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4888 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4889 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4890 return false;
4891
4892 // We don't need to check src0, all input types are legal, so just make sure
4893 // src0 isn't using any modifiers.
4894 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4895 return false;
4896
4897 // Can it be shrunk to a valid 32 bit opcode?
4898 if (!hasVALU32BitEncoding(MI.getOpcode()))
4899 return false;
4900
4901 // Check output modifiers
4902 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4903 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4904 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4905 // TODO: Can we avoid checking bound_ctrl/fi here?
4906 // They are only used by permlane*_swap special case.
4907 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
4908 !hasModifiersSet(MI, AMDGPU::OpName::fi);
4909}
4910
4911// Set VCC operand with all flags from \p Orig, except for setting it as
4912// implicit.
4914 const MachineOperand &Orig) {
4915
4916 for (MachineOperand &Use : MI.implicit_operands()) {
4917 if (Use.isUse() &&
4918 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4919 Use.setIsUndef(Orig.isUndef());
4920 Use.setIsKill(Orig.isKill());
4921 return;
4922 }
4923 }
4924}
4925
4927 unsigned Op32) const {
4928 MachineBasicBlock *MBB = MI.getParent();
4929
4930 const MCInstrDesc &Op32Desc = get(Op32);
4931 MachineInstrBuilder Inst32 =
4932 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4933 .setMIFlags(MI.getFlags());
4934
4935 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4936 // For VOPC instructions, this is replaced by an implicit def of vcc.
4937
4938 // We assume the defs of the shrunk opcode are in the same order, and the
4939 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4940 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4941 Inst32.add(MI.getOperand(I));
4942
4943 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4944
4945 int Idx = MI.getNumExplicitDefs();
4946 for (const MachineOperand &Use : MI.explicit_uses()) {
4947 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4949 continue;
4950
4951 if (&Use == Src2) {
4952 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4953 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4954 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4955 // of vcc was already added during the initial BuildMI, but we
4956 // 1) may need to change vcc to vcc_lo to preserve the original register
4957 // 2) have to preserve the original flags.
4958 copyFlagsToImplicitVCC(*Inst32, *Src2);
4959 continue;
4960 }
4961 }
4962
4963 Inst32.add(Use);
4964 }
4965
4966 // FIXME: Losing implicit operands
4967 fixImplicitOperands(*Inst32);
4968 return Inst32;
4969}
4970
4972 // Null is free
4973 Register Reg = RegOp.getReg();
4974 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
4975 return false;
4976
4977 // SGPRs use the constant bus
4978
4979 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
4980 // physical register operands should also count, except for exec.
4981 if (RegOp.isImplicit())
4982 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
4983
4984 // SGPRs use the constant bus
4985 return AMDGPU::SReg_32RegClass.contains(Reg) ||
4986 AMDGPU::SReg_64RegClass.contains(Reg);
4987}
4988
4990 const MachineRegisterInfo &MRI) const {
4991 Register Reg = RegOp.getReg();
4992 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
4993 : physRegUsesConstantBus(RegOp);
4994}
4995
4997 const MachineOperand &MO,
4998 const MCOperandInfo &OpInfo) const {
4999 // Literal constants use the constant bus.
5000 if (!MO.isReg())
5001 return !isInlineConstant(MO, OpInfo);
5002
5003 Register Reg = MO.getReg();
5004 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
5006}
5007
5009 for (const MachineOperand &MO : MI.implicit_operands()) {
5010 // We only care about reads.
5011 if (MO.isDef())
5012 continue;
5013
5014 switch (MO.getReg()) {
5015 case AMDGPU::VCC:
5016 case AMDGPU::VCC_LO:
5017 case AMDGPU::VCC_HI:
5018 case AMDGPU::M0:
5019 case AMDGPU::FLAT_SCR:
5020 return MO.getReg();
5021
5022 default:
5023 break;
5024 }
5025 }
5026
5027 return Register();
5028}
5029
5030static bool shouldReadExec(const MachineInstr &MI) {
5031 if (SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true)) {
5032 switch (MI.getOpcode()) {
5033 case AMDGPU::V_READLANE_B32:
5034 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
5035 case AMDGPU::V_WRITELANE_B32:
5036 case AMDGPU::SI_SPILL_S32_TO_VGPR:
5037 return false;
5038 }
5039
5040 return true;
5041 }
5042
5043 if (MI.isPreISelOpcode() ||
5044 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
5047 return false;
5048
5049 return true;
5050}
5051
5052static bool isRegOrFI(const MachineOperand &MO) {
5053 return MO.isReg() || MO.isFI();
5054}
5055
5056static bool isSubRegOf(const SIRegisterInfo &TRI,
5057 const MachineOperand &SuperVec,
5058 const MachineOperand &SubReg) {
5059 if (SubReg.getReg().isPhysical())
5060 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
5061
5062 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
5063 SubReg.getReg() == SuperVec.getReg();
5064}
5065
5066// Verify the illegal copy from vector register to SGPR for generic opcode COPY
5067bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
5068 const MachineRegisterInfo &MRI,
5069 StringRef &ErrInfo) const {
5070 Register DstReg = MI.getOperand(0).getReg();
5071 Register SrcReg = MI.getOperand(1).getReg();
5072 // This is a check for copy from vector register to SGPR
5073 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
5074 ErrInfo = "illegal copy from vector register to SGPR";
5075 return false;
5076 }
5077 return true;
5078}
5079
5081 StringRef &ErrInfo) const {
5082 uint32_t Opcode = MI.getOpcode();
5083 const MachineFunction *MF = MI.getMF();
5084 const MachineRegisterInfo &MRI = MF->getRegInfo();
5085
5086 // FIXME: At this point the COPY verify is done only for non-ssa forms.
5087 // Find a better property to recognize the point where instruction selection
5088 // is just done.
5089 // We can only enforce this check after SIFixSGPRCopies pass so that the
5090 // illegal copies are legalized and thereafter we don't expect a pass
5091 // inserting similar copies.
5092 if (!MRI.isSSA() && MI.isCopy())
5093 return verifyCopy(MI, MRI, ErrInfo);
5094
5095 if (SIInstrInfo::isGenericOpcode(Opcode))
5096 return true;
5097
5098 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
5099 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
5100 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
5101 int Src3Idx = -1;
5102 if (Src0Idx == -1) {
5103 // VOPD V_DUAL_* instructions use different operand names.
5104 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
5105 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
5106 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
5107 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
5108 }
5109
5110 // Make sure the number of operands is correct.
5111 const MCInstrDesc &Desc = get(Opcode);
5112 if (!Desc.isVariadic() &&
5113 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
5114 ErrInfo = "Instruction has wrong number of operands.";
5115 return false;
5116 }
5117
5118 if (MI.isInlineAsm()) {
5119 // Verify register classes for inlineasm constraints.
5120 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
5121 I != E; ++I) {
5122 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
5123 if (!RC)
5124 continue;
5125
5126 const MachineOperand &Op = MI.getOperand(I);
5127 if (!Op.isReg())
5128 continue;
5129
5130 Register Reg = Op.getReg();
5131 if (!Reg.isVirtual() && !RC->contains(Reg)) {
5132 ErrInfo = "inlineasm operand has incorrect register class.";
5133 return false;
5134 }
5135 }
5136
5137 return true;
5138 }
5139
5140 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
5141 ErrInfo = "missing memory operand from image instruction.";
5142 return false;
5143 }
5144
5145 // Make sure the register classes are correct.
5146 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
5147 const MachineOperand &MO = MI.getOperand(i);
5148 if (MO.isFPImm()) {
5149 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
5150 "all fp values to integers.";
5151 return false;
5152 }
5153
5154 const MCOperandInfo &OpInfo = Desc.operands()[i];
5155 int16_t RegClass = getOpRegClassID(OpInfo);
5156
5157 switch (OpInfo.OperandType) {
5159 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
5160 ErrInfo = "Illegal immediate value for operand.";
5161 return false;
5162 }
5163 break;
5175 break;
5177 break;
5178 break;
5192 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
5193 ErrInfo = "Illegal immediate value for operand.";
5194 return false;
5195 }
5196 break;
5197 }
5202 if (ST.has64BitLiterals() && Desc.getSize() != 4 && MO.isImm() &&
5203 !isInlineConstant(MI, i) &&
5205 OpInfo.OperandType ==
5207 ErrInfo = "illegal 64-bit immediate value for operand.";
5208 return false;
5209 }
5210 break;
5213 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
5214 ErrInfo = "Expected inline constant for operand.";
5215 return false;
5216 }
5217 break;
5220 break;
5225 // Check if this operand is an immediate.
5226 // FrameIndex operands will be replaced by immediates, so they are
5227 // allowed.
5228 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5229 ErrInfo = "Expected immediate, but got non-immediate";
5230 return false;
5231 }
5232 break;
5236 break;
5237 default:
5238 if (OpInfo.isGenericType())
5239 continue;
5240 break;
5241 }
5242
5243 if (!MO.isReg())
5244 continue;
5245 Register Reg = MO.getReg();
5246 if (!Reg)
5247 continue;
5248
5249 // FIXME: Ideally we would have separate instruction definitions with the
5250 // aligned register constraint.
5251 // FIXME: We do not verify inline asm operands, but custom inline asm
5252 // verification is broken anyway
5253 if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO &&
5254 Opcode != AMDGPU::V_MOV_B64_PSEUDO && !isSpill(MI)) {
5255 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5256 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5257 if (const TargetRegisterClass *SubRC =
5258 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5259 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
5260 if (RC)
5261 RC = SubRC;
5262 }
5263 }
5264
5265 // Check that this is the aligned version of the class.
5266 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
5267 ErrInfo = "Subtarget requires even aligned vector registers";
5268 return false;
5269 }
5270 }
5271
5272 if (RegClass != -1) {
5273 if (Reg.isVirtual())
5274 continue;
5275
5276 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
5277 if (!RC->contains(Reg)) {
5278 ErrInfo = "Operand has incorrect register class.";
5279 return false;
5280 }
5281 }
5282 }
5283
5284 // Verify SDWA
5285 if (isSDWA(MI)) {
5286 if (!ST.hasSDWA()) {
5287 ErrInfo = "SDWA is not supported on this target";
5288 return false;
5289 }
5290
5291 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5292 AMDGPU::OpName::dst_sel}) {
5293 const MachineOperand *MO = getNamedOperand(MI, Op);
5294 if (!MO)
5295 continue;
5296 int64_t Imm = MO->getImm();
5297 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5298 ErrInfo = "Invalid SDWA selection";
5299 return false;
5300 }
5301 }
5302
5303 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
5304
5305 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5306 if (OpIdx == -1)
5307 continue;
5308 const MachineOperand &MO = MI.getOperand(OpIdx);
5309
5310 if (!ST.hasSDWAScalar()) {
5311 // Only VGPRS on VI
5312 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
5313 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5314 return false;
5315 }
5316 } else {
5317 // No immediates on GFX9
5318 if (!MO.isReg()) {
5319 ErrInfo =
5320 "Only reg allowed as operands in SDWA instructions on GFX9+";
5321 return false;
5322 }
5323 }
5324 }
5325
5326 if (!ST.hasSDWAOmod()) {
5327 // No omod allowed on VI
5328 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5329 if (OMod != nullptr &&
5330 (!OMod->isImm() || OMod->getImm() != 0)) {
5331 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5332 return false;
5333 }
5334 }
5335
5336 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5337 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5338 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5339 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5340 const MachineOperand *Src0ModsMO =
5341 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
5342 unsigned Mods = Src0ModsMO->getImm();
5343 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5344 Mods & SISrcMods::SEXT) {
5345 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5346 return false;
5347 }
5348 }
5349
5350 uint32_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5351 if (isVOPC(BasicOpcode)) {
5352 if (!ST.hasSDWASdst() && DstIdx != -1) {
5353 // Only vcc allowed as dst on VI for VOPC
5354 const MachineOperand &Dst = MI.getOperand(DstIdx);
5355 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5356 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5357 return false;
5358 }
5359 } else if (!ST.hasSDWAOutModsVOPC()) {
5360 // No clamp allowed on GFX9 for VOPC
5361 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
5362 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5363 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5364 return false;
5365 }
5366
5367 // No omod allowed on GFX9 for VOPC
5368 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5369 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5370 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5371 return false;
5372 }
5373 }
5374 }
5375
5376 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
5377 if (DstUnused && DstUnused->isImm() &&
5378 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5379 const MachineOperand &Dst = MI.getOperand(DstIdx);
5380 if (!Dst.isReg() || !Dst.isTied()) {
5381 ErrInfo = "Dst register should have tied register";
5382 return false;
5383 }
5384
5385 const MachineOperand &TiedMO =
5386 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
5387 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5388 ErrInfo =
5389 "Dst register should be tied to implicit use of preserved register";
5390 return false;
5391 }
5392 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5393 ErrInfo = "Dst register should use same physical register as preserved";
5394 return false;
5395 }
5396 }
5397 }
5398
5399 if (isDPP(MI) && !ST.hasDPPSrc1SGPR() && Src1Idx != -1) {
5400 const MachineOperand &Src1MO = MI.getOperand(Src1Idx);
5401 if (Src1MO.isReg() && RI.isSGPRReg(MRI, Src1MO.getReg())) {
5402 ErrInfo = "DPP src1 cannot be SGPR on this subtarget";
5403 return false;
5404 }
5405 if (Src1MO.isImm()) {
5406 ErrInfo = "DPP src1 cannot be an immediate on this subtarget";
5407 return false;
5408 }
5409 }
5410
5411 // Verify MIMG / VIMAGE / VSAMPLE
5412 if (isImage(Opcode) && !MI.mayStore()) {
5413 // Ensure that the return type used is large enough for all the options
5414 // being used TFE/LWE require an extra result register.
5415 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
5416 if (DMask) {
5417 uint64_t DMaskImm = DMask->getImm();
5418 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(DMaskImm);
5419 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
5420 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
5421 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
5422
5423 // Adjust for packed 16 bit values
5424 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5425 RegCount = divideCeil(RegCount, 2);
5426
5427 // Adjust if using LWE or TFE
5428 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5429 RegCount += 1;
5430
5431 const uint32_t DstIdx =
5432 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
5433 const MachineOperand &Dst = MI.getOperand(DstIdx);
5434 if (Dst.isReg()) {
5435 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
5436 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
5437 if (RegCount > DstSize) {
5438 ErrInfo = "Image instruction returns too many registers for dst "
5439 "register class";
5440 return false;
5441 }
5442 }
5443 }
5444 }
5445
5446 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5447 if (isVALU(MI, /*AllowLDSDMA=*/true) &&
5448 Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5449 unsigned ConstantBusCount = 0;
5450 bool UsesLiteral = false;
5451 const MachineOperand *LiteralVal = nullptr;
5452
5453 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
5454 if (ImmIdx != -1) {
5455 ++ConstantBusCount;
5456 UsesLiteral = true;
5457 LiteralVal = &MI.getOperand(ImmIdx);
5458 }
5459
5460 SmallVector<Register, 2> SGPRsUsed;
5461 Register SGPRUsed;
5462
5463 // Only look at the true operands. Only a real operand can use the constant
5464 // bus, and we don't want to check pseudo-operands like the source modifier
5465 // flags.
5466 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5467 if (OpIdx == -1)
5468 continue;
5469 const MachineOperand &MO = MI.getOperand(OpIdx);
5470 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5471 if (MO.isReg()) {
5472 SGPRUsed = MO.getReg();
5473 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
5474 ++ConstantBusCount;
5475 SGPRsUsed.push_back(SGPRUsed);
5476 }
5477 } else if (!MO.isFI()) { // Treat FI like a register.
5478 if (!UsesLiteral) {
5479 ++ConstantBusCount;
5480 UsesLiteral = true;
5481 LiteralVal = &MO;
5482 } else if (!MO.isIdenticalTo(*LiteralVal)) {
5483 assert(isVOP2(MI) || isVOP3(MI));
5484 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5485 return false;
5486 }
5487 }
5488 }
5489 }
5490
5491 SGPRUsed = findImplicitSGPRRead(MI);
5492 if (SGPRUsed) {
5493 // Implicit uses may safely overlap true operands
5494 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5495 return !RI.regsOverlap(SGPRUsed, SGPR);
5496 })) {
5497 ++ConstantBusCount;
5498 SGPRsUsed.push_back(SGPRUsed);
5499 }
5500 }
5501
5502 // v_writelane_b32 is an exception from constant bus restriction:
5503 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5504 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5505 Opcode != AMDGPU::V_WRITELANE_B32) {
5506 ErrInfo = "VOP* instruction violates constant bus restriction";
5507 return false;
5508 }
5509
5510 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5511 ErrInfo = "VOP3 instruction uses literal";
5512 return false;
5513 }
5514 }
5515
5516 // Special case for writelane - this can break the multiple constant bus rule,
5517 // but still can't use more than one SGPR register
5518 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5519 unsigned SGPRCount = 0;
5520 Register SGPRUsed;
5521
5522 for (int OpIdx : {Src0Idx, Src1Idx}) {
5523 if (OpIdx == -1)
5524 break;
5525
5526 const MachineOperand &MO = MI.getOperand(OpIdx);
5527
5528 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5529 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5530 if (MO.getReg() != SGPRUsed)
5531 ++SGPRCount;
5532 SGPRUsed = MO.getReg();
5533 }
5534 }
5535 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5536 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5537 return false;
5538 }
5539 }
5540 }
5541
5542 // Verify misc. restrictions on specific instructions.
5543 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5544 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5545 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5546 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5547 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5548 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5549 if (!compareMachineOp(Src0, Src1) &&
5550 !compareMachineOp(Src0, Src2)) {
5551 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5552 return false;
5553 }
5554 }
5555 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5556 SISrcMods::ABS) ||
5557 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5558 SISrcMods::ABS) ||
5559 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5560 SISrcMods::ABS)) {
5561 ErrInfo = "ABS not allowed in VOP3B instructions";
5562 return false;
5563 }
5564 }
5565
5566 if (isSOP2(MI) || isSOPC(MI)) {
5567 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5568 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5569
5570 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5571 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5572 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5573 !Src0.isIdenticalTo(Src1)) {
5574 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5575 return false;
5576 }
5577 }
5578
5579 if (isSOPK(MI)) {
5580 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5581 if (Desc.isBranch()) {
5582 if (!Op->isMBB()) {
5583 ErrInfo = "invalid branch target for SOPK instruction";
5584 return false;
5585 }
5586 } else {
5587 uint64_t Imm = Op->getImm();
5588 if (sopkIsZext(Opcode)) {
5589 if (!isUInt<16>(Imm)) {
5590 ErrInfo = "invalid immediate for SOPK instruction";
5591 return false;
5592 }
5593 } else {
5594 if (!isInt<16>(Imm)) {
5595 ErrInfo = "invalid immediate for SOPK instruction";
5596 return false;
5597 }
5598 }
5599 }
5600 }
5601
5602 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5603 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5604 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5605 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5606 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5607 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5608
5609 const unsigned StaticNumOps =
5610 Desc.getNumOperands() + Desc.implicit_uses().size();
5611 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5612
5613 // Require additional implicit operands. This allows a fixup done by the
5614 // post RA scheduler where the main implicit operand is killed and
5615 // implicit-defs are added for sub-registers that remain live after this
5616 // instruction.
5617 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5618 ErrInfo = "missing implicit register operands";
5619 return false;
5620 }
5621
5622 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5623 if (IsDst) {
5624 if (!Dst->isUse()) {
5625 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5626 return false;
5627 }
5628
5629 unsigned UseOpIdx;
5630 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5631 UseOpIdx != StaticNumOps + 1) {
5632 ErrInfo = "movrel implicit operands should be tied";
5633 return false;
5634 }
5635 }
5636
5637 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5638 const MachineOperand &ImpUse
5639 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5640 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5641 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5642 ErrInfo = "src0 should be subreg of implicit vector use";
5643 return false;
5644 }
5645 }
5646
5647 // Make sure we aren't losing exec uses in the td files. This mostly requires
5648 // being careful when using let Uses to try to add other use registers.
5649 if (shouldReadExec(MI)) {
5650 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5651 ErrInfo = "VALU instruction does not implicitly read exec mask";
5652 return false;
5653 }
5654 }
5655
5656 if (isSMRD(MI)) {
5657 if (MI.mayStore() &&
5658 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5659 // The register offset form of scalar stores may only use m0 as the
5660 // soffset register.
5661 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5662 if (Soff && Soff->getReg() != AMDGPU::M0) {
5663 ErrInfo = "scalar stores must use m0 as offset register";
5664 return false;
5665 }
5666 }
5667 }
5668
5669 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5670 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5671 if (Offset->getImm() != 0) {
5672 ErrInfo = "subtarget does not support offsets in flat instructions";
5673 return false;
5674 }
5675 }
5676
5677 if (isDS(MI) && !ST.hasGDS()) {
5678 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5679 if (GDSOp && GDSOp->getImm() != 0) {
5680 ErrInfo = "GDS is not supported on this subtarget";
5681 return false;
5682 }
5683 }
5684
5685 if (isImage(MI)) {
5686 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5687 if (DimOp) {
5688 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5689 AMDGPU::OpName::vaddr0);
5690 AMDGPU::OpName RSrcOpName =
5691 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5692 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5693 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5694 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5695 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
5696 const AMDGPU::MIMGDimInfo *Dim =
5698
5699 if (!Dim) {
5700 ErrInfo = "dim is out of range";
5701 return false;
5702 }
5703
5704 bool IsA16 = false;
5705 if (ST.hasR128A16()) {
5706 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5707 IsA16 = R128A16->getImm() != 0;
5708 } else if (ST.hasA16()) {
5709 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5710 IsA16 = A16->getImm() != 0;
5711 }
5712
5713 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5714
5715 unsigned AddrWords =
5716 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5717
5718 unsigned VAddrWords;
5719 if (IsNSA) {
5720 VAddrWords = RsrcIdx - VAddr0Idx;
5721 if (ST.hasPartialNSAEncoding() &&
5722 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5723 unsigned LastVAddrIdx = RsrcIdx - 1;
5724 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5725 }
5726 } else {
5727 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5728 if (AddrWords > 12)
5729 AddrWords = 16;
5730 }
5731
5732 if (VAddrWords != AddrWords) {
5733 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5734 << " but got " << VAddrWords << "\n");
5735 ErrInfo = "bad vaddr size";
5736 return false;
5737 }
5738 }
5739 }
5740
5741 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5742 if (DppCt) {
5743 using namespace AMDGPU::DPP;
5744
5745 unsigned DC = DppCt->getImm();
5746 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5747 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5748 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5749 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5750 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5751 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5752 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5753 ErrInfo = "Invalid dpp_ctrl value";
5754 return false;
5755 }
5756 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5757 !ST.hasDPPWavefrontShifts()) {
5758 ErrInfo = "Invalid dpp_ctrl value: "
5759 "wavefront shifts are not supported on GFX10+";
5760 return false;
5761 }
5762 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5763 !ST.hasDPPBroadcasts()) {
5764 ErrInfo = "Invalid dpp_ctrl value: "
5765 "broadcasts are not supported on GFX10+";
5766 return false;
5767 }
5768 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5769 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5770 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5771 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5772 !ST.hasGFX90AInsts()) {
5773 ErrInfo = "Invalid dpp_ctrl value: "
5774 "row_newbroadcast/row_share is not supported before "
5775 "GFX90A/GFX10";
5776 return false;
5777 }
5778 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5779 ErrInfo = "Invalid dpp_ctrl value: "
5780 "row_share and row_xmask are not supported before GFX10";
5781 return false;
5782 }
5783 }
5784
5785 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5787 AMDGPU::isDPALU_DPP(Desc, *this, ST)) {
5788 ErrInfo = "Invalid dpp_ctrl value: "
5789 "DP ALU dpp only support row_newbcast";
5790 return false;
5791 }
5792 }
5793
5794 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5795 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5796 AMDGPU::OpName DataName =
5797 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5798 const MachineOperand *Data = getNamedOperand(MI, DataName);
5799 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5800 if (Data && !Data->isReg())
5801 Data = nullptr;
5802
5803 if (ST.hasGFX90AInsts()) {
5804 if (Dst && Data && !Dst->isTied() && !Data->isTied() &&
5805 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5806 ErrInfo = "Invalid register class: "
5807 "vdata and vdst should be both VGPR or AGPR";
5808 return false;
5809 }
5810 if (Data && Data2 &&
5811 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5812 ErrInfo = "Invalid register class: "
5813 "both data operands should be VGPR or AGPR";
5814 return false;
5815 }
5816 } else {
5817 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5818 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5819 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5820 ErrInfo = "Invalid register class: "
5821 "agpr loads and stores not supported on this GPU";
5822 return false;
5823 }
5824 }
5825 }
5826
5827 if (ST.needsAlignedVGPRs()) {
5828 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5830 if (!Op)
5831 return true;
5832 Register Reg = Op->getReg();
5833 if (Reg.isPhysical())
5834 return !(RI.getHWRegIndex(Reg) & 1);
5835 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5836 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5837 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5838 };
5839
5840 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5841 Opcode == AMDGPU::DS_GWS_BARRIER) {
5842
5843 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5844 ErrInfo = "Subtarget requires even aligned vector registers "
5845 "for DS_GWS instructions";
5846 return false;
5847 }
5848 }
5849
5850 if (isMIMG(MI)) {
5851 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5852 ErrInfo = "Subtarget requires even aligned vector registers "
5853 "for vaddr operand of image instructions";
5854 return false;
5855 }
5856 }
5857 }
5858
5859 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5860 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5861 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5862 ErrInfo = "Invalid register class: "
5863 "v_accvgpr_write with an SGPR is not supported on this GPU";
5864 return false;
5865 }
5866 }
5867
5868 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5869 const MachineOperand &SrcOp = MI.getOperand(1);
5870 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5871 ErrInfo = "pseudo expects only physical SGPRs";
5872 return false;
5873 }
5874 }
5875
5876 if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
5877 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
5878 if (!ST.hasScaleOffset()) {
5879 ErrInfo = "Subtarget does not support offset scaling";
5880 return false;
5881 }
5882 if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
5883 ErrInfo = "Instruction does not support offset scaling";
5884 return false;
5885 }
5886 }
5887 }
5888
5889 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32or64BitOperand for more
5890 // information.
5892 for (unsigned I = 0; I < 3; ++I) {
5894 return false;
5895 }
5896 }
5897
5898 if (ST.hasFlatScratchHiInB64InstHazard() && isSALU(MI) &&
5899 MI.readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, nullptr)) {
5900 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
5901 if ((Dst && RI.getRegClassForReg(MRI, Dst->getReg()) ==
5902 &AMDGPU::SReg_64RegClass) ||
5903 Opcode == AMDGPU::S_BITCMP0_B64 || Opcode == AMDGPU::S_BITCMP1_B64) {
5904 ErrInfo = "Instruction cannot read flat_scratch_base_hi";
5905 return false;
5906 }
5907 }
5908
5909 return true;
5910}
5911
5913 if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
5914 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
5915 return MI.getOperand(1).isReg() || RI.isAGPR(MRI, MI.getOperand(0).getReg())
5916 ? AMDGPU::COPY
5917 : AMDGPU::V_MOV_B32_e32;
5918 }
5919 return getVALUOp(MI.getOpcode());
5920}
5921
5922// It is more readable to list mapped opcodes on the same line.
5923// clang-format off
5924
5925unsigned SIInstrInfo::getVALUOp(unsigned Opc) const {
5926 switch (Opc) {
5927 default: return AMDGPU::INSTRUCTION_LIST_END;
5928 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5929 case AMDGPU::COPY: return AMDGPU::COPY;
5930 case AMDGPU::PHI: return AMDGPU::PHI;
5931 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5932 case AMDGPU::WQM: return AMDGPU::WQM;
5933 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5934 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5935 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5936 case AMDGPU::S_ADD_I32:
5937 return ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5938 case AMDGPU::S_ADDC_U32:
5939 return AMDGPU::V_ADDC_U32_e32;
5940 case AMDGPU::S_SUB_I32:
5941 return ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5942 // FIXME: These are not consistently handled, and selected when the carry is
5943 // used.
5944 case AMDGPU::S_ADD_U32:
5945 return AMDGPU::V_ADD_CO_U32_e32;
5946 case AMDGPU::S_SUB_U32:
5947 return AMDGPU::V_SUB_CO_U32_e32;
5948 case AMDGPU::S_ADD_U64_PSEUDO:
5949 return AMDGPU::V_ADD_U64_PSEUDO;
5950 case AMDGPU::S_SUB_U64_PSEUDO:
5951 return AMDGPU::V_SUB_U64_PSEUDO;
5952 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5953 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5954 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5955 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5956 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5957 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5958 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5959 case AMDGPU::S_XNOR_B32:
5960 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5961 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5962 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5963 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5964 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5965 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5966 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5967 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5968 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5969 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5970 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5971 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5972 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5973 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5974 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5975 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5976 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5977 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5978 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5979 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5980 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5981 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5982 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5983 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5984 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5985 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5986 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5987 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5988 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5989 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5990 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5991 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5992 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5993 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5994 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5995 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5996 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5997 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5998 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5999 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
6000 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
6001 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
6002 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
6003 case AMDGPU::S_CVT_F32_F16:
6004 case AMDGPU::S_CVT_HI_F32_F16:
6005 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
6006 : AMDGPU::V_CVT_F32_F16_fake16_e64;
6007 case AMDGPU::S_CVT_F16_F32:
6008 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
6009 : AMDGPU::V_CVT_F16_F32_fake16_e64;
6010 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
6011 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
6012 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
6013 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
6014 case AMDGPU::S_CEIL_F16:
6015 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
6016 : AMDGPU::V_CEIL_F16_fake16_e64;
6017 case AMDGPU::S_FLOOR_F16:
6018 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
6019 : AMDGPU::V_FLOOR_F16_fake16_e64;
6020 case AMDGPU::S_TRUNC_F16:
6021 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
6022 : AMDGPU::V_TRUNC_F16_fake16_e64;
6023 case AMDGPU::S_RNDNE_F16:
6024 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
6025 : AMDGPU::V_RNDNE_F16_fake16_e64;
6026 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
6027 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
6028 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
6029 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
6030 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
6031 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
6032 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
6033 case AMDGPU::S_ADD_F16:
6034 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
6035 : AMDGPU::V_ADD_F16_fake16_e64;
6036 case AMDGPU::S_SUB_F16:
6037 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
6038 : AMDGPU::V_SUB_F16_fake16_e64;
6039 case AMDGPU::S_MIN_F16:
6040 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
6041 : AMDGPU::V_MIN_F16_fake16_e64;
6042 case AMDGPU::S_MAX_F16:
6043 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
6044 : AMDGPU::V_MAX_F16_fake16_e64;
6045 case AMDGPU::S_MINIMUM_F16:
6046 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
6047 : AMDGPU::V_MINIMUM_F16_fake16_e64;
6048 case AMDGPU::S_MAXIMUM_F16:
6049 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
6050 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
6051 case AMDGPU::S_MUL_F16:
6052 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
6053 : AMDGPU::V_MUL_F16_fake16_e64;
6054 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
6055 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
6056 case AMDGPU::S_FMAC_F16:
6057 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
6058 : AMDGPU::V_FMAC_F16_fake16_e64;
6059 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
6060 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
6061 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
6062 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
6063 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
6064 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
6065 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
6066 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
6067 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
6068 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
6069 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
6070 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
6071 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
6072 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
6073 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
6074 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
6075 case AMDGPU::S_CMP_LT_F16:
6076 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
6077 : AMDGPU::V_CMP_LT_F16_fake16_e64;
6078 case AMDGPU::S_CMP_EQ_F16:
6079 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
6080 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
6081 case AMDGPU::S_CMP_LE_F16:
6082 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
6083 : AMDGPU::V_CMP_LE_F16_fake16_e64;
6084 case AMDGPU::S_CMP_GT_F16:
6085 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
6086 : AMDGPU::V_CMP_GT_F16_fake16_e64;
6087 case AMDGPU::S_CMP_LG_F16:
6088 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
6089 : AMDGPU::V_CMP_LG_F16_fake16_e64;
6090 case AMDGPU::S_CMP_GE_F16:
6091 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
6092 : AMDGPU::V_CMP_GE_F16_fake16_e64;
6093 case AMDGPU::S_CMP_O_F16:
6094 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
6095 : AMDGPU::V_CMP_O_F16_fake16_e64;
6096 case AMDGPU::S_CMP_U_F16:
6097 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
6098 : AMDGPU::V_CMP_U_F16_fake16_e64;
6099 case AMDGPU::S_CMP_NGE_F16:
6100 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
6101 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
6102 case AMDGPU::S_CMP_NLG_F16:
6103 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
6104 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
6105 case AMDGPU::S_CMP_NGT_F16:
6106 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
6107 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
6108 case AMDGPU::S_CMP_NLE_F16:
6109 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
6110 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
6111 case AMDGPU::S_CMP_NEQ_F16:
6112 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
6113 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
6114 case AMDGPU::S_CMP_NLT_F16:
6115 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
6116 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
6117 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
6118 case AMDGPU::V_S_EXP_F16_e64:
6119 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
6120 : AMDGPU::V_EXP_F16_fake16_e64;
6121 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
6122 case AMDGPU::V_S_LOG_F16_e64:
6123 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
6124 : AMDGPU::V_LOG_F16_fake16_e64;
6125 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
6126 case AMDGPU::V_S_RCP_F16_e64:
6127 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
6128 : AMDGPU::V_RCP_F16_fake16_e64;
6129 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
6130 case AMDGPU::V_S_RSQ_F16_e64:
6131 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
6132 : AMDGPU::V_RSQ_F16_fake16_e64;
6133 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
6134 case AMDGPU::V_S_SQRT_F16_e64:
6135 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
6136 : AMDGPU::V_SQRT_F16_fake16_e64;
6137 }
6139 "Unexpected scalar opcode without corresponding vector one!");
6140}
6141
6142// clang-format on
6143
6147 const DebugLoc &DL, Register Reg,
6148 bool IsSCCLive,
6149 SlotIndexes *Indexes) const {
6150 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6151 const SIInstrInfo *TII = ST.getInstrInfo();
6153 if (IsSCCLive) {
6154 // Insert two move instructions, one to save the original value of EXEC and
6155 // the other to turn on all bits in EXEC. This is required as we can't use
6156 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
6157 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), Reg)
6159 auto FlipExecMI =
6160 BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
6161 if (Indexes) {
6162 Indexes->insertMachineInstrInMaps(*StoreExecMI);
6163 Indexes->insertMachineInstrInMaps(*FlipExecMI);
6164 }
6165 } else {
6166 auto SaveExec =
6167 BuildMI(MBB, MBBI, DL, TII->get(LMC.OrSaveExecOpc), Reg).addImm(-1);
6168 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
6169 if (Indexes)
6170 Indexes->insertMachineInstrInMaps(*SaveExec);
6171 }
6172}
6173
6176 const DebugLoc &DL, Register Reg,
6177 SlotIndexes *Indexes) const {
6179 auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, get(LMC.MovOpc), LMC.ExecReg)
6180 .addReg(Reg, RegState::Kill);
6181 if (Indexes)
6182 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
6183}
6184
6188 "Not a whole wave func");
6189 MachineBasicBlock &MBB = *MF.begin();
6190 for (MachineInstr &MI : MBB)
6191 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
6192 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
6193 return &MI;
6194
6195 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
6196}
6197
6199 unsigned OpNo) const {
6200 const MCInstrDesc &Desc = get(MI.getOpcode());
6201 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6202 Desc.operands()[OpNo].RegClass == -1) {
6203 Register Reg = MI.getOperand(OpNo).getReg();
6204
6205 if (Reg.isVirtual()) {
6206 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6207 return MRI.getRegClass(Reg);
6208 }
6209 return RI.getPhysRegBaseClass(Reg);
6210 }
6211
6212 int16_t RegClass = getOpRegClassID(Desc.operands()[OpNo]);
6213 return RegClass < 0 ? nullptr : RI.getRegClass(RegClass);
6214}
6215
6218 MachineBasicBlock *MBB = MI.getParent();
6219 MachineOperand &MO = MI.getOperand(OpIdx);
6220 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6221 unsigned RCID = getOpRegClassID(get(MI.getOpcode()).operands()[OpIdx]);
6222 const TargetRegisterClass *RC = RI.getRegClass(RCID);
6223 unsigned Size = RI.getRegSizeInBits(*RC);
6224 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6225 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6226 : AMDGPU::V_MOV_B32_e32;
6227 if (MO.isReg())
6228 Opcode = AMDGPU::COPY;
6229 else if (RI.isSGPRClass(RC))
6230 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6231
6232 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
6233 Register Reg = MRI.createVirtualRegister(VRC);
6234 DebugLoc DL = MBB->findDebugLoc(I);
6235 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
6236 MO.ChangeToRegister(Reg, false);
6237}
6238
6241 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6242 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6243 if (!SuperReg.getReg().isVirtual())
6244 return RI.getSubReg(SuperReg.getReg(), SubIdx);
6245
6246 MachineBasicBlock *MBB = MI->getParent();
6247 const DebugLoc &DL = MI->getDebugLoc();
6248 Register SubReg = MRI.createVirtualRegister(SubRC);
6249
6250 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
6251 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
6252 .addReg(SuperReg.getReg(), {}, NewSubIdx);
6253 return SubReg;
6254}
6255
6258 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6259 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6260 if (Op.isImm()) {
6261 if (SubIdx == AMDGPU::sub0)
6262 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
6263 if (SubIdx == AMDGPU::sub1)
6264 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
6265
6266 llvm_unreachable("Unhandled register index for immediate");
6267 }
6268
6269 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
6270 SubIdx, SubRC);
6271 return MachineOperand::CreateReg(SubReg, false);
6272}
6273
6274// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6275void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6276 assert(Inst.getNumExplicitOperands() == 3);
6277 MachineOperand Op1 = Inst.getOperand(1);
6278 Inst.removeOperand(1);
6279 Inst.addOperand(Op1);
6280}
6281
6283 const MCOperandInfo &OpInfo,
6284 const MachineOperand &MO) const {
6285 if (!MO.isReg())
6286 return false;
6287
6288 Register Reg = MO.getReg();
6289
6290 const TargetRegisterClass *DRC = RI.getRegClass(getOpRegClassID(OpInfo));
6291 if (Reg.isPhysical())
6292 return DRC->contains(Reg);
6293
6294 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6295
6296 if (MO.getSubReg()) {
6297 const MachineFunction *MF = MO.getParent()->getMF();
6298 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
6299 if (!SuperRC)
6300 return false;
6301 return RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()) != nullptr;
6302 }
6303
6304 return RI.getCommonSubClass(DRC, RC) != nullptr;
6305}
6306
6308 const MachineOperand &MO) const {
6309 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6310 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6311 unsigned Opc = MI.getOpcode();
6312
6313 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32or64BitOperand for more
6314 // information.
6315 if (AMDGPU::isPackedFP32or64BitInst(MI.getOpcode()) &&
6316 AMDGPU::isGFX12Plus(ST) && MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
6317 constexpr AMDGPU::OpName OpNames[] = {
6318 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6319
6320 for (auto [I, OpName] : enumerate(OpNames)) {
6321 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
6322 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6324 return false;
6325 }
6326 }
6327
6328 if (!isLegalRegOperand(MRI, OpInfo, MO))
6329 return false;
6330
6331 // check Accumulate GPR operand
6332 bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
6333 if (IsAGPR && !ST.hasMAIInsts())
6334 return false;
6335 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6336 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
6337 return false;
6338 // Atomics should have both vdst and vdata either vgpr or agpr.
6339 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
6340 const int DataIdx = AMDGPU::getNamedOperandIdx(
6341 Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6342 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6343 MI.getOperand(DataIdx).isReg() &&
6344 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
6345 return false;
6346 if ((int)OpIdx == DataIdx) {
6347 if (VDstIdx != -1 &&
6348 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
6349 return false;
6350 // DS instructions with 2 src operands also must have tied RC.
6351 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
6352 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
6353 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
6354 return false;
6355 }
6356
6357 // Check V_ACCVGPR_WRITE_B32_e64
6358 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6359 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
6360 RI.isSGPRReg(MRI, MO.getReg()))
6361 return false;
6362
6363 if (ST.hasFlatScratchHiInB64InstHazard() &&
6364 MO.getReg() == AMDGPU::SRC_FLAT_SCRATCH_BASE_HI && isSALU(MI)) {
6365 if (const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst)) {
6366 if (AMDGPU::getRegBitWidth(*RI.getRegClassForReg(MRI, Dst->getReg())) ==
6367 64)
6368 return false;
6369 }
6370 if (Opc == AMDGPU::S_BITCMP0_B64 || Opc == AMDGPU::S_BITCMP1_B64)
6371 return false;
6372 }
6373 if (!ST.hasDPPSrc1SGPR() && isDPP(MI) && RI.isSGPRReg(MRI, MO.getReg()) &&
6374 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1))
6375 return false;
6376
6377 return true;
6378}
6379
6381 const MCOperandInfo &OpInfo,
6382 const MachineOperand &MO) const {
6383 if (MO.isReg())
6384 return isLegalRegOperand(MRI, OpInfo, MO);
6385
6386 // Handle non-register types that are treated like immediates.
6387 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6388 return true;
6389}
6390
6392 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6393 const MachineOperand *MO) const {
6394 constexpr unsigned NumOps = 3;
6395 constexpr AMDGPU::OpName OpNames[NumOps * 2] = {
6396 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6397 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6398 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6399
6400 assert(SrcN < NumOps);
6401
6402 if (!MO) {
6403 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
6404 if (SrcIdx == -1)
6405 return true;
6406 MO = &MI.getOperand(SrcIdx);
6407 }
6408
6409 if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
6410 return true;
6411
6412 int ModsIdx =
6413 AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
6414 if (ModsIdx == -1)
6415 return false;
6416
6417 unsigned Mods = MI.getOperand(ModsIdx).getImm();
6418 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6419 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6420
6421 return !OpSel && !OpSelHi;
6422}
6423
6425 const MachineOperand *MO) const {
6426 const MachineFunction &MF = *MI.getMF();
6427 const MachineRegisterInfo &MRI = MF.getRegInfo();
6428 const MCInstrDesc &InstDesc = MI.getDesc();
6429 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6430 int64_t RegClass = getOpRegClassID(OpInfo);
6431 const TargetRegisterClass *DefinedRC =
6432 RegClass != -1 ? RI.getRegClass(RegClass) : nullptr;
6433 if (!MO)
6434 MO = &MI.getOperand(OpIdx);
6435
6436 const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo);
6437
6438 if (isVALU(MI, /*AllowLDSDMA=*/true) && !IsInlineConst &&
6439 usesConstantBus(MRI, *MO, OpInfo)) {
6440 const MachineOperand *UsedLiteral = nullptr;
6441
6442 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
6443 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6444
6445 // TODO: Be more permissive with frame indexes.
6446 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) {
6447 if (!LiteralLimit--)
6448 return false;
6449
6450 UsedLiteral = MO;
6451 }
6452
6454 if (MO->isReg())
6455 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
6456
6457 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6458 if (i == OpIdx)
6459 continue;
6460 const MachineOperand &Op = MI.getOperand(i);
6461 if (Op.isReg()) {
6462 if (Op.isUse()) {
6463 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6464 if (regUsesConstantBus(Op, MRI) && SGPRsUsed.insert(SGPR).second) {
6465 if (--ConstantBusLimit <= 0)
6466 return false;
6467 }
6468 }
6469 } else if (AMDGPU::isSISrcOperand(InstDesc.operands()[i]) &&
6470 !isInlineConstant(Op, InstDesc.operands()[i])) {
6471 // The same literal may be used multiple times.
6472 if (!UsedLiteral)
6473 UsedLiteral = &Op;
6474 else if (UsedLiteral->isIdenticalTo(Op))
6475 continue;
6476
6477 if (!LiteralLimit--)
6478 return false;
6479 if (--ConstantBusLimit <= 0)
6480 return false;
6481 }
6482 }
6483 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6484 // There can be at most one literal operand, but it can be repeated.
6485 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6486 if (i == OpIdx)
6487 continue;
6488 const MachineOperand &Op = MI.getOperand(i);
6489 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6490 !isInlineConstant(Op, InstDesc.operands()[i]) &&
6491 !Op.isIdenticalTo(*MO))
6492 return false;
6493
6494 // Do not fold a non-inlineable and non-register operand into an
6495 // instruction that already has a frame index. The frame index handling
6496 // code could not handle well when a frame index co-exists with another
6497 // non-register operand, unless that operand is an inlineable immediate.
6498 if (Op.isFI())
6499 return false;
6500 }
6501 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6502 isF16PseudoScalarTrans(MI.getOpcode())) {
6503 return false;
6504 }
6505
6506 if (MO->isReg()) {
6507 if (!DefinedRC)
6508 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6509 return isLegalRegOperand(MI, OpIdx, *MO);
6510 }
6511
6512 if (MO->isImm()) {
6513 uint64_t Imm = MO->getImm();
6514 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64 ||
6515 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP64;
6516 bool Is64BitOp = Is64BitFPOp ||
6517 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6518 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6519 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32 ||
6520 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT64;
6521 if (Is64BitOp &&
6522 !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
6523 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
6524 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6525 return false;
6526
6527 // FIXME: We can use sign extended 64-bit literals, but only for signed
6528 // operands. At the moment we do not know if an operand is signed.
6529 // Such operand will be encoded as its low 32 bits and then either
6530 // correctly sign extended or incorrectly zero extended by HW.
6531 // If 64-bit literals are supported and the literal will be encoded
6532 // as full 64 bit we still can use it.
6533 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6534 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
6535 return false;
6536 }
6537 }
6538
6539 // Handle non-register types that are treated like immediates.
6540 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6541
6542 if (!DefinedRC) {
6543 // This operand expects an immediate.
6544 return true;
6545 }
6546
6547 return isImmOperandLegal(MI, OpIdx, *MO);
6548}
6549
6551 bool IsGFX950Only = ST.hasGFX950Insts();
6552 bool IsGFX940Only = ST.hasGFX940Insts();
6553
6554 if (!IsGFX950Only && !IsGFX940Only)
6555 return false;
6556
6557 if (!isVALU(MI, /*AllowLDSDMA=*/true))
6558 return false;
6559
6560 // V_COS, V_EXP, V_RCP, etc.
6561 if (isTRANS(MI))
6562 return true;
6563
6564 // DOT2, DOT2C, DOT4, etc.
6565 if (isDOT(MI))
6566 return true;
6567
6568 // MFMA, SMFMA
6569 if (isMFMA(MI))
6570 return true;
6571
6572 unsigned Opcode = MI.getOpcode();
6573 switch (Opcode) {
6574 case AMDGPU::V_CVT_PK_BF8_F32_e64:
6575 case AMDGPU::V_CVT_PK_FP8_F32_e64:
6576 case AMDGPU::V_MQSAD_PK_U16_U8_e64:
6577 case AMDGPU::V_MQSAD_U32_U8_e64:
6578 case AMDGPU::V_PK_ADD_F16:
6579 case AMDGPU::V_PK_ADD_F32:
6580 case AMDGPU::V_PK_ADD_I16:
6581 case AMDGPU::V_PK_ADD_U16:
6582 case AMDGPU::V_PK_ASHRREV_I16:
6583 case AMDGPU::V_PK_FMA_F16:
6584 case AMDGPU::V_PK_FMA_F32:
6585 case AMDGPU::V_PK_FMAC_F16_e32:
6586 case AMDGPU::V_PK_FMAC_F16_e64:
6587 case AMDGPU::V_PK_LSHLREV_B16:
6588 case AMDGPU::V_PK_LSHRREV_B16:
6589 case AMDGPU::V_PK_MAD_I16:
6590 case AMDGPU::V_PK_MAD_U16:
6591 case AMDGPU::V_PK_MAX_F16:
6592 case AMDGPU::V_PK_MAX_I16:
6593 case AMDGPU::V_PK_MAX_U16:
6594 case AMDGPU::V_PK_MIN_F16:
6595 case AMDGPU::V_PK_MIN_I16:
6596 case AMDGPU::V_PK_MIN_U16:
6597 case AMDGPU::V_PK_MOV_B32:
6598 case AMDGPU::V_PK_MUL_F16:
6599 case AMDGPU::V_PK_MUL_F32:
6600 case AMDGPU::V_PK_MUL_LO_U16:
6601 case AMDGPU::V_PK_SUB_I16:
6602 case AMDGPU::V_PK_SUB_U16:
6603 case AMDGPU::V_QSAD_PK_U16_U8_e64:
6604 return true;
6605 default:
6606 return false;
6607 }
6608}
6609
6611 MachineInstr &MI) const {
6612 unsigned Opc = MI.getOpcode();
6613 const MCInstrDesc &InstrDesc = get(Opc);
6614
6615 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
6616 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6617
6618 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
6619 MachineOperand &Src1 = MI.getOperand(Src1Idx);
6620
6621 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6622 // we need to only have one constant bus use before GFX10.
6623 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6624 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
6625 RI.isSGPRReg(MRI, Src0.getReg()))
6626 legalizeOpWithMove(MI, Src0Idx);
6627
6628 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6629 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6630 // src0/src1 with V_READFIRSTLANE.
6631 if (Opc == AMDGPU::V_WRITELANE_B32) {
6632 const DebugLoc &DL = MI.getDebugLoc();
6633 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
6634 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6635 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6636 .add(Src0);
6637 Src0.ChangeToRegister(Reg, false);
6638 }
6639 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
6640 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6641 const DebugLoc &DL = MI.getDebugLoc();
6642 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6643 .add(Src1);
6644 Src1.ChangeToRegister(Reg, false);
6645 }
6646 return;
6647 }
6648
6649 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6650 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6651 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
6652 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6653 legalizeOpWithMove(MI, Src2Idx);
6654 }
6655
6656 // VOP2 src0 instructions support all operand types, so we don't need to check
6657 // their legality. If src1 is already legal, we don't need to do anything.
6658 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6659 return;
6660
6661 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6662 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6663 // select is uniform.
6664 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6665 RI.isVGPR(MRI, Src1.getReg())) {
6666 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6667 const DebugLoc &DL = MI.getDebugLoc();
6668 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6669 .add(Src1);
6670 Src1.ChangeToRegister(Reg, false);
6671 return;
6672 }
6673
6674 // We do not use commuteInstruction here because it is too aggressive and will
6675 // commute if it is possible. We only want to commute here if it improves
6676 // legality. This can be called a fairly large number of times so don't waste
6677 // compile time pointlessly swapping and checking legality again.
6678 if (HasImplicitSGPR || !MI.isCommutable()) {
6679 legalizeOpWithMove(MI, Src1Idx);
6680 return;
6681 }
6682
6683 // If src0 can be used as src1, commuting will make the operands legal.
6684 // Otherwise we have to give up and insert a move.
6685 //
6686 // TODO: Other immediate-like operand kinds could be commuted if there was a
6687 // MachineOperand::ChangeTo* for them.
6688 if ((!Src1.isImm() && !Src1.isReg()) ||
6689 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6690 legalizeOpWithMove(MI, Src1Idx);
6691 return;
6692 }
6693
6694 int CommutedOpc = commuteOpcode(MI);
6695 if (CommutedOpc == -1) {
6696 legalizeOpWithMove(MI, Src1Idx);
6697 return;
6698 }
6699
6700 MI.setDesc(get(CommutedOpc));
6701
6702 Register Src0Reg = Src0.getReg();
6703 unsigned Src0SubReg = Src0.getSubReg();
6704 bool Src0Kill = Src0.isKill();
6705
6706 if (Src1.isImm())
6707 Src0.ChangeToImmediate(Src1.getImm());
6708 else if (Src1.isReg()) {
6709 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6710 Src0.setSubReg(Src1.getSubReg());
6711 } else
6712 llvm_unreachable("Should only have register or immediate operands");
6713
6714 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6715 Src1.setSubReg(Src0SubReg);
6717}
6718
6719// Legalize VOP3 operands. All operand types are supported for any operand
6720// but only one literal constant and only starting from GFX10.
6722 MachineInstr &MI) const {
6723 unsigned Opc = MI.getOpcode();
6724
6725 int VOP3Idx[3] = {
6726 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6727 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6728 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6729 };
6730
6731 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6732 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6733 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6734 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6735 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6736 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6737 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6738 // src1 and src2 must be scalar
6739 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6740 const DebugLoc &DL = MI.getDebugLoc();
6741 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6742 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6743 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6744 .add(Src1);
6745 Src1.ChangeToRegister(Reg, false);
6746 }
6747 if (VOP3Idx[2] != -1) {
6748 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6749 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6750 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6751 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6752 .add(Src2);
6753 Src2.ChangeToRegister(Reg, false);
6754 }
6755 }
6756 }
6757
6758 // Find the one SGPR operand we are allowed to use.
6759 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6760 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6761 SmallDenseSet<unsigned> SGPRsUsed;
6762 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6763 if (SGPRReg) {
6764 SGPRsUsed.insert(SGPRReg);
6765 --ConstantBusLimit;
6766 }
6767
6768 for (int Idx : VOP3Idx) {
6769 if (Idx == -1)
6770 break;
6771 MachineOperand &MO = MI.getOperand(Idx);
6772
6773 if (!MO.isReg()) {
6774 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6775 continue;
6776
6777 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6778 --LiteralLimit;
6779 --ConstantBusLimit;
6780 continue;
6781 }
6782
6783 --LiteralLimit;
6784 --ConstantBusLimit;
6785 legalizeOpWithMove(MI, Idx);
6786 continue;
6787 }
6788
6789 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6790 continue; // VGPRs are legal
6791
6792 // We can use one SGPR in each VOP3 instruction prior to GFX10
6793 // and two starting from GFX10.
6794 if (SGPRsUsed.count(MO.getReg()))
6795 continue;
6796 if (ConstantBusLimit > 0) {
6797 SGPRsUsed.insert(MO.getReg());
6798 --ConstantBusLimit;
6799 continue;
6800 }
6801
6802 // If we make it this far, then the operand is not legal and we must
6803 // legalize it.
6804 legalizeOpWithMove(MI, Idx);
6805 }
6806
6807 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6808 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6809 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6810 legalizeOpWithMove(MI, VOP3Idx[2]);
6811
6812 // Fix the register class of packed FP32 instructions on gfx12+. See
6813 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32or64BitOperand for more
6814 // information.
6816 for (unsigned I = 0; I < 3; ++I) {
6818 legalizeOpWithMove(MI, VOP3Idx[I]);
6819 }
6820 }
6821}
6822
6825 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6826 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6827 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6828 if (DstRC)
6829 SRC = RI.getCommonSubClass(SRC, DstRC);
6830
6831 Register DstReg = MRI.createVirtualRegister(SRC);
6832 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6833
6834 if (RI.hasAGPRs(VRC)) {
6835 VRC = RI.getEquivalentVGPRClass(VRC);
6836 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6837 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6838 get(TargetOpcode::COPY), NewSrcReg)
6839 .addReg(SrcReg);
6840 SrcReg = NewSrcReg;
6841 }
6842
6843 if (SubRegs == 1) {
6844 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6845 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6846 .addReg(SrcReg);
6847 return DstReg;
6848 }
6849
6851 for (unsigned i = 0; i < SubRegs; ++i) {
6852 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6853 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6854 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6855 .addReg(SrcReg, {}, RI.getSubRegFromChannel(i));
6856 SRegs.push_back(SGPR);
6857 }
6858
6860 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6861 get(AMDGPU::REG_SEQUENCE), DstReg);
6862 for (unsigned i = 0; i < SubRegs; ++i) {
6863 MIB.addReg(SRegs[i]);
6864 MIB.addImm(RI.getSubRegFromChannel(i));
6865 }
6866 return DstReg;
6867}
6868
6870 MachineInstr &MI) const {
6871
6872 // If the pointer is store in VGPRs, then we need to move them to
6873 // SGPRs using v_readfirstlane. This is safe because we only select
6874 // loads with uniform pointers to SMRD instruction so we know the
6875 // pointer value is uniform.
6876 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6877 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6878 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6879 SBase->setReg(SGPR);
6880 }
6881 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6882 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6883 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6884 SOff->setReg(SGPR);
6885 }
6886}
6887
6889 unsigned Opc = Inst.getOpcode();
6890 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6891 if (OldSAddrIdx < 0)
6892 return false;
6893
6894 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
6895
6896 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6897 if (NewOpc < 0)
6899 if (NewOpc < 0)
6900 return false;
6901
6902 MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo();
6903 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6904 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6905 return false;
6906
6907 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6908 if (NewVAddrIdx < 0)
6909 return false;
6910
6911 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6912
6913 // Check vaddr, it shall be zero or absent.
6914 MachineInstr *VAddrDef = nullptr;
6915 if (OldVAddrIdx >= 0) {
6916 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6917 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6918 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
6919 !VAddrDef->getOperand(1).isImm() ||
6920 VAddrDef->getOperand(1).getImm() != 0)
6921 return false;
6922 }
6923
6924 const MCInstrDesc &NewDesc = get(NewOpc);
6925 Inst.setDesc(NewDesc);
6926
6927 // Callers expect iterator to be valid after this call, so modify the
6928 // instruction in place.
6929 if (OldVAddrIdx == NewVAddrIdx) {
6930 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6931 // Clear use list from the old vaddr holding a zero register.
6932 MRI.removeRegOperandFromUseList(&NewVAddr);
6933 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6934 Inst.removeOperand(OldSAddrIdx);
6935 // Update the use list with the pointer we have just moved from vaddr to
6936 // saddr position. Otherwise new vaddr will be missing from the use list.
6937 MRI.removeRegOperandFromUseList(&NewVAddr);
6938 MRI.addRegOperandToUseList(&NewVAddr);
6939 } else {
6940 assert(OldSAddrIdx == NewVAddrIdx);
6941
6942 if (OldVAddrIdx >= 0) {
6943 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6944 AMDGPU::OpName::vdst_in);
6945
6946 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6947 // it asserts. Untie the operands for now and retie them afterwards.
6948 if (NewVDstIn != -1) {
6949 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6950 Inst.untieRegOperand(OldVDstIn);
6951 }
6952
6953 Inst.removeOperand(OldVAddrIdx);
6954
6955 if (NewVDstIn != -1) {
6956 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6957 Inst.tieOperands(NewVDst, NewVDstIn);
6958 }
6959 }
6960 }
6961
6962 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6963 VAddrDef->eraseFromParent();
6964
6965 return true;
6966}
6967
6968// FIXME: Remove this when SelectionDAG is obsoleted.
6970 MachineInstr &MI) const {
6971 if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
6972 return;
6973
6974 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6975 // thinks they are uniform, so a readfirstlane should be valid.
6976 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6977 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6978 return;
6979
6981 return;
6982
6983 const TargetRegisterClass *DeclaredRC =
6984 getRegClass(MI.getDesc(), SAddr->getOperandNo());
6985
6986 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
6987 SAddr->setReg(ToSGPR);
6988}
6989
6992 const TargetRegisterClass *DstRC,
6995 const DebugLoc &DL) const {
6996 Register OpReg = Op.getReg();
6997 unsigned OpSubReg = Op.getSubReg();
6998
6999 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
7000 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
7001
7002 // Check if operand is already the correct register class.
7003 if (DstRC == OpRC)
7004 return;
7005
7006 Register DstReg = MRI.createVirtualRegister(DstRC);
7007 auto Copy =
7008 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
7009 Op.setReg(DstReg);
7010
7011 MachineInstr *Def = MRI.getVRegDef(OpReg);
7012 if (!Def)
7013 return;
7014
7015 // Try to eliminate the copy if it is copying an immediate value.
7016 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
7017 foldImmediate(*Copy, *Def, OpReg, &MRI);
7018
7019 bool ImpDef = Def->isImplicitDef();
7020 while (!ImpDef && Def && Def->isCopy()) {
7021 if (Def->getOperand(1).getReg().isPhysical())
7022 break;
7023 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
7024 ImpDef = Def && Def->isImplicitDef();
7025 }
7026 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
7027 !ImpDef)
7028 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
7029}
7030
7031// Emit the actual waterfall loop, executing the wrapped instruction for each
7032// unique value of \p ScalarOps across all lanes. In the best case we execute 1
7033// iteration, in the worst case we execute 64 (once per lane).
7036 MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL,
7037 ArrayRef<MachineOperand *> ScalarOps, ArrayRef<Register> PhySGPRs = {}) {
7038 MachineFunction &MF = *LoopBB.getParent();
7040 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7042 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7043
7044 // Emit v_cmpx_eq and s_andn2_wrexec when both instructions are
7045 // available. Otherwise, use the previous pattern of v_cmp_eq,
7046 // s_and_saveexec, and s_xor.
7047 bool UseNewExecInstructions =
7048 ST.hasNoSdstCMPX() && TII.pseudoToMCOpcode(LMC.AndN2WrExecOpc) != -1;
7049
7051 Register CondReg;
7052
7053 Register PhiExec;
7054 Register NewExec;
7055
7056 if (UseNewExecInstructions) {
7057 PhiExec = MRI.createVirtualRegister(BoolXExecRC);
7058 NewExec = MRI.createVirtualRegister(BoolXExecRC);
7059 Register InitExec = MRI.createVirtualRegister(BoolXExecRC);
7060 BuildMI(PredBB, PredBB.end(), DL, TII.get(LMC.MovOpc), InitExec)
7061 .addReg(LMC.ExecReg);
7062
7063 BuildMI(LoopBB, I, DL, TII.get(TargetOpcode::PHI), PhiExec)
7064 .addReg(InitExec)
7065 .addMBB(&PredBB)
7066 .addReg(NewExec)
7067 .addMBB(&BodyBB);
7068 }
7069
7070 // Placement of v_cmpx instructions (when index is longer than 64 bit)
7071 // involves a trade-off between register pressure and latency:
7072 // (a) Defering all v_cmpx after all v_readfirstlane may increase
7073 // register pressure because arguments and results of all
7074 // v_readfirstlane instructions must stay live until deferred v_cmpx use them.
7075 // (b) Interleaving v_cmpx with v_readfirstlanes may reduce live ranges and
7076 // increase latency by placing v_readfirstlane instructions
7077 // immediately before v_cmpx instruction that directly depend on it.
7078 ///
7079 // Emitting interleaved v_cmpx and v_readfirstlane requires
7080 // block splitting because v_cmpx changes EXEC mask and therefore for safety
7081 // v_cmpx needs to be treated as terminator until after register allocation
7082 // (spill placement) and instruction reordering.
7083 //
7084 // Current implementation defers v_cmpx and leaves other instruction
7085 // scheduling decisions to later passes, where register pressure is known or
7086 // easier to approximate.
7087 // Non-terminators (V_READFIRSTLANE and REG_SEQUENCE) are inserted before I;
7088 // v_cmpx instructions are inserted at the end of LoopBB.
7089 // After the first v_cmpx is emitted, I is updated to point to it
7090 // so subsequent non-terminators are inserted before all v_cmpx instructions.
7091 for (auto [Idx, ScalarOp] : enumerate(ScalarOps)) {
7092 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
7093 unsigned NumSubRegs = RegSize / 32;
7094 Register VScalarOp = ScalarOp->getReg();
7095
7096 const TargetRegisterClass *RFLSrcRC =
7097 TII.getRegClass(TII.get(AMDGPU::V_READFIRSTLANE_B32), 1);
7098
7099 if (NumSubRegs == 1) {
7100 const TargetRegisterClass *VScalarOpRC = MRI.getRegClass(VScalarOp);
7101 if (const TargetRegisterClass *Common =
7102 TRI->getCommonSubClass(VScalarOpRC, RFLSrcRC);
7103 Common != VScalarOpRC) {
7104 Register VRReg = MRI.createVirtualRegister(Common);
7105 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::COPY), VRReg).addReg(VScalarOp);
7106 VScalarOp = VRReg;
7107 }
7108 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7109
7110 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
7111 .addReg(VScalarOp);
7112
7113 if (UseNewExecInstructions) {
7114 auto CmpxMI = BuildMI(LoopBB, LoopBB.end(), DL,
7115 TII.get(AMDGPU::V_CMPX_EQ_U32_nosdst_e32_term))
7116 .addReg(CurReg)
7117 .addReg(VScalarOp);
7118 if (I == LoopBB.end())
7119 I = CmpxMI.getInstr()->getIterator();
7120 } else {
7121 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
7122
7123 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
7124 .addReg(CurReg)
7125 .addReg(VScalarOp);
7126
7127 // Combine the comparison results with AND.
7128 if (!CondReg) { // First.
7129 CondReg = NewCondReg;
7130 } else { // If not the first, we create an AND.
7131 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7132 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7133 .addReg(CondReg)
7134 .addReg(NewCondReg);
7135 CondReg = AndReg;
7136 }
7137 }
7138
7139 // Update ScalarOp operand to use the SGPR ScalarOp.
7140 if (PhySGPRs.empty() || !PhySGPRs[Idx].isValid())
7141 ScalarOp->setReg(CurReg);
7142 else {
7143 // Insert into the same block of use
7144 BuildMI(*ScalarOp->getParent()->getParent(), ScalarOp->getParent(), DL,
7145 TII.get(AMDGPU::COPY), PhySGPRs[Idx])
7146 .addReg(CurReg);
7147 ScalarOp->setReg(PhySGPRs[Idx]);
7148 }
7149 ScalarOp->setIsKill();
7150 } else {
7151 SmallVector<Register, 8> ReadlanePieces;
7152 RegState VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
7153 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
7154 "Unhandled register size");
7155
7156 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
7157 Register CurRegLo =
7158 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7159 Register CurRegHi =
7160 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7161
7162 // Read the next variant <- also loop target.
7163 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
7164 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
7165
7166 // Read the next variant <- also loop target.
7167 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
7168 .addReg(VScalarOp, VScalarOpUndef,
7169 TRI->getSubRegFromChannel(Idx + 1));
7170
7171 ReadlanePieces.push_back(CurRegLo);
7172 ReadlanePieces.push_back(CurRegHi);
7173
7174 // Comparison is to be done as 64-bit.
7175 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
7176 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
7177 .addReg(CurRegLo)
7178 .addImm(AMDGPU::sub0)
7179 .addReg(CurRegHi)
7180 .addImm(AMDGPU::sub1);
7181
7182 unsigned SubReg =
7183 NumSubRegs <= 2 ? 0 : TRI->getSubRegFromChannel(Idx, 2);
7184
7185 if (UseNewExecInstructions) {
7186 auto CmpxMI = BuildMI(LoopBB, LoopBB.end(), DL,
7187 TII.get(AMDGPU::V_CMPX_EQ_U64_nosdst_e32_term))
7188 .addReg(CurReg)
7189 .addReg(VScalarOp, VScalarOpUndef, SubReg);
7190 if (I == LoopBB.end())
7191 I = CmpxMI.getInstr()->getIterator();
7192 } else {
7193 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
7194 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), NewCondReg)
7195 .addReg(CurReg)
7196 .addReg(VScalarOp, VScalarOpUndef, SubReg);
7197
7198 // Combine the comparison results with AND.
7199 if (!CondReg) { // First.
7200 CondReg = NewCondReg;
7201 } else { // If not the first, we create an AND.
7202 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7203 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7204 .addReg(CondReg)
7205 .addReg(NewCondReg);
7206 CondReg = AndReg;
7207 }
7208 }
7209 } // End for loop.
7210
7211 const auto *SScalarOpRC =
7212 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
7213 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
7214
7215 // Build scalar ScalarOp.
7216 auto Merge =
7217 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
7218 unsigned Channel = 0;
7219 for (Register Piece : ReadlanePieces) {
7220 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
7221 }
7222
7223 // Update ScalarOp operand to use the SGPR ScalarOp.
7224 if (PhySGPRs.empty() || !PhySGPRs[Idx].isValid())
7225 ScalarOp->setReg(SScalarOp);
7226 else {
7227 BuildMI(*ScalarOp->getParent()->getParent(), ScalarOp->getParent(), DL,
7228 TII.get(AMDGPU::COPY), PhySGPRs[Idx])
7229 .addReg(SScalarOp);
7230 ScalarOp->setReg(PhySGPRs[Idx]);
7231 }
7232 ScalarOp->setIsKill();
7233 }
7234 }
7235
7236 // Instructions AndSaveExecOpc and AndN2WrExecOpc that modify EXEC mask
7237 // should have isTerminator=1 but terminators that define
7238 // virtual registers are not supported.
7239 Register SaveExec;
7240 if (!UseNewExecInstructions) {
7241 SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7242 MRI.setSimpleHint(SaveExec, CondReg);
7243
7244 // Update EXEC to matching lanes, saving original to SaveExec.
7245 BuildMI(LoopBB, I, DL, TII.get(LMC.AndSaveExecOpc), SaveExec)
7246 .addReg(CondReg, RegState::Kill);
7247 }
7248
7249 // The original instruction is here; we insert the terminators after it.
7250 I = BodyBB.end();
7251
7252 if (UseNewExecInstructions) {
7253 MRI.setSimpleHint(NewExec, PhiExec);
7254 BuildMI(BodyBB, I, DL, TII.get(LMC.AndN2WrExecOpc), NewExec)
7255 .addReg(PhiExec);
7256 } else {
7257 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
7258 BuildMI(BodyBB, I, DL, TII.get(LMC.XorTermOpc), LMC.ExecReg)
7259 .addReg(LMC.ExecReg)
7260 .addReg(SaveExec);
7261 }
7262
7263 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
7264}
7265
7266// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
7267// with SGPRs by iterating over all unique values across all lanes.
7268// Returns the loop basic block that now contains \p MI.
7269static MachineBasicBlock *
7273 MachineBasicBlock::iterator Begin = nullptr,
7274 MachineBasicBlock::iterator End = nullptr,
7275 ArrayRef<Register> PhySGPRs = {}) {
7276 assert((PhySGPRs.empty() || PhySGPRs.size() == ScalarOps.size()) &&
7277 "Physical SGPRs must be empty or match the number of scalar operands");
7278 MachineBasicBlock &MBB = *MI.getParent();
7279 MachineFunction &MF = *MBB.getParent();
7281 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7282 MachineRegisterInfo &MRI = MF.getRegInfo();
7283 if (!Begin.isValid())
7284 Begin = &MI;
7285 if (!End.isValid()) {
7286 End = &MI;
7287 ++End;
7288 }
7289 const DebugLoc &DL = MI.getDebugLoc();
7291 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7292
7293 // Save SCC. Waterfall Loop may overwrite SCC.
7294 Register SaveSCCReg;
7295
7296 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
7297 // rather than unlimited scan everywhere
7298 bool SCCNotDead =
7299 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
7300 std::numeric_limits<unsigned>::max()) !=
7302 if (SCCNotDead) {
7303 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7304 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
7305 .addImm(1)
7306 .addImm(0);
7307 }
7308
7309 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7310
7311 // Save the EXEC mask
7312 BuildMI(MBB, Begin, DL, TII.get(LMC.MovOpc), SaveExec).addReg(LMC.ExecReg);
7313
7314 // Killed uses in the instruction we are waterfalling around will be
7315 // incorrect due to the added control-flow.
7317 ++AfterMI;
7318 for (auto I = Begin; I != AfterMI; I++) {
7319 for (auto &MO : I->all_uses())
7320 MRI.clearKillFlags(MO.getReg());
7321 }
7322
7323 // To insert the loop we need to split the block. Move everything after this
7324 // point to a new block, and insert a new empty block between the two.
7327 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
7329 ++MBBI;
7330
7331 MF.insert(MBBI, LoopBB);
7332 MF.insert(MBBI, BodyBB);
7333 MF.insert(MBBI, RemainderBB);
7334
7335 LoopBB->addSuccessor(BodyBB);
7336 BodyBB->addSuccessor(LoopBB);
7337 BodyBB->addSuccessor(RemainderBB);
7338
7339 // Move Begin to MI to the BodyBB, and the remainder of the block to
7340 // RemainderBB.
7341 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
7342 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
7343 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
7344
7345 MBB.addSuccessor(LoopBB);
7346
7347 // Update dominators. We know that MBB immediately dominates LoopBB, that
7348 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7349 // RemainderBB. RemainderBB immediately dominates all of the successors
7350 // transferred to it from MBB that MBB used to properly dominate.
7351 if (MDT) {
7352 MDT->addNewBlock(LoopBB, &MBB);
7353 MDT->addNewBlock(BodyBB, LoopBB);
7354 MDT->addNewBlock(RemainderBB, BodyBB);
7355 for (auto &Succ : RemainderBB->successors()) {
7356 if (MDT->properlyDominates(&MBB, Succ)) {
7357 MDT->changeImmediateDominator(Succ, RemainderBB);
7358 }
7359 }
7360 }
7361
7362 emitLoadScalarOpsFromVGPRLoop(TII, MRI, MBB, *LoopBB, *BodyBB, DL, ScalarOps,
7363 PhySGPRs);
7364
7365 MachineBasicBlock::iterator First = RemainderBB->begin();
7366 // Restore SCC
7367 if (SCCNotDead) {
7368 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
7369 .addReg(SaveSCCReg, RegState::Kill)
7370 .addImm(0);
7371 }
7372
7373 // Restore the EXEC mask
7374 BuildMI(*RemainderBB, First, DL, TII.get(LMC.MovOpc), LMC.ExecReg)
7375 .addReg(SaveExec);
7376 return BodyBB;
7377}
7378
7379// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7380static std::tuple<unsigned, unsigned>
7382 MachineBasicBlock &MBB = *MI.getParent();
7383 MachineFunction &MF = *MBB.getParent();
7384 MachineRegisterInfo &MRI = MF.getRegInfo();
7385
7386 // Extract the ptr from the resource descriptor.
7387 unsigned RsrcPtr =
7388 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
7389 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
7390
7391 // Create an empty resource descriptor
7392 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
7393 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7394 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7395 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
7396 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7397
7398 // Zero64 = 0
7399 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
7400 .addImm(0);
7401
7402 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7403 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
7404 .addImm(Lo_32(RsrcDataFormat));
7405
7406 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7407 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
7408 .addImm(Hi_32(RsrcDataFormat));
7409
7410 // NewSRsrc = {Zero64, SRsrcFormat}
7411 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
7412 .addReg(Zero64)
7413 .addImm(AMDGPU::sub0_sub1)
7414 .addReg(SRsrcFormatLo)
7415 .addImm(AMDGPU::sub2)
7416 .addReg(SRsrcFormatHi)
7417 .addImm(AMDGPU::sub3);
7418
7419 return std::tuple(RsrcPtr, NewSRsrc);
7420}
7421
7424 MachineDominatorTree *MDT) const {
7425 MachineFunction &MF = *MI.getMF();
7426 MachineRegisterInfo &MRI = MF.getRegInfo();
7427 MachineBasicBlock *CreatedBB = nullptr;
7428
7429 // Legalize VOP2
7430 if (isVOP2(MI) || isVOPC(MI)) {
7432 return CreatedBB;
7433 }
7434
7435 // Legalize VOP3
7436 if (isVOP3(MI)) {
7438 return CreatedBB;
7439 }
7440
7441 // Legalize SMRD
7442 if (isSMRD(MI)) {
7444 return CreatedBB;
7445 }
7446
7447 // Legalize FLAT
7448 if (isFLAT(MI)) {
7450 return CreatedBB;
7451 }
7452
7453 // Legalize PHI
7454 // The register class of the operands must be the same type as the register
7455 // class of the output.
7456 if (MI.getOpcode() == AMDGPU::PHI) {
7457 const TargetRegisterClass *VRC = getOpRegClass(MI, 0);
7458 assert(!RI.isSGPRClass(VRC));
7459
7460 // Update all the operands so they have the same type.
7461 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7462 MachineOperand &Op = MI.getOperand(I);
7463 if (!Op.isReg() || !Op.getReg().isVirtual())
7464 continue;
7465
7466 // MI is a PHI instruction.
7467 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
7469
7470 // Avoid creating no-op copies with the same src and dst reg class. These
7471 // confuse some of the machine passes.
7472 legalizeGenericOperand(*InsertBB, Insert, VRC, Op, MRI, MI.getDebugLoc());
7473 }
7474 }
7475
7476 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7477 // VGPR dest type and SGPR sources, insert copies so all operands are
7478 // VGPRs. This seems to help operand folding / the register coalescer.
7479 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7480 MachineBasicBlock *MBB = MI.getParent();
7481 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
7482 if (RI.hasVGPRs(DstRC)) {
7483 // Update all the operands so they are VGPR register classes. These may
7484 // not be the same register class because REG_SEQUENCE supports mixing
7485 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7486 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7487 MachineOperand &Op = MI.getOperand(I);
7488 if (!Op.isReg() || !Op.getReg().isVirtual())
7489 continue;
7490
7491 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
7492 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
7493 if (VRC == OpRC)
7494 continue;
7495
7496 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
7497 Op.setIsKill();
7498 }
7499 }
7500
7501 return CreatedBB;
7502 }
7503
7504 // Legalize INSERT_SUBREG
7505 // src0 must have the same register class as dst
7506 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7507 Register Dst = MI.getOperand(0).getReg();
7508 Register Src0 = MI.getOperand(1).getReg();
7509 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
7510 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
7511 if (DstRC != Src0RC) {
7512 MachineBasicBlock *MBB = MI.getParent();
7513 MachineOperand &Op = MI.getOperand(1);
7514 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
7515 }
7516 return CreatedBB;
7517 }
7518
7519 // Legalize SI_INIT_M0
7520 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7521 MachineOperand &Src = MI.getOperand(0);
7522 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7523 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7524 return CreatedBB;
7525 }
7526
7527 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7528 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7529 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7530 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7531 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7532 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7533 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7534 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7535 MachineOperand &Src = MI.getOperand(1);
7536 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7537 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7538 return CreatedBB;
7539 }
7540
7541 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7542 //
7543 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7544 // scratch memory access. In both cases, the legalization never involves
7545 // conversion to the addr64 form.
7547 (isMUBUF(MI) || isMTBUF(MI)))) {
7548 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7549 ? AMDGPU::OpName::rsrc
7550 : AMDGPU::OpName::srsrc;
7551 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
7552 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
7553 CreatedBB = generateWaterFallLoop(*this, MI, {SRsrc}, MDT);
7554
7555 AMDGPU::OpName SampOpName =
7556 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7557 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
7558 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
7559 CreatedBB = generateWaterFallLoop(*this, MI, {SSamp}, MDT);
7560
7561 return CreatedBB;
7562 }
7563
7564 // Legalize SI_CALL
7565 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7566 MachineOperand *Dest = &MI.getOperand(0);
7567 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
7568 createWaterFallForSiCall(&MI, MDT, {Dest});
7569 }
7570 }
7571
7572 // Legalize s_sleep_var.
7573 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7574 const DebugLoc &DL = MI.getDebugLoc();
7575 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7576 int Src0Idx =
7577 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
7578 MachineOperand &Src0 = MI.getOperand(Src0Idx);
7579 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
7580 .add(Src0);
7581 Src0.ChangeToRegister(Reg, false);
7582 return nullptr;
7583 }
7584
7585 // Legalize TENSOR_LOAD_TO_LDS_d2/_d4, TENSOR_STORE_FROM_LDS_d2/_d4. All their
7586 // operands are scalar.
7587 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_d2 ||
7588 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_d4 ||
7589 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_d2 ||
7590 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_d4) {
7591 for (MachineOperand &Src : MI.explicit_operands()) {
7592 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7593 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7594 }
7595 return CreatedBB;
7596 }
7597
7598 // Legalize MUBUF instructions.
7599 bool isSoffsetLegal = true;
7600 int SoffsetIdx =
7601 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7602 if (SoffsetIdx != -1) {
7603 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7604 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7605 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7606 isSoffsetLegal = false;
7607 }
7608 }
7609
7610 bool isRsrcLegal = true;
7611 int RsrcIdx =
7612 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
7613 if (RsrcIdx != -1) {
7614 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7615 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
7616 isRsrcLegal = false;
7617 }
7618
7619 // The operands are legal.
7620 if (isRsrcLegal && isSoffsetLegal)
7621 return CreatedBB;
7622
7623 if (!isRsrcLegal) {
7624 // Legalize a VGPR Rsrc
7625 //
7626 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7627 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7628 // a zero-value SRsrc.
7629 //
7630 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7631 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7632 // above.
7633 //
7634 // Otherwise we are on non-ADDR64 hardware, and/or we have
7635 // idxen/offen/bothen and we fall back to a waterfall loop.
7636
7637 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7638 MachineBasicBlock &MBB = *MI.getParent();
7639
7640 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7641 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
7642 // This is already an ADDR64 instruction so we need to add the pointer
7643 // extracted from the resource descriptor to the current value of VAddr.
7644 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7645 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7646 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7647
7648 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7649 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
7650 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
7651
7652 unsigned RsrcPtr, NewSRsrc;
7653 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7654
7655 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7656 const DebugLoc &DL = MI.getDebugLoc();
7657 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
7658 .addDef(CondReg0)
7659 .addReg(RsrcPtr, {}, AMDGPU::sub0)
7660 .addReg(VAddr->getReg(), {}, AMDGPU::sub0)
7661 .addImm(0);
7662
7663 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7664 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
7665 .addDef(CondReg1, RegState::Dead)
7666 .addReg(RsrcPtr, {}, AMDGPU::sub1)
7667 .addReg(VAddr->getReg(), {}, AMDGPU::sub1)
7668 .addReg(CondReg0, RegState::Kill)
7669 .addImm(0);
7670
7671 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7672 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
7673 .addReg(NewVAddrLo)
7674 .addImm(AMDGPU::sub0)
7675 .addReg(NewVAddrHi)
7676 .addImm(AMDGPU::sub1);
7677
7678 VAddr->setReg(NewVAddr);
7679 Rsrc->setReg(NewSRsrc);
7680 } else if (!VAddr && ST.hasAddr64()) {
7681 // This instructions is the _OFFSET variant, so we need to convert it to
7682 // ADDR64.
7683 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7684 "FIXME: Need to emit flat atomics here");
7685
7686 unsigned RsrcPtr, NewSRsrc;
7687 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7688
7689 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7690 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
7691 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
7692 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7693 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
7694
7695 // Atomics with return have an additional tied operand and are
7696 // missing some of the special bits.
7697 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
7698 MachineInstr *Addr64;
7699
7700 if (!VDataIn) {
7701 // Regular buffer load / store.
7703 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7704 .add(*VData)
7705 .addReg(NewVAddr)
7706 .addReg(NewSRsrc)
7707 .add(*SOffset)
7708 .add(*Offset);
7709
7710 if (const MachineOperand *CPol =
7711 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
7712 MIB.addImm(CPol->getImm());
7713 }
7714
7715 if (const MachineOperand *TFE =
7716 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
7717 MIB.addImm(TFE->getImm());
7718 }
7719
7720 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
7721
7722 MIB.cloneMemRefs(MI);
7723 Addr64 = MIB;
7724 } else {
7725 // Atomics with return.
7726 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7727 .add(*VData)
7728 .add(*VDataIn)
7729 .addReg(NewVAddr)
7730 .addReg(NewSRsrc)
7731 .add(*SOffset)
7732 .add(*Offset)
7733 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7734 .cloneMemRefs(MI);
7735 }
7736
7737 MI.removeFromParent();
7738
7739 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7740 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7741 NewVAddr)
7742 .addReg(RsrcPtr, {}, AMDGPU::sub0)
7743 .addImm(AMDGPU::sub0)
7744 .addReg(RsrcPtr, {}, AMDGPU::sub1)
7745 .addImm(AMDGPU::sub1);
7746 } else {
7747 // Legalize a VGPR Rsrc and soffset together.
7748 if (!isSoffsetLegal) {
7749 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7750 CreatedBB = generateWaterFallLoop(*this, MI, {Rsrc, Soffset}, MDT);
7751 return CreatedBB;
7752 }
7753 CreatedBB = generateWaterFallLoop(*this, MI, {Rsrc}, MDT);
7754 return CreatedBB;
7755 }
7756 }
7757
7758 // Legalize a VGPR soffset.
7759 if (!isSoffsetLegal) {
7760 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7761 CreatedBB = generateWaterFallLoop(*this, MI, {Soffset}, MDT);
7762 return CreatedBB;
7763 }
7764 return CreatedBB;
7765}
7766
7768 InstrList.insert(MI);
7769 // Add MBUF instructiosn to deferred list.
7770 int RsrcIdx =
7771 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7772 if (RsrcIdx != -1) {
7773 DeferredList.insert(MI);
7774 }
7775}
7776
7778 return DeferredList.contains(MI);
7779}
7780
7781// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7782// lowering (change sgpr to vgpr).
7783// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7784// size. Need to legalize the size of the operands during the vgpr lowering
7785// chain. This can be removed after we have sgpr16 in place
7787 MachineRegisterInfo &MRI) const {
7788 if (!ST.useRealTrue16Insts())
7789 return;
7790
7791 unsigned Opcode = MI.getOpcode();
7792 MachineBasicBlock *MBB = MI.getParent();
7793 // Legalize operands and check for size mismatch
7794 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7795 OpIdx >= get(Opcode).getNumOperands() ||
7796 get(Opcode).operands()[OpIdx].RegClass == -1)
7797 return;
7798
7799 MachineOperand &Op = MI.getOperand(OpIdx);
7800 if (!Op.isReg() || !Op.getReg().isVirtual())
7801 return;
7802
7803 const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
7804 if (!RI.isVGPRClass(CurrRC))
7805 return;
7806
7807 int16_t RCID = getOpRegClassID(get(Opcode).operands()[OpIdx]);
7808 const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
7809 if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
7810 Op.setSubReg(AMDGPU::lo16);
7811 } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
7812 const DebugLoc &DL = MI.getDebugLoc();
7813 Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7814 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7815 BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
7816 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
7817 .addReg(Op.getReg())
7818 .addImm(AMDGPU::lo16)
7819 .addReg(Undef)
7820 .addImm(AMDGPU::hi16);
7821 Op.setReg(NewDstReg);
7822 }
7823}
7825 MachineRegisterInfo &MRI) const {
7826 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7828}
7829
7833 ArrayRef<Register> PhySGPRs) const {
7834 assert(MI->getOpcode() == AMDGPU::SI_CALL_ISEL &&
7835 "This only handle waterfall for SI_CALL_ISEL");
7836 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7837 // following copies, we also need to move copies from and to physical
7838 // registers into the loop block.
7839 // Also move the copies to physical registers into the loop block
7840 MachineBasicBlock &MBB = *MI->getParent();
7842 while (Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
7843 --Start;
7845 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
7846 ++End;
7847
7848 // Also include following copies of the return value
7849 ++End;
7850 while (End != MBB.end() && End->isCopy() &&
7851 MI->definesRegister(End->getOperand(1).getReg(), &RI))
7852 ++End;
7853
7854 generateWaterFallLoop(*this, *MI, ScalarOps, MDT, Start, End, PhySGPRs);
7855}
7856
7858 MachineDominatorTree *MDT) const {
7860 DenseMap<MachineInstr *, bool> V2SPhyCopiesToErase;
7861 while (!Worklist.empty()) {
7862 MachineInstr &Inst = *Worklist.top();
7863 Worklist.erase_top();
7864 // Skip MachineInstr in the deferred list.
7865 if (Worklist.isDeferred(&Inst))
7866 continue;
7867 moveToVALUImpl(Worklist, MDT, Inst, WaterFalls, V2SPhyCopiesToErase);
7868 }
7869
7870 // Deferred list of instructions will be processed once
7871 // all the MachineInstr in the worklist are done.
7872 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7873 moveToVALUImpl(Worklist, MDT, *Inst, WaterFalls, V2SPhyCopiesToErase);
7874 assert(Worklist.empty() &&
7875 "Deferred MachineInstr are not supposed to re-populate worklist");
7876 }
7877
7878 for (std::pair<MachineInstr *, V2PhysSCopyInfo> &Entry : WaterFalls) {
7879 if (Entry.first->getOpcode() == AMDGPU::SI_CALL_ISEL)
7880 createWaterFallForSiCall(Entry.first, MDT, Entry.second.MOs,
7881 Entry.second.SGPRs);
7882 }
7883
7884 for (std::pair<MachineInstr *, bool> Entry : V2SPhyCopiesToErase)
7885 if (Entry.second)
7886 Entry.first->eraseFromParent();
7887}
7889 MachineRegisterInfo &MRI, Register DstReg, MachineInstr &Inst) const {
7890 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
7891 // hope for the best.
7892 const TargetRegisterClass *DstRC = RI.getRegClassForReg(MRI, DstReg);
7893 ArrayRef<int16_t> SubRegIndices = RI.getRegSplitParts(DstRC, 4);
7894 if (SubRegIndices.size() <= 1) {
7895 Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7896 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7897 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
7898 .add(Inst.getOperand(1));
7899 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
7900 DstReg)
7901 .addReg(NewDst);
7902 } else {
7904 for (int16_t Indice : SubRegIndices) {
7905 Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7906 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7907 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
7908 .addReg(Inst.getOperand(1).getReg(), {}, Indice);
7909
7910 DstRegs.push_back(NewDst);
7911 }
7913 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7914 get(AMDGPU::REG_SEQUENCE), DstReg);
7915 for (unsigned i = 0; i < SubRegIndices.size(); ++i) {
7916 MIB.addReg(DstRegs[i]);
7917 MIB.addImm(RI.getSubRegFromChannel(i));
7918 }
7919 }
7920}
7921
7923 SIInstrWorklist &Worklist, Register DstReg, MachineInstr &Inst,
7926 DenseMap<MachineInstr *, bool> &V2SPhyCopiesToErase) const {
7927 if (DstReg == AMDGPU::M0) {
7928 createReadFirstLaneFromCopyToPhysReg(MRI, DstReg, Inst);
7929 V2SPhyCopiesToErase.try_emplace(&Inst, true);
7930 return;
7931 }
7932 Register SrcReg = Inst.getOperand(1).getReg();
7935 // Only search current block since phyreg's def & use cannot cross
7936 // blocks when MF.NoPhi = false.
7937 while (++I != E) {
7938 // For SI_CALL_ISEL users, replace the phys SGPR with the VGPR source
7939 // and record the operand for later waterfall loop generation.
7940 if (I->getOpcode() == AMDGPU::SI_CALL_ISEL) {
7941 MachineInstr *UseMI = &*I;
7942 for (unsigned i = 0; i < UseMI->getNumOperands(); ++i) {
7943 if (UseMI->getOperand(i).isReg() &&
7944 UseMI->getOperand(i).getReg() == DstReg) {
7945 MachineOperand *MO = &UseMI->getOperand(i);
7946 MO->setReg(SrcReg);
7947 V2PhysSCopyInfo &V2SCopyInfo = WaterFalls[UseMI];
7948 V2SCopyInfo.MOs.push_back(MO);
7949 V2SCopyInfo.SGPRs.push_back(DstReg);
7950 V2SPhyCopiesToErase.try_emplace(&Inst, true);
7951 }
7952 }
7953 } else if (I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG &&
7954 I->getOperand(0).isReg() &&
7955 I->getOperand(0).getReg() == DstReg) {
7956 createReadFirstLaneFromCopyToPhysReg(MRI, DstReg, Inst);
7957 V2SPhyCopiesToErase.try_emplace(&Inst, true);
7958 } else if (I->readsRegister(DstReg, &RI)) {
7959 // COPY cannot be erased if other type of inst uses it.
7960 V2SPhyCopiesToErase[&Inst] = false;
7961 }
7962 if (I->findRegisterDefOperand(DstReg, &RI))
7963 break;
7964 }
7965}
7966
7968 SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst,
7970 DenseMap<MachineInstr *, bool> &V2SPhyCopiesToErase) const {
7971
7973 if (!MBB)
7974 return;
7975 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7976 unsigned Opcode = Inst.getOpcode();
7977 unsigned NewOpcode = getVALUOp(Inst);
7978 const DebugLoc &DL = Inst.getDebugLoc();
7979
7980 // Handle some special cases
7981 switch (Opcode) {
7982 default:
7983 break;
7984 case AMDGPU::S_ADD_I32:
7985 case AMDGPU::S_SUB_I32: {
7986 // FIXME: The u32 versions currently selected use the carry.
7987 bool Changed;
7988 MachineBasicBlock *CreatedBBTmp = nullptr;
7989 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7990 if (Changed)
7991 return;
7992
7993 // Default handling
7994 break;
7995 }
7996
7997 case AMDGPU::S_MUL_U64:
7998 if (ST.hasVMulU64Inst()) {
7999 NewOpcode = AMDGPU::V_MUL_U64_e64;
8000 break;
8001 }
8002 // Split s_mul_u64 in 32-bit vector multiplications.
8003 splitScalarSMulU64(Worklist, Inst, MDT);
8004 Inst.eraseFromParent();
8005 return;
8006
8007 case AMDGPU::S_MUL_U64_U32_PSEUDO:
8008 case AMDGPU::S_MUL_I64_I32_PSEUDO:
8009 // This is a special case of s_mul_u64 where all the operands are either
8010 // zero extended or sign extended.
8011 splitScalarSMulPseudo(Worklist, Inst, MDT);
8012 Inst.eraseFromParent();
8013 return;
8014
8015 case AMDGPU::S_AND_B64:
8016 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
8017 Inst.eraseFromParent();
8018 return;
8019
8020 case AMDGPU::S_OR_B64:
8021 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
8022 Inst.eraseFromParent();
8023 return;
8024
8025 case AMDGPU::S_XOR_B64:
8026 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
8027 Inst.eraseFromParent();
8028 return;
8029
8030 case AMDGPU::S_NAND_B64:
8031 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
8032 Inst.eraseFromParent();
8033 return;
8034
8035 case AMDGPU::S_NOR_B64:
8036 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
8037 Inst.eraseFromParent();
8038 return;
8039
8040 case AMDGPU::S_XNOR_B64:
8041 if (ST.hasDLInsts())
8042 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
8043 else
8044 splitScalar64BitXnor(Worklist, Inst, MDT);
8045 Inst.eraseFromParent();
8046 return;
8047
8048 case AMDGPU::S_ANDN2_B64:
8049 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
8050 Inst.eraseFromParent();
8051 return;
8052
8053 case AMDGPU::S_ORN2_B64:
8054 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
8055 Inst.eraseFromParent();
8056 return;
8057
8058 case AMDGPU::S_BREV_B64:
8059 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
8060 Inst.eraseFromParent();
8061 return;
8062
8063 case AMDGPU::S_NOT_B64:
8064 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
8065 Inst.eraseFromParent();
8066 return;
8067
8068 case AMDGPU::S_BCNT1_I32_B64:
8069 splitScalar64BitBCNT(Worklist, Inst);
8070 Inst.eraseFromParent();
8071 return;
8072
8073 case AMDGPU::S_BFE_I64:
8074 splitScalar64BitBFE(Worklist, Inst);
8075 Inst.eraseFromParent();
8076 return;
8077
8078 case AMDGPU::S_FLBIT_I32_B64:
8079 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
8080 Inst.eraseFromParent();
8081 return;
8082 case AMDGPU::S_FF1_I32_B64:
8083 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
8084 Inst.eraseFromParent();
8085 return;
8086
8087 case AMDGPU::S_LSHL_B32:
8088 if (ST.hasOnlyRevVALUShifts()) {
8089 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
8090 swapOperands(Inst);
8091 }
8092 break;
8093 case AMDGPU::S_ASHR_I32:
8094 if (ST.hasOnlyRevVALUShifts()) {
8095 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
8096 swapOperands(Inst);
8097 }
8098 break;
8099 case AMDGPU::S_LSHR_B32:
8100 if (ST.hasOnlyRevVALUShifts()) {
8101 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
8102 swapOperands(Inst);
8103 }
8104 break;
8105 case AMDGPU::S_LSHL_B64:
8106 if (ST.hasOnlyRevVALUShifts()) {
8107 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
8108 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
8109 : AMDGPU::V_LSHLREV_B64_e64;
8110 swapOperands(Inst);
8111 }
8112 break;
8113 case AMDGPU::S_ASHR_I64:
8114 if (ST.hasOnlyRevVALUShifts()) {
8115 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
8116 swapOperands(Inst);
8117 }
8118 break;
8119 case AMDGPU::S_LSHR_B64:
8120 if (ST.hasOnlyRevVALUShifts()) {
8121 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
8122 swapOperands(Inst);
8123 }
8124 break;
8125
8126 case AMDGPU::S_ABS_I32:
8127 lowerScalarAbs(Worklist, Inst);
8128 Inst.eraseFromParent();
8129 return;
8130
8131 case AMDGPU::S_ABSDIFF_I32:
8132 lowerScalarAbsDiff(Worklist, Inst);
8133 Inst.eraseFromParent();
8134 return;
8135
8136 case AMDGPU::S_CBRANCH_SCC0:
8137 case AMDGPU::S_CBRANCH_SCC1: {
8138 // Clear unused bits of vcc
8139 Register CondReg = Inst.getOperand(1).getReg();
8140 bool IsSCC = CondReg == AMDGPU::SCC;
8142 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(LMC.AndOpc), LMC.VccReg)
8143 .addReg(LMC.ExecReg)
8144 .addReg(IsSCC ? LMC.VccReg : CondReg);
8145 Inst.removeOperand(1);
8146 } break;
8147
8148 case AMDGPU::S_BFE_U64:
8149 case AMDGPU::S_BFM_B64:
8150 llvm_unreachable("Moving this op to VALU not implemented");
8151
8152 case AMDGPU::S_PACK_LL_B32_B16:
8153 case AMDGPU::S_PACK_LH_B32_B16:
8154 case AMDGPU::S_PACK_HL_B32_B16:
8155 case AMDGPU::S_PACK_HH_B32_B16:
8156 movePackToVALU(Worklist, MRI, Inst);
8157 Inst.eraseFromParent();
8158 return;
8159
8160 case AMDGPU::S_XNOR_B32:
8161 lowerScalarXnor(Worklist, Inst);
8162 Inst.eraseFromParent();
8163 return;
8164
8165 case AMDGPU::S_NAND_B32:
8166 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
8167 Inst.eraseFromParent();
8168 return;
8169
8170 case AMDGPU::S_NOR_B32:
8171 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
8172 Inst.eraseFromParent();
8173 return;
8174
8175 case AMDGPU::S_ANDN2_B32:
8176 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
8177 Inst.eraseFromParent();
8178 return;
8179
8180 case AMDGPU::S_ORN2_B32:
8181 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
8182 Inst.eraseFromParent();
8183 return;
8184
8185 // TODO: remove as soon as everything is ready
8186 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
8187 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
8188 // can only be selected from the uniform SDNode.
8189 case AMDGPU::S_ADD_CO_PSEUDO:
8190 case AMDGPU::S_SUB_CO_PSEUDO: {
8191 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
8192 ? AMDGPU::V_ADDC_U32_e64
8193 : AMDGPU::V_SUBB_U32_e64;
8194 const auto *CarryRC = RI.getWaveMaskRegClass();
8195
8196 Register CarryInReg = Inst.getOperand(4).getReg();
8197 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
8198 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
8199 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
8200 .addReg(CarryInReg);
8201 }
8202
8203 Register CarryOutReg = Inst.getOperand(1).getReg();
8204
8205 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
8206 MRI.getRegClass(Inst.getOperand(0).getReg())));
8207 MachineInstr *CarryOp =
8208 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
8209 .addReg(CarryOutReg, RegState::Define)
8210 .add(Inst.getOperand(2))
8211 .add(Inst.getOperand(3))
8212 .addReg(CarryInReg)
8213 .addImm(0);
8214 legalizeOperands(*CarryOp);
8215 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
8216 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8217 Inst.eraseFromParent();
8218 }
8219 return;
8220 case AMDGPU::S_UADDO_PSEUDO:
8221 case AMDGPU::S_USUBO_PSEUDO: {
8222 MachineOperand &Dest0 = Inst.getOperand(0);
8223 MachineOperand &Dest1 = Inst.getOperand(1);
8224 MachineOperand &Src0 = Inst.getOperand(2);
8225 MachineOperand &Src1 = Inst.getOperand(3);
8226
8227 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
8228 ? AMDGPU::V_ADD_CO_U32_e64
8229 : AMDGPU::V_SUB_CO_U32_e64;
8230 const TargetRegisterClass *NewRC =
8231 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
8232 Register DestReg = MRI.createVirtualRegister(NewRC);
8233 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
8234 .addReg(Dest1.getReg(), RegState::Define)
8235 .add(Src0)
8236 .add(Src1)
8237 .addImm(0); // clamp bit
8238
8239 legalizeOperands(*NewInstr, MDT);
8240 MRI.replaceRegWith(Dest0.getReg(), DestReg);
8241 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8242 Inst.eraseFromParent();
8243 }
8244 return;
8245 case AMDGPU::S_LSHL1_ADD_U32:
8246 case AMDGPU::S_LSHL2_ADD_U32:
8247 case AMDGPU::S_LSHL3_ADD_U32:
8248 case AMDGPU::S_LSHL4_ADD_U32: {
8249 MachineOperand &Dest = Inst.getOperand(0);
8250 MachineOperand &Src0 = Inst.getOperand(1);
8251 MachineOperand &Src1 = Inst.getOperand(2);
8252 unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1
8253 : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
8254 : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
8255 : 4);
8256
8257 const TargetRegisterClass *NewRC =
8258 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()));
8259 Register DestReg = MRI.createVirtualRegister(NewRC);
8260 MachineInstr *NewInstr =
8261 BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
8262 .add(Src0)
8263 .addImm(ShiftAmt)
8264 .add(Src1);
8265
8266 legalizeOperands(*NewInstr, MDT);
8267 MRI.replaceRegWith(Dest.getReg(), DestReg);
8268 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8269 Inst.eraseFromParent();
8270 }
8271 return;
8272 case AMDGPU::S_CSELECT_B32:
8273 case AMDGPU::S_CSELECT_B64:
8274 lowerSelect(Worklist, Inst, MDT);
8275 Inst.eraseFromParent();
8276 return;
8277 case AMDGPU::S_CMP_EQ_I32:
8278 case AMDGPU::S_CMP_LG_I32:
8279 case AMDGPU::S_CMP_GT_I32:
8280 case AMDGPU::S_CMP_GE_I32:
8281 case AMDGPU::S_CMP_LT_I32:
8282 case AMDGPU::S_CMP_LE_I32:
8283 case AMDGPU::S_CMP_EQ_U32:
8284 case AMDGPU::S_CMP_LG_U32:
8285 case AMDGPU::S_CMP_GT_U32:
8286 case AMDGPU::S_CMP_GE_U32:
8287 case AMDGPU::S_CMP_LT_U32:
8288 case AMDGPU::S_CMP_LE_U32:
8289 case AMDGPU::S_CMP_EQ_U64:
8290 case AMDGPU::S_CMP_LG_U64:
8291 case AMDGPU::S_CMP_LT_F32:
8292 case AMDGPU::S_CMP_EQ_F32:
8293 case AMDGPU::S_CMP_LE_F32:
8294 case AMDGPU::S_CMP_GT_F32:
8295 case AMDGPU::S_CMP_LG_F32:
8296 case AMDGPU::S_CMP_GE_F32:
8297 case AMDGPU::S_CMP_O_F32:
8298 case AMDGPU::S_CMP_U_F32:
8299 case AMDGPU::S_CMP_NGE_F32:
8300 case AMDGPU::S_CMP_NLG_F32:
8301 case AMDGPU::S_CMP_NGT_F32:
8302 case AMDGPU::S_CMP_NLE_F32:
8303 case AMDGPU::S_CMP_NEQ_F32:
8304 case AMDGPU::S_CMP_NLT_F32: {
8305 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8306 auto NewInstr =
8307 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8308 .setMIFlags(Inst.getFlags());
8309 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
8310 0) {
8311 NewInstr
8312 .addImm(0) // src0_modifiers
8313 .add(Inst.getOperand(0)) // src0
8314 .addImm(0) // src1_modifiers
8315 .add(Inst.getOperand(1)) // src1
8316 .addImm(0); // clamp
8317 } else {
8318 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
8319 }
8320 legalizeOperands(*NewInstr, MDT);
8321 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8322 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8323 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8324 Inst.eraseFromParent();
8325 return;
8326 }
8327 case AMDGPU::S_CMP_LT_F16:
8328 case AMDGPU::S_CMP_EQ_F16:
8329 case AMDGPU::S_CMP_LE_F16:
8330 case AMDGPU::S_CMP_GT_F16:
8331 case AMDGPU::S_CMP_LG_F16:
8332 case AMDGPU::S_CMP_GE_F16:
8333 case AMDGPU::S_CMP_O_F16:
8334 case AMDGPU::S_CMP_U_F16:
8335 case AMDGPU::S_CMP_NGE_F16:
8336 case AMDGPU::S_CMP_NLG_F16:
8337 case AMDGPU::S_CMP_NGT_F16:
8338 case AMDGPU::S_CMP_NLE_F16:
8339 case AMDGPU::S_CMP_NEQ_F16:
8340 case AMDGPU::S_CMP_NLT_F16: {
8341 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8342 auto NewInstr =
8343 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8344 .setMIFlags(Inst.getFlags());
8345 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
8346 NewInstr
8347 .addImm(0) // src0_modifiers
8348 .add(Inst.getOperand(0)) // src0
8349 .addImm(0) // src1_modifiers
8350 .add(Inst.getOperand(1)) // src1
8351 .addImm(0); // clamp
8352 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8353 NewInstr.addImm(0); // op_sel0
8354 } else {
8355 NewInstr
8356 .add(Inst.getOperand(0))
8357 .add(Inst.getOperand(1));
8358 }
8359 legalizeOperandsVALUt16(*NewInstr, MRI);
8360 legalizeOperands(*NewInstr, MDT);
8361 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8362 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8363 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8364 Inst.eraseFromParent();
8365 return;
8366 }
8367 case AMDGPU::S_CVT_HI_F32_F16: {
8368 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8369 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8370 if (ST.useRealTrue16Insts()) {
8371 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
8372 .add(Inst.getOperand(1));
8373 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8374 .addImm(0) // src0_modifiers
8375 .addReg(TmpReg, {}, AMDGPU::hi16)
8376 .addImm(0) // clamp
8377 .addImm(0) // omod
8378 .addImm(0); // op_sel0
8379 } else {
8380 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8381 .addImm(16)
8382 .add(Inst.getOperand(1));
8383 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8384 .addImm(0) // src0_modifiers
8385 .addReg(TmpReg)
8386 .addImm(0) // clamp
8387 .addImm(0); // omod
8388 }
8389
8390 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8391 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8392 Inst.eraseFromParent();
8393 return;
8394 }
8395 case AMDGPU::S_MINIMUM_F32:
8396 case AMDGPU::S_MAXIMUM_F32: {
8397 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8398 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8399 .addImm(0) // src0_modifiers
8400 .add(Inst.getOperand(1))
8401 .addImm(0) // src1_modifiers
8402 .add(Inst.getOperand(2))
8403 .addImm(0) // clamp
8404 .addImm(0); // omod
8405 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8406
8407 legalizeOperands(*NewInstr, MDT);
8408 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8409 Inst.eraseFromParent();
8410 return;
8411 }
8412 case AMDGPU::S_MINIMUM_F16:
8413 case AMDGPU::S_MAXIMUM_F16: {
8414 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8415 ? &AMDGPU::VGPR_16RegClass
8416 : &AMDGPU::VGPR_32RegClass);
8417 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8418 .addImm(0) // src0_modifiers
8419 .add(Inst.getOperand(1))
8420 .addImm(0) // src1_modifiers
8421 .add(Inst.getOperand(2))
8422 .addImm(0) // clamp
8423 .addImm(0) // omod
8424 .addImm(0); // opsel0
8425 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8426 legalizeOperandsVALUt16(*NewInstr, MRI);
8427 legalizeOperands(*NewInstr, MDT);
8428 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8429 Inst.eraseFromParent();
8430 return;
8431 }
8432 case AMDGPU::V_S_EXP_F16_e64:
8433 case AMDGPU::V_S_LOG_F16_e64:
8434 case AMDGPU::V_S_RCP_F16_e64:
8435 case AMDGPU::V_S_RSQ_F16_e64:
8436 case AMDGPU::V_S_SQRT_F16_e64: {
8437 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8438 ? &AMDGPU::VGPR_16RegClass
8439 : &AMDGPU::VGPR_32RegClass);
8440 auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8441 .add(Inst.getOperand(1)) // src0_modifiers
8442 .add(Inst.getOperand(2))
8443 .add(Inst.getOperand(3)) // clamp
8444 .add(Inst.getOperand(4)) // omod
8445 .setMIFlags(Inst.getFlags());
8446 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8447 NewInstr.addImm(0); // opsel0
8448 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8449 legalizeOperandsVALUt16(*NewInstr, MRI);
8450 legalizeOperands(*NewInstr, MDT);
8451 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8452 Inst.eraseFromParent();
8453 return;
8454 }
8455 }
8456
8457 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8458 // We cannot move this instruction to the VALU, so we should try to
8459 // legalize its operands instead.
8460 legalizeOperands(Inst, MDT);
8461 return;
8462 }
8463 // Handle converting generic instructions like COPY-to-SGPR into
8464 // COPY-to-VGPR.
8465 if (NewOpcode == Opcode) {
8466 Register DstReg = Inst.getOperand(0).getReg();
8467 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8468
8469 if (Inst.isCopy() && DstReg.isPhysical() &&
8470 Inst.getOperand(1).getReg().isVirtual()) {
8471 handleCopyToPhysHelper(Worklist, DstReg, Inst, MRI, WaterFalls,
8472 V2SPhyCopiesToErase);
8473 return;
8474 }
8475
8476 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual()) {
8477 Register NewDstReg = Inst.getOperand(1).getReg();
8478 const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg);
8479 if (const TargetRegisterClass *CommonRC =
8480 RI.getCommonSubClass(NewDstRC, SrcRC)) {
8481 // Instead of creating a copy where src and dst are the same register
8482 // class, we just replace all uses of dst with src. These kinds of
8483 // copies interfere with the heuristics MachineSink uses to decide
8484 // whether or not to split a critical edge. Since the pass assumes
8485 // that copies will end up as machine instructions and not be
8486 // eliminated.
8487 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
8488 MRI.replaceRegWith(DstReg, NewDstReg);
8489 MRI.clearKillFlags(NewDstReg);
8490 Inst.getOperand(0).setReg(DstReg);
8491
8492 if (!MRI.constrainRegClass(NewDstReg, CommonRC))
8493 llvm_unreachable("failed to constrain register");
8494
8495 Inst.eraseFromParent();
8496
8497 for (MachineOperand &UseMO :
8498 make_early_inc_range(MRI.use_operands(NewDstReg))) {
8499 MachineInstr &UseMI = *UseMO.getParent();
8500
8501 // Legalize t16 operands since replaceReg is called after
8502 // addUsersToVALU.
8504
8505 unsigned OpIdx = UseMI.getOperandNo(&UseMO);
8506 if (const TargetRegisterClass *OpRC =
8507 getRegClass(UseMI.getDesc(), OpIdx))
8508 MRI.constrainRegClass(NewDstReg, OpRC);
8509 }
8510
8511 return;
8512 }
8513 }
8514
8515 // If this is a v2s copy between 16bit and 32bit reg,
8516 // replace vgpr copy to reg_sequence/extract_subreg
8517 // This can be remove after we have sgpr16 in place
8518 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8519 Inst.getOperand(1).getReg().isVirtual() &&
8520 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8521 const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
8522 if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
8523 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8524 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
8525 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8526 get(AMDGPU::IMPLICIT_DEF), Undef);
8527 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8528 get(AMDGPU::REG_SEQUENCE), NewDstReg)
8529 .addReg(Inst.getOperand(1).getReg())
8530 .addImm(AMDGPU::lo16)
8531 .addReg(Undef)
8532 .addImm(AMDGPU::hi16);
8533 Inst.eraseFromParent();
8534 MRI.replaceRegWith(DstReg, NewDstReg);
8535 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8536 return;
8537 } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
8538 AMDGPU::lo16)) {
8539 Inst.getOperand(1).setSubReg(AMDGPU::lo16);
8540 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8541 MRI.replaceRegWith(DstReg, NewDstReg);
8542 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8543 return;
8544 }
8545 }
8546
8547 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8548 MRI.replaceRegWith(DstReg, NewDstReg);
8549 legalizeOperands(Inst, MDT);
8550 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8551 return;
8552 }
8553
8554 // Use the new VALU Opcode.
8555 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
8556 .setMIFlags(Inst.getFlags());
8557 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
8558 // Intersperse VOP3 modifiers among the SALU operands.
8559 NewInstr->addOperand(Inst.getOperand(0));
8560 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8561 AMDGPU::OpName::src0_modifiers) >= 0)
8562 NewInstr.addImm(0);
8563 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
8564 const MachineOperand &Src = Inst.getOperand(1);
8565 NewInstr->addOperand(Src);
8566 }
8567
8568 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8569 // We are converting these to a BFE, so we need to add the missing
8570 // operands for the size and offset.
8571 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8572 NewInstr.addImm(0);
8573 NewInstr.addImm(Size);
8574 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8575 // The VALU version adds the second operand to the result, so insert an
8576 // extra 0 operand.
8577 NewInstr.addImm(0);
8578 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8579 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
8580 // If we need to move this to VGPRs, we need to unpack the second
8581 // operand back into the 2 separate ones for bit offset and width.
8582 assert(OffsetWidthOp.isImm() &&
8583 "Scalar BFE is only implemented for constant width and offset");
8584 uint32_t Imm = OffsetWidthOp.getImm();
8585
8586 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8587 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8588 NewInstr.addImm(Offset);
8589 NewInstr.addImm(BitWidth);
8590 } else {
8591 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8592 AMDGPU::OpName::src1_modifiers) >= 0)
8593 NewInstr.addImm(0);
8594 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
8595 NewInstr->addOperand(Inst.getOperand(2));
8596 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8597 AMDGPU::OpName::src2_modifiers) >= 0)
8598 NewInstr.addImm(0);
8599 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
8600 NewInstr->addOperand(Inst.getOperand(3));
8601 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
8602 NewInstr.addImm(0);
8603 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
8604 NewInstr.addImm(0);
8605 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
8606 NewInstr.addImm(0);
8607 }
8608 } else {
8609 // Just copy the SALU operands.
8610 for (const MachineOperand &Op : Inst.explicit_operands())
8611 NewInstr->addOperand(Op);
8612 }
8613
8614 // Remove any references to SCC. Vector instructions can't read from it, and
8615 // We're just about to add the implicit use / defs of VCC, and we don't want
8616 // both.
8617 for (MachineOperand &Op : Inst.implicit_operands()) {
8618 if (Op.getReg() == AMDGPU::SCC) {
8619 // Only propagate through live-def of SCC.
8620 if (Op.isDef() && !Op.isDead())
8621 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
8622 if (Op.isUse())
8623 addSCCDefsToVALUWorklist(NewInstr, Worklist);
8624 }
8625 }
8626 Inst.eraseFromParent();
8627 Register NewDstReg;
8628 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
8629 Register DstReg = NewInstr->getOperand(0).getReg();
8630 assert(DstReg.isVirtual());
8631 // Update the destination register class.
8632 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
8633 assert(NewDstRC);
8634 NewDstReg = MRI.createVirtualRegister(NewDstRC);
8635 MRI.replaceRegWith(DstReg, NewDstReg);
8636 }
8637 fixImplicitOperands(*NewInstr);
8638
8639 legalizeOperandsVALUt16(*NewInstr, MRI);
8640
8641 // Legalize the operands
8642 legalizeOperands(*NewInstr, MDT);
8643 if (NewDstReg)
8644 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8645}
8646
8647// Add/sub require special handling to deal with carry outs.
8648std::pair<bool, MachineBasicBlock *>
8649SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8650 MachineDominatorTree *MDT) const {
8651 if (ST.hasAddNoCarryInsts()) {
8652 // Assume there is no user of scc since we don't select this in that case.
8653 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8654 // is used.
8655
8656 MachineBasicBlock &MBB = *Inst.getParent();
8657 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8658
8659 Register OldDstReg = Inst.getOperand(0).getReg();
8660 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8661
8662 unsigned Opc = Inst.getOpcode();
8663 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8664
8665 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8666 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8667
8668 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8669 Inst.removeOperand(3);
8670
8671 Inst.setDesc(get(NewOpc));
8672 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
8673 Inst.addImplicitDefUseOperands(*MBB.getParent());
8674 MRI.replaceRegWith(OldDstReg, ResultReg);
8675 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
8676
8677 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8678 return std::pair(true, NewBB);
8679 }
8680
8681 return std::pair(false, nullptr);
8682}
8683
8684void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8685 MachineDominatorTree *MDT) const {
8686
8687 MachineBasicBlock &MBB = *Inst.getParent();
8688 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8689 MachineBasicBlock::iterator MII = Inst;
8690 const DebugLoc &DL = Inst.getDebugLoc();
8691
8692 MachineOperand &Dest = Inst.getOperand(0);
8693 MachineOperand &Src0 = Inst.getOperand(1);
8694 MachineOperand &Src1 = Inst.getOperand(2);
8695 MachineOperand &Cond = Inst.getOperand(3);
8696
8697 Register CondReg = Cond.getReg();
8698 bool IsSCC = (CondReg == AMDGPU::SCC);
8699
8700 // Remove S_CSELECT instructions that we previously inserted to feed the SCC
8701 // condition output from S_CMP into the SGPR condition input of V_CNDMASK. If
8702 // the S_CMP has been promoted to V_CMP then we can feed its SGPR condition
8703 // output directly into the V_CNDMASK.
8704 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8705 (Src1.getImm() == 0)) {
8706 for (MachineOperand &UseMO :
8708 MachineInstr &UseMI = *UseMO.getParent();
8709 switch (UseMI.getOpcode()) {
8710 case AMDGPU::V_CNDMASK_B16_fake16_e32:
8711 case AMDGPU::V_CNDMASK_B16_fake16_e64:
8712 case AMDGPU::V_CNDMASK_B16_t16_e32:
8713 case AMDGPU::V_CNDMASK_B16_t16_e64:
8714 case AMDGPU::V_CNDMASK_B32_e32:
8715 case AMDGPU::V_CNDMASK_B32_e64:
8716 case AMDGPU::V_CNDMASK_B64_PSEUDO:
8717 if (UseMO.isImplicit() ||
8718 &UseMO == getNamedOperand(UseMI, AMDGPU::OpName::src2))
8719 UseMO.setReg(CondReg);
8720 }
8721 }
8722 if (MRI.use_nodbg_empty(Dest.getReg()))
8723 return;
8724 }
8725
8726 Register NewCondReg = CondReg;
8727 if (IsSCC) {
8728 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8729 NewCondReg = MRI.createVirtualRegister(TC);
8730
8731 // Now look for the closest SCC def if it is a copy
8732 // replacing the CondReg with the COPY source register
8733 bool CopyFound = false;
8734 for (MachineInstr &CandI :
8736 Inst.getParent()->rend())) {
8737 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
8738 -1) {
8739 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
8740 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
8741 .addReg(CandI.getOperand(1).getReg());
8742 CopyFound = true;
8743 }
8744 break;
8745 }
8746 }
8747 if (!CopyFound) {
8748 // SCC def is not a copy
8749 // Insert a trivial select instead of creating a copy, because a copy from
8750 // SCC would semantically mean just copying a single bit, but we may need
8751 // the result to be a vector condition mask that needs preserving.
8752 unsigned Opcode =
8753 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8754 auto NewSelect =
8755 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
8756 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
8757 }
8758 }
8759
8760 Register NewDestReg = MRI.createVirtualRegister(
8761 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
8762 MachineInstr *NewInst;
8763 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8764 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
8765 .addImm(0)
8766 .add(Src1) // False
8767 .addImm(0)
8768 .add(Src0) // True
8769 .addReg(NewCondReg);
8770 } else {
8771 NewInst =
8772 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
8773 .add(Src1) // False
8774 .add(Src0) // True
8775 .addReg(NewCondReg);
8776 }
8777 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
8778 legalizeOperands(*NewInst, MDT);
8779 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
8780}
8781
8782void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8783 MachineInstr &Inst) const {
8784 MachineBasicBlock &MBB = *Inst.getParent();
8785 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8786 MachineBasicBlock::iterator MII = Inst;
8787 const DebugLoc &DL = Inst.getDebugLoc();
8788
8789 MachineOperand &Dest = Inst.getOperand(0);
8790 MachineOperand &Src = Inst.getOperand(1);
8791 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8792 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8793
8794 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8795 : AMDGPU::V_SUB_CO_U32_e32;
8796
8797 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
8798 .addImm(0)
8799 .addReg(Src.getReg());
8800
8801 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8802 .addReg(Src.getReg())
8803 .addReg(TmpReg);
8804
8805 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8806 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8807}
8808
8809void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist,
8810 MachineInstr &Inst) const {
8811 MachineBasicBlock &MBB = *Inst.getParent();
8812 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8813 MachineBasicBlock::iterator MII = Inst;
8814 const DebugLoc &DL = Inst.getDebugLoc();
8815
8816 MachineOperand &Dest = Inst.getOperand(0);
8817 MachineOperand &Src1 = Inst.getOperand(1);
8818 MachineOperand &Src2 = Inst.getOperand(2);
8819 Register SubResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8820 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8821 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8822
8823 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8824 : AMDGPU::V_SUB_CO_U32_e32;
8825
8826 BuildMI(MBB, MII, DL, get(SubOp), SubResultReg)
8827 .addReg(Src1.getReg())
8828 .addReg(Src2.getReg());
8829
8830 BuildMI(MBB, MII, DL, get(SubOp), TmpReg).addImm(0).addReg(SubResultReg);
8831
8832 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8833 .addReg(SubResultReg)
8834 .addReg(TmpReg);
8835
8836 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8837 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8838}
8839
8840void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8841 MachineInstr &Inst) const {
8842 MachineBasicBlock &MBB = *Inst.getParent();
8843 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8844 MachineBasicBlock::iterator MII = Inst;
8845 const DebugLoc &DL = Inst.getDebugLoc();
8846
8847 MachineOperand &Dest = Inst.getOperand(0);
8848 MachineOperand &Src0 = Inst.getOperand(1);
8849 MachineOperand &Src1 = Inst.getOperand(2);
8850
8851 if (ST.hasDLInsts()) {
8852 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8853 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
8854 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
8855
8856 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
8857 .add(Src0)
8858 .add(Src1);
8859
8860 MRI.replaceRegWith(Dest.getReg(), NewDest);
8861 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8862 } else {
8863 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8864 // invert either source and then perform the XOR. If either source is a
8865 // scalar register, then we can leave the inversion on the scalar unit to
8866 // achieve a better distribution of scalar and vector instructions.
8867 bool Src0IsSGPR = Src0.isReg() &&
8868 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
8869 bool Src1IsSGPR = Src1.isReg() &&
8870 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
8871 MachineInstr *Xor;
8872 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8873 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8874
8875 // Build a pair of scalar instructions and add them to the work list.
8876 // The next iteration over the work list will lower these to the vector
8877 // unit as necessary.
8878 if (Src0IsSGPR) {
8879 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
8880 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8881 .addReg(Temp)
8882 .add(Src1);
8883 } else if (Src1IsSGPR) {
8884 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
8885 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8886 .add(Src0)
8887 .addReg(Temp);
8888 } else {
8889 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
8890 .add(Src0)
8891 .add(Src1);
8892 MachineInstr *Not =
8893 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
8894 Worklist.insert(Not);
8895 }
8896
8897 MRI.replaceRegWith(Dest.getReg(), NewDest);
8898
8899 Worklist.insert(Xor);
8900
8901 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8902 }
8903}
8904
8905void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8906 MachineInstr &Inst,
8907 unsigned Opcode) const {
8908 MachineBasicBlock &MBB = *Inst.getParent();
8909 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8910 MachineBasicBlock::iterator MII = Inst;
8911 const DebugLoc &DL = Inst.getDebugLoc();
8912
8913 MachineOperand &Dest = Inst.getOperand(0);
8914 MachineOperand &Src0 = Inst.getOperand(1);
8915 MachineOperand &Src1 = Inst.getOperand(2);
8916
8917 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8918 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8919
8920 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
8921 .add(Src0)
8922 .add(Src1);
8923
8924 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
8925 .addReg(Interm);
8926
8927 Worklist.insert(&Op);
8928 Worklist.insert(&Not);
8929
8930 MRI.replaceRegWith(Dest.getReg(), NewDest);
8931 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8932}
8933
8934void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8935 MachineInstr &Inst,
8936 unsigned Opcode) const {
8937 MachineBasicBlock &MBB = *Inst.getParent();
8938 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8939 MachineBasicBlock::iterator MII = Inst;
8940 const DebugLoc &DL = Inst.getDebugLoc();
8941
8942 MachineOperand &Dest = Inst.getOperand(0);
8943 MachineOperand &Src0 = Inst.getOperand(1);
8944 MachineOperand &Src1 = Inst.getOperand(2);
8945
8946 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8947 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8948
8949 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
8950 .add(Src1);
8951
8952 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
8953 .add(Src0)
8954 .addReg(Interm);
8955
8956 Worklist.insert(&Not);
8957 Worklist.insert(&Op);
8958
8959 MRI.replaceRegWith(Dest.getReg(), NewDest);
8960 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8961}
8962
8963void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8964 MachineInstr &Inst, unsigned Opcode,
8965 bool Swap) const {
8966 MachineBasicBlock &MBB = *Inst.getParent();
8967 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8968
8969 MachineOperand &Dest = Inst.getOperand(0);
8970 MachineOperand &Src0 = Inst.getOperand(1);
8971 const DebugLoc &DL = Inst.getDebugLoc();
8972
8973 MachineBasicBlock::iterator MII = Inst;
8974
8975 const MCInstrDesc &InstDesc = get(Opcode);
8976 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8977 MRI.getRegClass(Src0.getReg()) :
8978 &AMDGPU::SGPR_32RegClass;
8979
8980 const TargetRegisterClass *Src0SubRC =
8981 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8982
8983 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8984 AMDGPU::sub0, Src0SubRC);
8985
8986 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8987 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8988 const TargetRegisterClass *NewDestSubRC =
8989 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8990
8991 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8992 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
8993
8994 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8995 AMDGPU::sub1, Src0SubRC);
8996
8997 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8998 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
8999
9000 if (Swap)
9001 std::swap(DestSub0, DestSub1);
9002
9003 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
9004 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9005 .addReg(DestSub0)
9006 .addImm(AMDGPU::sub0)
9007 .addReg(DestSub1)
9008 .addImm(AMDGPU::sub1);
9009
9010 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9011
9012 Worklist.insert(&LoHalf);
9013 Worklist.insert(&HiHalf);
9014
9015 // We don't need to legalizeOperands here because for a single operand, src0
9016 // will support any kind of input.
9017
9018 // Move all users of this moved value.
9019 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9020}
9021
9022// There is not a vector equivalent of s_mul_u64. For this reason, we need to
9023// split the s_mul_u64 in 32-bit vector multiplications.
9024void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
9025 MachineInstr &Inst,
9026 MachineDominatorTree *MDT) const {
9027 MachineBasicBlock &MBB = *Inst.getParent();
9028 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9029
9030 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9031 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9032 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9033
9034 MachineOperand &Dest = Inst.getOperand(0);
9035 MachineOperand &Src0 = Inst.getOperand(1);
9036 MachineOperand &Src1 = Inst.getOperand(2);
9037 const DebugLoc &DL = Inst.getDebugLoc();
9038 MachineBasicBlock::iterator MII = Inst;
9039
9040 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
9041 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
9042 const TargetRegisterClass *Src0SubRC =
9043 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9044 if (RI.isSGPRClass(Src0SubRC))
9045 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
9046 const TargetRegisterClass *Src1SubRC =
9047 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9048 if (RI.isSGPRClass(Src1SubRC))
9049 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
9050
9051 // First, we extract the low 32-bit and high 32-bit values from each of the
9052 // operands.
9053 MachineOperand Op0L =
9054 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
9055 MachineOperand Op1L =
9056 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
9057 MachineOperand Op0H =
9058 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
9059 MachineOperand Op1H =
9060 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
9061
9062 // The multilication is done as follows:
9063 //
9064 // Op1H Op1L
9065 // * Op0H Op0L
9066 // --------------------
9067 // Op1H*Op0L Op1L*Op0L
9068 // + Op1H*Op0H Op1L*Op0H
9069 // -----------------------------------------
9070 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
9071 //
9072 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
9073 // value and that would overflow.
9074 // The low 32-bit value is Op1L*Op0L.
9075 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
9076
9077 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9078 MachineInstr *Op1L_Op0H =
9079 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
9080 .add(Op1L)
9081 .add(Op0H);
9082
9083 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9084 MachineInstr *Op1H_Op0L =
9085 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
9086 .add(Op1H)
9087 .add(Op0L);
9088
9089 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9090 MachineInstr *Carry =
9091 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
9092 .add(Op1L)
9093 .add(Op0L);
9094
9095 MachineInstr *LoHalf =
9096 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
9097 .add(Op1L)
9098 .add(Op0L);
9099
9100 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9101 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
9102 .addReg(Op1L_Op0H_Reg)
9103 .addReg(Op1H_Op0L_Reg);
9104
9105 MachineInstr *HiHalf =
9106 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
9107 .addReg(AddReg)
9108 .addReg(CarryReg);
9109
9110 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9111 .addReg(DestSub0)
9112 .addImm(AMDGPU::sub0)
9113 .addReg(DestSub1)
9114 .addImm(AMDGPU::sub1);
9115
9116 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9117
9118 // Try to legalize the operands in case we need to swap the order to keep it
9119 // valid.
9120 legalizeOperands(*Op1L_Op0H, MDT);
9121 legalizeOperands(*Op1H_Op0L, MDT);
9122 legalizeOperands(*Carry, MDT);
9123 legalizeOperands(*LoHalf, MDT);
9124 legalizeOperands(*Add, MDT);
9125 legalizeOperands(*HiHalf, MDT);
9126
9127 // Move all users of this moved value.
9128 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9129}
9130
9131// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
9132// multiplications.
9133void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
9134 MachineInstr &Inst,
9135 MachineDominatorTree *MDT) const {
9136 MachineBasicBlock &MBB = *Inst.getParent();
9137 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9138
9139 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9140 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9141 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9142
9143 MachineOperand &Dest = Inst.getOperand(0);
9144 MachineOperand &Src0 = Inst.getOperand(1);
9145 MachineOperand &Src1 = Inst.getOperand(2);
9146 const DebugLoc &DL = Inst.getDebugLoc();
9147 MachineBasicBlock::iterator MII = Inst;
9148
9149 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
9150 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
9151 const TargetRegisterClass *Src0SubRC =
9152 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9153 if (RI.isSGPRClass(Src0SubRC))
9154 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
9155 const TargetRegisterClass *Src1SubRC =
9156 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9157 if (RI.isSGPRClass(Src1SubRC))
9158 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
9159
9160 // First, we extract the low 32-bit and high 32-bit values from each of the
9161 // operands.
9162 MachineOperand Op0L =
9163 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
9164 MachineOperand Op1L =
9165 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
9166
9167 unsigned Opc = Inst.getOpcode();
9168 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
9169 ? AMDGPU::V_MUL_HI_U32_e64
9170 : AMDGPU::V_MUL_HI_I32_e64;
9171 MachineInstr *HiHalf =
9172 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
9173
9174 MachineInstr *LoHalf =
9175 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
9176 .add(Op1L)
9177 .add(Op0L);
9178
9179 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9180 .addReg(DestSub0)
9181 .addImm(AMDGPU::sub0)
9182 .addReg(DestSub1)
9183 .addImm(AMDGPU::sub1);
9184
9185 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9186
9187 // Try to legalize the operands in case we need to swap the order to keep it
9188 // valid.
9189 legalizeOperands(*HiHalf, MDT);
9190 legalizeOperands(*LoHalf, MDT);
9191
9192 // Move all users of this moved value.
9193 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9194}
9195
9196void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
9197 MachineInstr &Inst, unsigned Opcode,
9198 MachineDominatorTree *MDT) const {
9199 MachineBasicBlock &MBB = *Inst.getParent();
9200 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9201
9202 MachineOperand &Dest = Inst.getOperand(0);
9203 MachineOperand &Src0 = Inst.getOperand(1);
9204 MachineOperand &Src1 = Inst.getOperand(2);
9205 const DebugLoc &DL = Inst.getDebugLoc();
9206
9207 MachineBasicBlock::iterator MII = Inst;
9208
9209 const MCInstrDesc &InstDesc = get(Opcode);
9210 const TargetRegisterClass *Src0RC = Src0.isReg() ?
9211 MRI.getRegClass(Src0.getReg()) :
9212 &AMDGPU::SGPR_32RegClass;
9213
9214 const TargetRegisterClass *Src0SubRC =
9215 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9216 const TargetRegisterClass *Src1RC = Src1.isReg() ?
9217 MRI.getRegClass(Src1.getReg()) :
9218 &AMDGPU::SGPR_32RegClass;
9219
9220 const TargetRegisterClass *Src1SubRC =
9221 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9222
9223 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9224 AMDGPU::sub0, Src0SubRC);
9225 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
9226 AMDGPU::sub0, Src1SubRC);
9227 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9228 AMDGPU::sub1, Src0SubRC);
9229 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
9230 AMDGPU::sub1, Src1SubRC);
9231
9232 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9233 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
9234 const TargetRegisterClass *NewDestSubRC =
9235 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
9236
9237 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
9238 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
9239 .add(SrcReg0Sub0)
9240 .add(SrcReg1Sub0);
9241
9242 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
9243 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
9244 .add(SrcReg0Sub1)
9245 .add(SrcReg1Sub1);
9246
9247 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
9248 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9249 .addReg(DestSub0)
9250 .addImm(AMDGPU::sub0)
9251 .addReg(DestSub1)
9252 .addImm(AMDGPU::sub1);
9253
9254 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9255
9256 Worklist.insert(&LoHalf);
9257 Worklist.insert(&HiHalf);
9258
9259 // Move all users of this moved value.
9260 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9261}
9262
9263void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
9264 MachineInstr &Inst,
9265 MachineDominatorTree *MDT) const {
9266 MachineBasicBlock &MBB = *Inst.getParent();
9267 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9268
9269 MachineOperand &Dest = Inst.getOperand(0);
9270 MachineOperand &Src0 = Inst.getOperand(1);
9271 MachineOperand &Src1 = Inst.getOperand(2);
9272 const DebugLoc &DL = Inst.getDebugLoc();
9273
9274 MachineBasicBlock::iterator MII = Inst;
9275
9276 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9277
9278 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
9279
9280 MachineOperand* Op0;
9281 MachineOperand* Op1;
9282
9283 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
9284 Op0 = &Src0;
9285 Op1 = &Src1;
9286 } else {
9287 Op0 = &Src1;
9288 Op1 = &Src0;
9289 }
9290
9291 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
9292 .add(*Op0);
9293
9294 Register NewDest = MRI.createVirtualRegister(DestRC);
9295
9296 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
9297 .addReg(Interm)
9298 .add(*Op1);
9299
9300 MRI.replaceRegWith(Dest.getReg(), NewDest);
9301
9302 Worklist.insert(&Xor);
9303}
9304
9305void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
9306 MachineInstr &Inst) const {
9307 MachineBasicBlock &MBB = *Inst.getParent();
9308 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9309
9310 MachineBasicBlock::iterator MII = Inst;
9311 const DebugLoc &DL = Inst.getDebugLoc();
9312
9313 MachineOperand &Dest = Inst.getOperand(0);
9314 MachineOperand &Src = Inst.getOperand(1);
9315
9316 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
9317 const TargetRegisterClass *SrcRC = Src.isReg() ?
9318 MRI.getRegClass(Src.getReg()) :
9319 &AMDGPU::SGPR_32RegClass;
9320
9321 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9322 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9323
9324 const TargetRegisterClass *SrcSubRC =
9325 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9326
9327 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9328 AMDGPU::sub0, SrcSubRC);
9329 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9330 AMDGPU::sub1, SrcSubRC);
9331
9332 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
9333
9334 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
9335
9336 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9337
9338 // We don't need to legalize operands here. src0 for either instruction can be
9339 // an SGPR, and the second input is unused or determined here.
9340 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9341}
9342
9343void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
9344 MachineInstr &Inst) const {
9345 MachineBasicBlock &MBB = *Inst.getParent();
9346 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9347 MachineBasicBlock::iterator MII = Inst;
9348 const DebugLoc &DL = Inst.getDebugLoc();
9349
9350 MachineOperand &Dest = Inst.getOperand(0);
9351 uint32_t Imm = Inst.getOperand(2).getImm();
9352 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
9353 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
9354
9355 (void) Offset;
9356
9357 // Only sext_inreg cases handled.
9358 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
9359 Offset == 0 && "Not implemented");
9360
9361 if (BitWidth < 32) {
9362 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9363 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9364 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9365
9366 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
9367 .addReg(Inst.getOperand(1).getReg(), {}, AMDGPU::sub0)
9368 .addImm(0)
9369 .addImm(BitWidth);
9370
9371 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
9372 .addImm(31)
9373 .addReg(MidRegLo);
9374
9375 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9376 .addReg(MidRegLo)
9377 .addImm(AMDGPU::sub0)
9378 .addReg(MidRegHi)
9379 .addImm(AMDGPU::sub1);
9380
9381 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9382 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9383 return;
9384 }
9385
9386 MachineOperand &Src = Inst.getOperand(1);
9387 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9388 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9389
9390 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
9391 .addImm(31)
9392 .addReg(Src.getReg(), {}, AMDGPU::sub0);
9393
9394 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9395 .addReg(Src.getReg(), {}, AMDGPU::sub0)
9396 .addImm(AMDGPU::sub0)
9397 .addReg(TmpReg)
9398 .addImm(AMDGPU::sub1);
9399
9400 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9401 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9402}
9403
9404void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
9405 MachineInstr &Inst, unsigned Opcode,
9406 MachineDominatorTree *MDT) const {
9407 // (S_FLBIT_I32_B64 hi:lo) ->
9408 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
9409 // (S_FF1_I32_B64 hi:lo) ->
9410 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
9411
9412 MachineBasicBlock &MBB = *Inst.getParent();
9413 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9414 MachineBasicBlock::iterator MII = Inst;
9415 const DebugLoc &DL = Inst.getDebugLoc();
9416
9417 MachineOperand &Dest = Inst.getOperand(0);
9418 MachineOperand &Src = Inst.getOperand(1);
9419
9420 const MCInstrDesc &InstDesc = get(Opcode);
9421
9422 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
9423 unsigned OpcodeAdd = ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64
9424 : AMDGPU::V_ADD_CO_U32_e32;
9425
9426 const TargetRegisterClass *SrcRC =
9427 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
9428 const TargetRegisterClass *SrcSubRC =
9429 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9430
9431 MachineOperand SrcRegSub0 =
9432 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
9433 MachineOperand SrcRegSub1 =
9434 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
9435
9436 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9437 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9438 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9439 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9440
9441 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
9442
9443 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
9444
9445 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
9446 .addReg(IsCtlz ? MidReg1 : MidReg2)
9447 .addImm(32)
9448 .addImm(1); // enable clamp
9449
9450 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
9451 .addReg(MidReg3)
9452 .addReg(IsCtlz ? MidReg2 : MidReg1);
9453
9454 MRI.replaceRegWith(Dest.getReg(), MidReg4);
9455
9456 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
9457}
9458
9459void SIInstrInfo::addUsersToMoveToVALUWorklist(
9460 Register DstReg, MachineRegisterInfo &MRI,
9461 SIInstrWorklist &Worklist) const {
9462 for (MachineOperand &MO : make_early_inc_range(MRI.use_operands(DstReg))) {
9463 MachineInstr &UseMI = *MO.getParent();
9464
9465 unsigned OpNo = 0;
9466
9467 switch (UseMI.getOpcode()) {
9468 case AMDGPU::COPY:
9469 case AMDGPU::WQM:
9470 case AMDGPU::SOFT_WQM:
9471 case AMDGPU::STRICT_WWM:
9472 case AMDGPU::STRICT_WQM:
9473 case AMDGPU::REG_SEQUENCE:
9474 case AMDGPU::PHI:
9475 case AMDGPU::INSERT_SUBREG:
9476 break;
9477 default:
9478 OpNo = MO.getOperandNo();
9479 break;
9480 }
9481
9482 const TargetRegisterClass *OpRC = getOpRegClass(UseMI, OpNo);
9483 MRI.constrainRegClass(DstReg, OpRC);
9484
9485 if (!RI.hasVectorRegisters(OpRC))
9486 Worklist.insert(&UseMI);
9487 else
9488 // Legalization could change user list.
9489 legalizeOperandsVALUt16(UseMI, OpNo, MRI);
9490 }
9491}
9492
9493void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9495 MachineInstr &Inst) const {
9496 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9497 MachineBasicBlock *MBB = Inst.getParent();
9498 MachineOperand &Src0 = Inst.getOperand(1);
9499 MachineOperand &Src1 = Inst.getOperand(2);
9500 const DebugLoc &DL = Inst.getDebugLoc();
9501
9502 if (ST.useRealTrue16Insts()) {
9503 Register SrcReg0, SrcReg1;
9504 if (!Src0.isReg() || !RI.isVGPR(MRI, Src0.getReg())) {
9505 SrcReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9506 BuildMI(*MBB, Inst, DL,
9507 get(Src0.isImm() ? AMDGPU::V_MOV_B32_e32 : AMDGPU::COPY), SrcReg0)
9508 .add(Src0);
9509 } else {
9510 SrcReg0 = Src0.getReg();
9511 }
9512
9513 if (!Src1.isReg() || !RI.isVGPR(MRI, Src1.getReg())) {
9514 SrcReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9515 BuildMI(*MBB, Inst, DL,
9516 get(Src1.isImm() ? AMDGPU::V_MOV_B32_e32 : AMDGPU::COPY), SrcReg1)
9517 .add(Src1);
9518 } else {
9519 SrcReg1 = Src1.getReg();
9520 }
9521
9522 bool isSrc0Reg16 = MRI.constrainRegClass(SrcReg0, &AMDGPU::VGPR_16RegClass);
9523 bool isSrc1Reg16 = MRI.constrainRegClass(SrcReg1, &AMDGPU::VGPR_16RegClass);
9524
9525 auto NewMI = BuildMI(*MBB, Inst, DL, get(AMDGPU::REG_SEQUENCE), ResultReg);
9526 switch (Inst.getOpcode()) {
9527 case AMDGPU::S_PACK_LL_B32_B16:
9528 NewMI
9529 .addReg(SrcReg0, {},
9530 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9531 .addImm(AMDGPU::lo16)
9532 .addReg(SrcReg1, {},
9533 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9534 .addImm(AMDGPU::hi16);
9535 break;
9536 case AMDGPU::S_PACK_LH_B32_B16:
9537 NewMI
9538 .addReg(SrcReg0, {},
9539 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9540 .addImm(AMDGPU::lo16)
9541 .addReg(SrcReg1, {}, AMDGPU::hi16)
9542 .addImm(AMDGPU::hi16);
9543 break;
9544 case AMDGPU::S_PACK_HL_B32_B16:
9545 NewMI.addReg(SrcReg0, {}, AMDGPU::hi16)
9546 .addImm(AMDGPU::lo16)
9547 .addReg(SrcReg1, {},
9548 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9549 .addImm(AMDGPU::hi16);
9550 break;
9551 case AMDGPU::S_PACK_HH_B32_B16:
9552 NewMI.addReg(SrcReg0, {}, AMDGPU::hi16)
9553 .addImm(AMDGPU::lo16)
9554 .addReg(SrcReg1, {}, AMDGPU::hi16)
9555 .addImm(AMDGPU::hi16);
9556 break;
9557 default:
9558 llvm_unreachable("unhandled s_pack_* instruction");
9559 }
9560
9561 MachineOperand &Dest = Inst.getOperand(0);
9562 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9563 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9564 return;
9565 }
9566
9567 switch (Inst.getOpcode()) {
9568 case AMDGPU::S_PACK_LL_B32_B16: {
9569 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9570 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9571
9572 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9573 // 0.
9574 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9575 .addImm(0xffff);
9576
9577 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
9578 .addReg(ImmReg, RegState::Kill)
9579 .add(Src0);
9580
9581 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9582 .add(Src1)
9583 .addImm(16)
9584 .addReg(TmpReg, RegState::Kill);
9585 break;
9586 }
9587 case AMDGPU::S_PACK_LH_B32_B16: {
9588 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9589 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9590 .addImm(0xffff);
9591 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
9592 .addReg(ImmReg, RegState::Kill)
9593 .add(Src0)
9594 .add(Src1);
9595 break;
9596 }
9597 case AMDGPU::S_PACK_HL_B32_B16: {
9598 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9599 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9600 .addImm(16)
9601 .add(Src0);
9602 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9603 .add(Src1)
9604 .addImm(16)
9605 .addReg(TmpReg, RegState::Kill);
9606 break;
9607 }
9608 case AMDGPU::S_PACK_HH_B32_B16: {
9609 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9610 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9611 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9612 .addImm(16)
9613 .add(Src0);
9614 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9615 .addImm(0xffff0000);
9616 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
9617 .add(Src1)
9618 .addReg(ImmReg, RegState::Kill)
9619 .addReg(TmpReg, RegState::Kill);
9620 break;
9621 }
9622 default:
9623 llvm_unreachable("unhandled s_pack_* instruction");
9624 }
9625
9626 MachineOperand &Dest = Inst.getOperand(0);
9627 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9628 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9629}
9630
9631void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
9632 MachineInstr &SCCDefInst,
9633 SIInstrWorklist &Worklist,
9634 Register NewCond) const {
9635
9636 // Ensure that def inst defines SCC, which is still live.
9637 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9638 !Op.isDead() && Op.getParent() == &SCCDefInst);
9639 SmallVector<MachineInstr *, 4> CopyToDelete;
9640 // This assumes that all the users of SCC are in the same block
9641 // as the SCC def.
9642 for (MachineInstr &MI : // Skip the def inst itself.
9643 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
9644 SCCDefInst.getParent()->end())) {
9645 // Check if SCC is used first.
9646 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
9647 if (SCCIdx != -1) {
9648 if (MI.isCopy()) {
9649 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9650 Register DestReg = MI.getOperand(0).getReg();
9651
9652 MRI.replaceRegWith(DestReg, NewCond);
9653 CopyToDelete.push_back(&MI);
9654 } else {
9655
9656 if (NewCond.isValid())
9657 MI.getOperand(SCCIdx).setReg(NewCond);
9658
9659 Worklist.insert(&MI);
9660 }
9661 }
9662 // Exit if we find another SCC def.
9663 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
9664 break;
9665 }
9666 for (auto &Copy : CopyToDelete)
9667 Copy->eraseFromParent();
9668}
9669
9670// Instructions that use SCC may be converted to VALU instructions. When that
9671// happens, the SCC register is changed to VCC_LO. The instruction that defines
9672// SCC must be changed to an instruction that defines VCC. This function makes
9673// sure that the instruction that defines SCC is added to the moveToVALU
9674// worklist.
9675void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9676 SIInstrWorklist &Worklist) const {
9677 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9678 // then there is nothing to do because the defining instruction has been
9679 // converted to a VALU already. If SCC then that instruction needs to be
9680 // converted to a VALU.
9681 for (MachineInstr &MI :
9682 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
9683 SCCUseInst->getParent()->rend())) {
9684 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
9685 break;
9686 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
9687 Worklist.insert(&MI);
9688 break;
9689 }
9690 }
9691}
9692
9693const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9694 const MachineInstr &Inst) const {
9695 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
9696
9697 switch (Inst.getOpcode()) {
9698 // For target instructions, getOpRegClass just returns the virtual register
9699 // class associated with the operand, so we need to find an equivalent VGPR
9700 // register class in order to move the instruction to the VALU.
9701 case AMDGPU::COPY:
9702 case AMDGPU::PHI:
9703 case AMDGPU::REG_SEQUENCE:
9704 case AMDGPU::INSERT_SUBREG:
9705 case AMDGPU::WQM:
9706 case AMDGPU::SOFT_WQM:
9707 case AMDGPU::STRICT_WWM:
9708 case AMDGPU::STRICT_WQM: {
9709 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
9710 if (RI.isAGPRClass(SrcRC)) {
9711 if (RI.isAGPRClass(NewDstRC))
9712 return nullptr;
9713
9714 switch (Inst.getOpcode()) {
9715 case AMDGPU::PHI:
9716 case AMDGPU::REG_SEQUENCE:
9717 case AMDGPU::INSERT_SUBREG:
9718 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
9719 break;
9720 default:
9721 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9722 }
9723
9724 if (!NewDstRC)
9725 return nullptr;
9726 } else {
9727 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9728 return nullptr;
9729
9730 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9731 if (!NewDstRC)
9732 return nullptr;
9733 }
9734
9735 return NewDstRC;
9736 }
9737 default:
9738 return NewDstRC;
9739 }
9740}
9741
9742// Find the one SGPR operand we are allowed to use.
9743Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9744 int OpIndices[3]) const {
9745 const MCInstrDesc &Desc = MI.getDesc();
9746
9747 // Find the one SGPR operand we are allowed to use.
9748 //
9749 // First we need to consider the instruction's operand requirements before
9750 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9751 // of VCC, but we are still bound by the constant bus requirement to only use
9752 // one.
9753 //
9754 // If the operand's class is an SGPR, we can never move it.
9755
9756 Register SGPRReg = findImplicitSGPRRead(MI);
9757 if (SGPRReg)
9758 return SGPRReg;
9759
9760 Register UsedSGPRs[3] = {Register()};
9761 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9762
9763 for (unsigned i = 0; i < 3; ++i) {
9764 int Idx = OpIndices[i];
9765 if (Idx == -1)
9766 break;
9767
9768 const MachineOperand &MO = MI.getOperand(Idx);
9769 if (!MO.isReg())
9770 continue;
9771
9772 // Is this operand statically required to be an SGPR based on the operand
9773 // constraints?
9774 const TargetRegisterClass *OpRC =
9775 RI.getRegClass(getOpRegClassID(Desc.operands()[Idx]));
9776 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
9777 if (IsRequiredSGPR)
9778 return MO.getReg();
9779
9780 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9781 Register Reg = MO.getReg();
9782 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9783 if (RI.isSGPRClass(RegRC))
9784 UsedSGPRs[i] = Reg;
9785 }
9786
9787 // We don't have a required SGPR operand, so we have a bit more freedom in
9788 // selecting operands to move.
9789
9790 // Try to select the most used SGPR. If an SGPR is equal to one of the
9791 // others, we choose that.
9792 //
9793 // e.g.
9794 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9795 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9796
9797 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9798 // prefer those.
9799
9800 if (UsedSGPRs[0]) {
9801 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9802 SGPRReg = UsedSGPRs[0];
9803 }
9804
9805 if (!SGPRReg && UsedSGPRs[1]) {
9806 if (UsedSGPRs[1] == UsedSGPRs[2])
9807 SGPRReg = UsedSGPRs[1];
9808 }
9809
9810 return SGPRReg;
9811}
9812
9814 AMDGPU::OpName OperandName) const {
9815 if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
9816 return nullptr;
9817
9818 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
9819 if (Idx == -1)
9820 return nullptr;
9821
9822 return &MI.getOperand(Idx);
9823}
9824
9826 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9827 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9830 return (Format << 44) |
9831 (1ULL << 56) | // RESOURCE_LEVEL = 1
9832 (3ULL << 60); // OOB_SELECT = 3
9833 }
9834
9835 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9836 if (ST.isAmdHsaOS()) {
9837 // Set ATC = 1. GFX9 doesn't have this bit.
9838 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9839 RsrcDataFormat |= (1ULL << 56);
9840
9841 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9842 // BTW, it disables TC L2 and therefore decreases performance.
9843 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9844 RsrcDataFormat |= (2ULL << 59);
9845 }
9846
9847 return RsrcDataFormat;
9848}
9849
9853 0xffffffff; // Size;
9854
9855 // GFX9 doesn't have ELEMENT_SIZE.
9856 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9857 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
9858 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9859 }
9860
9861 // IndexStride = 64 / 32.
9862 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9863 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9864
9865 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9866 // Clear them unless we want a huge stride.
9867 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9868 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9869 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9870
9871 return Rsrc23;
9872}
9873
9875 unsigned Opc = MI.getOpcode();
9876
9877 return isSMRD(Opc);
9878}
9879
9881 return get(Opc).mayLoad() &&
9882 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
9883}
9884
9886 TypeSize &MemBytes) const {
9887 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
9888 if (!Addr || !Addr->isFI())
9889 return Register();
9890
9891 assert(!MI.memoperands_empty() &&
9892 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9893
9894 FrameIndex = Addr->getIndex();
9895
9896 int VDataIdx =
9897 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
9898 MemBytes = TypeSize::getFixed(getOpSize(MI.getOpcode(), VDataIdx));
9899 return MI.getOperand(VDataIdx).getReg();
9900}
9901
9903 TypeSize &MemBytes) const {
9904 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
9905 assert(Addr && Addr->isFI());
9906 FrameIndex = Addr->getIndex();
9907
9908 int DataIdx =
9909 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::data);
9910 MemBytes = TypeSize::getFixed(getOpSize(MI.getOpcode(), DataIdx));
9911 return MI.getOperand(DataIdx).getReg();
9912}
9913
9915 int &FrameIndex,
9916 TypeSize &MemBytes) const {
9917 if (!MI.mayLoad())
9918 return Register();
9919
9920 if (isMUBUF(MI) || isVGPRSpill(MI))
9921 return isStackAccess(MI, FrameIndex, MemBytes);
9922
9923 if (isSGPRSpill(MI))
9924 return isSGPRStackAccess(MI, FrameIndex, MemBytes);
9925
9926 return Register();
9927}
9928
9930 int &FrameIndex,
9931 TypeSize &MemBytes) const {
9932 if (!MI.mayStore())
9933 return Register();
9934
9935 if (isMUBUF(MI) || isVGPRSpill(MI))
9936 return isStackAccess(MI, FrameIndex, MemBytes);
9937
9938 if (isSGPRSpill(MI))
9939 return isSGPRStackAccess(MI, FrameIndex, MemBytes);
9940
9941 return Register();
9942}
9943
9945 unsigned Opc = MI.getOpcode();
9947 unsigned DescSize = Desc.getSize();
9948
9949 // If we have a definitive size, we can use it. Otherwise we need to inspect
9950 // the operands to know the size.
9951 if (isFixedSize(MI)) {
9952 unsigned Size = DescSize;
9953
9954 // If we hit the buggy offset, an extra nop will be inserted in MC so
9955 // estimate the worst case.
9956 if (MI.isBranch() && ST.hasOffset3fBug())
9957 Size += 4;
9958
9959 return Size;
9960 }
9961
9962 // Instructions may have a 32-bit literal encoded after them. Check
9963 // operands that could ever be literals.
9964 if (isVALU(MI, /*AllowLDSDMA=*/true) || isSALU(MI)) {
9965 if (isDPP(MI))
9966 return DescSize;
9967 bool HasLiteral = false;
9968 unsigned LiteralSize = 4;
9969 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9970 const MachineOperand &Op = MI.getOperand(I);
9971 const MCOperandInfo &OpInfo = Desc.operands()[I];
9972 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
9973 HasLiteral = true;
9974 if (ST.has64BitLiterals()) {
9975 switch (OpInfo.OperandType) {
9976 default:
9977 break;
9980 if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
9981 LiteralSize = 8;
9982 break;
9985 // A 32-bit literal is only valid when the value fits in BOTH signed
9986 // and unsigned 32-bit ranges [0, 2^31-1], matching the MC code
9987 // emitter's getLit64Encoding logic. This is because of the lack of
9988 // abilility to tell signedness of the literal, therefore we need to
9989 // be conservative and assume values outside this range require a
9990 // 64-bit literal encoding (8 bytes).
9991 if (!Op.isImm() || !isInt<32>(Op.getImm()) ||
9992 !isUInt<32>(Op.getImm()))
9993 LiteralSize = 8;
9994 break;
9995 }
9996 }
9997 break;
9998 }
9999 }
10000 return HasLiteral ? DescSize + LiteralSize : DescSize;
10001 }
10002
10003 // Check whether we have extra NSA words.
10004 if (isMIMG(MI)) {
10005 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
10006 if (VAddr0Idx < 0)
10007 return 8;
10008
10009 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
10010 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
10011 }
10012
10013 switch (Opc) {
10014 case TargetOpcode::BUNDLE:
10015 return getInstBundleSize(MI);
10016 case TargetOpcode::INLINEASM:
10017 case TargetOpcode::INLINEASM_BR: {
10018 const MachineFunction *MF = MI.getMF();
10019 const char *AsmStr = MI.getOperand(0).getSymbolName();
10020 return getInlineAsmLength(AsmStr, MF->getTarget().getMCAsmInfo(), &ST);
10021 }
10022 default:
10023 if (MI.isMetaInstruction())
10024 return 0;
10025
10026 // If D16 Pseudo inst, get correct MC code size
10027 const auto *D16Info = AMDGPU::getT16D16Helper(Opc);
10028 if (D16Info) {
10029 // Assume d16_lo/hi inst are always in same size
10030 unsigned LoInstOpcode = D16Info->LoOp;
10031 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);
10032 DescSize = Desc.getSize();
10033 }
10034
10035 // If FMA Pseudo inst, get correct MC code size
10036 if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) {
10037 // All potential lowerings are the same size; arbitrarily pick one.
10038 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(AMDGPU::V_FMA_MIXLO_F16);
10039 DescSize = Desc.getSize();
10040 }
10041
10042 return DescSize;
10043 }
10044}
10045
10048 if (MI.isBranch() && ST.hasOffset3fBug())
10049 return InstSizeVerifyMode::NoVerify;
10050 return InstSizeVerifyMode::ExactSize;
10051}
10052
10054 if (!isFLAT(MI))
10055 return false;
10056
10057 if (MI.memoperands_empty())
10058 return true;
10059
10060 for (const MachineMemOperand *MMO : MI.memoperands()) {
10062 return true;
10063 }
10064 return false;
10065}
10066
10069 static const std::pair<int, const char *> TargetIndices[] = {
10070 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
10071 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
10072 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
10073 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
10074 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
10075 return ArrayRef(TargetIndices);
10076}
10077
10078/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
10079/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
10082 const ScheduleDAG *DAG) const {
10083 return new GCNHazardRecognizer(DAG->MF);
10084}
10085
10086/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
10087/// pass.
10090 MachineLoopInfo *MLI) const {
10091 return new GCNHazardRecognizer(MF, MLI);
10092}
10093
10094// Called during:
10095// - pre-RA scheduling and post-RA scheduling
10098 const ScheduleDAGMI *DAG) const {
10099 // Borrowed from Arm Target
10100 // We would like to restrict this hazard recognizer to only
10101 // post-RA scheduling; we can tell that we're post-RA because we don't
10102 // track VRegLiveness.
10103 if (!DAG->hasVRegLiveness())
10104 return new GCNHazardRecognizer(DAG->MF);
10106}
10107
10108std::pair<unsigned, unsigned>
10110 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
10111}
10112
10115 static const std::pair<unsigned, const char *> TargetFlags[] = {
10116 {MO_GOTPCREL, "amdgpu-gotprel"},
10117 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
10118 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
10119 {MO_GOTPCREL64, "amdgpu-gotprel64"},
10120 {MO_REL32_LO, "amdgpu-rel32-lo"},
10121 {MO_REL32_HI, "amdgpu-rel32-hi"},
10122 {MO_REL64, "amdgpu-rel64"},
10123 {MO_ABS32_LO, "amdgpu-abs32-lo"},
10124 {MO_ABS32_HI, "amdgpu-abs32-hi"},
10125 {MO_ABS64, "amdgpu-abs64"},
10126 };
10127
10128 return ArrayRef(TargetFlags);
10129}
10130
10133 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
10134 {
10135 {MONoClobber, "amdgpu-noclobber"},
10136 {MOLastUse, "amdgpu-last-use"},
10137 {MOCooperative, "amdgpu-cooperative"},
10138 {MOThreadPrivate, "amdgpu-thread-private"},
10139 };
10140
10141 return ArrayRef(TargetFlags);
10142}
10143
10145 const MachineFunction &MF) const {
10147 assert(SrcReg.isVirtual());
10148 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
10149 return AMDGPU::WWM_COPY;
10150
10151 return AMDGPU::COPY;
10152}
10153
10155 uint32_t Opcode = MI.getOpcode();
10156 // Check if it is SGPR spill or wwm-register spill Opcode.
10157 if (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode))
10158 return true;
10159
10160 const MachineFunction *MF = MI.getMF();
10161 const MachineRegisterInfo &MRI = MF->getRegInfo();
10163
10164 // See if this is Liverange split instruction inserted for SGPR or
10165 // wwm-register. The implicit def inserted for wwm-registers should also be
10166 // included as they can appear at the bb begin.
10167 bool IsLRSplitInst = MI.getFlag(MachineInstr::LRSplit);
10168 if (!IsLRSplitInst && Opcode != AMDGPU::IMPLICIT_DEF)
10169 return false;
10170
10171 Register Reg = MI.getOperand(0).getReg();
10172 if (RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg)))
10173 return IsLRSplitInst;
10174
10175 return MFI->isWWMReg(Reg);
10176}
10177
10179 Register Reg) const {
10180 // We need to handle instructions which may be inserted during register
10181 // allocation to handle the prolog. The initial prolog instruction may have
10182 // been separated from the start of the block by spills and copies inserted
10183 // needed by the prolog. However, the insertions for scalar registers can
10184 // always be placed at the BB top as they are independent of the exec mask
10185 // value.
10186 bool IsNullOrVectorRegister = true;
10187 if (Reg) {
10188 const MachineFunction *MF = MI.getMF();
10189 const MachineRegisterInfo &MRI = MF->getRegInfo();
10190 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
10191 }
10192
10193 return IsNullOrVectorRegister &&
10194 (canAddToBBProlog(MI) ||
10195 (!MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
10196 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
10197}
10198
10202 const DebugLoc &DL,
10203 Register DestReg) const {
10204 if (ST.hasAddNoCarryInsts())
10205 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
10206
10207 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
10208 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
10209 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
10210
10211 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
10212 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
10213}
10214
10217 const DebugLoc &DL,
10218 Register DestReg,
10219 RegScavenger &RS) const {
10220 if (ST.hasAddNoCarryInsts())
10221 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
10222
10223 // If available, prefer to use vcc.
10224 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
10225 ? Register(RI.getVCC())
10226 : RS.scavengeRegisterBackwards(
10227 *RI.getBoolRC(), I, /* RestoreAfter */ false,
10228 0, /* AllowSpill */ false);
10229
10230 // TODO: Users need to deal with this.
10231 if (!UnusedCarry.isValid())
10232 return MachineInstrBuilder();
10233
10234 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
10235 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
10236}
10237
10238bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
10239 switch (Opcode) {
10240 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
10241 case AMDGPU::SI_KILL_I1_TERMINATOR:
10242 return true;
10243 default:
10244 return false;
10245 }
10246}
10247
10249 switch (Opcode) {
10250 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
10251 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
10252 case AMDGPU::SI_KILL_I1_PSEUDO:
10253 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
10254 default:
10255 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
10256 }
10257}
10258
10259bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
10260 return Imm <= getMaxMUBUFImmOffset(ST);
10261}
10262
10264 // GFX12 field is non-negative 24-bit signed byte offset.
10265 const unsigned OffsetBits =
10266 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
10267 return (1 << OffsetBits) - 1;
10268}
10269
10271 if (!ST.isWave32())
10272 return;
10273
10274 if (MI.isInlineAsm())
10275 return;
10276
10277 if (MI.getNumOperands() < MI.getNumExplicitOperands())
10278 return;
10279
10280 for (auto &Op : MI.implicit_operands()) {
10281 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
10282 Op.setReg(AMDGPU::VCC_LO);
10283 }
10284}
10285
10287 if (!isSMRD(MI))
10288 return false;
10289
10290 // Check that it is using a buffer resource.
10291 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
10292 if (Idx == -1) // e.g. s_memtime
10293 return false;
10294
10295 const int16_t RCID = getOpRegClassID(MI.getDesc().operands()[Idx]);
10296 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
10297}
10298
10299// Given Imm, split it into the values to put into the SOffset and ImmOffset
10300// fields in an MUBUF instruction. Return false if it is not possible (due to a
10301// hardware bug needing a workaround).
10302//
10303// The required alignment ensures that individual address components remain
10304// aligned if they are aligned to begin with. It also ensures that additional
10305// offsets within the given alignment can be added to the resulting ImmOffset.
10307 uint32_t &ImmOffset, Align Alignment) const {
10308 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
10309 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
10310 uint32_t Overflow = 0;
10311
10312 if (Imm > MaxImm) {
10313 if (Imm <= MaxImm + 64) {
10314 // Use an SOffset inline constant for 4..64
10315 Overflow = Imm - MaxImm;
10316 Imm = MaxImm;
10317 } else {
10318 // Try to keep the same value in SOffset for adjacent loads, so that
10319 // the corresponding register contents can be re-used.
10320 //
10321 // Load values with all low-bits (except for alignment bits) set into
10322 // SOffset, so that a larger range of values can be covered using
10323 // s_movk_i32.
10324 //
10325 // Atomic operations fail to work correctly when individual address
10326 // components are unaligned, even if their sum is aligned.
10327 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
10328 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
10329 Imm = Low;
10330 Overflow = High - Alignment.value();
10331 }
10332 }
10333
10334 if (Overflow > 0) {
10335 // There is a hardware bug in SI and CI which prevents address clamping in
10336 // MUBUF instructions from working correctly with SOffsets. The immediate
10337 // offset is unaffected.
10338 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
10339 return false;
10340
10341 // It is not possible to set immediate in SOffset field on some targets.
10342 if (ST.hasRestrictedSOffset())
10343 return false;
10344 }
10345
10346 ImmOffset = Imm;
10347 SOffset = Overflow;
10348 return true;
10349}
10350
10351// Depending on the used address space and instructions, some immediate offsets
10352// are allowed and some are not.
10353// Pre-GFX12, flat instruction offsets can only be non-negative, global and
10354// scratch instruction offsets can also be negative. On GFX12, offsets can be
10355// negative for all variants.
10356//
10357// There are several bugs related to these offsets:
10358// On gfx10.1, flat instructions that go into the global address space cannot
10359// use an offset.
10360//
10361// For scratch instructions, the address can be either an SGPR or a VGPR.
10362// The following offsets can be used, depending on the architecture (x means
10363// cannot be used):
10364// +----------------------------+------+------+
10365// | Address-Mode | SGPR | VGPR |
10366// +----------------------------+------+------+
10367// | gfx9 | | |
10368// | negative, 4-aligned offset | x | ok |
10369// | negative, unaligned offset | x | ok |
10370// +----------------------------+------+------+
10371// | gfx10 | | |
10372// | negative, 4-aligned offset | ok | ok |
10373// | negative, unaligned offset | ok | x |
10374// +----------------------------+------+------+
10375// | gfx10.3 | | |
10376// | negative, 4-aligned offset | ok | ok |
10377// | negative, unaligned offset | ok | ok |
10378// +----------------------------+------+------+
10379//
10380// This function ignores the addressing mode, so if an offset cannot be used in
10381// one addressing mode, it is considered illegal.
10382bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
10383 AMDGPU::FlatAddrSpace FlatVariant) const {
10384 // TODO: Should 0 be special cased?
10385 if (!ST.hasFlatInstOffsets())
10386 return false;
10387
10389 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == FlatAddrSpace::FLAT &&
10390 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
10391 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
10392 return false;
10393
10394 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10395 FlatVariant == FlatAddrSpace::FlatScratch && Offset < 0 &&
10396 (Offset % 4) != 0) {
10397 return false;
10398 }
10399
10400 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10401 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
10402 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
10403}
10404
10405// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
10406std::pair<int64_t, int64_t>
10407SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
10408 AMDGPU::FlatAddrSpace FlatVariant) const {
10409 int64_t RemainderOffset = COffsetVal;
10410 int64_t ImmField = 0;
10411
10412 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10413 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
10414
10415 if (AllowNegative) {
10416 // Use signed division by a power of two to truncate towards 0.
10417 int64_t D = 1LL << NumBits;
10418 RemainderOffset = (COffsetVal / D) * D;
10419 ImmField = COffsetVal - RemainderOffset;
10420
10421 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10422 FlatVariant == AMDGPU::FlatAddrSpace::FlatScratch && ImmField < 0 &&
10423 (ImmField % 4) != 0) {
10424 // Make ImmField a multiple of 4
10425 RemainderOffset += ImmField % 4;
10426 ImmField -= ImmField % 4;
10427 }
10428 } else if (COffsetVal >= 0) {
10429 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
10430 RemainderOffset = COffsetVal - ImmField;
10431 }
10432
10433 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
10434 assert(RemainderOffset + ImmField == COffsetVal);
10435 return {ImmField, RemainderOffset};
10436}
10437
10439 AMDGPU::FlatAddrSpace FlatVariant) const {
10440 if (ST.hasNegativeScratchOffsetBug() &&
10442 return false;
10443
10444 return FlatVariant != AMDGPU::FlatAddrSpace::FLAT || AMDGPU::isGFX12Plus(ST);
10445}
10446
10447static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
10448 switch (ST.getGeneration()) {
10449 default:
10450 break;
10453 return SIEncodingFamily::SI;
10456 return SIEncodingFamily::VI;
10460 return ST.hasGFX11_7Insts() ? SIEncodingFamily::GFX1170
10463 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
10467 }
10468 llvm_unreachable("Unknown subtarget generation!");
10469}
10470
10471bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
10472 switch(MCOp) {
10473 // These opcodes use indirect register addressing so
10474 // they need special handling by codegen (currently missing).
10475 // Therefore it is too risky to allow these opcodes
10476 // to be selected by dpp combiner or sdwa peepholer.
10477 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
10478 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
10479 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
10480 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
10481 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
10482 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
10483 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
10484 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
10485 return true;
10486 default:
10487 return false;
10488 }
10489}
10490
10491#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
10492 case OPCODE##_dpp: \
10493 case OPCODE##_e32: \
10494 case OPCODE##_e64: \
10495 case OPCODE##_e64_dpp: \
10496 case OPCODE##_sdwa:
10497
10498static bool isRenamedInGFX9(int Opcode) {
10499 switch (Opcode) {
10500 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
10501 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
10502 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
10503 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
10504 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
10505 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
10506 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
10507 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
10508 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
10509 //
10510 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
10511 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
10512 case AMDGPU::V_FMA_F16_gfx9_e64:
10513 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
10514 case AMDGPU::V_INTERP_P2_F16:
10515 case AMDGPU::V_MAD_F16_e64:
10516 case AMDGPU::V_MAD_U16_e64:
10517 case AMDGPU::V_MAD_I16_e64:
10518 return true;
10519 default:
10520 return false;
10521 }
10522}
10523
10524int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
10525 assert(Opcode == (int)SIInstrInfo::getNonSoftWaitcntOpcode(Opcode) &&
10526 "SIInsertWaitcnts should have promoted soft waitcnt instructions!");
10527
10528 unsigned Gen = subtargetEncodingFamily(ST);
10529
10530 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
10532
10533 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
10534 // subtarget has UnpackedD16VMem feature.
10535 // TODO: remove this when we discard GFX80 encoding.
10536 if (ST.hasUnpackedD16VMem() && SIInstrFlags::isD16Buf(get(Opcode)))
10538
10539 if (SIInstrFlags::isSDWA(get(Opcode))) {
10540 switch (ST.getGeneration()) {
10541 default:
10543 break;
10546 break;
10549 break;
10550 }
10551 }
10552
10553 if (isMAI(Opcode)) {
10554 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
10555 if (MFMAOp != -1)
10556 Opcode = MFMAOp;
10557 }
10558
10559 int32_t MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
10560
10561 if (MCOp == AMDGPU::INSTRUCTION_LIST_END && ST.hasGFX11_7Insts())
10563
10564 if (MCOp == AMDGPU::INSTRUCTION_LIST_END && ST.hasGFX1250Insts())
10566
10567 // -1 means that Opcode is already a native instruction.
10568 if (MCOp == -1)
10569 return Opcode;
10570
10571 if (ST.hasGFX90AInsts()) {
10572 uint32_t NMCOp = AMDGPU::INSTRUCTION_LIST_END;
10573 if (ST.hasGFX940Insts())
10575 if (NMCOp == AMDGPU::INSTRUCTION_LIST_END)
10577 if (NMCOp == AMDGPU::INSTRUCTION_LIST_END)
10579 if (NMCOp != AMDGPU::INSTRUCTION_LIST_END)
10580 MCOp = NMCOp;
10581 }
10582
10583 // INSTRUCTION_LIST_END means that Opcode is a pseudo instruction that has no
10584 // encoding in the given subtarget generation.
10585 if (MCOp == AMDGPU::INSTRUCTION_LIST_END)
10586 return -1;
10587
10588 if (isAsmOnlyOpcode(MCOp))
10589 return -1;
10590
10591 return MCOp;
10592}
10593
10594static
10596 assert(RegOpnd.isReg());
10597 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
10598 getRegSubRegPair(RegOpnd);
10599}
10600
10603 assert(MI.isRegSequence());
10604 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
10605 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
10606 auto &RegOp = MI.getOperand(1 + 2 * I);
10607 return getRegOrUndef(RegOp);
10608 }
10610}
10611
10612// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10613// Following a subreg of reg:subreg isn't supported
10616 if (!RSR.SubReg)
10617 return false;
10618 switch (MI.getOpcode()) {
10619 default: break;
10620 case AMDGPU::REG_SEQUENCE:
10621 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
10622 return true;
10623 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10624 case AMDGPU::INSERT_SUBREG:
10625 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
10626 // inserted the subreg we're looking for
10627 RSR = getRegOrUndef(MI.getOperand(2));
10628 else { // the subreg in the rest of the reg
10629 auto R1 = getRegOrUndef(MI.getOperand(1));
10630 if (R1.SubReg) // subreg of subreg isn't supported
10631 return false;
10632 RSR.Reg = R1.Reg;
10633 }
10634 return true;
10635 }
10636 return false;
10637}
10638
10640 const MachineRegisterInfo &MRI) {
10641 assert(MRI.isSSA());
10642 if (!P.Reg.isVirtual())
10643 return nullptr;
10644
10645 auto RSR = P;
10646 auto *DefInst = MRI.getVRegDef(RSR.Reg);
10647 while (auto *MI = DefInst) {
10648 DefInst = nullptr;
10649 switch (MI->getOpcode()) {
10650 case AMDGPU::COPY:
10651 case AMDGPU::V_MOV_B32_e32: {
10652 auto &Op1 = MI->getOperand(1);
10653 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10654 if (Op1.isUndef())
10655 return nullptr;
10656 RSR = getRegSubRegPair(Op1);
10657 DefInst = MRI.getVRegDef(RSR.Reg);
10658 }
10659 break;
10660 }
10661 default:
10662 if (followSubRegDef(*MI, RSR)) {
10663 if (!RSR.Reg)
10664 return nullptr;
10665 DefInst = MRI.getVRegDef(RSR.Reg);
10666 }
10667 }
10668 if (!DefInst)
10669 return MI;
10670 }
10671 return nullptr;
10672}
10673
10675 Register VReg,
10676 const MachineInstr &DefMI,
10677 const MachineInstr &UseMI) {
10678 assert(MRI.isSSA() && "Must be run on SSA");
10679
10680 auto *TRI = MRI.getTargetRegisterInfo();
10681 auto *DefBB = DefMI.getParent();
10682
10683 // Don't bother searching between blocks, although it is possible this block
10684 // doesn't modify exec.
10685 if (UseMI.getParent() != DefBB)
10686 return true;
10687
10688 const int MaxInstScan = 20;
10689 int NumInst = 0;
10690
10691 // Stop scan at the use.
10692 auto E = UseMI.getIterator();
10693 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
10694 if (I->isDebugInstr())
10695 continue;
10696
10697 if (++NumInst > MaxInstScan)
10698 return true;
10699
10700 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
10701 return true;
10702 }
10703
10704 return false;
10705}
10706
10708 Register VReg,
10709 const MachineInstr &DefMI) {
10710 assert(MRI.isSSA() && "Must be run on SSA");
10711
10712 auto *TRI = MRI.getTargetRegisterInfo();
10713 auto *DefBB = DefMI.getParent();
10714
10715 const int MaxUseScan = 10;
10716 int NumUse = 0;
10717
10718 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
10719 auto &UseInst = *Use.getParent();
10720 // Don't bother searching between blocks, although it is possible this block
10721 // doesn't modify exec.
10722 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10723 return true;
10724
10725 if (++NumUse > MaxUseScan)
10726 return true;
10727 }
10728
10729 if (NumUse == 0)
10730 return false;
10731
10732 const int MaxInstScan = 20;
10733 int NumInst = 0;
10734
10735 // Stop scan when we have seen all the uses.
10736 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
10737 assert(I != DefBB->end());
10738
10739 if (I->isDebugInstr())
10740 continue;
10741
10742 if (++NumInst > MaxInstScan)
10743 return true;
10744
10745 for (const MachineOperand &Op : I->operands()) {
10746 // We don't check reg masks here as they're used only on calls:
10747 // 1. EXEC is only considered const within one BB
10748 // 2. Call should be a terminator instruction if present in a BB
10749
10750 if (!Op.isReg())
10751 continue;
10752
10753 Register Reg = Op.getReg();
10754 if (Op.isUse()) {
10755 if (Reg == VReg && --NumUse == 0)
10756 return false;
10757 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
10758 return true;
10759 }
10760 }
10761}
10762
10765 const DebugLoc &DL, Register Src, Register Dst) const {
10766 auto Cur = MBB.begin();
10767 if (Cur != MBB.end())
10768 do {
10769 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
10770 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
10771 ++Cur;
10772 } while (Cur != MBB.end() && Cur != LastPHIIt);
10773
10774 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
10775 Dst);
10776}
10777
10780 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10781 if (InsPt != MBB.end() &&
10782 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10783 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10784 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10785 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
10786 InsPt++;
10787 return BuildMI(MBB, InsPt, DL,
10788 get(AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), Dst)
10789 .addReg(Src, {}, SrcSubReg)
10790 .addReg(AMDGPU::EXEC, RegState::Implicit);
10791 }
10792 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10793 Dst);
10794}
10795
10796bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10797
10799 const MachineInstr &SecondMI) const {
10800 for (const auto &Use : SecondMI.all_uses()) {
10801 if (Use.isReg() && FirstMI.modifiesRegister(Use.getReg(), &RI))
10802 return true;
10803 }
10804 return false;
10805}
10806
10807/// If OpX is multicycle, anti-dependencies are not allowed.
10808/// isDPMACCInstruction was not designed for VOPD, but it is fit for the
10809/// purpose.
10811 const MachineInstr &OpX) const {
10813}
10814
10817 ArrayRef<unsigned> Ops, int FrameIndex,
10818 MachineInstr *&CopyMI, LiveIntervals *LIS,
10819 VirtRegMap *VRM) const {
10820 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10821 //
10822 // %0:sreg_32 = COPY $m0
10823 //
10824 // We explicitly chose SReg_32 for the virtual register so such a copy might
10825 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10826 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10827 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10828 // TargetInstrInfo::foldMemoryOperand() is going to try.
10829 // A similar issue also exists with spilling and reloading $exec registers.
10830 //
10831 // To prevent that, constrain the %0 register class here.
10832 if (isFullCopyInstr(MI)) {
10833 Register DstReg = MI.getOperand(0).getReg();
10834 Register SrcReg = MI.getOperand(1).getReg();
10835 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10836 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10837 MachineRegisterInfo &MRI = MF.getRegInfo();
10838 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10839 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
10840 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
10841 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
10842 return nullptr;
10843 }
10844 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
10845 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
10846 return nullptr;
10847 }
10848 }
10849 }
10850
10851 return nullptr;
10852}
10853
10855 const MachineInstr &MI,
10856 unsigned *PredCost) const {
10857 if (MI.isBundle()) {
10859 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10860 unsigned Lat = 0, Count = 0;
10861 for (++I; I != E && I->isBundledWithPred(); ++I) {
10862 ++Count;
10863 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
10864 }
10865 return Lat + Count - 1;
10866 }
10867
10868 return SchedModel.computeInstrLatency(&MI);
10869}
10870
10871const MachineOperand &
10873 if (const MachineOperand *CallAddrOp =
10874 getNamedOperand(MI, AMDGPU::OpName::src0))
10875 return *CallAddrOp;
10877}
10878
10881 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10882 unsigned Opcode = MI.getOpcode();
10883
10884 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10885 Register Dst = MI.getOperand(0).getReg();
10886 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
10887 : MI.getOperand(1).getReg();
10888 LLT DstTy = MRI.getType(Dst);
10889 LLT SrcTy = MRI.getType(Src);
10890 unsigned DstAS = DstTy.getAddressSpace();
10891 unsigned SrcAS = SrcTy.getAddressSpace();
10892 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10893 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10894 ST.hasGloballyAddressableScratch()
10897 };
10898
10899 // If the target supports globally addressable scratch, the mapping from
10900 // scratch memory to the flat aperture changes therefore an address space cast
10901 // is no longer uniform.
10902 if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
10903 return HandleAddrSpaceCast(MI);
10904
10905 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
10906 auto IID = GI->getIntrinsicID();
10911
10912 switch (IID) {
10913 case Intrinsic::amdgcn_addrspacecast_nonnull:
10914 return HandleAddrSpaceCast(MI);
10915 case Intrinsic::amdgcn_if:
10916 case Intrinsic::amdgcn_else:
10917 // FIXME: Uniform if second result
10918 break;
10919 }
10920
10922 }
10923
10924 // Loads from the private and flat address spaces are divergent, because
10925 // threads can execute the load instruction with the same inputs and get
10926 // different results.
10927 //
10928 // All other loads are not divergent, because if threads issue loads with the
10929 // same arguments, they will always get the same result.
10930 if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
10931 Opcode == AMDGPU::G_SEXTLOAD) {
10932 if (MI.memoperands_empty())
10933 return ValueUniformity::NeverUniform; // conservative assumption
10934
10935 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10936 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10937 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10938 })) {
10939 // At least one MMO in a non-global address space.
10941 }
10943 }
10944
10945 if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) ||
10946 Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10947 Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10948 AMDGPU::isGenericAtomic(Opcode)) {
10950 }
10951
10952 // Result is computed from uniform SP and uniform wave-wide max size.
10953 if (Opcode == TargetOpcode::G_DYN_STACKALLOC)
10955
10956 if (Opcode == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
10958
10960}
10961
10963 if (!Formatter)
10964 Formatter = std::make_unique<AMDGPUMIRFormatter>(ST);
10965 return Formatter.get();
10966}
10967
10969
10970 if (isNeverUniform(MI))
10972
10973 unsigned opcode = MI.getOpcode();
10974 if (opcode == AMDGPU::V_READLANE_B32 ||
10975 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10976 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10978
10979 // If any of defs is divergent, report as NeverUniform. isUniformReg will
10980 // calculate in more detail for each def from its reg class, if available.
10981 if (MI.isInlineAsm()) {
10982 for (const MachineOperand &MO : MI.operands()) {
10983 if (!MO.isReg() || !MO.isDef())
10984 continue;
10985 const TargetRegisterClass *RC =
10986 MI.getRegClassConstraint(MO.getOperandNo(), this, &RI);
10987 if (!RC || !RI.isSGPRClass(RC))
10989 }
10990 }
10991
10992 if (isCopyInstr(MI)) {
10993 const MachineOperand &srcOp = MI.getOperand(1);
10994 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10995 const TargetRegisterClass *regClass =
10996 RI.getPhysRegBaseClass(srcOp.getReg());
10997 return RI.isSGPRClass(regClass) ? ValueUniformity::AlwaysUniform
10999 }
11001 }
11002
11003 // GMIR handling
11004 if (MI.isPreISelOpcode())
11006
11007 // Atomics are divergent because they are executed sequentially: when an
11008 // atomic operation refers to the same address in each thread, then each
11009 // thread after the first sees the value written by the previous thread as
11010 // original value.
11011
11012 if (isAtomic(MI))
11014
11015 // Loads from the private and flat address spaces are divergent, because
11016 // threads can execute the load instruction with the same inputs and get
11017 // different results.
11018 if (isFLAT(MI) && MI.mayLoad()) {
11019 if (MI.memoperands_empty())
11020 return ValueUniformity::NeverUniform; // conservative assumption
11021
11022 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
11023 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
11024 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
11025 })) {
11026 // At least one MMO in a non-global address space.
11028 }
11029
11031 }
11032
11033 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
11034 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
11035
11036 // FIXME: It's conceptually broken to report this for an instruction, and not
11037 // a specific def operand. For inline asm in particular, there could be mixed
11038 // uniform and divergent results.
11039 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
11040 const MachineOperand &SrcOp = MI.getOperand(I);
11041 if (!SrcOp.isReg())
11042 continue;
11043
11044 Register Reg = SrcOp.getReg();
11045 if (!Reg || !SrcOp.readsReg())
11046 continue;
11047
11048 // If RegBank is null, this is unassigned or an unallocatable special
11049 // register, which are all scalars.
11050 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
11051 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
11053 }
11054
11055 // TODO: Uniformity check condtions above can be rearranged for more
11056 // redability
11057
11058 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
11059 // currently turned into no-op COPYs by SelectionDAG ISel and are
11060 // therefore no longer recognizable.
11061
11063}
11064
11066 switch (MF.getFunction().getCallingConv()) {
11068 return 1;
11070 return 2;
11072 return 3;
11076 const Function &F = MF.getFunction();
11077 F.getContext().diagnose(DiagnosticInfoUnsupported(
11078 F, "ds_ordered_count unsupported for this calling conv"));
11079 [[fallthrough]];
11080 }
11083 case CallingConv::C:
11084 case CallingConv::Fast:
11085 default:
11086 // Assume other calling conventions are various compute callable functions
11087 return 0;
11088 }
11089}
11090
11092 Register &SrcReg2, int64_t &CmpMask,
11093 int64_t &CmpValue) const {
11094 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
11095 return false;
11096
11097 switch (MI.getOpcode()) {
11098 default:
11099 break;
11100 case AMDGPU::S_CMP_EQ_U32:
11101 case AMDGPU::S_CMP_EQ_I32:
11102 case AMDGPU::S_CMP_LG_U32:
11103 case AMDGPU::S_CMP_LG_I32:
11104 case AMDGPU::S_CMP_LT_U32:
11105 case AMDGPU::S_CMP_LT_I32:
11106 case AMDGPU::S_CMP_GT_U32:
11107 case AMDGPU::S_CMP_GT_I32:
11108 case AMDGPU::S_CMP_LE_U32:
11109 case AMDGPU::S_CMP_LE_I32:
11110 case AMDGPU::S_CMP_GE_U32:
11111 case AMDGPU::S_CMP_GE_I32:
11112 case AMDGPU::S_CMP_EQ_U64:
11113 case AMDGPU::S_CMP_LG_U64:
11114 SrcReg = MI.getOperand(0).getReg();
11115 if (MI.getOperand(1).isReg()) {
11116 if (MI.getOperand(1).getSubReg())
11117 return false;
11118 SrcReg2 = MI.getOperand(1).getReg();
11119 CmpValue = 0;
11120 } else if (MI.getOperand(1).isImm()) {
11121 SrcReg2 = Register();
11122 CmpValue = MI.getOperand(1).getImm();
11123 } else {
11124 return false;
11125 }
11126 CmpMask = ~0;
11127 return true;
11128 case AMDGPU::S_CMPK_EQ_U32:
11129 case AMDGPU::S_CMPK_EQ_I32:
11130 case AMDGPU::S_CMPK_LG_U32:
11131 case AMDGPU::S_CMPK_LG_I32:
11132 case AMDGPU::S_CMPK_LT_U32:
11133 case AMDGPU::S_CMPK_LT_I32:
11134 case AMDGPU::S_CMPK_GT_U32:
11135 case AMDGPU::S_CMPK_GT_I32:
11136 case AMDGPU::S_CMPK_LE_U32:
11137 case AMDGPU::S_CMPK_LE_I32:
11138 case AMDGPU::S_CMPK_GE_U32:
11139 case AMDGPU::S_CMPK_GE_I32:
11140 SrcReg = MI.getOperand(0).getReg();
11141 SrcReg2 = Register();
11142 CmpValue = MI.getOperand(1).getImm();
11143 CmpMask = ~0;
11144 return true;
11145 }
11146
11147 return false;
11148}
11149
11151 for (MachineBasicBlock *S : MBB->successors()) {
11152 if (S->isLiveIn(AMDGPU::SCC))
11153 return false;
11154 }
11155 return true;
11156}
11157
11158// Invert all uses of SCC following SCCDef because SCCDef may be deleted and
11159// (incoming SCC) = !(SCC defined by SCCDef).
11160// Return true if all uses can be re-written, false otherwise.
11161bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
11162 MachineBasicBlock *MBB = SCCDef->getParent();
11163 SmallVector<MachineInstr *> InvertInstr;
11164 bool SCCIsDead = false;
11165
11166 // Scan instructions for SCC uses that need to be inverted until SCC is dead.
11167 constexpr unsigned ScanLimit = 12;
11168 unsigned Count = 0;
11169 for (MachineInstr &MI :
11170 make_range(std::next(MachineBasicBlock::iterator(SCCDef)), MBB->end())) {
11171 if (++Count > ScanLimit)
11172 return false;
11173 if (MI.readsRegister(AMDGPU::SCC, &RI)) {
11174 if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
11175 MI.getOpcode() == AMDGPU::S_CSELECT_B64 ||
11176 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
11177 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC1)
11178 InvertInstr.push_back(&MI);
11179 else
11180 return false;
11181 }
11182 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
11183 SCCIsDead = true;
11184 break;
11185 }
11186 }
11187 if (!SCCIsDead && isSCCDeadOnExit(MBB))
11188 SCCIsDead = true;
11189
11190 // SCC may have more uses. Can't invert all of them.
11191 if (!SCCIsDead)
11192 return false;
11193
11194 // Invert uses
11195 for (MachineInstr *MI : InvertInstr) {
11196 if (MI->getOpcode() == AMDGPU::S_CSELECT_B32 ||
11197 MI->getOpcode() == AMDGPU::S_CSELECT_B64) {
11198 swapOperands(*MI);
11199 } else if (MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
11200 MI->getOpcode() == AMDGPU::S_CBRANCH_SCC1) {
11201 MI->setDesc(get(MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0
11202 ? AMDGPU::S_CBRANCH_SCC1
11203 : AMDGPU::S_CBRANCH_SCC0));
11204 } else {
11205 llvm_unreachable("SCC used but no inversion handling");
11206 }
11207 }
11208 return true;
11209}
11210
11211// SCC is already valid after SCCValid.
11212// SCCRedefine will redefine SCC to the same value already available after
11213// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
11214// update kill/dead flags if necessary.
11215bool SIInstrInfo::optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
11216 bool NeedInversion) const {
11217 MachineInstr *KillsSCC = nullptr;
11218 if (SCCValid->getParent() != SCCRedefine->getParent())
11219 return false;
11220 for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()),
11221 SCCRedefine->getIterator())) {
11222 if (MI.modifiesRegister(AMDGPU::SCC, &RI))
11223 return false;
11224 if (MI.killsRegister(AMDGPU::SCC, &RI))
11225 KillsSCC = &MI;
11226 }
11227 if (NeedInversion && !invertSCCUse(SCCRedefine))
11228 return false;
11229 if (MachineOperand *SccDef =
11230 SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
11231 SccDef->setIsDead(false);
11232 if (KillsSCC)
11233 KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
11234 SCCRedefine->eraseFromParent();
11235 return true;
11236}
11237
11238static bool foldableSelect(const MachineInstr &Def) {
11239 if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 &&
11240 Def.getOpcode() != AMDGPU::S_CSELECT_B64)
11241 return false;
11242 bool Op1IsNonZeroImm =
11243 Def.getOperand(1).isImm() && Def.getOperand(1).getImm() != 0;
11244 bool Op2IsZeroImm =
11245 Def.getOperand(2).isImm() && Def.getOperand(2).getImm() == 0;
11246 if (!Op1IsNonZeroImm || !Op2IsZeroImm)
11247 return false;
11248 return true;
11249}
11250
11251static bool setsSCCIfResultIsZero(const MachineInstr &Def, bool &NeedInversion,
11252 unsigned &NewDefOpc) {
11253 // S_ADD_U32 X, 1 sets SCC on carryout which can only happen if result==0.
11254 // S_ADD_I32 X, 1 can be converted to S_ADD_U32 X, 1 if SCC is dead.
11255 if (Def.getOpcode() != AMDGPU::S_ADD_I32 &&
11256 Def.getOpcode() != AMDGPU::S_ADD_U32)
11257 return false;
11258 const MachineOperand &AddSrc1 = Def.getOperand(1);
11259 const MachineOperand &AddSrc2 = Def.getOperand(2);
11260 int64_t addend;
11261
11262 if ((!AddSrc1.isImm() || AddSrc1.getImm() != 1) &&
11263 (!AddSrc2.isImm() || AddSrc2.getImm() != 1) &&
11264 (!getFoldableImm(&AddSrc1, addend) || addend != 1) &&
11265 (!getFoldableImm(&AddSrc2, addend) || addend != 1))
11266 return false;
11267
11268 if (Def.getOpcode() == AMDGPU::S_ADD_I32) {
11269 const MachineOperand *SccDef =
11270 Def.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
11271 if (!SccDef->isDead())
11272 return false;
11273 NewDefOpc = AMDGPU::S_ADD_U32;
11274 }
11275 NeedInversion = !NeedInversion;
11276 return true;
11277}
11278
11280 Register SrcReg2, int64_t CmpMask,
11281 int64_t CmpValue,
11282 const MachineRegisterInfo *MRI) const {
11283 if (!SrcReg || SrcReg.isPhysical())
11284 return false;
11285
11286 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
11287 return false;
11288
11289 const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
11290 this](bool NeedInversion) -> bool {
11291 if (CmpValue != 0)
11292 return false;
11293
11294 MachineInstr *Def = MRI->getVRegDef(SrcReg);
11295 if (!Def)
11296 return false;
11297
11298 // For S_OP that set SCC = DST!=0, do the transformation
11299 //
11300 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11301 //
11302 // For (S_OP ...) that set SCC = DST==0, invert NeedInversion and
11303 // do the transformation:
11304 //
11305 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11306 //
11307 // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
11308 // for S_CSELECT* already has the same value that will be calculated by
11309 // s_cmp_lg_*
11310 //
11311 // s_cmp_[lg|eq]_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT*
11312 // (non-zero imm), 0)
11313
11314 unsigned NewDefOpc = Def->getOpcode();
11315 if (!setsSCCIfResultIsNonZero(*Def) &&
11316 !setsSCCIfResultIsZero(*Def, NeedInversion, NewDefOpc) &&
11317 !foldableSelect(*Def))
11318 return false;
11319
11320 if (!optimizeSCC(Def, &CmpInstr, NeedInversion))
11321 return false;
11322
11323 if (NewDefOpc != Def->getOpcode())
11324 Def->setDesc(get(NewDefOpc));
11325
11326 // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
11327 // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
11328 // 64-bit foldableSelect then delete s_or_b32 in the sequence:
11329 // sX = s_cselect_b64 (non-zero imm), 0
11330 // sLo = copy sX.sub0
11331 // sHi = copy sX.sub1
11332 // sY = s_or_b32 sLo, sHi
11333 if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
11334 MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
11335 const MachineOperand &OrOpnd1 = Def->getOperand(1);
11336 const MachineOperand &OrOpnd2 = Def->getOperand(2);
11337 if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
11338 MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg());
11339 MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg());
11340 if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
11341 Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() &&
11342 Def2->getOperand(1).isReg() &&
11343 Def1->getOperand(1).getSubReg() == AMDGPU::sub0 &&
11344 Def2->getOperand(1).getSubReg() == AMDGPU::sub1 &&
11345 Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
11346 MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg());
11347 if (Select && foldableSelect(*Select))
11348 optimizeSCC(Select, Def, /*NeedInversion=*/false);
11349 }
11350 }
11351 }
11352 return true;
11353 };
11354
11355 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
11356 this](int64_t ExpectedValue, unsigned SrcSize,
11357 bool IsReversible, bool IsSigned) -> bool {
11358 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11359 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11360 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11361 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11362 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
11363 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11364 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11365 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11366 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11367 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
11368 //
11369 // Signed ge/gt are not used for the sign bit.
11370 //
11371 // If result of the AND is unused except in the compare:
11372 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
11373 //
11374 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11375 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11376 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
11377 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11378 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11379 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
11380
11381 MachineInstr *Def = MRI->getVRegDef(SrcReg);
11382 if (!Def)
11383 return false;
11384
11385 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
11386 Def->getOpcode() != AMDGPU::S_AND_B64)
11387 return false;
11388
11389 int64_t Mask;
11390 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
11391 if (MO->isImm())
11392 Mask = MO->getImm();
11393 else if (!getFoldableImm(MO, Mask))
11394 return false;
11395 Mask &= maxUIntN(SrcSize);
11396 return isPowerOf2_64(Mask);
11397 };
11398
11399 MachineOperand *SrcOp = &Def->getOperand(1);
11400 if (isMask(SrcOp))
11401 SrcOp = &Def->getOperand(2);
11402 else if (isMask(&Def->getOperand(2)))
11403 SrcOp = &Def->getOperand(1);
11404 else
11405 return false;
11406
11407 // A valid Mask is required to have a single bit set, hence a non-zero and
11408 // power-of-two value. This verifies that we will not do 64-bit shift below.
11409 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
11410 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
11411 if (IsSigned && BitNo == SrcSize - 1)
11412 return false;
11413
11414 ExpectedValue <<= BitNo;
11415
11416 bool IsReversedCC = false;
11417 if (CmpValue != ExpectedValue) {
11418 if (!IsReversible)
11419 return false;
11420 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
11421 if (!IsReversedCC)
11422 return false;
11423 }
11424
11425 Register DefReg = Def->getOperand(0).getReg();
11426 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
11427 return false;
11428
11429 if (!optimizeSCC(Def, &CmpInstr, /*NeedInversion=*/false))
11430 return false;
11431
11432 if (!MRI->use_nodbg_empty(DefReg)) {
11433 assert(!IsReversedCC);
11434 return true;
11435 }
11436
11437 // Replace AND with unused result with a S_BITCMP.
11438 MachineBasicBlock *MBB = Def->getParent();
11439
11440 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
11441 : AMDGPU::S_BITCMP1_B32
11442 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
11443 : AMDGPU::S_BITCMP1_B64;
11444
11445 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
11446 .add(*SrcOp)
11447 .addImm(BitNo);
11448 Def->eraseFromParent();
11449
11450 return true;
11451 };
11452
11453 switch (CmpInstr.getOpcode()) {
11454 default:
11455 break;
11456 case AMDGPU::S_CMP_EQ_U32:
11457 case AMDGPU::S_CMP_EQ_I32:
11458 case AMDGPU::S_CMPK_EQ_U32:
11459 case AMDGPU::S_CMPK_EQ_I32:
11460 return optimizeCmpAnd(1, 32, true, false) ||
11461 optimizeCmpSelect(/*NeedInversion=*/true);
11462 case AMDGPU::S_CMP_GE_U32:
11463 case AMDGPU::S_CMPK_GE_U32:
11464 return optimizeCmpAnd(1, 32, false, false);
11465 case AMDGPU::S_CMP_GE_I32:
11466 case AMDGPU::S_CMPK_GE_I32:
11467 return optimizeCmpAnd(1, 32, false, true);
11468 case AMDGPU::S_CMP_EQ_U64:
11469 return optimizeCmpAnd(1, 64, true, false);
11470 case AMDGPU::S_CMP_LG_U32:
11471 case AMDGPU::S_CMP_LG_I32:
11472 case AMDGPU::S_CMPK_LG_U32:
11473 case AMDGPU::S_CMPK_LG_I32:
11474 return optimizeCmpAnd(0, 32, true, false) ||
11475 optimizeCmpSelect(/*NeedInversion=*/false);
11476 case AMDGPU::S_CMP_GT_U32:
11477 case AMDGPU::S_CMPK_GT_U32:
11478 return optimizeCmpAnd(0, 32, false, false);
11479 case AMDGPU::S_CMP_GT_I32:
11480 case AMDGPU::S_CMPK_GT_I32:
11481 return optimizeCmpAnd(0, 32, false, true);
11482 case AMDGPU::S_CMP_LG_U64:
11483 return optimizeCmpAnd(0, 64, true, false) ||
11484 optimizeCmpSelect(/*NeedInversion=*/false);
11485 }
11486
11487 return false;
11488}
11489
11491 AMDGPU::OpName OpName) const {
11492 if (!ST.needsAlignedVGPRs())
11493 return;
11494
11495 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
11496 if (OpNo < 0)
11497 return;
11498 MachineOperand &Op = MI.getOperand(OpNo);
11499 if (getOpSize(MI, OpNo) > 4)
11500 return;
11501
11502 // Add implicit aligned super-reg to force alignment on the data operand.
11503 const DebugLoc &DL = MI.getDebugLoc();
11504 MachineBasicBlock *BB = MI.getParent();
11506 Register DataReg = Op.getReg();
11507 bool IsAGPR = RI.isAGPR(MRI, DataReg);
11509 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
11510 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
11511 Register NewVR =
11512 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
11513 : &AMDGPU::VReg_64_Align2RegClass);
11514 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
11515 .addReg(DataReg, {}, Op.getSubReg())
11516 .addImm(AMDGPU::sub0)
11517 .addReg(Undef)
11518 .addImm(AMDGPU::sub1);
11519 Op.setReg(NewVR);
11520 Op.setSubReg(AMDGPU::sub0);
11521 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
11522}
11523
11525 if (isIGLP(*MI))
11526 return false;
11527
11529}
11530
11532 if (!isWMMA(MI) && !isSWMMAC(MI))
11533 return false;
11534
11535 if (ST.hasGFX1250Insts())
11536 return AMDGPU::getWMMAIsXDL(MI.getOpcode());
11537
11538 return true;
11539}
11540
11542 unsigned Opcode = MI.getOpcode();
11543
11544 if (AMDGPU::isGFX12Plus(ST))
11545 return isDOT(MI) || isXDLWMMA(MI);
11546
11547 if (!isMAI(MI) || isDGEMM(Opcode) ||
11548 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
11549 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
11550 return false;
11551
11552 if (!ST.hasGFX940Insts())
11553 return true;
11554
11555 return AMDGPU::getMAIIsGFX940XDL(Opcode);
11556}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static const TargetRegisterClass * getRegClass(const MachineInstr &MI, Register Reg)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static bool isUndef(const MachineInstr &MI)
TargetInstrInfo::RegSubRegPair RegSubRegPair
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
static MachineBasicBlock * generateWaterFallLoop(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr, ArrayRef< Register > PhySGPRs={})
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static MachineInstr * swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static unsigned getSGPRSpillSaveOpcode(unsigned Size, bool NeedsCFI)
static bool setsSCCIfResultIsZero(const MachineInstr &Def, bool &NeedInversion, unsigned &NewDefOpc)
static bool isSCCDeadOnExit(MachineBasicBlock *MBB)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static unsigned getAVSpillSaveOpcode(unsigned Size, bool NeedsCFI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getVGPRSpillSaveOpcode(unsigned Size, bool NeedsCFI)
static constexpr AMDGPU::OpName ModifierOpNames[]
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static bool foldableSelect(const MachineInstr &Def)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, AMDGPU::OpName OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
static unsigned getNumOperandsNoGlue(SDNode *Node)
static bool canRemat(const MachineInstr &MI)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &PredBB, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps, ArrayRef< Register > PhySGPRs={})
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:484
#define LLVM_DEBUG(...)
Definition Debug.h:119
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static LLVM_ABI Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition APFloat.cpp:160
Class for arbitrary precision integers.
Definition APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
Get the first element.
Definition ArrayRef.h:144
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
uint64_t getZExtValue() const
A debug info location.
Definition DebugLoc.h:126
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:301
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:353
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
constexpr unsigned getAddressSpace() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
static LocationSize precise(uint64_t Value)
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:347
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:417
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:427
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
unsigned getSize() const
Return the number of bytes in the encoding of this instruction, or zero if the encoding size cannot b...
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
unsigned getOpcode() const
Return the opcode number for this descriptor.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:86
uint8_t OperandType
Information about the type of the operand.
Definition MCInstrDesc.h:98
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition MCInstrDesc.h:92
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:213
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
LLVM_ABI void setVariableValue(const MCExpr *Value)
Definition MCSymbol.cpp:50
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
MIRFormater - Interface to format MIR operand based on target.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LLVM_ABI LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
bool isBundle() const
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
bool modifiesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr modifies (fully define or partially define) the specified register.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI void eraseFromBundle()
Unlink 'this' from its basic block and delete it.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
mop_range explicit_operands()
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
LLVM_ABI void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
LLVM_ABI void clearRegisterKills(Register Reg, const TargetRegisterInfo *RegInfo)
Clear all kill flags affecting Reg.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
MachineOperand * findRegisterDefOperand(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false)
Wrapper for findRegisterDefOperandIdx, it returns a pointer to the MachineOperand rather than an inde...
A description of a memory reference used in the backend.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
iterator_range< use_nodbg_iterator > use_nodbg_operands(Register Reg) const
bool use_nodbg_empty(Register RegNo) const
use_nodbg_empty - Return true if there are no non-Debug instructions using the specified register.
LLVM_ABI void moveOperands(MachineOperand *Dst, MachineOperand *Src, unsigned NumOps)
Move NumOps operands from Src to Dst, updating use-def lists as needed.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
bool reservedRegsFrozen() const
reservedRegsFrozen - Returns true after freezeReservedRegs() was called to ensure the set of reserved...
LLVM_ABI void clearVirtRegs()
clearVirtRegs - Remove all virtual registers (after physreg assignment).
void setRegAllocationHint(Register VReg, unsigned Type, Register PrefReg)
setRegAllocationHint - Specify a register allocation hint for the specified virtual register.
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
void setSimpleHint(Register VReg, Register PrefReg)
Specify the preferred (target independent) register allocation hint for the specified virtual registe...
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
iterator_range< use_iterator > use_operands(Register Reg) const
LLVM_ABI void removeRegOperandFromUseList(MachineOperand *MO)
Remove MO from its use-def list.
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
LLVM_ABI void addRegOperandToUseList(MachineOperand *MO)
Add MO to the linked list of operands for its register.
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
bool canAddToBBProlog(const MachineInstr &MI) const
static bool isDS(const MachineInstr &MI)
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
bool isXDLWMMA(const MachineInstr &MI) const
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
bool isSpill(uint32_t Opcode) const
uint64_t getDefaultRsrcDataFormat() const
static bool isSOPP(const MachineInstr &MI)
bool mayAccessScratch(const MachineInstr &MI) const
bool isIGLP(unsigned Opcode) const
static bool isFLATScratch(const MachineInstr &MI)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, AMDGPU::FlatAddrSpace FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
void storeRegToStackSlotCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getFoldableCopySrcIdx(const MachineInstr &MI)
unsigned getOpSize(uint32_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static bool isMTBUF(const MachineInstr &MI)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool setsSCCIfResultIsNonZero(const MachineInstr &MI)
const MIRFormatter * getMIRFormatter() const override
static bool isXcntDrain(const MachineInstr &MI)
True if MI implicitly drains XCNT.
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
static bool isVOP2(const MachineInstr &MI)
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
MachineInstr * getWholeWaveFunctionSetup(MachineFunction &MF) const
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
static bool isDOT(const MachineInstr &MI)
InstSizeVerifyMode getInstSizeVerifyMode(const MachineInstr &MI) const override
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
static bool isSWMMAC(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
void removeModOperands(MachineInstr &MI) const
unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
bool isXDL(const MachineInstr &MI) const
Register isStackAccess(const MachineInstr &MI, int &FrameIndex, TypeSize &MemBytes) const
static bool isVIMAGE(const MachineInstr &MI)
void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const
static bool isSOP2(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
bool hasRAWDependency(const MachineInstr &FirstMI, const MachineInstr &SecondMI) const
bool isLegalAV64PseudoImm(uint64_t Imm) const
Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool isNeverCoissue(MachineInstr &MI) const
static bool isBUF(const MachineInstr &MI)
void handleCopyToPhysHelper(SIInstrWorklist &Worklist, Register DstReg, MachineInstr &Inst, MachineRegisterInfo &MRI, DenseMap< MachineInstr *, V2PhysSCopyInfo > &WaterFalls, DenseMap< MachineInstr *, bool > &V2SPhyCopiesToErase) const
bool hasModifiersSet(const MachineInstr &MI, AMDGPU::OpName OpName) const
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, unsigned toIdx) const
static bool isFLATGlobal(const MachineInstr &MI)
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, int FrameIndex, MachineInstr *&CopyMI, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
bool isGlobalMemoryObject(const MachineInstr *MI) const override
static bool isVSAMPLE(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool isVOPDAntidependencyAllowed(const MachineInstr &MI) const
If OpX is multicycle, anti-dependencies are not allowed.
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex, TypeSize &MemBytes) const
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
static bool isVALU(const MachineInstr &MI, bool AllowLDSDMA)
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isTRANS(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static bool isSOPK(const MachineInstr &MI)
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
void createWaterFallForSiCall(MachineInstr *MI, MachineDominatorTree *MDT, ArrayRef< MachineOperand * > ScalarOps, ArrayRef< Register > PhySGPRs={}) const
Wrapper function for generating waterfall for instruction MI This function take into consideration of...
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, unsigned SubReg=0, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
bool isReMaterializableImpl(const MachineInstr &MI) const override
static bool isVOP3(const MCInstrDesc &Desc)
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
bool physRegUsesConstantBus(const MachineOperand &Reg) const
static bool isF16PseudoScalarTrans(unsigned Opcode)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const
static bool isDPP(const MachineInstr &MI)
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
static bool isMFMA(const MachineInstr &MI)
bool isLowLatencyInstruction(const MachineInstr &MI) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
void mutateAndCleanupImplicit(MachineInstr &MI, const MCInstrDesc &NewDesc) const
ValueUniformity getGenericValueUniformity(const MachineInstr &MI) const
static bool isMAI(const MCInstrDesc &Desc)
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, LaneBitmask UsedLanes=LaneBitmask::getAll()) const override
static bool usesLGKM_CNT(const MachineInstr &MI)
void legalizeOperandsVALUt16(MachineInstr &Inst, MachineRegisterInfo &MRI) const
Fix operands in Inst to fix 16bit SALU to VALU lowering.
bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
const MachineOperand & getCalleeOperand(const MachineInstr &MI) const override
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
bool isAlwaysGDS(uint32_t Opcode) const
static bool isVGPRSpill(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
bool isLegalGFX12PlusPackedMathFP32or64BitOperand(const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO=nullptr) const
Check if MO would be a legal operand for gfx12+ packed math FP32 or 64 instructions.
unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI, bool NeedsCFI) const
int64_t getNamedImmOperand(const MachineInstr &MI, AMDGPU::OpName OperandName) const
Get required immediate operand.
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool regUsesConstantBus(const MachineOperand &Reg, const MachineRegisterInfo &MRI) const
static bool isMIMG(const MachineInstr &MI)
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description or operand ind...
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, AMDGPU::FlatAddrSpace FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
void createReadFirstLaneFromCopyToPhysReg(MachineRegisterInfo &MRI, Register DstReg, MachineInstr &Inst) const
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
static bool isAtomic(const MachineInstr &MI)
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
bool isLiteralOperandLegal(const MCInstrDesc &InstDesc, const MCOperandInfo &OpInfo) const
static bool isWWMRegSpillOpcode(uint32_t Opcode)
static bool sopkIsZext(unsigned Opcode)
static bool isSGPRSpill(const MachineInstr &MI)
static bool isWMMA(const MachineInstr &MI)
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
static bool isSOPC(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
bool isBarrier(unsigned Opcode) const
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
static bool usesVM_CNT(const MachineInstr &MI)
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
static bool isFixedSize(const MachineInstr &MI)
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
ValueUniformity getValueUniformity(const MachineInstr &MI) const final
uint64_t getScratchRsrcWords23() const
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
bool allowNegativeFlatOffset(AMDGPU::FlatAddrSpace FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
std::optional< int64_t > getImmOrMaterializedImm(MachineOperand &Op) const
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst, DenseMap< MachineInstr *, V2PhysSCopyInfo > &WaterFalls, DenseMap< MachineInstr *, bool > &V2SPhyCopiesToErase) const
static bool isLDSDMA(const MachineInstr &MI)
static bool isVOP1(const MachineInstr &MI)
SIInstrInfo(const GCNSubtarget &ST)
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
unsigned getScratchReservedForDynamicVGPRs() const
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
unsigned getHWRegIndex(MCRegister Reg) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
unsigned getChannelFromSubReg(unsigned SubReg) const
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:301
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReMaterializableImpl(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual const MachineOperand & getCalleeOperand(const MachineInstr &MI) const
Returns the callee operand from the given MI.
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, LaneBitmask UsedLanes=LaneBitmask::getAll()) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo & getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:212
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:190
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
bool isPKFMACF16InlineConstant(uint32_t Literal, bool IsGFX11Plus)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
bool getWMMAIsXDL(unsigned Opc)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isDPMACCInstruction(unsigned Opc)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int32_t getCommuteRev(uint32_t Opcode)
LLVM_READONLY int32_t getCommuteOrig(uint32_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
LLVM_READONLY int32_t getGlobalVaddrOp(uint32_t Opcode)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READONLY int32_t getMFMAEarlyClobberOp(uint32_t Opcode)
bool getMAIIsGFX940XDL(unsigned Opc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY int32_t getIfAddr64Inst(uint32_t Opcode)
Check if Opcode is an Addr64 opcode.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
const uint64_t RSRC_TID_ENABLE
LLVM_READONLY int32_t getVOPe32(uint32_t Opcode)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
bool isGenericAtomic(unsigned Opc)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
LLVM_READONLY int32_t getAddr64Inst(uint32_t Opcode)
int32_t getMCOpcode(uint32_t Opcode, unsigned Gen)
bool isPackedFP32or64BitInst(unsigned Opc)
@ OPERAND_REG_IMM_V2FP64
Definition SIDefines.h:433
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition SIDefines.h:451
@ OPERAND_REG_IMM_INT64
Definition SIDefines.h:419
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:426
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:442
@ OPERAND_REG_INLINE_C_BF16
Definition SIDefines.h:439
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:444
@ OPERAND_REG_IMM_V2INT64
Definition SIDefines.h:429
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:428
@ OPERAND_REG_IMM_BF16
Definition SIDefines.h:423
@ OPERAND_REG_IMM_INT32
Operands with register, 32-bit, or 64-bit immediate.
Definition SIDefines.h:418
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:425
@ OPERAND_REG_IMM_FP16
Definition SIDefines.h:424
@ OPERAND_REG_IMM_V2FP16_SPLAT
Definition SIDefines.h:427
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:438
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition SIDefines.h:436
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:430
@ OPERAND_REG_IMM_FP64
Definition SIDefines.h:422
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:445
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:456
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:457
@ OPERAND_REG_IMM_V2INT32
Definition SIDefines.h:431
@ OPERAND_SDWA_VOPC_DST
Definition SIDefines.h:468
@ OPERAND_REG_IMM_FP32
Definition SIDefines.h:421
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:441
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:437
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:443
@ OPERAND_INLINE_C_AV64_PSEUDO
Definition SIDefines.h:462
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:432
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:458
@ OPERAND_REG_INLINE_C_FP16
Definition SIDefines.h:440
@ OPERAND_REG_IMM_INT16
Definition SIDefines.h:420
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition SIDefines.h:448
LLVM_READONLY int32_t getBasicFromSDWAOp(uint32_t Opcode)
bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, const MCSubtargetInfo &ST)
@ TI_SCRATCH_RSRC_DWORD1
Definition AMDGPU.h:612
@ TI_SCRATCH_RSRC_DWORD3
Definition AMDGPU.h:614
@ TI_SCRATCH_RSRC_DWORD0
Definition AMDGPU.h:611
@ TI_SCRATCH_RSRC_DWORD2
Definition AMDGPU.h:613
@ TI_CONSTDATA_START
Definition AMDGPU.h:610
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READONLY int32_t getFlatScratchInstSVfromSS(uint32_t Opcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ OPERAND_GENERIC_4
Definition MCInstrDesc.h:71
@ OPERAND_GENERIC_2
Definition MCInstrDesc.h:69
@ OPERAND_GENERIC_1
Definition MCInstrDesc.h:68
@ OPERAND_GENERIC_3
Definition MCInstrDesc.h:70
@ OPERAND_IMMEDIATE
Definition MCInstrDesc.h:61
@ OPERAND_GENERIC_0
Definition MCInstrDesc.h:67
@ OPERAND_GENERIC_5
Definition MCInstrDesc.h:72
Not(const Pred &P) -> Not< Pred >
constexpr bool isD16Buf(const T &...O)
Definition SIDefines.h:333
constexpr bool isSDWA(const T &...O)
Definition SIDefines.h:243
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:573
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:207
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
RegState
Flags to represent properties of register accesses.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
constexpr RegState getKillRegState(bool B)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:149
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, const MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI VirtRegInfo AnalyzeVirtRegInBundle(MachineInstr &MI, Register Reg, SmallVectorImpl< std::pair< MachineInstr *, unsigned > > *Ops=nullptr)
AnalyzeVirtRegInBundle - Analyze how the current instruction or bundle uses a virtual register.
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
@ Xor
Bitwise or logical XOR of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Count
Definition InstrProf.h:145
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
constexpr unsigned BitWidth
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
constexpr T reverseBits(T Val)
Reverse the bits in Val.
Definition MathExtras.h:118
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
constexpr RegState getUndefRegState(bool B)
ValueUniformity
Enum describing how values behave with respect to uniformity and divergence, to answer the question: ...
Definition Uniformity.h:18
@ AlwaysUniform
The result value is always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result value can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
MachineCycleInfo::CycleT MachineCycle
static const MachineMemOperand::Flags MOThreadPrivate
Mark the MMO of accesses to memory locations that are never written to by other threads.
Definition SIInstrInfo.h:63
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
#define N
Helper struct for the implementation of 3-address conversion to communicate updates made to instructi...
MachineInstr * RemoveMIUse
Other instruction whose def is no longer used by the converted instruction.
static constexpr uint64_t encode(Fields... Values)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
constexpr bool all() const
Definition LaneBitmask.h:54
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition SIInstrInfo.h:67
MachineInstr * top() const
Definition SIInstrInfo.h:72
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition SIInstrInfo.h:91
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.
VirtRegInfo - Information about a virtual register used by a set of operands.
bool Reads
Reads - One of the operands read the virtual register.
bool Writes
Writes - One of the operands writes the virtual register.