LLVM 23.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "GCNHazardRecognizer.h"
19#include "GCNSubtarget.h"
22#include "llvm/ADT/STLExtras.h"
34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/MC/MCContext.h"
38
39using namespace llvm;
40
41#define DEBUG_TYPE "si-instr-info"
42
43#define GET_INSTRINFO_CTOR_DTOR
44#include "AMDGPUGenInstrInfo.inc"
45
46namespace llvm::AMDGPU {
47#define GET_D16ImageDimIntrinsics_IMPL
48#define GET_ImageDimIntrinsicTable_IMPL
49#define GET_RsrcIntrinsics_IMPL
50#include "AMDGPUGenSearchableTables.inc"
51} // namespace llvm::AMDGPU
52
53// Must be at least 4 to be able to branch over minimum unconditional branch
54// code. This is only for making it possible to write reasonably small tests for
55// long branches.
57BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
58 cl::desc("Restrict range of branch instructions (DEBUG)"));
59
61 "amdgpu-fix-16-bit-physreg-copies",
62 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
63 cl::init(true),
65
67 : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP,
68 AMDGPU::ADJCALLSTACKDOWN),
69 RI(ST), ST(ST) {
70 SchedModel.init(&ST);
71}
72
73//===----------------------------------------------------------------------===//
74// TargetInstrInfo callbacks
75//===----------------------------------------------------------------------===//
76
77static unsigned getNumOperandsNoGlue(SDNode *Node) {
78 unsigned N = Node->getNumOperands();
79 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
80 --N;
81 return N;
82}
83
84/// Returns true if both nodes have the same value for the given
85/// operand \p Op, or if both nodes do not have this operand.
87 AMDGPU::OpName OpName) {
88 unsigned Opc0 = N0->getMachineOpcode();
89 unsigned Opc1 = N1->getMachineOpcode();
90
91 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
92 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
93
94 if (Op0Idx == -1 && Op1Idx == -1)
95 return true;
96
97
98 if ((Op0Idx == -1 && Op1Idx != -1) ||
99 (Op1Idx == -1 && Op0Idx != -1))
100 return false;
101
102 // getNamedOperandIdx returns the index for the MachineInstr's operands,
103 // which includes the result as the first operand. We are indexing into the
104 // MachineSDNode's operands, so we need to skip the result operand to get
105 // the real index.
106 --Op0Idx;
107 --Op1Idx;
108
109 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
110}
111
112static bool canRemat(const MachineInstr &MI) {
113
117 return true;
118
119 if (SIInstrInfo::isSMRD(MI)) {
120 return !MI.memoperands_empty() &&
121 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
122 return MMO->isLoad() && MMO->isInvariant();
123 });
124 }
125
126 return false;
127}
128
130 const MachineInstr &MI) const {
131
132 if (canRemat(MI)) {
133 // Normally VALU use of exec would block the rematerialization, but that
134 // is OK in this case to have an implicit exec read as all VALU do.
135 // We really want all of the generic logic for this except for this.
136
137 // Another potential implicit use is mode register. The core logic of
138 // the RA will not attempt rematerialization if mode is set anywhere
139 // in the function, otherwise it is safe since mode is not changed.
140
141 // There is difference to generic method which does not allow
142 // rematerialization if there are virtual register uses. We allow this,
143 // therefore this method includes SOP instructions as well.
144 if (!MI.hasImplicitDef() &&
145 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
146 !MI.mayRaiseFPException())
147 return true;
148 }
149
151}
152
153// Returns true if the result of a VALU instruction depends on exec.
154bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
155 assert(isVALU(MI));
156
157 // If it is convergent it depends on EXEC.
158 if (MI.isConvergent())
159 return true;
160
161 // If it defines SGPR it depends on EXEC
162 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
163 for (const MachineOperand &Def : MI.defs()) {
164 if (!Def.isReg())
165 continue;
166
167 Register Reg = Def.getReg();
168 if (Reg && RI.isSGPRReg(MRI, Reg))
169 return true;
170 }
171
172 return false;
173}
174
176 // Any implicit use of exec by VALU is not a real register read.
177 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
178 isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
179}
180
182 MachineBasicBlock *SuccToSinkTo,
183 MachineCycleInfo *CI) const {
184 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
185 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
186 return true;
187
188 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
189 // Check if sinking of MI would create temporal divergent use.
190 for (auto Op : MI.uses()) {
191 if (Op.isReg() && Op.getReg().isVirtual() &&
192 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
193 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
194
195 // SgprDef defined inside cycle
196 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
197 if (FromCycle == nullptr)
198 continue;
199
200 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
201 // Check if there is a FromCycle that contains SgprDef's basic block but
202 // does not contain SuccToSinkTo and also has divergent exit condition.
203 while (FromCycle && !FromCycle->contains(ToCycle)) {
205 FromCycle->getExitingBlocks(ExitingBlocks);
206
207 // FromCycle has divergent exit condition.
208 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
209 if (hasDivergentBranch(ExitingBlock))
210 return false;
211 }
212
213 FromCycle = FromCycle->getParentCycle();
214 }
215 }
216 }
217
218 return true;
219}
220
222 int64_t &Offset0,
223 int64_t &Offset1) const {
224 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
225 return false;
226
227 unsigned Opc0 = Load0->getMachineOpcode();
228 unsigned Opc1 = Load1->getMachineOpcode();
229
230 // Make sure both are actually loads.
231 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
232 return false;
233
234 // A mayLoad instruction without a def is not a load. Likely a prefetch.
235 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
236 return false;
237
238 if (isDS(Opc0) && isDS(Opc1)) {
239
240 // FIXME: Handle this case:
241 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
242 return false;
243
244 // Check base reg.
245 if (Load0->getOperand(0) != Load1->getOperand(0))
246 return false;
247
248 // Skip read2 / write2 variants for simplicity.
249 // TODO: We should report true if the used offsets are adjacent (excluded
250 // st64 versions).
251 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
252 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
253 if (Offset0Idx == -1 || Offset1Idx == -1)
254 return false;
255
256 // XXX - be careful of dataless loads
257 // getNamedOperandIdx returns the index for MachineInstrs. Since they
258 // include the output in the operand list, but SDNodes don't, we need to
259 // subtract the index by one.
260 Offset0Idx -= get(Opc0).NumDefs;
261 Offset1Idx -= get(Opc1).NumDefs;
262 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
263 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
264 return true;
265 }
266
267 if (isSMRD(Opc0) && isSMRD(Opc1)) {
268 // Skip time and cache invalidation instructions.
269 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
270 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
271 return false;
272
273 unsigned NumOps = getNumOperandsNoGlue(Load0);
274 if (NumOps != getNumOperandsNoGlue(Load1))
275 return false;
276
277 // Check base reg.
278 if (Load0->getOperand(0) != Load1->getOperand(0))
279 return false;
280
281 // Match register offsets, if both register and immediate offsets present.
282 assert(NumOps == 4 || NumOps == 5);
283 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
284 return false;
285
286 const ConstantSDNode *Load0Offset =
288 const ConstantSDNode *Load1Offset =
290
291 if (!Load0Offset || !Load1Offset)
292 return false;
293
294 Offset0 = Load0Offset->getZExtValue();
295 Offset1 = Load1Offset->getZExtValue();
296 return true;
297 }
298
299 // MUBUF and MTBUF can access the same addresses.
300 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
301
302 // MUBUF and MTBUF have vaddr at different indices.
303 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
304 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
305 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
306 return false;
307
308 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
309 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
310
311 if (OffIdx0 == -1 || OffIdx1 == -1)
312 return false;
313
314 // getNamedOperandIdx returns the index for MachineInstrs. Since they
315 // include the output in the operand list, but SDNodes don't, we need to
316 // subtract the index by one.
317 OffIdx0 -= get(Opc0).NumDefs;
318 OffIdx1 -= get(Opc1).NumDefs;
319
320 SDValue Off0 = Load0->getOperand(OffIdx0);
321 SDValue Off1 = Load1->getOperand(OffIdx1);
322
323 // The offset might be a FrameIndexSDNode.
324 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
325 return false;
326
327 Offset0 = Off0->getAsZExtVal();
328 Offset1 = Off1->getAsZExtVal();
329 return true;
330 }
331
332 return false;
333}
334
335static bool isStride64(unsigned Opc) {
336 switch (Opc) {
337 case AMDGPU::DS_READ2ST64_B32:
338 case AMDGPU::DS_READ2ST64_B64:
339 case AMDGPU::DS_WRITE2ST64_B32:
340 case AMDGPU::DS_WRITE2ST64_B64:
341 return true;
342 default:
343 return false;
344 }
345}
346
349 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
350 const TargetRegisterInfo *TRI) const {
351 if (!LdSt.mayLoadOrStore())
352 return false;
353
354 unsigned Opc = LdSt.getOpcode();
355 OffsetIsScalable = false;
356 const MachineOperand *BaseOp, *OffsetOp;
357 int DataOpIdx;
358
359 if (isDS(LdSt)) {
360 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
361 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
362 if (OffsetOp) {
363 // Normal, single offset LDS instruction.
364 if (!BaseOp) {
365 // DS_CONSUME/DS_APPEND use M0 for the base address.
366 // TODO: find the implicit use operand for M0 and use that as BaseOp?
367 return false;
368 }
369 BaseOps.push_back(BaseOp);
370 Offset = OffsetOp->getImm();
371 // Get appropriate operand, and compute width accordingly.
372 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
373 if (DataOpIdx == -1)
374 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
375 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
376 Width = LocationSize::precise(64);
377 else
378 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
379 } else {
380 // The 2 offset instructions use offset0 and offset1 instead. We can treat
381 // these as a load with a single offset if the 2 offsets are consecutive.
382 // We will use this for some partially aligned loads.
383 const MachineOperand *Offset0Op =
384 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
385 const MachineOperand *Offset1Op =
386 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
387
388 unsigned Offset0 = Offset0Op->getImm() & 0xff;
389 unsigned Offset1 = Offset1Op->getImm() & 0xff;
390 if (Offset0 + 1 != Offset1)
391 return false;
392
393 // Each of these offsets is in element sized units, so we need to convert
394 // to bytes of the individual reads.
395
396 unsigned EltSize;
397 if (LdSt.mayLoad())
398 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
399 else {
400 assert(LdSt.mayStore());
401 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
402 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
403 }
404
405 if (isStride64(Opc))
406 EltSize *= 64;
407
408 BaseOps.push_back(BaseOp);
409 Offset = EltSize * Offset0;
410 // Get appropriate operand(s), and compute width accordingly.
411 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
412 if (DataOpIdx == -1) {
413 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
414 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
415 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
416 Width = LocationSize::precise(
417 Width.getValue() + TypeSize::getFixed(getOpSize(LdSt, DataOpIdx)));
418 } else {
419 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
420 }
421 }
422 return true;
423 }
424
425 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
426 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
427 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
428 return false;
429 BaseOps.push_back(RSrc);
430 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
431 if (BaseOp && !BaseOp->isFI())
432 BaseOps.push_back(BaseOp);
433 const MachineOperand *OffsetImm =
434 getNamedOperand(LdSt, AMDGPU::OpName::offset);
435 Offset = OffsetImm->getImm();
436 const MachineOperand *SOffset =
437 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
438 if (SOffset) {
439 if (SOffset->isReg())
440 BaseOps.push_back(SOffset);
441 else
442 Offset += SOffset->getImm();
443 }
444 // Get appropriate operand, and compute width accordingly.
445 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
446 if (DataOpIdx == -1)
447 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
448 if (DataOpIdx == -1) // LDS DMA
449 return false;
450 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
451 return true;
452 }
453
454 if (isImage(LdSt)) {
455 auto RsrcOpName =
456 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
457 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
458 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
459 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
460 if (VAddr0Idx >= 0) {
461 // GFX10 possible NSA encoding.
462 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
463 BaseOps.push_back(&LdSt.getOperand(I));
464 } else {
465 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
466 }
467 Offset = 0;
468 // Get appropriate operand, and compute width accordingly.
469 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
470 if (DataOpIdx == -1)
471 return false; // no return sampler
472 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
473 return true;
474 }
475
476 if (isSMRD(LdSt)) {
477 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
478 if (!BaseOp) // e.g. S_MEMTIME
479 return false;
480 BaseOps.push_back(BaseOp);
481 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
482 Offset = OffsetOp ? OffsetOp->getImm() : 0;
483 // Get appropriate operand, and compute width accordingly.
484 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
485 if (DataOpIdx == -1)
486 return false;
487 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
488 return true;
489 }
490
491 if (isFLAT(LdSt)) {
492 // Instructions have either vaddr or saddr or both or none.
493 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
494 if (BaseOp)
495 BaseOps.push_back(BaseOp);
496 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
497 if (BaseOp)
498 BaseOps.push_back(BaseOp);
499 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
500 // Get appropriate operand, and compute width accordingly.
501 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
502 if (DataOpIdx == -1)
503 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
504 if (DataOpIdx == -1) // LDS DMA
505 return false;
506 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
507 return true;
508 }
509
510 return false;
511}
512
513static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
515 const MachineInstr &MI2,
517 // Only examine the first "base" operand of each instruction, on the
518 // assumption that it represents the real base address of the memory access.
519 // Other operands are typically offsets or indices from this base address.
520 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
521 return true;
522
523 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
524 return false;
525
526 auto *MO1 = *MI1.memoperands_begin();
527 auto *MO2 = *MI2.memoperands_begin();
528 if (MO1->getAddrSpace() != MO2->getAddrSpace())
529 return false;
530
531 const auto *Base1 = MO1->getValue();
532 const auto *Base2 = MO2->getValue();
533 if (!Base1 || !Base2)
534 return false;
535 Base1 = getUnderlyingObject(Base1);
536 Base2 = getUnderlyingObject(Base2);
537
538 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
539 return false;
540
541 return Base1 == Base2;
542}
543
545 int64_t Offset1, bool OffsetIsScalable1,
547 int64_t Offset2, bool OffsetIsScalable2,
548 unsigned ClusterSize,
549 unsigned NumBytes) const {
550 // If the mem ops (to be clustered) do not have the same base ptr, then they
551 // should not be clustered
552 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
553 if (!BaseOps1.empty() && !BaseOps2.empty()) {
554 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
555 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
556 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
557 return false;
558
559 const SIMachineFunctionInfo *MFI =
560 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
561 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
562 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
563 // If only one base op is empty, they do not have the same base ptr
564 return false;
565 }
566
567 // In order to avoid register pressure, on an average, the number of DWORDS
568 // loaded together by all clustered mem ops should not exceed
569 // MaxMemoryClusterDWords. This is an empirical value based on certain
570 // observations and performance related experiments.
571 // The good thing about this heuristic is - it avoids clustering of too many
572 // sub-word loads, and also avoids clustering of wide loads. Below is the
573 // brief summary of how the heuristic behaves for various `LoadSize` when
574 // MaxMemoryClusterDWords is 8.
575 //
576 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
577 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
578 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
579 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
580 // (5) LoadSize >= 17: do not cluster
581 const unsigned LoadSize = NumBytes / ClusterSize;
582 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
583 return NumDWords <= MaxMemoryClusterDWords;
584}
585
586// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
587// the first 16 loads will be interleaved with the stores, and the next 16 will
588// be clustered as expected. It should really split into 2 16 store batches.
589//
590// Loads are clustered until this returns false, rather than trying to schedule
591// groups of stores. This also means we have to deal with saying different
592// address space loads should be clustered, and ones which might cause bank
593// conflicts.
594//
595// This might be deprecated so it might not be worth that much effort to fix.
597 int64_t Offset0, int64_t Offset1,
598 unsigned NumLoads) const {
599 assert(Offset1 > Offset0 &&
600 "Second offset should be larger than first offset!");
601 // If we have less than 16 loads in a row, and the offsets are within 64
602 // bytes, then schedule together.
603
604 // A cacheline is 64 bytes (for global memory).
605 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
606}
607
610 const DebugLoc &DL, MCRegister DestReg,
611 MCRegister SrcReg, bool KillSrc,
612 const char *Msg = "illegal VGPR to SGPR copy") {
613 MachineFunction *MF = MBB.getParent();
614
616 C.diagnose(DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
617
618 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
619 .addReg(SrcReg, getKillRegState(KillSrc));
620}
621
622/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
623/// possible to have a direct copy in these cases on GFX908, so an intermediate
624/// VGPR copy is required.
628 const DebugLoc &DL, MCRegister DestReg,
629 MCRegister SrcReg, bool KillSrc,
630 RegScavenger &RS, bool RegsOverlap,
631 Register ImpDefSuperReg = Register(),
632 Register ImpUseSuperReg = Register()) {
633 assert((TII.getSubtarget().hasMAIInsts() &&
634 !TII.getSubtarget().hasGFX90AInsts()) &&
635 "Expected GFX908 subtarget.");
636
637 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
638 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
639 "Source register of the copy should be either an SGPR or an AGPR.");
640
641 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
642 "Destination register of the copy should be an AGPR.");
643
644 const SIRegisterInfo &RI = TII.getRegisterInfo();
645
646 // First try to find defining accvgpr_write to avoid temporary registers.
647 // In the case of copies of overlapping AGPRs, we conservatively do not
648 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
649 // an accvgpr_write used for this same copy due to implicit-defs
650 if (!RegsOverlap) {
651 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
652 --Def;
653
654 if (!Def->modifiesRegister(SrcReg, &RI))
655 continue;
656
657 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
658 Def->getOperand(0).getReg() != SrcReg)
659 break;
660
661 MachineOperand &DefOp = Def->getOperand(1);
662 assert(DefOp.isReg() || DefOp.isImm());
663
664 if (DefOp.isReg()) {
665 bool SafeToPropagate = true;
666 // Check that register source operand is not clobbered before MI.
667 // Immediate operands are always safe to propagate.
668 for (auto I = Def; I != MI && SafeToPropagate; ++I)
669 if (I->modifiesRegister(DefOp.getReg(), &RI))
670 SafeToPropagate = false;
671
672 if (!SafeToPropagate)
673 break;
674
675 for (auto I = Def; I != MI; ++I)
676 I->clearRegisterKills(DefOp.getReg(), &RI);
677 }
678
679 MachineInstrBuilder Builder =
680 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
681 .add(DefOp);
682 if (ImpDefSuperReg)
683 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
684
685 if (ImpUseSuperReg) {
686 Builder.addReg(ImpUseSuperReg,
688 }
689
690 return;
691 }
692 }
693
694 RS.enterBasicBlockEnd(MBB);
695 RS.backward(std::next(MI));
696
697 // Ideally we want to have three registers for a long reg_sequence copy
698 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
699 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
700 *MBB.getParent());
701
702 // Registers in the sequence are allocated contiguously so we can just
703 // use register number to pick one of three round-robin temps.
704 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
705 Register Tmp =
706 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
707 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
708 "VGPR used for an intermediate copy should have been reserved.");
709
710 // Only loop through if there are any free registers left. We don't want to
711 // spill.
712 while (RegNo--) {
713 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
714 /* RestoreAfter */ false, 0,
715 /* AllowSpill */ false);
716 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
717 break;
718 Tmp = Tmp2;
719 RS.setRegUsed(Tmp);
720 }
721
722 // Insert copy to temporary VGPR.
723 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
724 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
725 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
726 } else {
727 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
728 }
729
730 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
731 .addReg(SrcReg, getKillRegState(KillSrc));
732 if (ImpUseSuperReg) {
733 UseBuilder.addReg(ImpUseSuperReg,
735 }
736
737 MachineInstrBuilder DefBuilder
738 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
739 .addReg(Tmp, RegState::Kill);
740
741 if (ImpDefSuperReg)
742 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
743}
744
747 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
748 const TargetRegisterClass *RC, bool Forward) {
749 const SIRegisterInfo &RI = TII.getRegisterInfo();
750 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
752 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
753
754 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
755 int16_t SubIdx = BaseIndices[Idx];
756 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
757 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
758 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
759 unsigned Opcode = AMDGPU::S_MOV_B32;
760
761 // Is SGPR aligned? If so try to combine with next.
762 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
763 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
764 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
765 // Can use SGPR64 copy
766 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
767 SubIdx = RI.getSubRegFromChannel(Channel, 2);
768 DestSubReg = RI.getSubReg(DestReg, SubIdx);
769 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
770 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
771 Opcode = AMDGPU::S_MOV_B64;
772 Idx++;
773 }
774
775 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
776 .addReg(SrcSubReg)
777 .addReg(SrcReg, RegState::Implicit);
778
779 if (!FirstMI)
780 FirstMI = LastMI;
781
782 if (!Forward)
783 I--;
784 }
785
786 assert(FirstMI && LastMI);
787 if (!Forward)
788 std::swap(FirstMI, LastMI);
789
790 FirstMI->addOperand(
791 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
792
793 if (KillSrc)
794 LastMI->addRegisterKilled(SrcReg, &RI);
795}
796
799 const DebugLoc &DL, Register DestReg,
800 Register SrcReg, bool KillSrc, bool RenamableDest,
801 bool RenamableSrc) const {
802 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
803 unsigned Size = RI.getRegSizeInBits(*RC);
804 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
805 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
806
807 // The rest of copyPhysReg assumes Src and Dst size are the same size.
808 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
809 // we remove Fix16BitCopies and this code block?
810 if (Fix16BitCopies) {
811 if (((Size == 16) != (SrcSize == 16))) {
812 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
813 assert(ST.useRealTrue16Insts());
814 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
815 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
816 RegToFix = SubReg;
817
818 if (DestReg == SrcReg) {
819 // Identity copy. Insert empty bundle since ExpandPostRA expects an
820 // instruction here.
821 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
822 return;
823 }
824 RC = RI.getPhysRegBaseClass(DestReg);
825 Size = RI.getRegSizeInBits(*RC);
826 SrcRC = RI.getPhysRegBaseClass(SrcReg);
827 SrcSize = RI.getRegSizeInBits(*SrcRC);
828 }
829 }
830
831 if (RC == &AMDGPU::VGPR_32RegClass) {
832 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
833 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
834 AMDGPU::AGPR_32RegClass.contains(SrcReg));
835 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
836 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
837 BuildMI(MBB, MI, DL, get(Opc), DestReg)
838 .addReg(SrcReg, getKillRegState(KillSrc));
839 return;
840 }
841
842 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
843 RC == &AMDGPU::SReg_32RegClass) {
844 if (SrcReg == AMDGPU::SCC) {
845 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
846 .addImm(1)
847 .addImm(0);
848 return;
849 }
850
851 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
852 if (DestReg == AMDGPU::VCC_LO) {
853 // FIXME: Hack until VReg_1 removed.
854 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
855 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
856 .addImm(0)
857 .addReg(SrcReg, getKillRegState(KillSrc));
858 return;
859 }
860
861 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
862 return;
863 }
864
865 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
866 .addReg(SrcReg, getKillRegState(KillSrc));
867 return;
868 }
869
870 if (RC == &AMDGPU::SReg_64RegClass) {
871 if (SrcReg == AMDGPU::SCC) {
872 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
873 .addImm(1)
874 .addImm(0);
875 return;
876 }
877
878 if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
879 if (DestReg == AMDGPU::VCC) {
880 // FIXME: Hack until VReg_1 removed.
881 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
882 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
883 .addImm(0)
884 .addReg(SrcReg, getKillRegState(KillSrc));
885 return;
886 }
887
888 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
889 return;
890 }
891
892 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
893 .addReg(SrcReg, getKillRegState(KillSrc));
894 return;
895 }
896
897 if (DestReg == AMDGPU::SCC) {
898 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
899 // but SelectionDAG emits such copies for i1 sources.
900 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
901 // This copy can only be produced by patterns
902 // with explicit SCC, which are known to be enabled
903 // only for subtargets with S_CMP_LG_U64 present.
904 assert(ST.hasScalarCompareEq64());
905 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
906 .addReg(SrcReg, getKillRegState(KillSrc))
907 .addImm(0);
908 } else {
909 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
910 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
911 .addReg(SrcReg, getKillRegState(KillSrc))
912 .addImm(0);
913 }
914
915 return;
916 }
917
918 if (RC == &AMDGPU::AGPR_32RegClass) {
919 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
920 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
921 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
922 .addReg(SrcReg, getKillRegState(KillSrc));
923 return;
924 }
925
926 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
927 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
928 .addReg(SrcReg, getKillRegState(KillSrc));
929 return;
930 }
931
932 // FIXME: Pass should maintain scavenger to avoid scan through the block on
933 // every AGPR spill.
934 RegScavenger RS;
935 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
936 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
937 return;
938 }
939
940 if (Size == 16) {
941 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
942 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
943 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
944
945 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
946 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
947 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
948 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
949 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
950 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
951 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
952 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
953
954 if (IsSGPRDst) {
955 if (!IsSGPRSrc) {
956 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
957 return;
958 }
959
960 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
961 .addReg(NewSrcReg, getKillRegState(KillSrc));
962 return;
963 }
964
965 if (IsAGPRDst || IsAGPRSrc) {
966 if (!DstLow || !SrcLow) {
967 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
968 "Cannot use hi16 subreg with an AGPR!");
969 }
970
971 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
972 return;
973 }
974
975 if (ST.useRealTrue16Insts()) {
976 if (IsSGPRSrc) {
977 assert(SrcLow);
978 SrcReg = NewSrcReg;
979 }
980 // Use the smaller instruction encoding if possible.
981 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
982 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
983 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
984 .addReg(SrcReg);
985 } else {
986 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
987 .addImm(0) // src0_modifiers
988 .addReg(SrcReg)
989 .addImm(0); // op_sel
990 }
991 return;
992 }
993
994 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
995 if (!DstLow || !SrcLow) {
996 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
997 "Cannot use hi16 subreg on VI!");
998 }
999
1000 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1001 .addReg(NewSrcReg, getKillRegState(KillSrc));
1002 return;
1003 }
1004
1005 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1006 .addImm(0) // src0_modifiers
1007 .addReg(NewSrcReg)
1008 .addImm(0) // clamp
1015 // First implicit operand is $exec.
1016 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1017 return;
1018 }
1019
1020 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1021 if (ST.hasVMovB64Inst()) {
1022 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1023 .addReg(SrcReg, getKillRegState(KillSrc));
1024 return;
1025 }
1026 if (ST.hasPkMovB32()) {
1027 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1029 .addReg(SrcReg)
1031 .addReg(SrcReg)
1032 .addImm(0) // op_sel_lo
1033 .addImm(0) // op_sel_hi
1034 .addImm(0) // neg_lo
1035 .addImm(0) // neg_hi
1036 .addImm(0) // clamp
1037 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1038 return;
1039 }
1040 }
1041
1042 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1043 if (RI.isSGPRClass(RC)) {
1044 if (!RI.isSGPRClass(SrcRC)) {
1045 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1046 return;
1047 }
1048 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1049 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1050 Forward);
1051 return;
1052 }
1053
1054 unsigned EltSize = 4;
1055 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1056 if (RI.isAGPRClass(RC)) {
1057 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1058 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1059 else if (RI.hasVGPRs(SrcRC) ||
1060 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1061 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1062 else
1063 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1064 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1065 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1066 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1067 (RI.isProperlyAlignedRC(*RC) &&
1068 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1069 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1070 if (ST.hasVMovB64Inst()) {
1071 Opcode = AMDGPU::V_MOV_B64_e32;
1072 EltSize = 8;
1073 } else if (ST.hasPkMovB32()) {
1074 Opcode = AMDGPU::V_PK_MOV_B32;
1075 EltSize = 8;
1076 }
1077 }
1078
1079 // For the cases where we need an intermediate instruction/temporary register
1080 // (destination is an AGPR), we need a scavenger.
1081 //
1082 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1083 // whole block for every handled copy.
1084 std::unique_ptr<RegScavenger> RS;
1085 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1086 RS = std::make_unique<RegScavenger>();
1087
1088 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1089
1090 // If there is an overlap, we can't kill the super-register on the last
1091 // instruction, since it will also kill the components made live by this def.
1092 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1093 const bool CanKillSuperReg = KillSrc && !Overlap;
1094
1095 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1096 unsigned SubIdx;
1097 if (Forward)
1098 SubIdx = SubIndices[Idx];
1099 else
1100 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1101 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1102 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1103 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1104
1105 bool IsFirstSubreg = Idx == 0;
1106 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1107
1108 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1109 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1110 Register ImpUseSuper = SrcReg;
1111 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1112 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1113 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1115 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1117 .addReg(SrcSubReg)
1119 .addReg(SrcSubReg)
1120 .addImm(0) // op_sel_lo
1121 .addImm(0) // op_sel_hi
1122 .addImm(0) // neg_lo
1123 .addImm(0) // neg_hi
1124 .addImm(0) // clamp
1125 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1126 if (IsFirstSubreg)
1128 } else {
1129 MachineInstrBuilder Builder =
1130 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1131 if (IsFirstSubreg)
1132 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1133
1134 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1135 }
1136 }
1137}
1138
1139int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1140 int32_t NewOpc;
1141
1142 // Try to map original to commuted opcode
1143 NewOpc = AMDGPU::getCommuteRev(Opcode);
1144 if (NewOpc != -1)
1145 // Check if the commuted (REV) opcode exists on the target.
1146 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1147
1148 // Try to map commuted to original opcode
1149 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1150 if (NewOpc != -1)
1151 // Check if the original (non-REV) opcode exists on the target.
1152 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1153
1154 return Opcode;
1155}
1156
1157const TargetRegisterClass *
1159 return &AMDGPU::VGPR_32RegClass;
1160}
1161
1164 const DebugLoc &DL, Register DstReg,
1166 Register TrueReg,
1167 Register FalseReg) const {
1168 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1169 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1171 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1172 "Not a VGPR32 reg");
1173
1174 if (Cond.size() == 1) {
1175 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1176 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1177 .add(Cond[0]);
1178 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1179 .addImm(0)
1180 .addReg(FalseReg)
1181 .addImm(0)
1182 .addReg(TrueReg)
1183 .addReg(SReg);
1184 } else if (Cond.size() == 2) {
1185 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1186 switch (Cond[0].getImm()) {
1187 case SIInstrInfo::SCC_TRUE: {
1188 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1189 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1190 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1191 .addImm(0)
1192 .addReg(FalseReg)
1193 .addImm(0)
1194 .addReg(TrueReg)
1195 .addReg(SReg);
1196 break;
1197 }
1198 case SIInstrInfo::SCC_FALSE: {
1199 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1200 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1201 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1202 .addImm(0)
1203 .addReg(FalseReg)
1204 .addImm(0)
1205 .addReg(TrueReg)
1206 .addReg(SReg);
1207 break;
1208 }
1209 case SIInstrInfo::VCCNZ: {
1210 MachineOperand RegOp = Cond[1];
1211 RegOp.setImplicit(false);
1212 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1213 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1214 .add(RegOp);
1215 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1216 .addImm(0)
1217 .addReg(FalseReg)
1218 .addImm(0)
1219 .addReg(TrueReg)
1220 .addReg(SReg);
1221 break;
1222 }
1223 case SIInstrInfo::VCCZ: {
1224 MachineOperand RegOp = Cond[1];
1225 RegOp.setImplicit(false);
1226 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1227 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1228 .add(RegOp);
1229 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1230 .addImm(0)
1231 .addReg(TrueReg)
1232 .addImm(0)
1233 .addReg(FalseReg)
1234 .addReg(SReg);
1235 break;
1236 }
1237 case SIInstrInfo::EXECNZ: {
1238 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1239 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1240 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1241 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1242 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1243 .addImm(0)
1244 .addReg(FalseReg)
1245 .addImm(0)
1246 .addReg(TrueReg)
1247 .addReg(SReg);
1248 break;
1249 }
1250 case SIInstrInfo::EXECZ: {
1251 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1252 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1253 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1254 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1255 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1256 .addImm(0)
1257 .addReg(FalseReg)
1258 .addImm(0)
1259 .addReg(TrueReg)
1260 .addReg(SReg);
1261 llvm_unreachable("Unhandled branch predicate EXECZ");
1262 break;
1263 }
1264 default:
1265 llvm_unreachable("invalid branch predicate");
1266 }
1267 } else {
1268 llvm_unreachable("Can only handle Cond size 1 or 2");
1269 }
1270}
1271
1274 const DebugLoc &DL,
1275 Register SrcReg, int Value) const {
1276 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1277 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1278 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1279 .addImm(Value)
1280 .addReg(SrcReg);
1281
1282 return Reg;
1283}
1284
1287 const DebugLoc &DL,
1288 Register SrcReg, int Value) const {
1289 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1290 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1291 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1292 .addImm(Value)
1293 .addReg(SrcReg);
1294
1295 return Reg;
1296}
1297
1299 const Register Reg,
1300 int64_t &ImmVal) const {
1301 switch (MI.getOpcode()) {
1302 case AMDGPU::V_MOV_B32_e32:
1303 case AMDGPU::S_MOV_B32:
1304 case AMDGPU::S_MOVK_I32:
1305 case AMDGPU::S_MOV_B64:
1306 case AMDGPU::V_MOV_B64_e32:
1307 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1308 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1309 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1310 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1311 case AMDGPU::V_MOV_B64_PSEUDO:
1312 case AMDGPU::V_MOV_B16_t16_e32: {
1313 const MachineOperand &Src0 = MI.getOperand(1);
1314 if (Src0.isImm()) {
1315 ImmVal = Src0.getImm();
1316 return MI.getOperand(0).getReg() == Reg;
1317 }
1318
1319 return false;
1320 }
1321 case AMDGPU::V_MOV_B16_t16_e64: {
1322 const MachineOperand &Src0 = MI.getOperand(2);
1323 if (Src0.isImm() && !MI.getOperand(1).getImm()) {
1324 ImmVal = Src0.getImm();
1325 return MI.getOperand(0).getReg() == Reg;
1326 }
1327
1328 return false;
1329 }
1330 case AMDGPU::S_BREV_B32:
1331 case AMDGPU::V_BFREV_B32_e32:
1332 case AMDGPU::V_BFREV_B32_e64: {
1333 const MachineOperand &Src0 = MI.getOperand(1);
1334 if (Src0.isImm()) {
1335 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Src0.getImm()));
1336 return MI.getOperand(0).getReg() == Reg;
1337 }
1338
1339 return false;
1340 }
1341 case AMDGPU::S_NOT_B32:
1342 case AMDGPU::V_NOT_B32_e32:
1343 case AMDGPU::V_NOT_B32_e64: {
1344 const MachineOperand &Src0 = MI.getOperand(1);
1345 if (Src0.isImm()) {
1346 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1347 return MI.getOperand(0).getReg() == Reg;
1348 }
1349
1350 return false;
1351 }
1352 default:
1353 return false;
1354 }
1355}
1356
1357std::optional<int64_t>
1359 if (Op.isImm())
1360 return Op.getImm();
1361
1362 if (!Op.isReg() || !Op.getReg().isVirtual())
1363 return std::nullopt;
1364 MachineRegisterInfo &MRI = Op.getParent()->getMF()->getRegInfo();
1365 const MachineInstr *Def = MRI.getVRegDef(Op.getReg());
1366 if (Def && Def->isMoveImmediate()) {
1367 const MachineOperand &ImmSrc = Def->getOperand(1);
1368 if (ImmSrc.isImm())
1369 return extractSubregFromImm(ImmSrc.getImm(), Op.getSubReg());
1370 }
1371
1372 return std::nullopt;
1373}
1374
1376
1377 if (RI.isAGPRClass(DstRC))
1378 return AMDGPU::COPY;
1379 if (RI.getRegSizeInBits(*DstRC) == 16) {
1380 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1381 // before RA.
1382 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1383 }
1384 if (RI.getRegSizeInBits(*DstRC) == 32)
1385 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1386 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1387 return AMDGPU::S_MOV_B64;
1388 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1389 return AMDGPU::V_MOV_B64_PSEUDO;
1390 return AMDGPU::COPY;
1391}
1392
1393const MCInstrDesc &
1395 bool IsIndirectSrc) const {
1396 if (IsIndirectSrc) {
1397 if (VecSize <= 32) // 4 bytes
1398 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1399 if (VecSize <= 64) // 8 bytes
1400 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1401 if (VecSize <= 96) // 12 bytes
1402 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1403 if (VecSize <= 128) // 16 bytes
1404 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1405 if (VecSize <= 160) // 20 bytes
1406 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1407 if (VecSize <= 192) // 24 bytes
1408 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6);
1409 if (VecSize <= 224) // 28 bytes
1410 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7);
1411 if (VecSize <= 256) // 32 bytes
1412 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1413 if (VecSize <= 288) // 36 bytes
1414 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1415 if (VecSize <= 320) // 40 bytes
1416 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1417 if (VecSize <= 352) // 44 bytes
1418 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1419 if (VecSize <= 384) // 48 bytes
1420 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1421 if (VecSize <= 512) // 64 bytes
1422 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1423 if (VecSize <= 1024) // 128 bytes
1424 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1425
1426 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1427 }
1428
1429 if (VecSize <= 32) // 4 bytes
1430 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1431 if (VecSize <= 64) // 8 bytes
1432 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1433 if (VecSize <= 96) // 12 bytes
1434 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1435 if (VecSize <= 128) // 16 bytes
1436 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1437 if (VecSize <= 160) // 20 bytes
1438 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1439 if (VecSize <= 192) // 24 bytes
1440 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6);
1441 if (VecSize <= 224) // 28 bytes
1442 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7);
1443 if (VecSize <= 256) // 32 bytes
1444 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1445 if (VecSize <= 288) // 36 bytes
1446 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1447 if (VecSize <= 320) // 40 bytes
1448 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1449 if (VecSize <= 352) // 44 bytes
1450 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1451 if (VecSize <= 384) // 48 bytes
1452 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1453 if (VecSize <= 512) // 64 bytes
1454 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1455 if (VecSize <= 1024) // 128 bytes
1456 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1457
1458 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1459}
1460
1461static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1462 if (VecSize <= 32) // 4 bytes
1463 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1464 if (VecSize <= 64) // 8 bytes
1465 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1466 if (VecSize <= 96) // 12 bytes
1467 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1468 if (VecSize <= 128) // 16 bytes
1469 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1470 if (VecSize <= 160) // 20 bytes
1471 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1472 if (VecSize <= 192) // 24 bytes
1473 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1474 if (VecSize <= 224) // 28 bytes
1475 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1476 if (VecSize <= 256) // 32 bytes
1477 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1478 if (VecSize <= 288) // 36 bytes
1479 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1480 if (VecSize <= 320) // 40 bytes
1481 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1482 if (VecSize <= 352) // 44 bytes
1483 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1484 if (VecSize <= 384) // 48 bytes
1485 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1486 if (VecSize <= 512) // 64 bytes
1487 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1488 if (VecSize <= 1024) // 128 bytes
1489 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1490
1491 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1492}
1493
1494static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1495 if (VecSize <= 32) // 4 bytes
1496 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1497 if (VecSize <= 64) // 8 bytes
1498 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1499 if (VecSize <= 96) // 12 bytes
1500 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1501 if (VecSize <= 128) // 16 bytes
1502 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1503 if (VecSize <= 160) // 20 bytes
1504 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1505 if (VecSize <= 192) // 24 bytes
1506 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1507 if (VecSize <= 224) // 28 bytes
1508 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1509 if (VecSize <= 256) // 32 bytes
1510 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1511 if (VecSize <= 288) // 36 bytes
1512 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1513 if (VecSize <= 320) // 40 bytes
1514 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1515 if (VecSize <= 352) // 44 bytes
1516 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1517 if (VecSize <= 384) // 48 bytes
1518 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1519 if (VecSize <= 512) // 64 bytes
1520 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1521 if (VecSize <= 1024) // 128 bytes
1522 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1523
1524 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1525}
1526
1527static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1528 if (VecSize <= 64) // 8 bytes
1529 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1530 if (VecSize <= 128) // 16 bytes
1531 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1532 if (VecSize <= 256) // 32 bytes
1533 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1534 if (VecSize <= 512) // 64 bytes
1535 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1536 if (VecSize <= 1024) // 128 bytes
1537 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1538
1539 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1540}
1541
1542const MCInstrDesc &
1543SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1544 bool IsSGPR) const {
1545 if (IsSGPR) {
1546 switch (EltSize) {
1547 case 32:
1548 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1549 case 64:
1550 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1551 default:
1552 llvm_unreachable("invalid reg indexing elt size");
1553 }
1554 }
1555
1556 assert(EltSize == 32 && "invalid reg indexing elt size");
1558}
1559
1560static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1561 switch (Size) {
1562 case 4:
1563 return AMDGPU::SI_SPILL_S32_SAVE;
1564 case 8:
1565 return AMDGPU::SI_SPILL_S64_SAVE;
1566 case 12:
1567 return AMDGPU::SI_SPILL_S96_SAVE;
1568 case 16:
1569 return AMDGPU::SI_SPILL_S128_SAVE;
1570 case 20:
1571 return AMDGPU::SI_SPILL_S160_SAVE;
1572 case 24:
1573 return AMDGPU::SI_SPILL_S192_SAVE;
1574 case 28:
1575 return AMDGPU::SI_SPILL_S224_SAVE;
1576 case 32:
1577 return AMDGPU::SI_SPILL_S256_SAVE;
1578 case 36:
1579 return AMDGPU::SI_SPILL_S288_SAVE;
1580 case 40:
1581 return AMDGPU::SI_SPILL_S320_SAVE;
1582 case 44:
1583 return AMDGPU::SI_SPILL_S352_SAVE;
1584 case 48:
1585 return AMDGPU::SI_SPILL_S384_SAVE;
1586 case 64:
1587 return AMDGPU::SI_SPILL_S512_SAVE;
1588 case 128:
1589 return AMDGPU::SI_SPILL_S1024_SAVE;
1590 default:
1591 llvm_unreachable("unknown register size");
1592 }
1593}
1594
1595static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1596 switch (Size) {
1597 case 2:
1598 return AMDGPU::SI_SPILL_V16_SAVE;
1599 case 4:
1600 return AMDGPU::SI_SPILL_V32_SAVE;
1601 case 8:
1602 return AMDGPU::SI_SPILL_V64_SAVE;
1603 case 12:
1604 return AMDGPU::SI_SPILL_V96_SAVE;
1605 case 16:
1606 return AMDGPU::SI_SPILL_V128_SAVE;
1607 case 20:
1608 return AMDGPU::SI_SPILL_V160_SAVE;
1609 case 24:
1610 return AMDGPU::SI_SPILL_V192_SAVE;
1611 case 28:
1612 return AMDGPU::SI_SPILL_V224_SAVE;
1613 case 32:
1614 return AMDGPU::SI_SPILL_V256_SAVE;
1615 case 36:
1616 return AMDGPU::SI_SPILL_V288_SAVE;
1617 case 40:
1618 return AMDGPU::SI_SPILL_V320_SAVE;
1619 case 44:
1620 return AMDGPU::SI_SPILL_V352_SAVE;
1621 case 48:
1622 return AMDGPU::SI_SPILL_V384_SAVE;
1623 case 64:
1624 return AMDGPU::SI_SPILL_V512_SAVE;
1625 case 128:
1626 return AMDGPU::SI_SPILL_V1024_SAVE;
1627 default:
1628 llvm_unreachable("unknown register size");
1629 }
1630}
1631
1632static unsigned getAVSpillSaveOpcode(unsigned Size) {
1633 switch (Size) {
1634 case 4:
1635 return AMDGPU::SI_SPILL_AV32_SAVE;
1636 case 8:
1637 return AMDGPU::SI_SPILL_AV64_SAVE;
1638 case 12:
1639 return AMDGPU::SI_SPILL_AV96_SAVE;
1640 case 16:
1641 return AMDGPU::SI_SPILL_AV128_SAVE;
1642 case 20:
1643 return AMDGPU::SI_SPILL_AV160_SAVE;
1644 case 24:
1645 return AMDGPU::SI_SPILL_AV192_SAVE;
1646 case 28:
1647 return AMDGPU::SI_SPILL_AV224_SAVE;
1648 case 32:
1649 return AMDGPU::SI_SPILL_AV256_SAVE;
1650 case 36:
1651 return AMDGPU::SI_SPILL_AV288_SAVE;
1652 case 40:
1653 return AMDGPU::SI_SPILL_AV320_SAVE;
1654 case 44:
1655 return AMDGPU::SI_SPILL_AV352_SAVE;
1656 case 48:
1657 return AMDGPU::SI_SPILL_AV384_SAVE;
1658 case 64:
1659 return AMDGPU::SI_SPILL_AV512_SAVE;
1660 case 128:
1661 return AMDGPU::SI_SPILL_AV1024_SAVE;
1662 default:
1663 llvm_unreachable("unknown register size");
1664 }
1665}
1666
1667static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1668 bool IsVectorSuperClass) {
1669 // Currently, there is only 32-bit WWM register spills needed.
1670 if (Size != 4)
1671 llvm_unreachable("unknown wwm register spill size");
1672
1673 if (IsVectorSuperClass)
1674 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1675
1676 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1677}
1678
1680 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1681 const SIMachineFunctionInfo &MFI) const {
1682 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1683
1684 // Choose the right opcode if spilling a WWM register.
1686 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1687
1688 // TODO: Check if AGPRs are available
1689 if (ST.hasMAIInsts())
1690 return getAVSpillSaveOpcode(Size);
1691
1693}
1694
1697 bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
1698 MachineInstr::MIFlag Flags) const {
1699 MachineFunction *MF = MBB.getParent();
1701 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1702 const DebugLoc &DL = MBB.findDebugLoc(MI);
1703
1704 MachinePointerInfo PtrInfo
1705 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1707 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1708 FrameInfo.getObjectAlign(FrameIndex));
1709 unsigned SpillSize = RI.getSpillSize(*RC);
1710
1711 MachineRegisterInfo &MRI = MF->getRegInfo();
1712 if (RI.isSGPRClass(RC)) {
1713 MFI->setHasSpilledSGPRs();
1714 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1715 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1716 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1717
1718 // We are only allowed to create one new instruction when spilling
1719 // registers, so we need to use pseudo instruction for spilling SGPRs.
1720 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1721
1722 // The SGPR spill/restore instructions only work on number sgprs, so we need
1723 // to make sure we are using the correct register class.
1724 if (SrcReg.isVirtual() && SpillSize == 4) {
1725 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1726 }
1727
1728 BuildMI(MBB, MI, DL, OpDesc)
1729 .addReg(SrcReg, getKillRegState(isKill)) // data
1730 .addFrameIndex(FrameIndex) // addr
1731 .addMemOperand(MMO)
1733
1734 if (RI.spillSGPRToVGPR())
1735 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1736 return;
1737 }
1738
1739 unsigned Opcode =
1740 getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
1741 MFI->setHasSpilledVGPRs();
1742
1743 BuildMI(MBB, MI, DL, get(Opcode))
1744 .addReg(SrcReg, getKillRegState(isKill)) // data
1745 .addFrameIndex(FrameIndex) // addr
1746 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1747 .addImm(0) // offset
1748 .addMemOperand(MMO);
1749}
1750
1751static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1752 switch (Size) {
1753 case 4:
1754 return AMDGPU::SI_SPILL_S32_RESTORE;
1755 case 8:
1756 return AMDGPU::SI_SPILL_S64_RESTORE;
1757 case 12:
1758 return AMDGPU::SI_SPILL_S96_RESTORE;
1759 case 16:
1760 return AMDGPU::SI_SPILL_S128_RESTORE;
1761 case 20:
1762 return AMDGPU::SI_SPILL_S160_RESTORE;
1763 case 24:
1764 return AMDGPU::SI_SPILL_S192_RESTORE;
1765 case 28:
1766 return AMDGPU::SI_SPILL_S224_RESTORE;
1767 case 32:
1768 return AMDGPU::SI_SPILL_S256_RESTORE;
1769 case 36:
1770 return AMDGPU::SI_SPILL_S288_RESTORE;
1771 case 40:
1772 return AMDGPU::SI_SPILL_S320_RESTORE;
1773 case 44:
1774 return AMDGPU::SI_SPILL_S352_RESTORE;
1775 case 48:
1776 return AMDGPU::SI_SPILL_S384_RESTORE;
1777 case 64:
1778 return AMDGPU::SI_SPILL_S512_RESTORE;
1779 case 128:
1780 return AMDGPU::SI_SPILL_S1024_RESTORE;
1781 default:
1782 llvm_unreachable("unknown register size");
1783 }
1784}
1785
1786static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1787 switch (Size) {
1788 case 2:
1789 return AMDGPU::SI_SPILL_V16_RESTORE;
1790 case 4:
1791 return AMDGPU::SI_SPILL_V32_RESTORE;
1792 case 8:
1793 return AMDGPU::SI_SPILL_V64_RESTORE;
1794 case 12:
1795 return AMDGPU::SI_SPILL_V96_RESTORE;
1796 case 16:
1797 return AMDGPU::SI_SPILL_V128_RESTORE;
1798 case 20:
1799 return AMDGPU::SI_SPILL_V160_RESTORE;
1800 case 24:
1801 return AMDGPU::SI_SPILL_V192_RESTORE;
1802 case 28:
1803 return AMDGPU::SI_SPILL_V224_RESTORE;
1804 case 32:
1805 return AMDGPU::SI_SPILL_V256_RESTORE;
1806 case 36:
1807 return AMDGPU::SI_SPILL_V288_RESTORE;
1808 case 40:
1809 return AMDGPU::SI_SPILL_V320_RESTORE;
1810 case 44:
1811 return AMDGPU::SI_SPILL_V352_RESTORE;
1812 case 48:
1813 return AMDGPU::SI_SPILL_V384_RESTORE;
1814 case 64:
1815 return AMDGPU::SI_SPILL_V512_RESTORE;
1816 case 128:
1817 return AMDGPU::SI_SPILL_V1024_RESTORE;
1818 default:
1819 llvm_unreachable("unknown register size");
1820 }
1821}
1822
1823static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1824 switch (Size) {
1825 case 4:
1826 return AMDGPU::SI_SPILL_AV32_RESTORE;
1827 case 8:
1828 return AMDGPU::SI_SPILL_AV64_RESTORE;
1829 case 12:
1830 return AMDGPU::SI_SPILL_AV96_RESTORE;
1831 case 16:
1832 return AMDGPU::SI_SPILL_AV128_RESTORE;
1833 case 20:
1834 return AMDGPU::SI_SPILL_AV160_RESTORE;
1835 case 24:
1836 return AMDGPU::SI_SPILL_AV192_RESTORE;
1837 case 28:
1838 return AMDGPU::SI_SPILL_AV224_RESTORE;
1839 case 32:
1840 return AMDGPU::SI_SPILL_AV256_RESTORE;
1841 case 36:
1842 return AMDGPU::SI_SPILL_AV288_RESTORE;
1843 case 40:
1844 return AMDGPU::SI_SPILL_AV320_RESTORE;
1845 case 44:
1846 return AMDGPU::SI_SPILL_AV352_RESTORE;
1847 case 48:
1848 return AMDGPU::SI_SPILL_AV384_RESTORE;
1849 case 64:
1850 return AMDGPU::SI_SPILL_AV512_RESTORE;
1851 case 128:
1852 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1853 default:
1854 llvm_unreachable("unknown register size");
1855 }
1856}
1857
1858static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1859 bool IsVectorSuperClass) {
1860 // Currently, there is only 32-bit WWM register spills needed.
1861 if (Size != 4)
1862 llvm_unreachable("unknown wwm register spill size");
1863
1864 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1865 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1866
1867 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1868}
1869
1871 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1872 const SIMachineFunctionInfo &MFI) const {
1873 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1874
1875 // Choose the right opcode if restoring a WWM register.
1877 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1878
1879 // TODO: Check if AGPRs are available
1880 if (ST.hasMAIInsts())
1882
1883 assert(!RI.isAGPRClass(RC));
1885}
1886
1889 Register DestReg, int FrameIndex,
1890 const TargetRegisterClass *RC,
1891 Register VReg, unsigned SubReg,
1892 MachineInstr::MIFlag Flags) const {
1893 MachineFunction *MF = MBB.getParent();
1895 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1896 const DebugLoc &DL = MBB.findDebugLoc(MI);
1897 unsigned SpillSize = RI.getSpillSize(*RC);
1898
1899 MachinePointerInfo PtrInfo
1900 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1901
1903 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1904 FrameInfo.getObjectAlign(FrameIndex));
1905
1906 if (RI.isSGPRClass(RC)) {
1907 MFI->setHasSpilledSGPRs();
1908 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1909 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1910 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1911
1912 // FIXME: Maybe this should not include a memoperand because it will be
1913 // lowered to non-memory instructions.
1914 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1915 if (DestReg.isVirtual() && SpillSize == 4) {
1916 MachineRegisterInfo &MRI = MF->getRegInfo();
1917 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1918 }
1919
1920 if (RI.spillSGPRToVGPR())
1921 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1922 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1923 .addFrameIndex(FrameIndex) // addr
1924 .addMemOperand(MMO)
1926
1927 return;
1928 }
1929
1930 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1931 SpillSize, *MFI);
1932 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1933 .addFrameIndex(FrameIndex) // vaddr
1934 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1935 .addImm(0) // offset
1936 .addMemOperand(MMO);
1937}
1938
1943
1946 unsigned Quantity) const {
1947 DebugLoc DL = MBB.findDebugLoc(MI);
1948 unsigned MaxSNopCount = 1u << ST.getSNopBits();
1949 while (Quantity > 0) {
1950 unsigned Arg = std::min(Quantity, MaxSNopCount);
1951 Quantity -= Arg;
1952 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1953 }
1954}
1955
1957 auto *MF = MBB.getParent();
1958 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1959
1960 assert(Info->isEntryFunction());
1961
1962 if (MBB.succ_empty()) {
1963 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1964 if (HasNoTerminator) {
1965 if (Info->returnsVoid()) {
1966 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1967 } else {
1968 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1969 }
1970 }
1971 }
1972}
1973
1977 const DebugLoc &DL) const {
1978 MachineFunction *MF = MBB.getParent();
1979 constexpr unsigned DoorbellIDMask = 0x3ff;
1980 constexpr unsigned ECQueueWaveAbort = 0x400;
1981
1982 MachineBasicBlock *TrapBB = &MBB;
1983 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
1984
1985 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
1986 MBB.splitAt(MI, /*UpdateLiveIns=*/false);
1987 TrapBB = MF->CreateMachineBasicBlock();
1988 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
1989 MF->push_back(TrapBB);
1990 MBB.addSuccessor(TrapBB);
1991 }
1992 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
1993 // will be a nop.
1994 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
1995 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
1996 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1997 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
1998 DoorbellReg)
2000 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2001 .addUse(AMDGPU::M0);
2002 Register DoorbellRegMasked =
2003 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2004 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2005 .addUse(DoorbellReg)
2006 .addImm(DoorbellIDMask);
2007 Register SetWaveAbortBit =
2008 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2009 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2010 .addUse(DoorbellRegMasked)
2011 .addImm(ECQueueWaveAbort);
2012 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2013 .addUse(SetWaveAbortBit);
2014 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2016 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2017 .addUse(AMDGPU::TTMP2);
2018 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2019 TrapBB->addSuccessor(HaltLoopBB);
2020
2021 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2022 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2023 .addMBB(HaltLoopBB);
2024 MF->push_back(HaltLoopBB);
2025 HaltLoopBB->addSuccessor(HaltLoopBB);
2026
2027 return MBB.getNextNode();
2028}
2029
2031 switch (MI.getOpcode()) {
2032 default:
2033 if (MI.isMetaInstruction())
2034 return 0;
2035 return 1; // FIXME: Do wait states equal cycles?
2036
2037 case AMDGPU::S_NOP:
2038 return MI.getOperand(0).getImm() + 1;
2039 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2040 // hazard, even if one exist, won't really be visible. Should we handle it?
2041 }
2042}
2043
2045 MachineBasicBlock &MBB = *MI.getParent();
2046 DebugLoc DL = MBB.findDebugLoc(MI);
2048 switch (MI.getOpcode()) {
2049 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2050 case AMDGPU::S_MOV_B64_term:
2051 // This is only a terminator to get the correct spill code placement during
2052 // register allocation.
2053 MI.setDesc(get(AMDGPU::S_MOV_B64));
2054 break;
2055
2056 case AMDGPU::S_MOV_B32_term:
2057 // This is only a terminator to get the correct spill code placement during
2058 // register allocation.
2059 MI.setDesc(get(AMDGPU::S_MOV_B32));
2060 break;
2061
2062 case AMDGPU::S_XOR_B64_term:
2063 // This is only a terminator to get the correct spill code placement during
2064 // register allocation.
2065 MI.setDesc(get(AMDGPU::S_XOR_B64));
2066 break;
2067
2068 case AMDGPU::S_XOR_B32_term:
2069 // This is only a terminator to get the correct spill code placement during
2070 // register allocation.
2071 MI.setDesc(get(AMDGPU::S_XOR_B32));
2072 break;
2073 case AMDGPU::S_OR_B64_term:
2074 // This is only a terminator to get the correct spill code placement during
2075 // register allocation.
2076 MI.setDesc(get(AMDGPU::S_OR_B64));
2077 break;
2078 case AMDGPU::S_OR_B32_term:
2079 // This is only a terminator to get the correct spill code placement during
2080 // register allocation.
2081 MI.setDesc(get(AMDGPU::S_OR_B32));
2082 break;
2083
2084 case AMDGPU::S_ANDN2_B64_term:
2085 // This is only a terminator to get the correct spill code placement during
2086 // register allocation.
2087 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2088 break;
2089
2090 case AMDGPU::S_ANDN2_B32_term:
2091 // This is only a terminator to get the correct spill code placement during
2092 // register allocation.
2093 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2094 break;
2095
2096 case AMDGPU::S_AND_B64_term:
2097 // This is only a terminator to get the correct spill code placement during
2098 // register allocation.
2099 MI.setDesc(get(AMDGPU::S_AND_B64));
2100 break;
2101
2102 case AMDGPU::S_AND_B32_term:
2103 // This is only a terminator to get the correct spill code placement during
2104 // register allocation.
2105 MI.setDesc(get(AMDGPU::S_AND_B32));
2106 break;
2107
2108 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2109 // This is only a terminator to get the correct spill code placement during
2110 // register allocation.
2111 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2112 break;
2113
2114 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2115 // This is only a terminator to get the correct spill code placement during
2116 // register allocation.
2117 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2118 break;
2119
2120 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2121 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2122 break;
2123
2124 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2125 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2126 break;
2127 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2128 Register Dst = MI.getOperand(0).getReg();
2129 bool IsAGPR = SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst));
2130 MI.setDesc(
2131 get(IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2132 break;
2133 }
2134 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2135 Register Dst = MI.getOperand(0).getReg();
2136 if (SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst))) {
2137 int64_t Imm = MI.getOperand(1).getImm();
2138
2139 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2140 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2141 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)
2142 .addImm(SignExtend64<32>(Imm));
2143 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)
2144 .addImm(SignExtend64<32>(Imm >> 32));
2145 MI.eraseFromParent();
2146 break;
2147 }
2148
2149 [[fallthrough]];
2150 }
2151 case AMDGPU::V_MOV_B64_PSEUDO: {
2152 Register Dst = MI.getOperand(0).getReg();
2153 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2154 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2155
2156 const MCInstrDesc &Mov64Desc = get(AMDGPU::V_MOV_B64_e32);
2157 const TargetRegisterClass *Mov64RC = getRegClass(Mov64Desc, /*OpNum=*/0);
2158
2159 const MachineOperand &SrcOp = MI.getOperand(1);
2160 // FIXME: Will this work for 64-bit floating point immediates?
2161 assert(!SrcOp.isFPImm());
2162 if (ST.hasVMovB64Inst() && Mov64RC->contains(Dst)) {
2163 MI.setDesc(Mov64Desc);
2164 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2165 isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
2166 break;
2167 }
2168 if (SrcOp.isImm()) {
2169 APInt Imm(64, SrcOp.getImm());
2170 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2171 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2172 const MCInstrDesc &PkMovDesc = get(AMDGPU::V_PK_MOV_B32);
2173 const TargetRegisterClass *PkMovRC = getRegClass(PkMovDesc, /*OpNum=*/0);
2174
2175 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo) &&
2176 PkMovRC->contains(Dst)) {
2177 BuildMI(MBB, MI, DL, PkMovDesc, Dst)
2179 .addImm(Lo.getSExtValue())
2181 .addImm(Lo.getSExtValue())
2182 .addImm(0) // op_sel_lo
2183 .addImm(0) // op_sel_hi
2184 .addImm(0) // neg_lo
2185 .addImm(0) // neg_hi
2186 .addImm(0); // clamp
2187 } else {
2188 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2189 .addImm(Lo.getSExtValue());
2190 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2191 .addImm(Hi.getSExtValue());
2192 }
2193 } else {
2194 assert(SrcOp.isReg());
2195 if (ST.hasPkMovB32() &&
2196 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2197 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2198 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2199 .addReg(SrcOp.getReg())
2201 .addReg(SrcOp.getReg())
2202 .addImm(0) // op_sel_lo
2203 .addImm(0) // op_sel_hi
2204 .addImm(0) // neg_lo
2205 .addImm(0) // neg_hi
2206 .addImm(0); // clamp
2207 } else {
2208 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2209 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0));
2210 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2211 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1));
2212 }
2213 }
2214 MI.eraseFromParent();
2215 break;
2216 }
2217 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2219 break;
2220 }
2221 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2222 const MachineOperand &SrcOp = MI.getOperand(1);
2223 assert(!SrcOp.isFPImm());
2224
2225 if (ST.has64BitLiterals()) {
2226 MI.setDesc(get(AMDGPU::S_MOV_B64));
2227 break;
2228 }
2229
2230 APInt Imm(64, SrcOp.getImm());
2231 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2232 MI.setDesc(get(AMDGPU::S_MOV_B64));
2233 break;
2234 }
2235
2236 Register Dst = MI.getOperand(0).getReg();
2237 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2238 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2239
2240 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2241 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2242 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2243 .addImm(Lo.getSExtValue());
2244 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2245 .addImm(Hi.getSExtValue());
2246 MI.eraseFromParent();
2247 break;
2248 }
2249 case AMDGPU::V_SET_INACTIVE_B32: {
2250 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2251 Register DstReg = MI.getOperand(0).getReg();
2252 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2253 .add(MI.getOperand(3))
2254 .add(MI.getOperand(4))
2255 .add(MI.getOperand(1))
2256 .add(MI.getOperand(2))
2257 .add(MI.getOperand(5));
2258 MI.eraseFromParent();
2259 break;
2260 }
2261 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2262 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2263 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2264 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2265 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2266 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2267 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2268 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2269 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2270 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2271 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2272 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2273 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2274 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2275 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2276 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2277 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2278 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2279 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2280 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2281 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2282 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2283 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2284 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2285 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2286 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2287 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2288 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2289 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2290 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2291 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2292 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2293 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2294 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2295
2296 unsigned Opc;
2297 if (RI.hasVGPRs(EltRC)) {
2298 Opc = AMDGPU::V_MOVRELD_B32_e32;
2299 } else {
2300 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2301 : AMDGPU::S_MOVRELD_B32;
2302 }
2303
2304 const MCInstrDesc &OpDesc = get(Opc);
2305 Register VecReg = MI.getOperand(0).getReg();
2306 bool IsUndef = MI.getOperand(1).isUndef();
2307 unsigned SubReg = MI.getOperand(3).getImm();
2308 assert(VecReg == MI.getOperand(1).getReg());
2309
2311 BuildMI(MBB, MI, DL, OpDesc)
2312 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2313 .add(MI.getOperand(2))
2315 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2316
2317 const int ImpDefIdx =
2318 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2319 const int ImpUseIdx = ImpDefIdx + 1;
2320 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2321 MI.eraseFromParent();
2322 break;
2323 }
2324 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2325 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2326 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2327 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2328 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2329 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6:
2330 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7:
2331 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2332 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2333 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2334 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2335 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2336 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2337 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2338 assert(ST.useVGPRIndexMode());
2339 Register VecReg = MI.getOperand(0).getReg();
2340 bool IsUndef = MI.getOperand(1).isUndef();
2341 MachineOperand &Idx = MI.getOperand(3);
2342 Register SubReg = MI.getOperand(4).getImm();
2343
2344 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2345 .add(Idx)
2347 SetOn->getOperand(3).setIsUndef();
2348
2349 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2351 BuildMI(MBB, MI, DL, OpDesc)
2352 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2353 .add(MI.getOperand(2))
2355 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2356
2357 const int ImpDefIdx =
2358 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2359 const int ImpUseIdx = ImpDefIdx + 1;
2360 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2361
2362 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2363
2364 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2365
2366 MI.eraseFromParent();
2367 break;
2368 }
2369 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2370 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2371 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2372 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2373 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2374 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6:
2375 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7:
2376 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2377 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2378 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2379 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2380 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2381 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2382 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2383 assert(ST.useVGPRIndexMode());
2384 Register Dst = MI.getOperand(0).getReg();
2385 Register VecReg = MI.getOperand(1).getReg();
2386 bool IsUndef = MI.getOperand(1).isUndef();
2387 Register SubReg = MI.getOperand(3).getImm();
2388
2389 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2390 .add(MI.getOperand(2))
2392 SetOn->getOperand(3).setIsUndef();
2393
2394 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2395 .addDef(Dst)
2396 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2397 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2398
2399 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2400
2401 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2402
2403 MI.eraseFromParent();
2404 break;
2405 }
2406 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2407 MachineFunction &MF = *MBB.getParent();
2408 Register Reg = MI.getOperand(0).getReg();
2409 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2410 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2411 MachineOperand OpLo = MI.getOperand(1);
2412 MachineOperand OpHi = MI.getOperand(2);
2413
2414 // Create a bundle so these instructions won't be re-ordered by the
2415 // post-RA scheduler.
2416 MIBundleBuilder Bundler(MBB, MI);
2417 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2418
2419 // What we want here is an offset from the value returned by s_getpc (which
2420 // is the address of the s_add_u32 instruction) to the global variable, but
2421 // since the encoding of $symbol starts 4 bytes after the start of the
2422 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2423 // small. This requires us to add 4 to the global variable offset in order
2424 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2425 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2426 // instruction.
2427
2428 int64_t Adjust = 0;
2429 if (ST.hasGetPCZeroExtension()) {
2430 // Fix up hardware that does not sign-extend the 48-bit PC value by
2431 // inserting: s_sext_i32_i16 reghi, reghi
2432 Bundler.append(
2433 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2434 Adjust += 4;
2435 }
2436
2437 if (OpLo.isGlobal())
2438 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2439 Bundler.append(
2440 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2441
2442 if (OpHi.isGlobal())
2443 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2444 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2445 .addReg(RegHi)
2446 .add(OpHi));
2447
2448 finalizeBundle(MBB, Bundler.begin());
2449
2450 MI.eraseFromParent();
2451 break;
2452 }
2453 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2454 MachineFunction &MF = *MBB.getParent();
2455 Register Reg = MI.getOperand(0).getReg();
2456 MachineOperand Op = MI.getOperand(1);
2457
2458 // Create a bundle so these instructions won't be re-ordered by the
2459 // post-RA scheduler.
2460 MIBundleBuilder Bundler(MBB, MI);
2461 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2462 if (Op.isGlobal())
2463 Op.setOffset(Op.getOffset() + 4);
2464 Bundler.append(
2465 BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));
2466
2467 finalizeBundle(MBB, Bundler.begin());
2468
2469 MI.eraseFromParent();
2470 break;
2471 }
2472 case AMDGPU::ENTER_STRICT_WWM: {
2473 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2474 // Whole Wave Mode is entered.
2475 MI.setDesc(get(LMC.OrSaveExecOpc));
2476 break;
2477 }
2478 case AMDGPU::ENTER_STRICT_WQM: {
2479 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2480 // STRICT_WQM is entered.
2481 BuildMI(MBB, MI, DL, get(LMC.MovOpc), MI.getOperand(0).getReg())
2482 .addReg(LMC.ExecReg);
2483 BuildMI(MBB, MI, DL, get(LMC.WQMOpc), LMC.ExecReg).addReg(LMC.ExecReg);
2484
2485 MI.eraseFromParent();
2486 break;
2487 }
2488 case AMDGPU::EXIT_STRICT_WWM:
2489 case AMDGPU::EXIT_STRICT_WQM: {
2490 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2491 // WWM/STICT_WQM is exited.
2492 MI.setDesc(get(LMC.MovOpc));
2493 break;
2494 }
2495 case AMDGPU::SI_RETURN: {
2496 const MachineFunction *MF = MBB.getParent();
2497 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2498 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2499 // Hiding the return address use with SI_RETURN may lead to extra kills in
2500 // the function and missing live-ins. We are fine in practice because callee
2501 // saved register handling ensures the register value is restored before
2502 // RET, but we need the undef flag here to appease the MachineVerifier
2503 // liveness checks.
2505 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2506 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2507
2508 MIB.copyImplicitOps(MI);
2509 MI.eraseFromParent();
2510 break;
2511 }
2512
2513 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2514 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2515 MI.setDesc(get(AMDGPU::S_MUL_U64));
2516 break;
2517
2518 case AMDGPU::S_GETPC_B64_pseudo:
2519 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2520 if (ST.hasGetPCZeroExtension()) {
2521 Register Dst = MI.getOperand(0).getReg();
2522 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2523 // Fix up hardware that does not sign-extend the 48-bit PC value by
2524 // inserting: s_sext_i32_i16 dsthi, dsthi
2525 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2526 DstHi)
2527 .addReg(DstHi);
2528 }
2529 break;
2530
2531 case AMDGPU::V_MAX_BF16_PSEUDO_e64: {
2532 assert(ST.hasBF16PackedInsts());
2533 MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
2534 MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
2535 MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
2536 MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
2537 auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2538 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2539 auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2540 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2541 break;
2542 }
2543
2544 case AMDGPU::GET_STACK_BASE:
2545 // The stack starts at offset 0 unless we need to reserve some space at the
2546 // bottom.
2547 if (ST.getFrameLowering()->mayReserveScratchForCWSR(*MBB.getParent())) {
2548 // When CWSR is used in dynamic VGPR mode, the trap handler needs to save
2549 // some of the VGPRs. The size of the required scratch space has already
2550 // been computed by prolog epilog insertion.
2551 const SIMachineFunctionInfo *MFI =
2552 MBB.getParent()->getInfo<SIMachineFunctionInfo>();
2553 unsigned VGPRSize = MFI->getScratchReservedForDynamicVGPRs();
2554 Register DestReg = MI.getOperand(0).getReg();
2555 BuildMI(MBB, MI, DL, get(AMDGPU::S_GETREG_B32), DestReg)
2558 // The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
2559 // (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
2560 // SCC, so we need to check for 0 manually.
2561 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)).addImm(0).addReg(DestReg);
2562 // Change the implicif-def of SCC to an explicit use (but first remove
2563 // the dead flag if present).
2564 MI.getOperand(MI.getNumExplicitOperands()).setIsDead(false);
2565 MI.getOperand(MI.getNumExplicitOperands()).setIsUse();
2566 MI.setDesc(get(AMDGPU::S_CMOVK_I32));
2567 MI.addOperand(MachineOperand::CreateImm(VGPRSize));
2568 } else {
2569 MI.setDesc(get(AMDGPU::S_MOV_B32));
2570 MI.addOperand(MachineOperand::CreateImm(0));
2571 MI.removeOperand(
2572 MI.getNumExplicitOperands()); // Drop implicit def of SCC.
2573 }
2574 break;
2575 }
2576
2577 return true;
2578}
2579
2582 unsigned SubIdx, const MachineInstr &Orig,
2583 LaneBitmask UsedLanes) const {
2584
2585 // Try shrinking the instruction to remat only the part needed for current
2586 // context.
2587 // TODO: Handle more cases.
2588 unsigned Opcode = Orig.getOpcode();
2589 switch (Opcode) {
2590 case AMDGPU::S_MOV_B64:
2591 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2592 if (SubIdx != 0)
2593 break;
2594
2595 if (!Orig.getOperand(1).isImm())
2596 break;
2597
2598 // Shrink S_MOV_B64 to S_MOV_B32 when UsedLanes indicates only a single
2599 // 32-bit lane of the 64-bit value is live at the rematerialization point.
2600 if (UsedLanes.all())
2601 break;
2602
2603 // Determine which half of the 64-bit immediate corresponds to the use.
2604 unsigned OrigSubReg = Orig.getOperand(0).getSubReg();
2605 unsigned LoSubReg = RI.composeSubRegIndices(OrigSubReg, AMDGPU::sub0);
2606 unsigned HiSubReg = RI.composeSubRegIndices(OrigSubReg, AMDGPU::sub1);
2607
2608 bool NeedLo = (UsedLanes & RI.getSubRegIndexLaneMask(LoSubReg)).any();
2609 bool NeedHi = (UsedLanes & RI.getSubRegIndexLaneMask(HiSubReg)).any();
2610
2611 if (NeedLo && NeedHi)
2612 break;
2613
2614 int64_t Imm64 = Orig.getOperand(1).getImm();
2615 int32_t Imm32 = NeedLo ? Lo_32(Imm64) : Hi_32(Imm64);
2616
2617 unsigned UseSubReg = NeedLo ? LoSubReg : HiSubReg;
2618
2619 // Emit S_MOV_B32 defining just the needed 32-bit subreg of DestReg.
2620 BuildMI(MBB, I, Orig.getDebugLoc(), get(AMDGPU::S_MOV_B32))
2621 .addReg(DestReg, RegState::Define | RegState::Undef, UseSubReg)
2622 .addImm(Imm32);
2623 return;
2624 }
2625
2626 case AMDGPU::S_LOAD_DWORDX16_IMM:
2627 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2628 if (SubIdx != 0)
2629 break;
2630
2631 if (I == MBB.end())
2632 break;
2633
2634 if (I->isBundled())
2635 break;
2636
2637 // Look for a single use of the register that is also a subreg.
2638 Register RegToFind = Orig.getOperand(0).getReg();
2639 MachineOperand *UseMO = nullptr;
2640 for (auto &CandMO : I->operands()) {
2641 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2642 continue;
2643 if (UseMO) {
2644 UseMO = nullptr;
2645 break;
2646 }
2647 UseMO = &CandMO;
2648 }
2649 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2650 break;
2651
2652 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2653 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2654
2655 MachineFunction *MF = MBB.getParent();
2656 MachineRegisterInfo &MRI = MF->getRegInfo();
2657 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2658
2659 unsigned NewOpcode = -1;
2660 if (SubregSize == 256)
2661 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2662 else if (SubregSize == 128)
2663 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2664 else
2665 break;
2666
2667 const MCInstrDesc &TID = get(NewOpcode);
2668 const TargetRegisterClass *NewRC =
2669 RI.getAllocatableClass(getRegClass(TID, 0));
2670 MRI.setRegClass(DestReg, NewRC);
2671
2672 UseMO->setReg(DestReg);
2673 UseMO->setSubReg(AMDGPU::NoSubRegister);
2674
2675 // Use a smaller load with the desired size, possibly with updated offset.
2676 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2677 MI->setDesc(TID);
2678 MI->getOperand(0).setReg(DestReg);
2679 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2680 if (Offset) {
2681 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2682 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2683 OffsetMO->setImm(FinalOffset);
2684 }
2686 for (const MachineMemOperand *MemOp : Orig.memoperands())
2687 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2688 SubregSize / 8));
2689 MI->setMemRefs(*MF, NewMMOs);
2690
2691 MBB.insert(I, MI);
2692 return;
2693 }
2694
2695 default:
2696 break;
2697 }
2698
2699 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, UsedLanes);
2700}
2701
2702std::pair<MachineInstr*, MachineInstr*>
2704 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2705
2706 if (ST.hasVMovB64Inst() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
2708 ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2709 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2710 return std::pair(&MI, nullptr);
2711 }
2712
2713 MachineBasicBlock &MBB = *MI.getParent();
2714 DebugLoc DL = MBB.findDebugLoc(MI);
2715 MachineFunction *MF = MBB.getParent();
2716 MachineRegisterInfo &MRI = MF->getRegInfo();
2717 Register Dst = MI.getOperand(0).getReg();
2718 unsigned Part = 0;
2719 MachineInstr *Split[2];
2720
2721 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2722 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2723 if (Dst.isPhysical()) {
2724 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2725 } else {
2726 assert(MRI.isSSA());
2727 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2728 MovDPP.addDef(Tmp);
2729 }
2730
2731 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2732 const MachineOperand &SrcOp = MI.getOperand(I);
2733 assert(!SrcOp.isFPImm());
2734 if (SrcOp.isImm()) {
2735 APInt Imm(64, SrcOp.getImm());
2736 Imm.ashrInPlace(Part * 32);
2737 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2738 } else {
2739 assert(SrcOp.isReg());
2740 Register Src = SrcOp.getReg();
2741 if (Src.isPhysical())
2742 MovDPP.addReg(RI.getSubReg(Src, Sub));
2743 else
2744 MovDPP.addReg(Src, getUndefRegState(SrcOp.isUndef()), Sub);
2745 }
2746 }
2747
2748 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2749 MovDPP.addImm(MO.getImm());
2750
2751 Split[Part] = MovDPP;
2752 ++Part;
2753 }
2754
2755 if (Dst.isVirtual())
2756 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2757 .addReg(Split[0]->getOperand(0).getReg())
2758 .addImm(AMDGPU::sub0)
2759 .addReg(Split[1]->getOperand(0).getReg())
2760 .addImm(AMDGPU::sub1);
2761
2762 MI.eraseFromParent();
2763 return std::pair(Split[0], Split[1]);
2764}
2765
2766std::optional<DestSourcePair>
2768 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2769 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2770
2771 return std::nullopt;
2772}
2773
2775 AMDGPU::OpName Src0OpName,
2776 MachineOperand &Src1,
2777 AMDGPU::OpName Src1OpName) const {
2778 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2779 if (!Src0Mods)
2780 return false;
2781
2782 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2783 assert(Src1Mods &&
2784 "All commutable instructions have both src0 and src1 modifiers");
2785
2786 int Src0ModsVal = Src0Mods->getImm();
2787 int Src1ModsVal = Src1Mods->getImm();
2788
2789 Src1Mods->setImm(Src0ModsVal);
2790 Src0Mods->setImm(Src1ModsVal);
2791 return true;
2792}
2793
2795 MachineOperand &RegOp,
2796 MachineOperand &NonRegOp) {
2797 Register Reg = RegOp.getReg();
2798 unsigned SubReg = RegOp.getSubReg();
2799 bool IsKill = RegOp.isKill();
2800 bool IsDead = RegOp.isDead();
2801 bool IsUndef = RegOp.isUndef();
2802 bool IsDebug = RegOp.isDebug();
2803
2804 if (NonRegOp.isImm())
2805 RegOp.ChangeToImmediate(NonRegOp.getImm());
2806 else if (NonRegOp.isFI())
2807 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2808 else if (NonRegOp.isGlobal()) {
2809 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2810 NonRegOp.getTargetFlags());
2811 } else
2812 return nullptr;
2813
2814 // Make sure we don't reinterpret a subreg index in the target flags.
2815 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2816
2817 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2818 NonRegOp.setSubReg(SubReg);
2819
2820 return &MI;
2821}
2822
2824 MachineOperand &NonRegOp1,
2825 MachineOperand &NonRegOp2) {
2826 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2827 int64_t NonRegVal = NonRegOp1.getImm();
2828
2829 NonRegOp1.setImm(NonRegOp2.getImm());
2830 NonRegOp2.setImm(NonRegVal);
2831 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2832 NonRegOp2.setTargetFlags(TargetFlags);
2833 return &MI;
2834}
2835
2836bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2837 unsigned OpIdx1) const {
2838 const MCInstrDesc &InstDesc = MI.getDesc();
2839 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2840 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2841
2842 unsigned Opc = MI.getOpcode();
2843 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2844
2845 const MachineOperand &MO0 = MI.getOperand(OpIdx0);
2846 const MachineOperand &MO1 = MI.getOperand(OpIdx1);
2847
2848 // Swap doesn't breach constant bus or literal limits
2849 // It may move literal to position other than src0, this is not allowed
2850 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2851 // FIXME: After gfx9, literal can be in place other than Src0
2852 if (isVALU(MI)) {
2853 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2854 !isInlineConstant(MO0, OpInfo1))
2855 return false;
2856 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2857 !isInlineConstant(MO1, OpInfo0))
2858 return false;
2859 }
2860
2861 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2862 if (OpInfo1.RegClass == -1)
2863 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2864 return isLegalRegOperand(MI, OpIdx1, MO0) &&
2865 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
2866 }
2867 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2868 if (OpInfo0.RegClass == -1)
2869 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2870 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
2871 isLegalRegOperand(MI, OpIdx0, MO1);
2872 }
2873
2874 // No need to check 64-bit literals since swapping does not bring new
2875 // 64-bit literals into current instruction to fold to 32-bit
2876
2877 return isImmOperandLegal(MI, OpIdx1, MO0);
2878}
2879
2881 unsigned Src0Idx,
2882 unsigned Src1Idx) const {
2883 assert(!NewMI && "this should never be used");
2884
2885 unsigned Opc = MI.getOpcode();
2886 int CommutedOpcode = commuteOpcode(Opc);
2887 if (CommutedOpcode == -1)
2888 return nullptr;
2889
2890 if (Src0Idx > Src1Idx)
2891 std::swap(Src0Idx, Src1Idx);
2892
2893 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2894 static_cast<int>(Src0Idx) &&
2895 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2896 static_cast<int>(Src1Idx) &&
2897 "inconsistency with findCommutedOpIndices");
2898
2899 if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
2900 return nullptr;
2901
2902 MachineInstr *CommutedMI = nullptr;
2903 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2904 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2905 if (Src0.isReg() && Src1.isReg()) {
2906 // Be sure to copy the source modifiers to the right place.
2907 CommutedMI =
2908 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2909 } else if (Src0.isReg() && !Src1.isReg()) {
2910 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2911 } else if (!Src0.isReg() && Src1.isReg()) {
2912 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2913 } else if (Src0.isImm() && Src1.isImm()) {
2914 CommutedMI = swapImmOperands(MI, Src0, Src1);
2915 } else {
2916 // FIXME: Found two non registers to commute. This does happen.
2917 return nullptr;
2918 }
2919
2920 if (CommutedMI) {
2921 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2922 Src1, AMDGPU::OpName::src1_modifiers);
2923
2924 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2925 AMDGPU::OpName::src1_sel);
2926
2927 CommutedMI->setDesc(get(CommutedOpcode));
2928 }
2929
2930 return CommutedMI;
2931}
2932
2933// This needs to be implemented because the source modifiers may be inserted
2934// between the true commutable operands, and the base
2935// TargetInstrInfo::commuteInstruction uses it.
2937 unsigned &SrcOpIdx0,
2938 unsigned &SrcOpIdx1) const {
2939 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2940}
2941
2943 unsigned &SrcOpIdx0,
2944 unsigned &SrcOpIdx1) const {
2945 if (!Desc.isCommutable())
2946 return false;
2947
2948 unsigned Opc = Desc.getOpcode();
2949 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2950 if (Src0Idx == -1)
2951 return false;
2952
2953 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2954 if (Src1Idx == -1)
2955 return false;
2956
2957 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2958}
2959
2961 int64_t BrOffset) const {
2962 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
2963 // because its dest block is unanalyzable.
2964 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
2965
2966 // Convert to dwords.
2967 BrOffset /= 4;
2968
2969 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2970 // from the next instruction.
2971 BrOffset -= 1;
2972
2973 return isIntN(BranchOffsetBits, BrOffset);
2974}
2975
2978 return MI.getOperand(0).getMBB();
2979}
2980
2982 for (const MachineInstr &MI : MBB->terminators()) {
2983 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2984 MI.getOpcode() == AMDGPU::SI_LOOP)
2985 return true;
2986 }
2987 return false;
2988}
2989
2991 MachineBasicBlock &DestBB,
2992 MachineBasicBlock &RestoreBB,
2993 const DebugLoc &DL, int64_t BrOffset,
2994 RegScavenger *RS) const {
2995 assert(MBB.empty() &&
2996 "new block should be inserted for expanding unconditional branch");
2997 assert(MBB.pred_size() == 1);
2998 assert(RestoreBB.empty() &&
2999 "restore block should be inserted for restoring clobbered registers");
3000
3001 MachineFunction *MF = MBB.getParent();
3002 MachineRegisterInfo &MRI = MF->getRegInfo();
3004 auto I = MBB.end();
3005 auto &MCCtx = MF->getContext();
3006
3007 if (ST.useAddPC64Inst()) {
3008 MCSymbol *Offset =
3009 MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
3010 auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
3012 MCSymbol *PostAddPCLabel =
3013 MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);
3014 AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);
3015 auto *OffsetExpr = MCBinaryExpr::createSub(
3016 MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
3017 MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);
3018 Offset->setVariableValue(OffsetExpr);
3019 return;
3020 }
3021
3022 assert(RS && "RegScavenger required for long branching");
3023
3024 // FIXME: Virtual register workaround for RegScavenger not working with empty
3025 // blocks.
3026 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3027
3028 // Note: as this is used after hazard recognizer we need to apply some hazard
3029 // workarounds directly.
3030 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
3031 ST.hasVALUReadSGPRHazard();
3032 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
3033 if (FlushSGPRWrites)
3034 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
3036 };
3037
3038 // We need to compute the offset relative to the instruction immediately after
3039 // s_getpc_b64. Insert pc arithmetic code before last terminator.
3040 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
3041 ApplyHazardWorkarounds();
3042
3043 MCSymbol *PostGetPCLabel =
3044 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
3045 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
3046
3047 MCSymbol *OffsetLo =
3048 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
3049 MCSymbol *OffsetHi =
3050 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
3051 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
3052 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
3053 .addReg(PCReg, {}, AMDGPU::sub0)
3054 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
3055 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
3056 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
3057 .addReg(PCReg, {}, AMDGPU::sub1)
3058 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
3059 ApplyHazardWorkarounds();
3060
3061 // Insert the indirect branch after the other terminator.
3062 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
3063 .addReg(PCReg);
3064
3065 // If a spill is needed for the pc register pair, we need to insert a spill
3066 // restore block right before the destination block, and insert a short branch
3067 // into the old destination block's fallthrough predecessor.
3068 // e.g.:
3069 //
3070 // s_cbranch_scc0 skip_long_branch:
3071 //
3072 // long_branch_bb:
3073 // spill s[8:9]
3074 // s_getpc_b64 s[8:9]
3075 // s_add_u32 s8, s8, restore_bb
3076 // s_addc_u32 s9, s9, 0
3077 // s_setpc_b64 s[8:9]
3078 //
3079 // skip_long_branch:
3080 // foo;
3081 //
3082 // .....
3083 //
3084 // dest_bb_fallthrough_predecessor:
3085 // bar;
3086 // s_branch dest_bb
3087 //
3088 // restore_bb:
3089 // restore s[8:9]
3090 // fallthrough dest_bb
3091 ///
3092 // dest_bb:
3093 // buzz;
3094
3095 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
3096 Register Scav;
3097
3098 // If we've previously reserved a register for long branches
3099 // avoid running the scavenger and just use those registers
3100 if (LongBranchReservedReg) {
3101 RS->enterBasicBlock(MBB);
3102 Scav = LongBranchReservedReg;
3103 } else {
3104 RS->enterBasicBlockEnd(MBB);
3105 Scav = RS->scavengeRegisterBackwards(
3106 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
3107 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
3108 }
3109 if (Scav) {
3110 RS->setRegUsed(Scav);
3111 MRI.replaceRegWith(PCReg, Scav);
3112 MRI.clearVirtRegs();
3113 } else {
3114 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3115 // SGPR spill.
3116 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3117 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3118 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
3119 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
3120 MRI.clearVirtRegs();
3121 }
3122
3123 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3124 // Now, the distance could be defined.
3126 MCSymbolRefExpr::create(DestLabel, MCCtx),
3127 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
3128 // Add offset assignments.
3129 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
3130 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
3131 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
3132 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3133}
3134
3135unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3136 switch (Cond) {
3137 case SIInstrInfo::SCC_TRUE:
3138 return AMDGPU::S_CBRANCH_SCC1;
3139 case SIInstrInfo::SCC_FALSE:
3140 return AMDGPU::S_CBRANCH_SCC0;
3141 case SIInstrInfo::VCCNZ:
3142 return AMDGPU::S_CBRANCH_VCCNZ;
3143 case SIInstrInfo::VCCZ:
3144 return AMDGPU::S_CBRANCH_VCCZ;
3145 case SIInstrInfo::EXECNZ:
3146 return AMDGPU::S_CBRANCH_EXECNZ;
3147 case SIInstrInfo::EXECZ:
3148 return AMDGPU::S_CBRANCH_EXECZ;
3149 default:
3150 llvm_unreachable("invalid branch predicate");
3151 }
3152}
3153
3154SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3155 switch (Opcode) {
3156 case AMDGPU::S_CBRANCH_SCC0:
3157 return SCC_FALSE;
3158 case AMDGPU::S_CBRANCH_SCC1:
3159 return SCC_TRUE;
3160 case AMDGPU::S_CBRANCH_VCCNZ:
3161 return VCCNZ;
3162 case AMDGPU::S_CBRANCH_VCCZ:
3163 return VCCZ;
3164 case AMDGPU::S_CBRANCH_EXECNZ:
3165 return EXECNZ;
3166 case AMDGPU::S_CBRANCH_EXECZ:
3167 return EXECZ;
3168 default:
3169 return INVALID_BR;
3170 }
3171}
3172
3176 MachineBasicBlock *&FBB,
3178 bool AllowModify) const {
3179 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3180 // Unconditional Branch
3181 TBB = I->getOperand(0).getMBB();
3182 return false;
3183 }
3184
3185 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3186 if (Pred == INVALID_BR)
3187 return true;
3188
3189 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3190 Cond.push_back(MachineOperand::CreateImm(Pred));
3191 Cond.push_back(I->getOperand(1)); // Save the branch register.
3192
3193 ++I;
3194
3195 if (I == MBB.end()) {
3196 // Conditional branch followed by fall-through.
3197 TBB = CondBB;
3198 return false;
3199 }
3200
3201 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3202 TBB = CondBB;
3203 FBB = I->getOperand(0).getMBB();
3204 return false;
3205 }
3206
3207 return true;
3208}
3209
3211 MachineBasicBlock *&FBB,
3213 bool AllowModify) const {
3214 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3215 auto E = MBB.end();
3216 if (I == E)
3217 return false;
3218
3219 // Skip over the instructions that are artificially terminators for special
3220 // exec management.
3221 while (I != E && !I->isBranch() && !I->isReturn()) {
3222 switch (I->getOpcode()) {
3223 case AMDGPU::S_MOV_B64_term:
3224 case AMDGPU::S_XOR_B64_term:
3225 case AMDGPU::S_OR_B64_term:
3226 case AMDGPU::S_ANDN2_B64_term:
3227 case AMDGPU::S_AND_B64_term:
3228 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3229 case AMDGPU::S_MOV_B32_term:
3230 case AMDGPU::S_XOR_B32_term:
3231 case AMDGPU::S_OR_B32_term:
3232 case AMDGPU::S_ANDN2_B32_term:
3233 case AMDGPU::S_AND_B32_term:
3234 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3235 break;
3236 case AMDGPU::SI_IF:
3237 case AMDGPU::SI_ELSE:
3238 case AMDGPU::SI_KILL_I1_TERMINATOR:
3239 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3240 // FIXME: It's messy that these need to be considered here at all.
3241 return true;
3242 default:
3243 llvm_unreachable("unexpected non-branch terminator inst");
3244 }
3245
3246 ++I;
3247 }
3248
3249 if (I == E)
3250 return false;
3251
3252 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3253}
3254
3256 int *BytesRemoved) const {
3257 unsigned Count = 0;
3258 unsigned RemovedSize = 0;
3259 for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {
3260 // Skip over artificial terminators when removing instructions.
3261 if (MI.isBranch() || MI.isReturn()) {
3262 RemovedSize += getInstSizeInBytes(MI);
3263 MI.eraseFromParent();
3264 ++Count;
3265 }
3266 }
3267
3268 if (BytesRemoved)
3269 *BytesRemoved = RemovedSize;
3270
3271 return Count;
3272}
3273
3274// Copy the flags onto the implicit condition register operand.
3276 const MachineOperand &OrigCond) {
3277 CondReg.setIsUndef(OrigCond.isUndef());
3278 CondReg.setIsKill(OrigCond.isKill());
3279}
3280
3283 MachineBasicBlock *FBB,
3285 const DebugLoc &DL,
3286 int *BytesAdded) const {
3287 if (!FBB && Cond.empty()) {
3288 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3289 .addMBB(TBB);
3290 if (BytesAdded)
3291 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3292 return 1;
3293 }
3294
3295 assert(TBB && Cond[0].isImm());
3296
3297 unsigned Opcode
3298 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3299
3300 if (!FBB) {
3301 MachineInstr *CondBr =
3302 BuildMI(&MBB, DL, get(Opcode))
3303 .addMBB(TBB);
3304
3305 // Copy the flags onto the implicit condition register operand.
3306 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3307 fixImplicitOperands(*CondBr);
3308
3309 if (BytesAdded)
3310 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3311 return 1;
3312 }
3313
3314 assert(TBB && FBB);
3315
3316 MachineInstr *CondBr =
3317 BuildMI(&MBB, DL, get(Opcode))
3318 .addMBB(TBB);
3319 fixImplicitOperands(*CondBr);
3320 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3321 .addMBB(FBB);
3322
3323 MachineOperand &CondReg = CondBr->getOperand(1);
3324 CondReg.setIsUndef(Cond[1].isUndef());
3325 CondReg.setIsKill(Cond[1].isKill());
3326
3327 if (BytesAdded)
3328 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3329
3330 return 2;
3331}
3332
3335 if (Cond.size() != 2) {
3336 return true;
3337 }
3338
3339 if (Cond[0].isImm()) {
3340 Cond[0].setImm(-Cond[0].getImm());
3341 return false;
3342 }
3343
3344 return true;
3345}
3346
3349 Register DstReg, Register TrueReg,
3350 Register FalseReg, int &CondCycles,
3351 int &TrueCycles, int &FalseCycles) const {
3352 switch (Cond[0].getImm()) {
3353 case VCCNZ:
3354 case VCCZ: {
3355 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3356 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3357 if (MRI.getRegClass(FalseReg) != RC)
3358 return false;
3359
3360 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3361 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3362
3363 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3364 return RI.hasVGPRs(RC) && NumInsts <= 6;
3365 }
3366 case SCC_TRUE:
3367 case SCC_FALSE: {
3368 // FIXME: We could insert for VGPRs if we could replace the original compare
3369 // with a vector one.
3370 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3371 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3372 if (MRI.getRegClass(FalseReg) != RC)
3373 return false;
3374
3375 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3376
3377 // Multiples of 8 can do s_cselect_b64
3378 if (NumInsts % 2 == 0)
3379 NumInsts /= 2;
3380
3381 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3382 return RI.isSGPRClass(RC);
3383 }
3384 default:
3385 return false;
3386 }
3387}
3388
3392 Register TrueReg, Register FalseReg) const {
3393 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3394 if (Pred == VCCZ || Pred == SCC_FALSE) {
3395 Pred = static_cast<BranchPredicate>(-Pred);
3396 std::swap(TrueReg, FalseReg);
3397 }
3398
3399 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3400 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3401 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3402
3403 if (DstSize == 32) {
3405 if (Pred == SCC_TRUE) {
3406 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3407 .addReg(TrueReg)
3408 .addReg(FalseReg);
3409 } else {
3410 // Instruction's operands are backwards from what is expected.
3411 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3412 .addReg(FalseReg)
3413 .addReg(TrueReg);
3414 }
3415
3416 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3417 return;
3418 }
3419
3420 if (DstSize == 64 && Pred == SCC_TRUE) {
3422 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3423 .addReg(TrueReg)
3424 .addReg(FalseReg);
3425
3426 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3427 return;
3428 }
3429
3430 static const int16_t Sub0_15[] = {
3431 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3432 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3433 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3434 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3435 };
3436
3437 static const int16_t Sub0_15_64[] = {
3438 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3439 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3440 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3441 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3442 };
3443
3444 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3445 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3446 const int16_t *SubIndices = Sub0_15;
3447 int NElts = DstSize / 32;
3448
3449 // 64-bit select is only available for SALU.
3450 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3451 if (Pred == SCC_TRUE) {
3452 if (NElts % 2) {
3453 SelOp = AMDGPU::S_CSELECT_B32;
3454 EltRC = &AMDGPU::SGPR_32RegClass;
3455 } else {
3456 SelOp = AMDGPU::S_CSELECT_B64;
3457 EltRC = &AMDGPU::SGPR_64RegClass;
3458 SubIndices = Sub0_15_64;
3459 NElts /= 2;
3460 }
3461 }
3462
3464 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3465
3466 I = MIB->getIterator();
3467
3469 for (int Idx = 0; Idx != NElts; ++Idx) {
3470 Register DstElt = MRI.createVirtualRegister(EltRC);
3471 Regs.push_back(DstElt);
3472
3473 unsigned SubIdx = SubIndices[Idx];
3474
3476 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3477 Select = BuildMI(MBB, I, DL, get(SelOp), DstElt)
3478 .addReg(FalseReg, {}, SubIdx)
3479 .addReg(TrueReg, {}, SubIdx);
3480 } else {
3481 Select = BuildMI(MBB, I, DL, get(SelOp), DstElt)
3482 .addReg(TrueReg, {}, SubIdx)
3483 .addReg(FalseReg, {}, SubIdx);
3484 }
3485
3486 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3488
3489 MIB.addReg(DstElt)
3490 .addImm(SubIdx);
3491 }
3492}
3493
3495 switch (MI.getOpcode()) {
3496 case AMDGPU::V_MOV_B16_t16_e32:
3497 case AMDGPU::V_MOV_B16_t16_e64:
3498 case AMDGPU::V_MOV_B32_e32:
3499 case AMDGPU::V_MOV_B32_e64:
3500 case AMDGPU::V_MOV_B64_PSEUDO:
3501 case AMDGPU::V_MOV_B64_e32:
3502 case AMDGPU::V_MOV_B64_e64:
3503 case AMDGPU::S_MOV_B32:
3504 case AMDGPU::S_MOV_B64:
3505 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3506 case AMDGPU::COPY:
3507 case AMDGPU::WWM_COPY:
3508 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3509 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3510 case AMDGPU::V_ACCVGPR_MOV_B32:
3511 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3512 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3513 return true;
3514 default:
3515 return false;
3516 }
3517}
3518
3520 switch (MI.getOpcode()) {
3521 case AMDGPU::V_MOV_B16_t16_e32:
3522 case AMDGPU::V_MOV_B16_t16_e64:
3523 return 2;
3524 case AMDGPU::V_MOV_B32_e32:
3525 case AMDGPU::V_MOV_B32_e64:
3526 case AMDGPU::V_MOV_B64_PSEUDO:
3527 case AMDGPU::V_MOV_B64_e32:
3528 case AMDGPU::V_MOV_B64_e64:
3529 case AMDGPU::S_MOV_B32:
3530 case AMDGPU::S_MOV_B64:
3531 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3532 case AMDGPU::COPY:
3533 case AMDGPU::WWM_COPY:
3534 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3535 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3536 case AMDGPU::V_ACCVGPR_MOV_B32:
3537 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3538 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3539 return 1;
3540 default:
3541 llvm_unreachable("MI is not a foldable copy");
3542 }
3543}
3544
3545static constexpr AMDGPU::OpName ModifierOpNames[] = {
3546 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3547 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3548 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3549
3551 unsigned Opc = MI.getOpcode();
3552 for (AMDGPU::OpName Name : reverse(ModifierOpNames)) {
3553 int Idx = AMDGPU::getNamedOperandIdx(Opc, Name);
3554 if (Idx >= 0)
3555 MI.removeOperand(Idx);
3556 }
3557}
3558
3560 const MCInstrDesc &NewDesc) const {
3561 MI.setDesc(NewDesc);
3562
3563 // Remove any leftover implicit operands from mutating the instruction. e.g.
3564 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
3565 // anymore.
3566 const MCInstrDesc &Desc = MI.getDesc();
3567 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
3568 Desc.implicit_defs().size();
3569
3570 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
3571 MI.removeOperand(I);
3572}
3573
3574std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3575 unsigned SubRegIndex) {
3576 switch (SubRegIndex) {
3577 case AMDGPU::NoSubRegister:
3578 return Imm;
3579 case AMDGPU::sub0:
3580 return SignExtend64<32>(Imm);
3581 case AMDGPU::sub1:
3582 return SignExtend64<32>(Imm >> 32);
3583 case AMDGPU::lo16:
3584 return SignExtend64<16>(Imm);
3585 case AMDGPU::hi16:
3586 return SignExtend64<16>(Imm >> 16);
3587 case AMDGPU::sub1_lo16:
3588 return SignExtend64<16>(Imm >> 32);
3589 case AMDGPU::sub1_hi16:
3590 return SignExtend64<16>(Imm >> 48);
3591 default:
3592 return std::nullopt;
3593 }
3594
3595 llvm_unreachable("covered subregister switch");
3596}
3597
3598static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3599 switch (Opc) {
3600 case AMDGPU::V_MAC_F16_e32:
3601 case AMDGPU::V_MAC_F16_e64:
3602 case AMDGPU::V_MAD_F16_e64:
3603 return AMDGPU::V_MADAK_F16;
3604 case AMDGPU::V_MAC_F32_e32:
3605 case AMDGPU::V_MAC_F32_e64:
3606 case AMDGPU::V_MAD_F32_e64:
3607 return AMDGPU::V_MADAK_F32;
3608 case AMDGPU::V_FMAC_F32_e32:
3609 case AMDGPU::V_FMAC_F32_e64:
3610 case AMDGPU::V_FMA_F32_e64:
3611 return AMDGPU::V_FMAAK_F32;
3612 case AMDGPU::V_FMAC_F16_e32:
3613 case AMDGPU::V_FMAC_F16_e64:
3614 case AMDGPU::V_FMAC_F16_t16_e64:
3615 case AMDGPU::V_FMAC_F16_fake16_e64:
3616 case AMDGPU::V_FMAC_F16_t16_e32:
3617 case AMDGPU::V_FMAC_F16_fake16_e32:
3618 case AMDGPU::V_FMA_F16_e64:
3619 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3620 ? AMDGPU::V_FMAAK_F16_t16
3621 : AMDGPU::V_FMAAK_F16_fake16
3622 : AMDGPU::V_FMAAK_F16;
3623 case AMDGPU::V_FMAC_F64_e32:
3624 case AMDGPU::V_FMAC_F64_e64:
3625 case AMDGPU::V_FMA_F64_e64:
3626 return AMDGPU::V_FMAAK_F64;
3627 default:
3628 llvm_unreachable("invalid instruction");
3629 }
3630}
3631
3632static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3633 switch (Opc) {
3634 case AMDGPU::V_MAC_F16_e32:
3635 case AMDGPU::V_MAC_F16_e64:
3636 case AMDGPU::V_MAD_F16_e64:
3637 return AMDGPU::V_MADMK_F16;
3638 case AMDGPU::V_MAC_F32_e32:
3639 case AMDGPU::V_MAC_F32_e64:
3640 case AMDGPU::V_MAD_F32_e64:
3641 return AMDGPU::V_MADMK_F32;
3642 case AMDGPU::V_FMAC_F32_e32:
3643 case AMDGPU::V_FMAC_F32_e64:
3644 case AMDGPU::V_FMA_F32_e64:
3645 return AMDGPU::V_FMAMK_F32;
3646 case AMDGPU::V_FMAC_F16_e32:
3647 case AMDGPU::V_FMAC_F16_e64:
3648 case AMDGPU::V_FMAC_F16_t16_e64:
3649 case AMDGPU::V_FMAC_F16_fake16_e64:
3650 case AMDGPU::V_FMAC_F16_t16_e32:
3651 case AMDGPU::V_FMAC_F16_fake16_e32:
3652 case AMDGPU::V_FMA_F16_e64:
3653 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3654 ? AMDGPU::V_FMAMK_F16_t16
3655 : AMDGPU::V_FMAMK_F16_fake16
3656 : AMDGPU::V_FMAMK_F16;
3657 case AMDGPU::V_FMAC_F64_e32:
3658 case AMDGPU::V_FMAC_F64_e64:
3659 case AMDGPU::V_FMA_F64_e64:
3660 return AMDGPU::V_FMAMK_F64;
3661 default:
3662 llvm_unreachable("invalid instruction");
3663 }
3664}
3665
3667 Register Reg, MachineRegisterInfo *MRI) const {
3668 int64_t Imm;
3669 if (!getConstValDefinedInReg(DefMI, Reg, Imm))
3670 return false;
3671
3672 const bool HasMultipleUses = !MRI->hasOneNonDBGUse(Reg);
3673
3674 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3675
3676 unsigned Opc = UseMI.getOpcode();
3677 if (Opc == AMDGPU::COPY) {
3678 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3679
3680 Register DstReg = UseMI.getOperand(0).getReg();
3681 Register UseSubReg = UseMI.getOperand(1).getSubReg();
3682
3683 const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
3684
3685 if (HasMultipleUses) {
3686 // TODO: This should fold in more cases with multiple use, but we need to
3687 // more carefully consider what those uses are.
3688 unsigned ImmDefSize = RI.getRegSizeInBits(*MRI->getRegClass(Reg));
3689
3690 // Avoid breaking up a 64-bit inline immediate into a subregister extract.
3691 if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
3692 return false;
3693
3694 // Most of the time folding a 32-bit inline constant is free (though this
3695 // might not be true if we can't later fold it into a real user).
3696 //
3697 // FIXME: This isInlineConstant check is imprecise if
3698 // getConstValDefinedInReg handled the tricky non-mov cases.
3699 if (ImmDefSize == 32 &&
3701 return false;
3702 }
3703
3704 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3705 RI.getSubRegIdxSize(UseSubReg) == 16;
3706
3707 if (Is16Bit) {
3708 if (RI.hasVGPRs(DstRC))
3709 return false; // Do not clobber vgpr_hi16
3710
3711 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3712 return false;
3713 }
3714
3715 MachineFunction *MF = UseMI.getMF();
3716
3717 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3718 MCRegister MovDstPhysReg =
3719 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3720
3721 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
3722
3723 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3724 for (unsigned MovOp :
3725 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3726 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3727 const MCInstrDesc &MovDesc = get(MovOp);
3728
3729 const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0);
3730 if (Is16Bit) {
3731 // We just need to find a correctly sized register class, so the
3732 // subregister index compatibility doesn't matter since we're statically
3733 // extracting the immediate value.
3734 MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
3735 if (!MovDstRC)
3736 continue;
3737
3738 if (MovDstPhysReg) {
3739 // FIXME: We probably should not do this. If there is a live value in
3740 // the high half of the register, it will be corrupted.
3741 MovDstPhysReg =
3742 RI.getMatchingSuperReg(MovDstPhysReg, AMDGPU::lo16, MovDstRC);
3743 if (!MovDstPhysReg)
3744 continue;
3745 }
3746 }
3747
3748 // Result class isn't the right size, try the next instruction.
3749 if (MovDstPhysReg) {
3750 if (!MovDstRC->contains(MovDstPhysReg))
3751 return false;
3752 } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
3753 // TODO: This will be overly conservative in the case of 16-bit virtual
3754 // SGPRs. We could hack up the virtual register uses to use a compatible
3755 // 32-bit class.
3756 continue;
3757 }
3758
3759 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3760
3761 // Ensure the interpreted immediate value is a valid operand in the new
3762 // mov.
3763 //
3764 // FIXME: isImmOperandLegal should have form that doesn't require existing
3765 // MachineInstr or MachineOperand
3766 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
3767 !isInlineConstant(*SubRegImm, OpInfo.OperandType))
3768 break;
3769
3770 NewOpc = MovOp;
3771 break;
3772 }
3773
3774 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3775 return false;
3776
3777 if (Is16Bit) {
3778 UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
3779 if (MovDstPhysReg)
3780 UseMI.getOperand(0).setReg(MovDstPhysReg);
3781 assert(UseMI.getOperand(1).getReg().isVirtual());
3782 }
3783
3784 const MCInstrDesc &NewMCID = get(NewOpc);
3785 UseMI.setDesc(NewMCID);
3786 UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
3787 UseMI.addImplicitDefUseOperands(*MF);
3788 return true;
3789 }
3790
3791 if (HasMultipleUses)
3792 return false;
3793
3794 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3795 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3796 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3797 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3798 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3799 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3800 Opc == AMDGPU::V_FMAC_F64_e64) {
3801 // Don't fold if we are using source or output modifiers. The new VOP2
3802 // instructions don't have them.
3804 return false;
3805
3806 // If this is a free constant, there's no reason to do this.
3807 // TODO: We could fold this here instead of letting SIFoldOperands do it
3808 // later.
3809 int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0);
3810
3811 // Any src operand can be used for the legality check.
3812 if (isInlineConstant(UseMI, Src0Idx, Imm))
3813 return false;
3814
3815 MachineOperand *Src0 = &UseMI.getOperand(Src0Idx);
3816
3817 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3818 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3819
3820 auto CopyRegOperandToNarrowerRC =
3821 [MRI, this](MachineInstr &MI, unsigned OpNo,
3822 const TargetRegisterClass *NewRC) -> void {
3823 if (!MI.getOperand(OpNo).isReg())
3824 return;
3825 Register Reg = MI.getOperand(OpNo).getReg();
3826 const TargetRegisterClass *RC = RI.getRegClassForReg(*MRI, Reg);
3827 if (RI.getCommonSubClass(RC, NewRC) != NewRC)
3828 return;
3829 Register Tmp = MRI->createVirtualRegister(NewRC);
3830 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
3831 get(AMDGPU::COPY), Tmp)
3832 .addReg(Reg);
3833 MI.getOperand(OpNo).setReg(Tmp);
3834 MI.getOperand(OpNo).setIsKill();
3835 };
3836
3837 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3838 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3839 (Src1->isReg() && Src1->getReg() == Reg)) {
3840 MachineOperand *RegSrc =
3841 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3842 if (!RegSrc->isReg())
3843 return false;
3844 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3845 ST.getConstantBusLimit(Opc) < 2)
3846 return false;
3847
3848 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3849 return false;
3850
3851 // If src2 is also a literal constant then we have to choose which one to
3852 // fold. In general it is better to choose madak so that the other literal
3853 // can be materialized in an sgpr instead of a vgpr:
3854 // s_mov_b32 s0, literal
3855 // v_madak_f32 v0, s0, v0, literal
3856 // Instead of:
3857 // v_mov_b32 v1, literal
3858 // v_madmk_f32 v0, v0, literal, v1
3859 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3860 if (Def && Def->isMoveImmediate() &&
3861 !isInlineConstant(Def->getOperand(1)))
3862 return false;
3863
3864 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3865 if (pseudoToMCOpcode(NewOpc) == -1)
3866 return false;
3867
3868 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3869 Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3870
3871 // FIXME: This would be a lot easier if we could return a new instruction
3872 // instead of having to modify in place.
3873
3874 Register SrcReg = RegSrc->getReg();
3875 unsigned SrcSubReg = RegSrc->getSubReg();
3876 Src0->setReg(SrcReg);
3877 Src0->setSubReg(SrcSubReg);
3878 Src0->setIsKill(RegSrc->isKill());
3879
3880 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3881 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3882 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3883 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3884 UseMI.untieRegOperand(
3885 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3886
3887 Src1->ChangeToImmediate(*SubRegImm);
3888
3890 UseMI.setDesc(get(NewOpc));
3891
3892 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3893 NewOpc == AMDGPU::V_FMAMK_F16_fake16) {
3894 const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
3895 Register Tmp = MRI->createVirtualRegister(NewRC);
3896 BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
3897 UseMI.getDebugLoc(), get(AMDGPU::COPY),
3898 UseMI.getOperand(0).getReg())
3899 .addReg(Tmp, RegState::Kill);
3900 UseMI.getOperand(0).setReg(Tmp);
3901 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
3902 CopyRegOperandToNarrowerRC(UseMI, 3, NewRC);
3903 }
3904
3905 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3906 if (DeleteDef)
3907 DefMI.eraseFromParent();
3908
3909 return true;
3910 }
3911
3912 // Added part is the constant: Use v_madak_{f16, f32}.
3913 if (Src2->isReg() && Src2->getReg() == Reg) {
3914 if (ST.getConstantBusLimit(Opc) < 2) {
3915 // Not allowed to use constant bus for another operand.
3916 // We can however allow an inline immediate as src0.
3917 bool Src0Inlined = false;
3918 if (Src0->isReg()) {
3919 // Try to inline constant if possible.
3920 // If the Def moves immediate and the use is single
3921 // We are saving VGPR here.
3922 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3923 if (Def && Def->isMoveImmediate() &&
3924 isInlineConstant(Def->getOperand(1)) &&
3925 MRI->hasOneNonDBGUse(Src0->getReg())) {
3926 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3927 Src0Inlined = true;
3928 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3929 RI.isSGPRReg(*MRI, Src0->getReg())) {
3930 return false;
3931 }
3932 // VGPR is okay as Src0 - fallthrough
3933 }
3934
3935 if (Src1->isReg() && !Src0Inlined) {
3936 // We have one slot for inlinable constant so far - try to fill it
3937 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3938 if (Def && Def->isMoveImmediate() &&
3939 isInlineConstant(Def->getOperand(1)) &&
3940 MRI->hasOneNonDBGUse(Src1->getReg()) && commuteInstruction(UseMI))
3941 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3942 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3943 return false;
3944 // VGPR is okay as Src1 - fallthrough
3945 }
3946 }
3947
3948 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3949 if (pseudoToMCOpcode(NewOpc) == -1)
3950 return false;
3951
3952 // FIXME: This would be a lot easier if we could return a new instruction
3953 // instead of having to modify in place.
3954
3955 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3956 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3957 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3958 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3959 UseMI.untieRegOperand(
3960 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3961
3962 const std::optional<int64_t> SubRegImm =
3963 extractSubregFromImm(Imm, Src2->getSubReg());
3964
3965 // ChangingToImmediate adds Src2 back to the instruction.
3966 Src2->ChangeToImmediate(*SubRegImm);
3967
3968 // These come before src2.
3970 UseMI.setDesc(get(NewOpc));
3971
3972 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3973 NewOpc == AMDGPU::V_FMAAK_F16_fake16) {
3974 const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
3975 Register Tmp = MRI->createVirtualRegister(NewRC);
3976 BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
3977 UseMI.getDebugLoc(), get(AMDGPU::COPY),
3978 UseMI.getOperand(0).getReg())
3979 .addReg(Tmp, RegState::Kill);
3980 UseMI.getOperand(0).setReg(Tmp);
3981 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
3982 CopyRegOperandToNarrowerRC(UseMI, 2, NewRC);
3983 }
3984
3985 // It might happen that UseMI was commuted
3986 // and we now have SGPR as SRC1. If so 2 inlined
3987 // constant and SGPR are illegal.
3989
3990 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3991 if (DeleteDef)
3992 DefMI.eraseFromParent();
3993
3994 return true;
3995 }
3996 }
3997
3998 return false;
3999}
4000
4001static bool
4004 if (BaseOps1.size() != BaseOps2.size())
4005 return false;
4006 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
4007 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
4008 return false;
4009 }
4010 return true;
4011}
4012
4013static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
4014 LocationSize WidthB, int OffsetB) {
4015 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
4016 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
4017 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
4018 return LowWidth.hasValue() &&
4019 LowOffset + (int)LowWidth.getValue() <= HighOffset;
4020}
4021
4022bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
4023 const MachineInstr &MIb) const {
4024 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
4025 int64_t Offset0, Offset1;
4026 LocationSize Dummy0 = LocationSize::precise(0);
4027 LocationSize Dummy1 = LocationSize::precise(0);
4028 bool Offset0IsScalable, Offset1IsScalable;
4029 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
4030 Dummy0, &RI) ||
4031 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
4032 Dummy1, &RI))
4033 return false;
4034
4035 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
4036 return false;
4037
4038 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
4039 // FIXME: Handle ds_read2 / ds_write2.
4040 return false;
4041 }
4042 LocationSize Width0 = MIa.memoperands().front()->getSize();
4043 LocationSize Width1 = MIb.memoperands().front()->getSize();
4044 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
4045}
4046
4048 const MachineInstr &MIb) const {
4049 assert(MIa.mayLoadOrStore() &&
4050 "MIa must load from or modify a memory location");
4051 assert(MIb.mayLoadOrStore() &&
4052 "MIb must load from or modify a memory location");
4053
4055 return false;
4056
4057 // XXX - Can we relax this between address spaces?
4058 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
4059 return false;
4060
4061 if (isLDSDMA(MIa) || isLDSDMA(MIb))
4062 return false;
4063
4064 if (MIa.isBundle() || MIb.isBundle())
4065 return false;
4066
4067 // TODO: Should we check the address space from the MachineMemOperand? That
4068 // would allow us to distinguish objects we know don't alias based on the
4069 // underlying address space, even if it was lowered to a different one,
4070 // e.g. private accesses lowered to use MUBUF instructions on a scratch
4071 // buffer.
4072 if (isDS(MIa)) {
4073 if (isDS(MIb))
4074 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4075
4076 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
4077 }
4078
4079 if (isMUBUF(MIa) || isMTBUF(MIa)) {
4080 if (isMUBUF(MIb) || isMTBUF(MIb))
4081 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4082
4083 if (isFLAT(MIb))
4084 return isFLATScratch(MIb);
4085
4086 return !isSMRD(MIb);
4087 }
4088
4089 if (isSMRD(MIa)) {
4090 if (isSMRD(MIb))
4091 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4092
4093 if (isFLAT(MIb))
4094 return isFLATScratch(MIb);
4095
4096 return !isMUBUF(MIb) && !isMTBUF(MIb);
4097 }
4098
4099 if (isFLAT(MIa)) {
4100 if (isFLAT(MIb)) {
4101 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
4102 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
4103 return true;
4104
4105 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4106 }
4107
4108 return false;
4109 }
4110
4111 return false;
4112}
4113
4115 int64_t &Imm, MachineInstr **DefMI = nullptr) {
4116 if (Reg.isPhysical())
4117 return false;
4118 auto *Def = MRI.getUniqueVRegDef(Reg);
4119 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
4120 Imm = Def->getOperand(1).getImm();
4121 if (DefMI)
4122 *DefMI = Def;
4123 return true;
4124 }
4125 return false;
4126}
4127
4128static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
4129 MachineInstr **DefMI = nullptr) {
4130 if (!MO->isReg())
4131 return false;
4132 const MachineFunction *MF = MO->getParent()->getMF();
4133 const MachineRegisterInfo &MRI = MF->getRegInfo();
4134 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
4135}
4136
4138 MachineInstr &NewMI) {
4139 if (LV) {
4140 unsigned NumOps = MI.getNumOperands();
4141 for (unsigned I = 1; I < NumOps; ++I) {
4142 MachineOperand &Op = MI.getOperand(I);
4143 if (Op.isReg() && Op.isKill())
4144 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
4145 }
4146 }
4147}
4148
4149static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
4150 switch (Opc) {
4151 case AMDGPU::V_MAC_F16_e32:
4152 case AMDGPU::V_MAC_F16_e64:
4153 return AMDGPU::V_MAD_F16_e64;
4154 case AMDGPU::V_MAC_F32_e32:
4155 case AMDGPU::V_MAC_F32_e64:
4156 return AMDGPU::V_MAD_F32_e64;
4157 case AMDGPU::V_MAC_LEGACY_F32_e32:
4158 case AMDGPU::V_MAC_LEGACY_F32_e64:
4159 return AMDGPU::V_MAD_LEGACY_F32_e64;
4160 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4161 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4162 return AMDGPU::V_FMA_LEGACY_F32_e64;
4163 case AMDGPU::V_FMAC_F16_e32:
4164 case AMDGPU::V_FMAC_F16_e64:
4165 case AMDGPU::V_FMAC_F16_t16_e64:
4166 case AMDGPU::V_FMAC_F16_fake16_e64:
4167 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4168 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4169 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4170 : AMDGPU::V_FMA_F16_gfx9_e64;
4171 case AMDGPU::V_FMAC_F32_e32:
4172 case AMDGPU::V_FMAC_F32_e64:
4173 return AMDGPU::V_FMA_F32_e64;
4174 case AMDGPU::V_FMAC_F64_e32:
4175 case AMDGPU::V_FMAC_F64_e64:
4176 return AMDGPU::V_FMA_F64_e64;
4177 default:
4178 llvm_unreachable("invalid instruction");
4179 }
4180}
4181
4182/// Helper struct for the implementation of 3-address conversion to communicate
4183/// updates made to instruction operands.
4185 /// Other instruction whose def is no longer used by the converted
4186 /// instruction.
4188};
4189
4191 LiveVariables *LV,
4192 LiveIntervals *LIS) const {
4193 MachineBasicBlock &MBB = *MI.getParent();
4194 MachineInstr *CandidateMI = &MI;
4195
4196 if (MI.isBundle()) {
4197 // This is a temporary placeholder for bundle handling that enables us to
4198 // exercise the relevant code paths in the two-address instruction pass.
4199 if (MI.getBundleSize() != 1)
4200 return nullptr;
4201 CandidateMI = MI.getNextNode();
4202 }
4203
4205 MachineInstr *NewMI = convertToThreeAddressImpl(*CandidateMI, U);
4206 if (!NewMI)
4207 return nullptr;
4208
4209 if (MI.isBundle()) {
4210 CandidateMI->eraseFromBundle();
4211
4212 for (MachineOperand &MO : MI.all_defs()) {
4213 if (MO.isTied())
4214 MI.untieRegOperand(MO.getOperandNo());
4215 }
4216 } else {
4217 updateLiveVariables(LV, MI, *NewMI);
4218 if (LIS) {
4219 LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
4220 // SlotIndex of defs needs to be updated when converting to early-clobber
4221 MachineOperand &Def = NewMI->getOperand(0);
4222 if (Def.isEarlyClobber() && Def.isReg() &&
4223 LIS->hasInterval(Def.getReg())) {
4224 SlotIndex OldIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(false);
4225 SlotIndex NewIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(true);
4226 auto &LI = LIS->getInterval(Def.getReg());
4227 auto UpdateDefIndex = [&](LiveRange &LR) {
4228 auto *S = LR.find(OldIndex);
4229 if (S != LR.end() && S->start == OldIndex) {
4230 assert(S->valno && S->valno->def == OldIndex);
4231 S->start = NewIndex;
4232 S->valno->def = NewIndex;
4233 }
4234 };
4235 UpdateDefIndex(LI);
4236 for (auto &SR : LI.subranges())
4237 UpdateDefIndex(SR);
4238 }
4239 }
4240 }
4241
4242 if (U.RemoveMIUse) {
4243 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4244 // The only user is the instruction which will be killed.
4245 Register DefReg = U.RemoveMIUse->getOperand(0).getReg();
4246
4247 if (MRI.hasOneNonDBGUse(DefReg)) {
4248 // We cannot just remove the DefMI here, calling pass will crash.
4249 U.RemoveMIUse->setDesc(get(AMDGPU::IMPLICIT_DEF));
4250 U.RemoveMIUse->getOperand(0).setIsDead(true);
4251 for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I)
4252 U.RemoveMIUse->removeOperand(I);
4253 if (LV)
4254 LV->getVarInfo(DefReg).AliveBlocks.clear();
4255 }
4256
4257 if (MI.isBundle()) {
4258 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4259 if (!VRI.Reads && !VRI.Writes) {
4260 for (MachineOperand &MO : MI.all_uses()) {
4261 if (MO.isReg() && MO.getReg() == DefReg) {
4262 assert(MO.getSubReg() == 0 &&
4263 "tied sub-registers in bundles currently not supported");
4264 MI.removeOperand(MO.getOperandNo());
4265 break;
4266 }
4267 }
4268
4269 if (LIS)
4270 LIS->shrinkToUses(&LIS->getInterval(DefReg));
4271 }
4272 } else if (LIS) {
4273 LiveInterval &DefLI = LIS->getInterval(DefReg);
4274
4275 // We cannot delete the original instruction here, so hack out the use
4276 // in the original instruction with a dummy register so we can use
4277 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4278 // not have the complexity of deleting a use to consider here.
4279 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
4280 for (MachineOperand &MIOp : MI.uses()) {
4281 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4282 MIOp.setIsUndef(true);
4283 MIOp.setReg(DummyReg);
4284 }
4285 }
4286
4287 if (MI.isBundle()) {
4288 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4289 if (!VRI.Reads && !VRI.Writes) {
4290 for (MachineOperand &MIOp : MI.uses()) {
4291 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4292 MIOp.setIsUndef(true);
4293 MIOp.setReg(DummyReg);
4294 }
4295 }
4296 }
4297
4298 MI.addOperand(MachineOperand::CreateReg(DummyReg, false, false, false,
4299 false, /*isUndef=*/true));
4300 }
4301
4302 LIS->shrinkToUses(&DefLI);
4303 }
4304 }
4305
4306 return MI.isBundle() ? &MI : NewMI;
4307}
4308
4310SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
4311 ThreeAddressUpdates &U) const {
4312 MachineBasicBlock &MBB = *MI.getParent();
4313 unsigned Opc = MI.getOpcode();
4314
4315 // Handle MFMA.
4316 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
4317 if (NewMFMAOpc != -1) {
4319 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
4320 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4321 MIB.add(MI.getOperand(I));
4322 return MIB;
4323 }
4324
4325 if (SIInstrInfo::isWMMA(MI)) {
4326 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
4327 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4328 .setMIFlags(MI.getFlags());
4329 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4330 MIB->addOperand(MI.getOperand(I));
4331 return MIB;
4332 }
4333
4334 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4335 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4336 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4337 "present pre-RA");
4338
4339 // Handle MAC/FMAC.
4340 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4341 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4342 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4343 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4344 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4345 bool Src0Literal = false;
4346
4347 switch (Opc) {
4348 default:
4349 return nullptr;
4350 case AMDGPU::V_MAC_F16_e64:
4351 case AMDGPU::V_FMAC_F16_e64:
4352 case AMDGPU::V_FMAC_F16_t16_e64:
4353 case AMDGPU::V_FMAC_F16_fake16_e64:
4354 case AMDGPU::V_MAC_F32_e64:
4355 case AMDGPU::V_MAC_LEGACY_F32_e64:
4356 case AMDGPU::V_FMAC_F32_e64:
4357 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4358 case AMDGPU::V_FMAC_F64_e64:
4359 break;
4360 case AMDGPU::V_MAC_F16_e32:
4361 case AMDGPU::V_FMAC_F16_e32:
4362 case AMDGPU::V_MAC_F32_e32:
4363 case AMDGPU::V_MAC_LEGACY_F32_e32:
4364 case AMDGPU::V_FMAC_F32_e32:
4365 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4366 case AMDGPU::V_FMAC_F64_e32: {
4367 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4368 AMDGPU::OpName::src0);
4369 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
4370 if (!Src0->isReg() && !Src0->isImm())
4371 return nullptr;
4372
4373 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
4374 Src0Literal = true;
4375
4376 break;
4377 }
4378 }
4379
4380 MachineInstrBuilder MIB;
4381 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4382 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
4383 const MachineOperand *Src0Mods =
4384 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4385 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4386 const MachineOperand *Src1Mods =
4387 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
4388 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4389 const MachineOperand *Src2Mods =
4390 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
4391 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4392 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
4393 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
4394
4395 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4396 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4397 // If we have an SGPR input, we will violate the constant bus restriction.
4398 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
4399 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
4400 MachineInstr *DefMI;
4401
4402 int64_t Imm;
4403 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4404 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4405 if (pseudoToMCOpcode(NewOpc) != -1) {
4406 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4407 .add(*Dst)
4408 .add(*Src0)
4409 .add(*Src1)
4410 .addImm(Imm)
4411 .setMIFlags(MI.getFlags());
4412 U.RemoveMIUse = DefMI;
4413 return MIB;
4414 }
4415 }
4416 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4417 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4418 if (pseudoToMCOpcode(NewOpc) != -1) {
4419 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4420 .add(*Dst)
4421 .add(*Src0)
4422 .addImm(Imm)
4423 .add(*Src2)
4424 .setMIFlags(MI.getFlags());
4425 U.RemoveMIUse = DefMI;
4426 return MIB;
4427 }
4428 }
4429 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4430 if (Src0Literal) {
4431 Imm = Src0->getImm();
4432 DefMI = nullptr;
4433 }
4434 if (pseudoToMCOpcode(NewOpc) != -1 &&
4436 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4437 Src1)) {
4438 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4439 .add(*Dst)
4440 .add(*Src1)
4441 .addImm(Imm)
4442 .add(*Src2)
4443 .setMIFlags(MI.getFlags());
4444 U.RemoveMIUse = DefMI;
4445 return MIB;
4446 }
4447 }
4448 }
4449
4450 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4451 // if VOP3 does not allow a literal operand.
4452 if (Src0Literal && !ST.hasVOP3Literal())
4453 return nullptr;
4454
4455 unsigned NewOpc = getNewFMAInst(ST, Opc);
4456
4457 if (pseudoToMCOpcode(NewOpc) == -1)
4458 return nullptr;
4459
4460 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4461 .add(*Dst)
4462 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4463 .add(*Src0)
4464 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4465 .add(*Src1)
4466 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4467 .add(*Src2)
4468 .addImm(Clamp ? Clamp->getImm() : 0)
4469 .addImm(Omod ? Omod->getImm() : 0)
4470 .setMIFlags(MI.getFlags());
4471 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4472 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4473 return MIB;
4474}
4475
4476// It's not generally safe to move VALU instructions across these since it will
4477// start using the register as a base index rather than directly.
4478// XXX - Why isn't hasSideEffects sufficient for these?
4480 switch (MI.getOpcode()) {
4481 case AMDGPU::S_SET_GPR_IDX_ON:
4482 case AMDGPU::S_SET_GPR_IDX_MODE:
4483 case AMDGPU::S_SET_GPR_IDX_OFF:
4484 return true;
4485 default:
4486 return false;
4487 }
4488}
4489
4491 const MachineBasicBlock *MBB,
4492 const MachineFunction &MF) const {
4493 // Skipping the check for SP writes in the base implementation. The reason it
4494 // was added was apparently due to compile time concerns.
4495 //
4496 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4497 // but is probably avoidable.
4498
4499 // Copied from base implementation.
4500 // Terminators and labels can't be scheduled around.
4501 if (MI.isTerminator() || MI.isPosition())
4502 return true;
4503
4504 // INLINEASM_BR can jump to another block
4505 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4506 return true;
4507
4508 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4509 return true;
4510
4511 // Target-independent instructions do not have an implicit-use of EXEC, even
4512 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4513 // boundaries prevents incorrect movements of such instructions.
4514 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4515 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4516 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4517 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4518 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4520}
4521
4523 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4524 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4525 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4526}
4527
4529 // Instructions that access scratch use FLAT encoding or BUF encodings.
4530 if ((!isFLAT(MI) || isFLATGlobal(MI)) && !isBUF(MI))
4531 return false;
4532
4533 // SCRATCH instructions always access scratch.
4534 if (isFLATScratch(MI))
4535 return true;
4536
4537 // If FLAT_SCRATCH registers are not initialized, we can never access scratch
4538 // via the aperture.
4539 if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
4540 return false;
4541
4542 // If there are no memory operands then conservatively assume the flat
4543 // operation may access scratch.
4544 if (MI.memoperands_empty())
4545 return true;
4546
4547 // See if any memory operand specifies an address space that involves scratch.
4548 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
4549 unsigned AS = Memop->getAddrSpace();
4550 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4551 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4552 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4553 *MD, AMDGPUAS::PRIVATE_ADDRESS);
4554 }
4555 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4556 });
4557}
4558
4560 assert(isFLAT(MI));
4561
4562 // All flat instructions use the VMEM counter except prefetch.
4563 if (!usesVM_CNT(MI))
4564 return false;
4565
4566 // If there are no memory operands then conservatively assume the flat
4567 // operation may access VMEM.
4568 if (MI.memoperands_empty())
4569 return true;
4570
4571 // See if any memory operand specifies an address space that involves VMEM.
4572 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
4573 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
4574 // (GDS) address space is not supported by flat operations. Therefore, simply
4575 // return true unless only the LDS address space is found.
4576 for (const MachineMemOperand *Memop : MI.memoperands()) {
4577 unsigned AS = Memop->getAddrSpace();
4579 if (AS != AMDGPUAS::LOCAL_ADDRESS)
4580 return true;
4581 }
4582
4583 return false;
4584}
4585
4587 assert(isFLAT(MI));
4588
4589 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
4590 if (!usesLGKM_CNT(MI))
4591 return false;
4592
4593 // If in tgsplit mode then there can be no use of LDS.
4594 if (ST.isTgSplitEnabled())
4595 return false;
4596
4597 // If there are no memory operands then conservatively assume the flat
4598 // operation may access LDS.
4599 if (MI.memoperands_empty())
4600 return true;
4601
4602 // See if any memory operand specifies an address space that involves LDS.
4603 for (const MachineMemOperand *Memop : MI.memoperands()) {
4604 unsigned AS = Memop->getAddrSpace();
4606 return true;
4607 }
4608
4609 return false;
4610}
4611
4613 // Skip the full operand and register alias search modifiesRegister
4614 // does. There's only a handful of instructions that touch this, it's only an
4615 // implicit def, and doesn't alias any other registers.
4616 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4617}
4618
4620 unsigned Opcode = MI.getOpcode();
4621
4622 if (MI.mayStore() && isSMRD(MI))
4623 return true; // scalar store or atomic
4624
4625 // This will terminate the function when other lanes may need to continue.
4626 if (MI.isReturn())
4627 return true;
4628
4629 // These instructions cause shader I/O that may cause hardware lockups
4630 // when executed with an empty EXEC mask.
4631 //
4632 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4633 // EXEC = 0, but checking for that case here seems not worth it
4634 // given the typical code patterns.
4635 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4636 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4637 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT ||
4638 Opcode == AMDGPU::S_SETHALT)
4639 return true;
4640
4641 if (MI.isCall() || MI.isInlineAsm())
4642 return true; // conservative assumption
4643
4644 // Assume that barrier interactions are only intended with active lanes.
4645 if (isBarrier(Opcode))
4646 return true;
4647
4648 // A mode change is a scalar operation that influences vector instructions.
4650 return true;
4651
4652 // These are like SALU instructions in terms of effects, so it's questionable
4653 // whether we should return true for those.
4654 //
4655 // However, executing them with EXEC = 0 causes them to operate on undefined
4656 // data, which we avoid by returning true here.
4657 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4658 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4659 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4660 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4661 return true;
4662
4663 return false;
4664}
4665
4667 const MachineInstr &MI) const {
4668 if (MI.isMetaInstruction())
4669 return false;
4670
4671 // This won't read exec if this is an SGPR->SGPR copy.
4672 if (MI.isCopyLike()) {
4673 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4674 return true;
4675
4676 // Make sure this isn't copying exec as a normal operand
4677 return MI.readsRegister(AMDGPU::EXEC, &RI);
4678 }
4679
4680 // Make a conservative assumption about the callee.
4681 if (MI.isCall())
4682 return true;
4683
4684 // Be conservative with any unhandled generic opcodes.
4685 if (!isTargetSpecificOpcode(MI.getOpcode()))
4686 return true;
4687
4688 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4689}
4690
4691bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4692 switch (Imm.getBitWidth()) {
4693 case 1: // This likely will be a condition code mask.
4694 return true;
4695
4696 case 32:
4697 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4698 ST.hasInv2PiInlineImm());
4699 case 64:
4700 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4701 ST.hasInv2PiInlineImm());
4702 case 16:
4703 return ST.has16BitInsts() &&
4704 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4705 ST.hasInv2PiInlineImm());
4706 default:
4707 llvm_unreachable("invalid bitwidth");
4708 }
4709}
4710
4712 APInt IntImm = Imm.bitcastToAPInt();
4713 int64_t IntImmVal = IntImm.getSExtValue();
4714 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4715 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4716 default:
4717 llvm_unreachable("invalid fltSemantics");
4720 return isInlineConstant(IntImm);
4722 return ST.has16BitInsts() &&
4723 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4725 return ST.has16BitInsts() &&
4726 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4727 }
4728}
4729
4730bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4731 // MachineOperand provides no way to tell the true operand size, since it only
4732 // records a 64-bit value. We need to know the size to determine if a 32-bit
4733 // floating point immediate bit pattern is legal for an integer immediate. It
4734 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4735 switch (OperandType) {
4745 int32_t Trunc = static_cast<int32_t>(Imm);
4746 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
4747 }
4753 return AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm());
4756 // We would expect inline immediates to not be concerned with an integer/fp
4757 // distinction. However, in the case of 16-bit integer operations, the
4758 // "floating point" values appear to not work. It seems read the low 16-bits
4759 // of 32-bit immediates, which happens to always work for the integer
4760 // values.
4761 //
4762 // See llvm bugzilla 46302.
4763 //
4764 // TODO: Theoretically we could use op-sel to use the high bits of the
4765 // 32-bit FP values.
4774 return AMDGPU::isPKFMACF16InlineConstant(Imm, ST.isGFX11Plus());
4779 return false;
4782 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4783 // A few special case instructions have 16-bit operands on subtargets
4784 // where 16-bit instructions are not legal.
4785 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4786 // constants in these cases
4787 int16_t Trunc = static_cast<int16_t>(Imm);
4788 return ST.has16BitInsts() &&
4789 AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm());
4790 }
4791
4792 return false;
4793 }
4796 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4797 int16_t Trunc = static_cast<int16_t>(Imm);
4798 return ST.has16BitInsts() &&
4799 AMDGPU::isInlinableLiteralBF16(Trunc, ST.hasInv2PiInlineImm());
4800 }
4801 return false;
4802 }
4806 return false;
4808 return isLegalAV64PseudoImm(Imm);
4811 // Always embedded in the instruction for free.
4812 return true;
4822 // Just ignore anything else.
4823 return true;
4824 default:
4825 llvm_unreachable("invalid operand type");
4826 }
4827}
4828
4829static bool compareMachineOp(const MachineOperand &Op0,
4830 const MachineOperand &Op1) {
4831 if (Op0.getType() != Op1.getType())
4832 return false;
4833
4834 switch (Op0.getType()) {
4836 return Op0.getReg() == Op1.getReg();
4838 return Op0.getImm() == Op1.getImm();
4839 default:
4840 llvm_unreachable("Didn't expect to be comparing these operand types");
4841 }
4842}
4843
4845 const MCOperandInfo &OpInfo) const {
4846 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4847 return true;
4848
4849 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4850 return false;
4851
4852 if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4853 return true;
4854
4855 return ST.hasVOP3Literal();
4856}
4857
4858bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4859 int64_t ImmVal) const {
4860 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4861 if (isInlineConstant(ImmVal, OpInfo.OperandType)) {
4862 if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4863 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(),
4864 AMDGPU::OpName::src2))
4865 return false;
4866 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4867 }
4868
4869 return isLiteralOperandLegal(InstDesc, OpInfo);
4870}
4871
4872bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4873 const MachineOperand &MO) const {
4874 if (MO.isImm())
4875 return isImmOperandLegal(InstDesc, OpNo, MO.getImm());
4876
4877 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4878 "unexpected imm-like operand kind");
4879 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4880 return isLiteralOperandLegal(InstDesc, OpInfo);
4881}
4882
4884 // 2 32-bit inline constants packed into one.
4885 return AMDGPU::isInlinableLiteral32(Lo_32(Imm), ST.hasInv2PiInlineImm()) &&
4886 AMDGPU::isInlinableLiteral32(Hi_32(Imm), ST.hasInv2PiInlineImm());
4887}
4888
4889bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4890 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4891 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4892 return false;
4893
4894 int Op32 = AMDGPU::getVOPe32(Opcode);
4895 if (Op32 == -1)
4896 return false;
4897
4898 return pseudoToMCOpcode(Op32) != -1;
4899}
4900
4901bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4902 // The src0_modifier operand is present on all instructions
4903 // that have modifiers.
4904
4905 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4906}
4907
4909 AMDGPU::OpName OpName) const {
4910 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4911 return Mods && Mods->getImm();
4912}
4913
4915 return any_of(ModifierOpNames,
4916 [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, Name); });
4917}
4918
4920 const MachineRegisterInfo &MRI) const {
4921 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4922 // Can't shrink instruction with three operands.
4923 if (Src2) {
4924 switch (MI.getOpcode()) {
4925 default: return false;
4926
4927 case AMDGPU::V_ADDC_U32_e64:
4928 case AMDGPU::V_SUBB_U32_e64:
4929 case AMDGPU::V_SUBBREV_U32_e64: {
4930 const MachineOperand *Src1
4931 = getNamedOperand(MI, AMDGPU::OpName::src1);
4932 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4933 return false;
4934 // Additional verification is needed for sdst/src2.
4935 return true;
4936 }
4937 case AMDGPU::V_MAC_F16_e64:
4938 case AMDGPU::V_MAC_F32_e64:
4939 case AMDGPU::V_MAC_LEGACY_F32_e64:
4940 case AMDGPU::V_FMAC_F16_e64:
4941 case AMDGPU::V_FMAC_F16_t16_e64:
4942 case AMDGPU::V_FMAC_F16_fake16_e64:
4943 case AMDGPU::V_FMAC_F32_e64:
4944 case AMDGPU::V_FMAC_F64_e64:
4945 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4946 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4947 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4948 return false;
4949 break;
4950
4951 case AMDGPU::V_CNDMASK_B32_e64:
4952 break;
4953 }
4954 }
4955
4956 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4957 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4958 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4959 return false;
4960
4961 // We don't need to check src0, all input types are legal, so just make sure
4962 // src0 isn't using any modifiers.
4963 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4964 return false;
4965
4966 // Can it be shrunk to a valid 32 bit opcode?
4967 if (!hasVALU32BitEncoding(MI.getOpcode()))
4968 return false;
4969
4970 // Check output modifiers
4971 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4972 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4973 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4974 // TODO: Can we avoid checking bound_ctrl/fi here?
4975 // They are only used by permlane*_swap special case.
4976 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
4977 !hasModifiersSet(MI, AMDGPU::OpName::fi);
4978}
4979
4980// Set VCC operand with all flags from \p Orig, except for setting it as
4981// implicit.
4983 const MachineOperand &Orig) {
4984
4985 for (MachineOperand &Use : MI.implicit_operands()) {
4986 if (Use.isUse() &&
4987 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4988 Use.setIsUndef(Orig.isUndef());
4989 Use.setIsKill(Orig.isKill());
4990 return;
4991 }
4992 }
4993}
4994
4996 unsigned Op32) const {
4997 MachineBasicBlock *MBB = MI.getParent();
4998
4999 const MCInstrDesc &Op32Desc = get(Op32);
5000 MachineInstrBuilder Inst32 =
5001 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
5002 .setMIFlags(MI.getFlags());
5003
5004 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
5005 // For VOPC instructions, this is replaced by an implicit def of vcc.
5006
5007 // We assume the defs of the shrunk opcode are in the same order, and the
5008 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
5009 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
5010 Inst32.add(MI.getOperand(I));
5011
5012 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
5013
5014 int Idx = MI.getNumExplicitDefs();
5015 for (const MachineOperand &Use : MI.explicit_uses()) {
5016 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
5018 continue;
5019
5020 if (&Use == Src2) {
5021 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
5022 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
5023 // replaced with an implicit read of vcc or vcc_lo. The implicit read
5024 // of vcc was already added during the initial BuildMI, but we
5025 // 1) may need to change vcc to vcc_lo to preserve the original register
5026 // 2) have to preserve the original flags.
5027 copyFlagsToImplicitVCC(*Inst32, *Src2);
5028 continue;
5029 }
5030 }
5031
5032 Inst32.add(Use);
5033 }
5034
5035 // FIXME: Losing implicit operands
5036 fixImplicitOperands(*Inst32);
5037 return Inst32;
5038}
5039
5041 // Null is free
5042 Register Reg = RegOp.getReg();
5043 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
5044 return false;
5045
5046 // SGPRs use the constant bus
5047
5048 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
5049 // physical register operands should also count, except for exec.
5050 if (RegOp.isImplicit())
5051 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
5052
5053 // SGPRs use the constant bus
5054 return AMDGPU::SReg_32RegClass.contains(Reg) ||
5055 AMDGPU::SReg_64RegClass.contains(Reg);
5056}
5057
5059 const MachineRegisterInfo &MRI) const {
5060 Register Reg = RegOp.getReg();
5061 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
5062 : physRegUsesConstantBus(RegOp);
5063}
5064
5066 const MachineOperand &MO,
5067 const MCOperandInfo &OpInfo) const {
5068 // Literal constants use the constant bus.
5069 if (!MO.isReg())
5070 return !isInlineConstant(MO, OpInfo);
5071
5072 Register Reg = MO.getReg();
5073 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
5075}
5076
5078 for (const MachineOperand &MO : MI.implicit_operands()) {
5079 // We only care about reads.
5080 if (MO.isDef())
5081 continue;
5082
5083 switch (MO.getReg()) {
5084 case AMDGPU::VCC:
5085 case AMDGPU::VCC_LO:
5086 case AMDGPU::VCC_HI:
5087 case AMDGPU::M0:
5088 case AMDGPU::FLAT_SCR:
5089 return MO.getReg();
5090
5091 default:
5092 break;
5093 }
5094 }
5095
5096 return Register();
5097}
5098
5099static bool shouldReadExec(const MachineInstr &MI) {
5100 if (SIInstrInfo::isVALU(MI)) {
5101 switch (MI.getOpcode()) {
5102 case AMDGPU::V_READLANE_B32:
5103 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
5104 case AMDGPU::V_WRITELANE_B32:
5105 case AMDGPU::SI_SPILL_S32_TO_VGPR:
5106 return false;
5107 }
5108
5109 return true;
5110 }
5111
5112 if (MI.isPreISelOpcode() ||
5113 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
5116 return false;
5117
5118 return true;
5119}
5120
5121static bool isRegOrFI(const MachineOperand &MO) {
5122 return MO.isReg() || MO.isFI();
5123}
5124
5125static bool isSubRegOf(const SIRegisterInfo &TRI,
5126 const MachineOperand &SuperVec,
5127 const MachineOperand &SubReg) {
5128 if (SubReg.getReg().isPhysical())
5129 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
5130
5131 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
5132 SubReg.getReg() == SuperVec.getReg();
5133}
5134
5135// Verify the illegal copy from vector register to SGPR for generic opcode COPY
5136bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
5137 const MachineRegisterInfo &MRI,
5138 StringRef &ErrInfo) const {
5139 Register DstReg = MI.getOperand(0).getReg();
5140 Register SrcReg = MI.getOperand(1).getReg();
5141 // This is a check for copy from vector register to SGPR
5142 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
5143 ErrInfo = "illegal copy from vector register to SGPR";
5144 return false;
5145 }
5146 return true;
5147}
5148
5150 StringRef &ErrInfo) const {
5151 uint32_t Opcode = MI.getOpcode();
5152 const MachineFunction *MF = MI.getMF();
5153 const MachineRegisterInfo &MRI = MF->getRegInfo();
5154
5155 // FIXME: At this point the COPY verify is done only for non-ssa forms.
5156 // Find a better property to recognize the point where instruction selection
5157 // is just done.
5158 // We can only enforce this check after SIFixSGPRCopies pass so that the
5159 // illegal copies are legalized and thereafter we don't expect a pass
5160 // inserting similar copies.
5161 if (!MRI.isSSA() && MI.isCopy())
5162 return verifyCopy(MI, MRI, ErrInfo);
5163
5164 if (SIInstrInfo::isGenericOpcode(Opcode))
5165 return true;
5166
5167 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
5168 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
5169 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
5170 int Src3Idx = -1;
5171 if (Src0Idx == -1) {
5172 // VOPD V_DUAL_* instructions use different operand names.
5173 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
5174 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
5175 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
5176 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
5177 }
5178
5179 // Make sure the number of operands is correct.
5180 const MCInstrDesc &Desc = get(Opcode);
5181 if (!Desc.isVariadic() &&
5182 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
5183 ErrInfo = "Instruction has wrong number of operands.";
5184 return false;
5185 }
5186
5187 if (MI.isInlineAsm()) {
5188 // Verify register classes for inlineasm constraints.
5189 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
5190 I != E; ++I) {
5191 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
5192 if (!RC)
5193 continue;
5194
5195 const MachineOperand &Op = MI.getOperand(I);
5196 if (!Op.isReg())
5197 continue;
5198
5199 Register Reg = Op.getReg();
5200 if (!Reg.isVirtual() && !RC->contains(Reg)) {
5201 ErrInfo = "inlineasm operand has incorrect register class.";
5202 return false;
5203 }
5204 }
5205
5206 return true;
5207 }
5208
5209 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
5210 ErrInfo = "missing memory operand from image instruction.";
5211 return false;
5212 }
5213
5214 // Make sure the register classes are correct.
5215 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
5216 const MachineOperand &MO = MI.getOperand(i);
5217 if (MO.isFPImm()) {
5218 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
5219 "all fp values to integers.";
5220 return false;
5221 }
5222
5223 const MCOperandInfo &OpInfo = Desc.operands()[i];
5224 int16_t RegClass = getOpRegClassID(OpInfo);
5225
5226 switch (OpInfo.OperandType) {
5228 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
5229 ErrInfo = "Illegal immediate value for operand.";
5230 return false;
5231 }
5232 break;
5242 break;
5244 break;
5245 break;
5259 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
5260 ErrInfo = "Illegal immediate value for operand.";
5261 return false;
5262 }
5263 break;
5264 }
5269 if (ST.has64BitLiterals() && Desc.getSize() != 4 && MO.isImm() &&
5270 !isInlineConstant(MI, i) &&
5272 OpInfo.OperandType ==
5274 ErrInfo = "illegal 64-bit immediate value for operand.";
5275 return false;
5276 }
5277 break;
5280 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
5281 ErrInfo = "Expected inline constant for operand.";
5282 return false;
5283 }
5284 break;
5287 break;
5292 // Check if this operand is an immediate.
5293 // FrameIndex operands will be replaced by immediates, so they are
5294 // allowed.
5295 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5296 ErrInfo = "Expected immediate, but got non-immediate";
5297 return false;
5298 }
5299 break;
5303 break;
5304 default:
5305 if (OpInfo.isGenericType())
5306 continue;
5307 break;
5308 }
5309
5310 if (!MO.isReg())
5311 continue;
5312 Register Reg = MO.getReg();
5313 if (!Reg)
5314 continue;
5315
5316 // FIXME: Ideally we would have separate instruction definitions with the
5317 // aligned register constraint.
5318 // FIXME: We do not verify inline asm operands, but custom inline asm
5319 // verification is broken anyway
5320 if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO &&
5321 Opcode != AMDGPU::V_MOV_B64_PSEUDO && !isSpill(MI)) {
5322 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5323 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5324 if (const TargetRegisterClass *SubRC =
5325 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5326 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
5327 if (RC)
5328 RC = SubRC;
5329 }
5330 }
5331
5332 // Check that this is the aligned version of the class.
5333 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
5334 ErrInfo = "Subtarget requires even aligned vector registers";
5335 return false;
5336 }
5337 }
5338
5339 if (RegClass != -1) {
5340 if (Reg.isVirtual())
5341 continue;
5342
5343 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
5344 if (!RC->contains(Reg)) {
5345 ErrInfo = "Operand has incorrect register class.";
5346 return false;
5347 }
5348 }
5349 }
5350
5351 // Verify SDWA
5352 if (isSDWA(MI)) {
5353 if (!ST.hasSDWA()) {
5354 ErrInfo = "SDWA is not supported on this target";
5355 return false;
5356 }
5357
5358 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5359 AMDGPU::OpName::dst_sel}) {
5360 const MachineOperand *MO = getNamedOperand(MI, Op);
5361 if (!MO)
5362 continue;
5363 int64_t Imm = MO->getImm();
5364 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5365 ErrInfo = "Invalid SDWA selection";
5366 return false;
5367 }
5368 }
5369
5370 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
5371
5372 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5373 if (OpIdx == -1)
5374 continue;
5375 const MachineOperand &MO = MI.getOperand(OpIdx);
5376
5377 if (!ST.hasSDWAScalar()) {
5378 // Only VGPRS on VI
5379 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
5380 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5381 return false;
5382 }
5383 } else {
5384 // No immediates on GFX9
5385 if (!MO.isReg()) {
5386 ErrInfo =
5387 "Only reg allowed as operands in SDWA instructions on GFX9+";
5388 return false;
5389 }
5390 }
5391 }
5392
5393 if (!ST.hasSDWAOmod()) {
5394 // No omod allowed on VI
5395 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5396 if (OMod != nullptr &&
5397 (!OMod->isImm() || OMod->getImm() != 0)) {
5398 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5399 return false;
5400 }
5401 }
5402
5403 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5404 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5405 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5406 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5407 const MachineOperand *Src0ModsMO =
5408 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
5409 unsigned Mods = Src0ModsMO->getImm();
5410 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5411 Mods & SISrcMods::SEXT) {
5412 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5413 return false;
5414 }
5415 }
5416
5417 uint32_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5418 if (isVOPC(BasicOpcode)) {
5419 if (!ST.hasSDWASdst() && DstIdx != -1) {
5420 // Only vcc allowed as dst on VI for VOPC
5421 const MachineOperand &Dst = MI.getOperand(DstIdx);
5422 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5423 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5424 return false;
5425 }
5426 } else if (!ST.hasSDWAOutModsVOPC()) {
5427 // No clamp allowed on GFX9 for VOPC
5428 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
5429 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5430 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5431 return false;
5432 }
5433
5434 // No omod allowed on GFX9 for VOPC
5435 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5436 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5437 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5438 return false;
5439 }
5440 }
5441 }
5442
5443 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
5444 if (DstUnused && DstUnused->isImm() &&
5445 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5446 const MachineOperand &Dst = MI.getOperand(DstIdx);
5447 if (!Dst.isReg() || !Dst.isTied()) {
5448 ErrInfo = "Dst register should have tied register";
5449 return false;
5450 }
5451
5452 const MachineOperand &TiedMO =
5453 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
5454 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5455 ErrInfo =
5456 "Dst register should be tied to implicit use of preserved register";
5457 return false;
5458 }
5459 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5460 ErrInfo = "Dst register should use same physical register as preserved";
5461 return false;
5462 }
5463 }
5464 }
5465
5466 // Verify MIMG / VIMAGE / VSAMPLE
5467 if (isImage(Opcode) && !MI.mayStore()) {
5468 // Ensure that the return type used is large enough for all the options
5469 // being used TFE/LWE require an extra result register.
5470 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
5471 if (DMask) {
5472 uint64_t DMaskImm = DMask->getImm();
5473 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(DMaskImm);
5474 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
5475 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
5476 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
5477
5478 // Adjust for packed 16 bit values
5479 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5480 RegCount = divideCeil(RegCount, 2);
5481
5482 // Adjust if using LWE or TFE
5483 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5484 RegCount += 1;
5485
5486 const uint32_t DstIdx =
5487 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
5488 const MachineOperand &Dst = MI.getOperand(DstIdx);
5489 if (Dst.isReg()) {
5490 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
5491 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
5492 if (RegCount > DstSize) {
5493 ErrInfo = "Image instruction returns too many registers for dst "
5494 "register class";
5495 return false;
5496 }
5497 }
5498 }
5499 }
5500
5501 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5502 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5503 unsigned ConstantBusCount = 0;
5504 bool UsesLiteral = false;
5505 const MachineOperand *LiteralVal = nullptr;
5506
5507 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
5508 if (ImmIdx != -1) {
5509 ++ConstantBusCount;
5510 UsesLiteral = true;
5511 LiteralVal = &MI.getOperand(ImmIdx);
5512 }
5513
5514 SmallVector<Register, 2> SGPRsUsed;
5515 Register SGPRUsed;
5516
5517 // Only look at the true operands. Only a real operand can use the constant
5518 // bus, and we don't want to check pseudo-operands like the source modifier
5519 // flags.
5520 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5521 if (OpIdx == -1)
5522 continue;
5523 const MachineOperand &MO = MI.getOperand(OpIdx);
5524 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5525 if (MO.isReg()) {
5526 SGPRUsed = MO.getReg();
5527 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
5528 ++ConstantBusCount;
5529 SGPRsUsed.push_back(SGPRUsed);
5530 }
5531 } else if (!MO.isFI()) { // Treat FI like a register.
5532 if (!UsesLiteral) {
5533 ++ConstantBusCount;
5534 UsesLiteral = true;
5535 LiteralVal = &MO;
5536 } else if (!MO.isIdenticalTo(*LiteralVal)) {
5537 assert(isVOP2(MI) || isVOP3(MI));
5538 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5539 return false;
5540 }
5541 }
5542 }
5543 }
5544
5545 SGPRUsed = findImplicitSGPRRead(MI);
5546 if (SGPRUsed) {
5547 // Implicit uses may safely overlap true operands
5548 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5549 return !RI.regsOverlap(SGPRUsed, SGPR);
5550 })) {
5551 ++ConstantBusCount;
5552 SGPRsUsed.push_back(SGPRUsed);
5553 }
5554 }
5555
5556 // v_writelane_b32 is an exception from constant bus restriction:
5557 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5558 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5559 Opcode != AMDGPU::V_WRITELANE_B32) {
5560 ErrInfo = "VOP* instruction violates constant bus restriction";
5561 return false;
5562 }
5563
5564 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5565 ErrInfo = "VOP3 instruction uses literal";
5566 return false;
5567 }
5568 }
5569
5570 // Special case for writelane - this can break the multiple constant bus rule,
5571 // but still can't use more than one SGPR register
5572 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5573 unsigned SGPRCount = 0;
5574 Register SGPRUsed;
5575
5576 for (int OpIdx : {Src0Idx, Src1Idx}) {
5577 if (OpIdx == -1)
5578 break;
5579
5580 const MachineOperand &MO = MI.getOperand(OpIdx);
5581
5582 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5583 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5584 if (MO.getReg() != SGPRUsed)
5585 ++SGPRCount;
5586 SGPRUsed = MO.getReg();
5587 }
5588 }
5589 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5590 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5591 return false;
5592 }
5593 }
5594 }
5595
5596 // Verify misc. restrictions on specific instructions.
5597 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5598 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5599 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5600 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5601 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5602 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5603 if (!compareMachineOp(Src0, Src1) &&
5604 !compareMachineOp(Src0, Src2)) {
5605 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5606 return false;
5607 }
5608 }
5609 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5610 SISrcMods::ABS) ||
5611 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5612 SISrcMods::ABS) ||
5613 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5614 SISrcMods::ABS)) {
5615 ErrInfo = "ABS not allowed in VOP3B instructions";
5616 return false;
5617 }
5618 }
5619
5620 if (isSOP2(MI) || isSOPC(MI)) {
5621 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5622 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5623
5624 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5625 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5626 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5627 !Src0.isIdenticalTo(Src1)) {
5628 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5629 return false;
5630 }
5631 }
5632
5633 if (isSOPK(MI)) {
5634 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5635 if (Desc.isBranch()) {
5636 if (!Op->isMBB()) {
5637 ErrInfo = "invalid branch target for SOPK instruction";
5638 return false;
5639 }
5640 } else {
5641 uint64_t Imm = Op->getImm();
5642 if (sopkIsZext(Opcode)) {
5643 if (!isUInt<16>(Imm)) {
5644 ErrInfo = "invalid immediate for SOPK instruction";
5645 return false;
5646 }
5647 } else {
5648 if (!isInt<16>(Imm)) {
5649 ErrInfo = "invalid immediate for SOPK instruction";
5650 return false;
5651 }
5652 }
5653 }
5654 }
5655
5656 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5657 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5658 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5659 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5660 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5661 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5662
5663 const unsigned StaticNumOps =
5664 Desc.getNumOperands() + Desc.implicit_uses().size();
5665 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5666
5667 // Require additional implicit operands. This allows a fixup done by the
5668 // post RA scheduler where the main implicit operand is killed and
5669 // implicit-defs are added for sub-registers that remain live after this
5670 // instruction.
5671 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5672 ErrInfo = "missing implicit register operands";
5673 return false;
5674 }
5675
5676 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5677 if (IsDst) {
5678 if (!Dst->isUse()) {
5679 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5680 return false;
5681 }
5682
5683 unsigned UseOpIdx;
5684 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5685 UseOpIdx != StaticNumOps + 1) {
5686 ErrInfo = "movrel implicit operands should be tied";
5687 return false;
5688 }
5689 }
5690
5691 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5692 const MachineOperand &ImpUse
5693 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5694 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5695 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5696 ErrInfo = "src0 should be subreg of implicit vector use";
5697 return false;
5698 }
5699 }
5700
5701 // Make sure we aren't losing exec uses in the td files. This mostly requires
5702 // being careful when using let Uses to try to add other use registers.
5703 if (shouldReadExec(MI)) {
5704 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5705 ErrInfo = "VALU instruction does not implicitly read exec mask";
5706 return false;
5707 }
5708 }
5709
5710 if (isSMRD(MI)) {
5711 if (MI.mayStore() &&
5712 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5713 // The register offset form of scalar stores may only use m0 as the
5714 // soffset register.
5715 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5716 if (Soff && Soff->getReg() != AMDGPU::M0) {
5717 ErrInfo = "scalar stores must use m0 as offset register";
5718 return false;
5719 }
5720 }
5721 }
5722
5723 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5724 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5725 if (Offset->getImm() != 0) {
5726 ErrInfo = "subtarget does not support offsets in flat instructions";
5727 return false;
5728 }
5729 }
5730
5731 if (isDS(MI) && !ST.hasGDS()) {
5732 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5733 if (GDSOp && GDSOp->getImm() != 0) {
5734 ErrInfo = "GDS is not supported on this subtarget";
5735 return false;
5736 }
5737 }
5738
5739 if (isImage(MI)) {
5740 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5741 if (DimOp) {
5742 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5743 AMDGPU::OpName::vaddr0);
5744 AMDGPU::OpName RSrcOpName =
5745 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5746 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5747 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5748 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5749 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
5750 const AMDGPU::MIMGDimInfo *Dim =
5752
5753 if (!Dim) {
5754 ErrInfo = "dim is out of range";
5755 return false;
5756 }
5757
5758 bool IsA16 = false;
5759 if (ST.hasR128A16()) {
5760 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5761 IsA16 = R128A16->getImm() != 0;
5762 } else if (ST.hasA16()) {
5763 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5764 IsA16 = A16->getImm() != 0;
5765 }
5766
5767 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5768
5769 unsigned AddrWords =
5770 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5771
5772 unsigned VAddrWords;
5773 if (IsNSA) {
5774 VAddrWords = RsrcIdx - VAddr0Idx;
5775 if (ST.hasPartialNSAEncoding() &&
5776 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5777 unsigned LastVAddrIdx = RsrcIdx - 1;
5778 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5779 }
5780 } else {
5781 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5782 if (AddrWords > 12)
5783 AddrWords = 16;
5784 }
5785
5786 if (VAddrWords != AddrWords) {
5787 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5788 << " but got " << VAddrWords << "\n");
5789 ErrInfo = "bad vaddr size";
5790 return false;
5791 }
5792 }
5793 }
5794
5795 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5796 if (DppCt) {
5797 using namespace AMDGPU::DPP;
5798
5799 unsigned DC = DppCt->getImm();
5800 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5801 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5802 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5803 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5804 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5805 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5806 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5807 ErrInfo = "Invalid dpp_ctrl value";
5808 return false;
5809 }
5810 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5811 !ST.hasDPPWavefrontShifts()) {
5812 ErrInfo = "Invalid dpp_ctrl value: "
5813 "wavefront shifts are not supported on GFX10+";
5814 return false;
5815 }
5816 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5817 !ST.hasDPPBroadcasts()) {
5818 ErrInfo = "Invalid dpp_ctrl value: "
5819 "broadcasts are not supported on GFX10+";
5820 return false;
5821 }
5822 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5823 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5824 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5825 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5826 !ST.hasGFX90AInsts()) {
5827 ErrInfo = "Invalid dpp_ctrl value: "
5828 "row_newbroadcast/row_share is not supported before "
5829 "GFX90A/GFX10";
5830 return false;
5831 }
5832 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5833 ErrInfo = "Invalid dpp_ctrl value: "
5834 "row_share and row_xmask are not supported before GFX10";
5835 return false;
5836 }
5837 }
5838
5839 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5841 AMDGPU::isDPALU_DPP(Desc, *this, ST)) {
5842 ErrInfo = "Invalid dpp_ctrl value: "
5843 "DP ALU dpp only support row_newbcast";
5844 return false;
5845 }
5846 }
5847
5848 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5849 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5850 AMDGPU::OpName DataName =
5851 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5852 const MachineOperand *Data = getNamedOperand(MI, DataName);
5853 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5854 if (Data && !Data->isReg())
5855 Data = nullptr;
5856
5857 if (ST.hasGFX90AInsts()) {
5858 if (Dst && Data && !Dst->isTied() && !Data->isTied() &&
5859 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5860 ErrInfo = "Invalid register class: "
5861 "vdata and vdst should be both VGPR or AGPR";
5862 return false;
5863 }
5864 if (Data && Data2 &&
5865 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5866 ErrInfo = "Invalid register class: "
5867 "both data operands should be VGPR or AGPR";
5868 return false;
5869 }
5870 } else {
5871 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5872 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5873 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5874 ErrInfo = "Invalid register class: "
5875 "agpr loads and stores not supported on this GPU";
5876 return false;
5877 }
5878 }
5879 }
5880
5881 if (ST.needsAlignedVGPRs()) {
5882 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5884 if (!Op)
5885 return true;
5886 Register Reg = Op->getReg();
5887 if (Reg.isPhysical())
5888 return !(RI.getHWRegIndex(Reg) & 1);
5889 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5890 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5891 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5892 };
5893
5894 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5895 Opcode == AMDGPU::DS_GWS_BARRIER) {
5896
5897 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5898 ErrInfo = "Subtarget requires even aligned vector registers "
5899 "for DS_GWS instructions";
5900 return false;
5901 }
5902 }
5903
5904 if (isMIMG(MI)) {
5905 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5906 ErrInfo = "Subtarget requires even aligned vector registers "
5907 "for vaddr operand of image instructions";
5908 return false;
5909 }
5910 }
5911 }
5912
5913 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5914 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5915 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5916 ErrInfo = "Invalid register class: "
5917 "v_accvgpr_write with an SGPR is not supported on this GPU";
5918 return false;
5919 }
5920 }
5921
5922 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5923 const MachineOperand &SrcOp = MI.getOperand(1);
5924 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5925 ErrInfo = "pseudo expects only physical SGPRs";
5926 return false;
5927 }
5928 }
5929
5930 if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
5931 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
5932 if (!ST.hasScaleOffset()) {
5933 ErrInfo = "Subtarget does not support offset scaling";
5934 return false;
5935 }
5936 if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
5937 ErrInfo = "Instruction does not support offset scaling";
5938 return false;
5939 }
5940 }
5941 }
5942
5943 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
5944 // information.
5945 if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) {
5946 for (unsigned I = 0; I < 3; ++I) {
5948 return false;
5949 }
5950 }
5951
5952 if (ST.hasFlatScratchHiInB64InstHazard() && isSALU(MI) &&
5953 MI.readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, nullptr)) {
5954 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
5955 if ((Dst && RI.getRegClassForReg(MRI, Dst->getReg()) ==
5956 &AMDGPU::SReg_64RegClass) ||
5957 Opcode == AMDGPU::S_BITCMP0_B64 || Opcode == AMDGPU::S_BITCMP1_B64) {
5958 ErrInfo = "Instruction cannot read flat_scratch_base_hi";
5959 return false;
5960 }
5961 }
5962
5963 return true;
5964}
5965
5967 if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
5968 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
5969 return MI.getOperand(1).isReg() || RI.isAGPR(MRI, MI.getOperand(0).getReg())
5970 ? AMDGPU::COPY
5971 : AMDGPU::V_MOV_B32_e32;
5972 }
5973 return getVALUOp(MI.getOpcode());
5974}
5975
5976// It is more readable to list mapped opcodes on the same line.
5977// clang-format off
5978
5979unsigned SIInstrInfo::getVALUOp(unsigned Opc) const {
5980 switch (Opc) {
5981 default: return AMDGPU::INSTRUCTION_LIST_END;
5982 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5983 case AMDGPU::COPY: return AMDGPU::COPY;
5984 case AMDGPU::PHI: return AMDGPU::PHI;
5985 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5986 case AMDGPU::WQM: return AMDGPU::WQM;
5987 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5988 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5989 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5990 case AMDGPU::S_ADD_I32:
5991 return ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5992 case AMDGPU::S_ADDC_U32:
5993 return AMDGPU::V_ADDC_U32_e32;
5994 case AMDGPU::S_SUB_I32:
5995 return ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5996 // FIXME: These are not consistently handled, and selected when the carry is
5997 // used.
5998 case AMDGPU::S_ADD_U32:
5999 return AMDGPU::V_ADD_CO_U32_e32;
6000 case AMDGPU::S_SUB_U32:
6001 return AMDGPU::V_SUB_CO_U32_e32;
6002 case AMDGPU::S_ADD_U64_PSEUDO:
6003 return AMDGPU::V_ADD_U64_PSEUDO;
6004 case AMDGPU::S_SUB_U64_PSEUDO:
6005 return AMDGPU::V_SUB_U64_PSEUDO;
6006 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
6007 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
6008 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
6009 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
6010 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
6011 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
6012 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
6013 case AMDGPU::S_XNOR_B32:
6014 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
6015 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
6016 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
6017 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
6018 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
6019 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
6020 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
6021 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
6022 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
6023 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
6024 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
6025 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
6026 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
6027 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
6028 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
6029 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
6030 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
6031 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
6032 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
6033 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
6034 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
6035 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
6036 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
6037 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
6038 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
6039 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
6040 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
6041 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
6042 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
6043 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
6044 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
6045 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
6046 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
6047 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
6048 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
6049 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
6050 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
6051 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
6052 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
6053 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
6054 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
6055 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
6056 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
6057 case AMDGPU::S_CVT_F32_F16:
6058 case AMDGPU::S_CVT_HI_F32_F16:
6059 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
6060 : AMDGPU::V_CVT_F32_F16_fake16_e64;
6061 case AMDGPU::S_CVT_F16_F32:
6062 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
6063 : AMDGPU::V_CVT_F16_F32_fake16_e64;
6064 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
6065 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
6066 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
6067 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
6068 case AMDGPU::S_CEIL_F16:
6069 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
6070 : AMDGPU::V_CEIL_F16_fake16_e64;
6071 case AMDGPU::S_FLOOR_F16:
6072 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
6073 : AMDGPU::V_FLOOR_F16_fake16_e64;
6074 case AMDGPU::S_TRUNC_F16:
6075 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
6076 : AMDGPU::V_TRUNC_F16_fake16_e64;
6077 case AMDGPU::S_RNDNE_F16:
6078 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
6079 : AMDGPU::V_RNDNE_F16_fake16_e64;
6080 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
6081 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
6082 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
6083 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
6084 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
6085 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
6086 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
6087 case AMDGPU::S_ADD_F16:
6088 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
6089 : AMDGPU::V_ADD_F16_fake16_e64;
6090 case AMDGPU::S_SUB_F16:
6091 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
6092 : AMDGPU::V_SUB_F16_fake16_e64;
6093 case AMDGPU::S_MIN_F16:
6094 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
6095 : AMDGPU::V_MIN_F16_fake16_e64;
6096 case AMDGPU::S_MAX_F16:
6097 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
6098 : AMDGPU::V_MAX_F16_fake16_e64;
6099 case AMDGPU::S_MINIMUM_F16:
6100 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
6101 : AMDGPU::V_MINIMUM_F16_fake16_e64;
6102 case AMDGPU::S_MAXIMUM_F16:
6103 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
6104 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
6105 case AMDGPU::S_MUL_F16:
6106 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
6107 : AMDGPU::V_MUL_F16_fake16_e64;
6108 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
6109 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
6110 case AMDGPU::S_FMAC_F16:
6111 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
6112 : AMDGPU::V_FMAC_F16_fake16_e64;
6113 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
6114 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
6115 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
6116 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
6117 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
6118 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
6119 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
6120 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
6121 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
6122 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
6123 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
6124 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
6125 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
6126 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
6127 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
6128 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
6129 case AMDGPU::S_CMP_LT_F16:
6130 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
6131 : AMDGPU::V_CMP_LT_F16_fake16_e64;
6132 case AMDGPU::S_CMP_EQ_F16:
6133 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
6134 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
6135 case AMDGPU::S_CMP_LE_F16:
6136 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
6137 : AMDGPU::V_CMP_LE_F16_fake16_e64;
6138 case AMDGPU::S_CMP_GT_F16:
6139 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
6140 : AMDGPU::V_CMP_GT_F16_fake16_e64;
6141 case AMDGPU::S_CMP_LG_F16:
6142 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
6143 : AMDGPU::V_CMP_LG_F16_fake16_e64;
6144 case AMDGPU::S_CMP_GE_F16:
6145 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
6146 : AMDGPU::V_CMP_GE_F16_fake16_e64;
6147 case AMDGPU::S_CMP_O_F16:
6148 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
6149 : AMDGPU::V_CMP_O_F16_fake16_e64;
6150 case AMDGPU::S_CMP_U_F16:
6151 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
6152 : AMDGPU::V_CMP_U_F16_fake16_e64;
6153 case AMDGPU::S_CMP_NGE_F16:
6154 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
6155 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
6156 case AMDGPU::S_CMP_NLG_F16:
6157 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
6158 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
6159 case AMDGPU::S_CMP_NGT_F16:
6160 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
6161 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
6162 case AMDGPU::S_CMP_NLE_F16:
6163 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
6164 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
6165 case AMDGPU::S_CMP_NEQ_F16:
6166 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
6167 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
6168 case AMDGPU::S_CMP_NLT_F16:
6169 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
6170 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
6171 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
6172 case AMDGPU::V_S_EXP_F16_e64:
6173 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
6174 : AMDGPU::V_EXP_F16_fake16_e64;
6175 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
6176 case AMDGPU::V_S_LOG_F16_e64:
6177 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
6178 : AMDGPU::V_LOG_F16_fake16_e64;
6179 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
6180 case AMDGPU::V_S_RCP_F16_e64:
6181 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
6182 : AMDGPU::V_RCP_F16_fake16_e64;
6183 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
6184 case AMDGPU::V_S_RSQ_F16_e64:
6185 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
6186 : AMDGPU::V_RSQ_F16_fake16_e64;
6187 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
6188 case AMDGPU::V_S_SQRT_F16_e64:
6189 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
6190 : AMDGPU::V_SQRT_F16_fake16_e64;
6191 }
6193 "Unexpected scalar opcode without corresponding vector one!");
6194}
6195
6196// clang-format on
6197
6201 const DebugLoc &DL, Register Reg,
6202 bool IsSCCLive,
6203 SlotIndexes *Indexes) const {
6204 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6205 const SIInstrInfo *TII = ST.getInstrInfo();
6207 if (IsSCCLive) {
6208 // Insert two move instructions, one to save the original value of EXEC and
6209 // the other to turn on all bits in EXEC. This is required as we can't use
6210 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
6211 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), Reg)
6213 auto FlipExecMI =
6214 BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
6215 if (Indexes) {
6216 Indexes->insertMachineInstrInMaps(*StoreExecMI);
6217 Indexes->insertMachineInstrInMaps(*FlipExecMI);
6218 }
6219 } else {
6220 auto SaveExec =
6221 BuildMI(MBB, MBBI, DL, TII->get(LMC.OrSaveExecOpc), Reg).addImm(-1);
6222 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
6223 if (Indexes)
6224 Indexes->insertMachineInstrInMaps(*SaveExec);
6225 }
6226}
6227
6230 const DebugLoc &DL, Register Reg,
6231 SlotIndexes *Indexes) const {
6233 auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, get(LMC.MovOpc), LMC.ExecReg)
6234 .addReg(Reg, RegState::Kill);
6235 if (Indexes)
6236 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
6237}
6238
6242 "Not a whole wave func");
6243 MachineBasicBlock &MBB = *MF.begin();
6244 for (MachineInstr &MI : MBB)
6245 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
6246 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
6247 return &MI;
6248
6249 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
6250}
6251
6253 unsigned OpNo) const {
6254 const MCInstrDesc &Desc = get(MI.getOpcode());
6255 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6256 Desc.operands()[OpNo].RegClass == -1) {
6257 Register Reg = MI.getOperand(OpNo).getReg();
6258
6259 if (Reg.isVirtual()) {
6260 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6261 return MRI.getRegClass(Reg);
6262 }
6263 return RI.getPhysRegBaseClass(Reg);
6264 }
6265
6266 int16_t RegClass = getOpRegClassID(Desc.operands()[OpNo]);
6267 return RegClass < 0 ? nullptr : RI.getRegClass(RegClass);
6268}
6269
6272 MachineBasicBlock *MBB = MI.getParent();
6273 MachineOperand &MO = MI.getOperand(OpIdx);
6274 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6275 unsigned RCID = getOpRegClassID(get(MI.getOpcode()).operands()[OpIdx]);
6276 const TargetRegisterClass *RC = RI.getRegClass(RCID);
6277 unsigned Size = RI.getRegSizeInBits(*RC);
6278 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6279 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6280 : AMDGPU::V_MOV_B32_e32;
6281 if (MO.isReg())
6282 Opcode = AMDGPU::COPY;
6283 else if (RI.isSGPRClass(RC))
6284 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6285
6286 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
6287 Register Reg = MRI.createVirtualRegister(VRC);
6288 DebugLoc DL = MBB->findDebugLoc(I);
6289 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
6290 MO.ChangeToRegister(Reg, false);
6291}
6292
6295 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6296 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6297 if (!SuperReg.getReg().isVirtual())
6298 return RI.getSubReg(SuperReg.getReg(), SubIdx);
6299
6300 MachineBasicBlock *MBB = MI->getParent();
6301 const DebugLoc &DL = MI->getDebugLoc();
6302 Register SubReg = MRI.createVirtualRegister(SubRC);
6303
6304 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
6305 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
6306 .addReg(SuperReg.getReg(), {}, NewSubIdx);
6307 return SubReg;
6308}
6309
6312 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6313 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6314 if (Op.isImm()) {
6315 if (SubIdx == AMDGPU::sub0)
6316 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
6317 if (SubIdx == AMDGPU::sub1)
6318 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
6319
6320 llvm_unreachable("Unhandled register index for immediate");
6321 }
6322
6323 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
6324 SubIdx, SubRC);
6325 return MachineOperand::CreateReg(SubReg, false);
6326}
6327
6328// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6329void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6330 assert(Inst.getNumExplicitOperands() == 3);
6331 MachineOperand Op1 = Inst.getOperand(1);
6332 Inst.removeOperand(1);
6333 Inst.addOperand(Op1);
6334}
6335
6337 const MCOperandInfo &OpInfo,
6338 const MachineOperand &MO) const {
6339 if (!MO.isReg())
6340 return false;
6341
6342 Register Reg = MO.getReg();
6343
6344 const TargetRegisterClass *DRC = RI.getRegClass(getOpRegClassID(OpInfo));
6345 if (Reg.isPhysical())
6346 return DRC->contains(Reg);
6347
6348 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6349
6350 if (MO.getSubReg()) {
6351 const MachineFunction *MF = MO.getParent()->getMF();
6352 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
6353 if (!SuperRC)
6354 return false;
6355 return RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()) != nullptr;
6356 }
6357
6358 return RI.getCommonSubClass(DRC, RC) != nullptr;
6359}
6360
6362 const MachineOperand &MO) const {
6363 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6364 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6365 unsigned Opc = MI.getOpcode();
6366
6367 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
6368 // information.
6369 if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
6370 MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
6371 constexpr AMDGPU::OpName OpNames[] = {
6372 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6373
6374 for (auto [I, OpName] : enumerate(OpNames)) {
6375 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
6376 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6378 return false;
6379 }
6380 }
6381
6382 if (!isLegalRegOperand(MRI, OpInfo, MO))
6383 return false;
6384
6385 // check Accumulate GPR operand
6386 bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
6387 if (IsAGPR && !ST.hasMAIInsts())
6388 return false;
6389 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6390 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
6391 return false;
6392 // Atomics should have both vdst and vdata either vgpr or agpr.
6393 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
6394 const int DataIdx = AMDGPU::getNamedOperandIdx(
6395 Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6396 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6397 MI.getOperand(DataIdx).isReg() &&
6398 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
6399 return false;
6400 if ((int)OpIdx == DataIdx) {
6401 if (VDstIdx != -1 &&
6402 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
6403 return false;
6404 // DS instructions with 2 src operands also must have tied RC.
6405 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
6406 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
6407 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
6408 return false;
6409 }
6410
6411 // Check V_ACCVGPR_WRITE_B32_e64
6412 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6413 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
6414 RI.isSGPRReg(MRI, MO.getReg()))
6415 return false;
6416
6417 if (ST.hasFlatScratchHiInB64InstHazard() &&
6418 MO.getReg() == AMDGPU::SRC_FLAT_SCRATCH_BASE_HI && isSALU(MI)) {
6419 if (const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst)) {
6420 if (AMDGPU::getRegBitWidth(*RI.getRegClassForReg(MRI, Dst->getReg())) ==
6421 64)
6422 return false;
6423 }
6424 if (Opc == AMDGPU::S_BITCMP0_B64 || Opc == AMDGPU::S_BITCMP1_B64)
6425 return false;
6426 }
6427
6428 return true;
6429}
6430
6432 const MCOperandInfo &OpInfo,
6433 const MachineOperand &MO) const {
6434 if (MO.isReg())
6435 return isLegalRegOperand(MRI, OpInfo, MO);
6436
6437 // Handle non-register types that are treated like immediates.
6438 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6439 return true;
6440}
6441
6443 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6444 const MachineOperand *MO) const {
6445 constexpr unsigned NumOps = 3;
6446 constexpr AMDGPU::OpName OpNames[NumOps * 2] = {
6447 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6448 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6449 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6450
6451 assert(SrcN < NumOps);
6452
6453 if (!MO) {
6454 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
6455 if (SrcIdx == -1)
6456 return true;
6457 MO = &MI.getOperand(SrcIdx);
6458 }
6459
6460 if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
6461 return true;
6462
6463 int ModsIdx =
6464 AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
6465 if (ModsIdx == -1)
6466 return true;
6467
6468 unsigned Mods = MI.getOperand(ModsIdx).getImm();
6469 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6470 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6471
6472 return !OpSel && !OpSelHi;
6473}
6474
6476 const MachineOperand *MO) const {
6477 const MachineFunction &MF = *MI.getMF();
6478 const MachineRegisterInfo &MRI = MF.getRegInfo();
6479 const MCInstrDesc &InstDesc = MI.getDesc();
6480 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6481 int64_t RegClass = getOpRegClassID(OpInfo);
6482 const TargetRegisterClass *DefinedRC =
6483 RegClass != -1 ? RI.getRegClass(RegClass) : nullptr;
6484 if (!MO)
6485 MO = &MI.getOperand(OpIdx);
6486
6487 const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo);
6488
6489 if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, *MO, OpInfo)) {
6490 const MachineOperand *UsedLiteral = nullptr;
6491
6492 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
6493 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6494
6495 // TODO: Be more permissive with frame indexes.
6496 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) {
6497 if (!LiteralLimit--)
6498 return false;
6499
6500 UsedLiteral = MO;
6501 }
6502
6504 if (MO->isReg())
6505 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
6506
6507 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6508 if (i == OpIdx)
6509 continue;
6510 const MachineOperand &Op = MI.getOperand(i);
6511 if (Op.isReg()) {
6512 if (Op.isUse()) {
6513 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6514 if (regUsesConstantBus(Op, MRI) && SGPRsUsed.insert(SGPR).second) {
6515 if (--ConstantBusLimit <= 0)
6516 return false;
6517 }
6518 }
6519 } else if (AMDGPU::isSISrcOperand(InstDesc.operands()[i]) &&
6520 !isInlineConstant(Op, InstDesc.operands()[i])) {
6521 // The same literal may be used multiple times.
6522 if (!UsedLiteral)
6523 UsedLiteral = &Op;
6524 else if (UsedLiteral->isIdenticalTo(Op))
6525 continue;
6526
6527 if (!LiteralLimit--)
6528 return false;
6529 if (--ConstantBusLimit <= 0)
6530 return false;
6531 }
6532 }
6533 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6534 // There can be at most one literal operand, but it can be repeated.
6535 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6536 if (i == OpIdx)
6537 continue;
6538 const MachineOperand &Op = MI.getOperand(i);
6539 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6540 !isInlineConstant(Op, InstDesc.operands()[i]) &&
6541 !Op.isIdenticalTo(*MO))
6542 return false;
6543
6544 // Do not fold a non-inlineable and non-register operand into an
6545 // instruction that already has a frame index. The frame index handling
6546 // code could not handle well when a frame index co-exists with another
6547 // non-register operand, unless that operand is an inlineable immediate.
6548 if (Op.isFI())
6549 return false;
6550 }
6551 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6552 isF16PseudoScalarTrans(MI.getOpcode())) {
6553 return false;
6554 }
6555
6556 if (MO->isReg()) {
6557 if (!DefinedRC)
6558 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6559 return isLegalRegOperand(MI, OpIdx, *MO);
6560 }
6561
6562 if (MO->isImm()) {
6563 uint64_t Imm = MO->getImm();
6564 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
6565 bool Is64BitOp = Is64BitFPOp ||
6566 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6567 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6568 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
6569 if (Is64BitOp &&
6570 !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
6571 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
6572 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6573 return false;
6574
6575 // FIXME: We can use sign extended 64-bit literals, but only for signed
6576 // operands. At the moment we do not know if an operand is signed.
6577 // Such operand will be encoded as its low 32 bits and then either
6578 // correctly sign extended or incorrectly zero extended by HW.
6579 // If 64-bit literals are supported and the literal will be encoded
6580 // as full 64 bit we still can use it.
6581 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6582 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
6583 return false;
6584 }
6585 }
6586
6587 // Handle non-register types that are treated like immediates.
6588 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6589
6590 if (!DefinedRC) {
6591 // This operand expects an immediate.
6592 return true;
6593 }
6594
6595 return isImmOperandLegal(MI, OpIdx, *MO);
6596}
6597
6599 bool IsGFX950Only = ST.hasGFX950Insts();
6600 bool IsGFX940Only = ST.hasGFX940Insts();
6601
6602 if (!IsGFX950Only && !IsGFX940Only)
6603 return false;
6604
6605 if (!isVALU(MI))
6606 return false;
6607
6608 // V_COS, V_EXP, V_RCP, etc.
6609 if (isTRANS(MI))
6610 return true;
6611
6612 // DOT2, DOT2C, DOT4, etc.
6613 if (isDOT(MI))
6614 return true;
6615
6616 // MFMA, SMFMA
6617 if (isMFMA(MI))
6618 return true;
6619
6620 unsigned Opcode = MI.getOpcode();
6621 switch (Opcode) {
6622 case AMDGPU::V_CVT_PK_BF8_F32_e64:
6623 case AMDGPU::V_CVT_PK_FP8_F32_e64:
6624 case AMDGPU::V_MQSAD_PK_U16_U8_e64:
6625 case AMDGPU::V_MQSAD_U32_U8_e64:
6626 case AMDGPU::V_PK_ADD_F16:
6627 case AMDGPU::V_PK_ADD_F32:
6628 case AMDGPU::V_PK_ADD_I16:
6629 case AMDGPU::V_PK_ADD_U16:
6630 case AMDGPU::V_PK_ASHRREV_I16:
6631 case AMDGPU::V_PK_FMA_F16:
6632 case AMDGPU::V_PK_FMA_F32:
6633 case AMDGPU::V_PK_FMAC_F16_e32:
6634 case AMDGPU::V_PK_FMAC_F16_e64:
6635 case AMDGPU::V_PK_LSHLREV_B16:
6636 case AMDGPU::V_PK_LSHRREV_B16:
6637 case AMDGPU::V_PK_MAD_I16:
6638 case AMDGPU::V_PK_MAD_U16:
6639 case AMDGPU::V_PK_MAX_F16:
6640 case AMDGPU::V_PK_MAX_I16:
6641 case AMDGPU::V_PK_MAX_U16:
6642 case AMDGPU::V_PK_MIN_F16:
6643 case AMDGPU::V_PK_MIN_I16:
6644 case AMDGPU::V_PK_MIN_U16:
6645 case AMDGPU::V_PK_MOV_B32:
6646 case AMDGPU::V_PK_MUL_F16:
6647 case AMDGPU::V_PK_MUL_F32:
6648 case AMDGPU::V_PK_MUL_LO_U16:
6649 case AMDGPU::V_PK_SUB_I16:
6650 case AMDGPU::V_PK_SUB_U16:
6651 case AMDGPU::V_QSAD_PK_U16_U8_e64:
6652 return true;
6653 default:
6654 return false;
6655 }
6656}
6657
6659 MachineInstr &MI) const {
6660 unsigned Opc = MI.getOpcode();
6661 const MCInstrDesc &InstrDesc = get(Opc);
6662
6663 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
6664 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6665
6666 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
6667 MachineOperand &Src1 = MI.getOperand(Src1Idx);
6668
6669 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6670 // we need to only have one constant bus use before GFX10.
6671 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6672 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
6673 RI.isSGPRReg(MRI, Src0.getReg()))
6674 legalizeOpWithMove(MI, Src0Idx);
6675
6676 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6677 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6678 // src0/src1 with V_READFIRSTLANE.
6679 if (Opc == AMDGPU::V_WRITELANE_B32) {
6680 const DebugLoc &DL = MI.getDebugLoc();
6681 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
6682 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6683 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6684 .add(Src0);
6685 Src0.ChangeToRegister(Reg, false);
6686 }
6687 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
6688 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6689 const DebugLoc &DL = MI.getDebugLoc();
6690 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6691 .add(Src1);
6692 Src1.ChangeToRegister(Reg, false);
6693 }
6694 return;
6695 }
6696
6697 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6698 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6699 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
6700 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6701 legalizeOpWithMove(MI, Src2Idx);
6702 }
6703
6704 // VOP2 src0 instructions support all operand types, so we don't need to check
6705 // their legality. If src1 is already legal, we don't need to do anything.
6706 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6707 return;
6708
6709 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6710 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6711 // select is uniform.
6712 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6713 RI.isVGPR(MRI, Src1.getReg())) {
6714 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6715 const DebugLoc &DL = MI.getDebugLoc();
6716 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6717 .add(Src1);
6718 Src1.ChangeToRegister(Reg, false);
6719 return;
6720 }
6721
6722 // We do not use commuteInstruction here because it is too aggressive and will
6723 // commute if it is possible. We only want to commute here if it improves
6724 // legality. This can be called a fairly large number of times so don't waste
6725 // compile time pointlessly swapping and checking legality again.
6726 if (HasImplicitSGPR || !MI.isCommutable()) {
6727 legalizeOpWithMove(MI, Src1Idx);
6728 return;
6729 }
6730
6731 // If src0 can be used as src1, commuting will make the operands legal.
6732 // Otherwise we have to give up and insert a move.
6733 //
6734 // TODO: Other immediate-like operand kinds could be commuted if there was a
6735 // MachineOperand::ChangeTo* for them.
6736 if ((!Src1.isImm() && !Src1.isReg()) ||
6737 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6738 legalizeOpWithMove(MI, Src1Idx);
6739 return;
6740 }
6741
6742 int CommutedOpc = commuteOpcode(MI);
6743 if (CommutedOpc == -1) {
6744 legalizeOpWithMove(MI, Src1Idx);
6745 return;
6746 }
6747
6748 MI.setDesc(get(CommutedOpc));
6749
6750 Register Src0Reg = Src0.getReg();
6751 unsigned Src0SubReg = Src0.getSubReg();
6752 bool Src0Kill = Src0.isKill();
6753
6754 if (Src1.isImm())
6755 Src0.ChangeToImmediate(Src1.getImm());
6756 else if (Src1.isReg()) {
6757 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6758 Src0.setSubReg(Src1.getSubReg());
6759 } else
6760 llvm_unreachable("Should only have register or immediate operands");
6761
6762 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6763 Src1.setSubReg(Src0SubReg);
6765}
6766
6767// Legalize VOP3 operands. All operand types are supported for any operand
6768// but only one literal constant and only starting from GFX10.
6770 MachineInstr &MI) const {
6771 unsigned Opc = MI.getOpcode();
6772
6773 int VOP3Idx[3] = {
6774 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6775 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6776 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6777 };
6778
6779 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6780 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6781 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6782 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6783 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6784 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6785 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6786 // src1 and src2 must be scalar
6787 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6788 const DebugLoc &DL = MI.getDebugLoc();
6789 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6790 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6791 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6792 .add(Src1);
6793 Src1.ChangeToRegister(Reg, false);
6794 }
6795 if (VOP3Idx[2] != -1) {
6796 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6797 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6798 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6799 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6800 .add(Src2);
6801 Src2.ChangeToRegister(Reg, false);
6802 }
6803 }
6804 }
6805
6806 // Find the one SGPR operand we are allowed to use.
6807 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6808 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6809 SmallDenseSet<unsigned> SGPRsUsed;
6810 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6811 if (SGPRReg) {
6812 SGPRsUsed.insert(SGPRReg);
6813 --ConstantBusLimit;
6814 }
6815
6816 for (int Idx : VOP3Idx) {
6817 if (Idx == -1)
6818 break;
6819 MachineOperand &MO = MI.getOperand(Idx);
6820
6821 if (!MO.isReg()) {
6822 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6823 continue;
6824
6825 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6826 --LiteralLimit;
6827 --ConstantBusLimit;
6828 continue;
6829 }
6830
6831 --LiteralLimit;
6832 --ConstantBusLimit;
6833 legalizeOpWithMove(MI, Idx);
6834 continue;
6835 }
6836
6837 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6838 continue; // VGPRs are legal
6839
6840 // We can use one SGPR in each VOP3 instruction prior to GFX10
6841 // and two starting from GFX10.
6842 if (SGPRsUsed.count(MO.getReg()))
6843 continue;
6844 if (ConstantBusLimit > 0) {
6845 SGPRsUsed.insert(MO.getReg());
6846 --ConstantBusLimit;
6847 continue;
6848 }
6849
6850 // If we make it this far, then the operand is not legal and we must
6851 // legalize it.
6852 legalizeOpWithMove(MI, Idx);
6853 }
6854
6855 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6856 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6857 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6858 legalizeOpWithMove(MI, VOP3Idx[2]);
6859
6860 // Fix the register class of packed FP32 instructions on gfx12+. See
6861 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
6863 for (unsigned I = 0; I < 3; ++I) {
6864 if (!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, /*SrcN=*/I))
6865 legalizeOpWithMove(MI, VOP3Idx[I]);
6866 }
6867 }
6868}
6869
6872 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6873 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6874 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6875 if (DstRC)
6876 SRC = RI.getCommonSubClass(SRC, DstRC);
6877
6878 Register DstReg = MRI.createVirtualRegister(SRC);
6879 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6880
6881 if (RI.hasAGPRs(VRC)) {
6882 VRC = RI.getEquivalentVGPRClass(VRC);
6883 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6884 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6885 get(TargetOpcode::COPY), NewSrcReg)
6886 .addReg(SrcReg);
6887 SrcReg = NewSrcReg;
6888 }
6889
6890 if (SubRegs == 1) {
6891 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6892 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6893 .addReg(SrcReg);
6894 return DstReg;
6895 }
6896
6898 for (unsigned i = 0; i < SubRegs; ++i) {
6899 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6900 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6901 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6902 .addReg(SrcReg, {}, RI.getSubRegFromChannel(i));
6903 SRegs.push_back(SGPR);
6904 }
6905
6907 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6908 get(AMDGPU::REG_SEQUENCE), DstReg);
6909 for (unsigned i = 0; i < SubRegs; ++i) {
6910 MIB.addReg(SRegs[i]);
6911 MIB.addImm(RI.getSubRegFromChannel(i));
6912 }
6913 return DstReg;
6914}
6915
6917 MachineInstr &MI) const {
6918
6919 // If the pointer is store in VGPRs, then we need to move them to
6920 // SGPRs using v_readfirstlane. This is safe because we only select
6921 // loads with uniform pointers to SMRD instruction so we know the
6922 // pointer value is uniform.
6923 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6924 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6925 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6926 SBase->setReg(SGPR);
6927 }
6928 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6929 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6930 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6931 SOff->setReg(SGPR);
6932 }
6933}
6934
6936 unsigned Opc = Inst.getOpcode();
6937 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6938 if (OldSAddrIdx < 0)
6939 return false;
6940
6941 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
6942
6943 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6944 if (NewOpc < 0)
6946 if (NewOpc < 0)
6947 return false;
6948
6949 MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo();
6950 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6951 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6952 return false;
6953
6954 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6955 if (NewVAddrIdx < 0)
6956 return false;
6957
6958 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6959
6960 // Check vaddr, it shall be zero or absent.
6961 MachineInstr *VAddrDef = nullptr;
6962 if (OldVAddrIdx >= 0) {
6963 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6964 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6965 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
6966 !VAddrDef->getOperand(1).isImm() ||
6967 VAddrDef->getOperand(1).getImm() != 0)
6968 return false;
6969 }
6970
6971 const MCInstrDesc &NewDesc = get(NewOpc);
6972 Inst.setDesc(NewDesc);
6973
6974 // Callers expect iterator to be valid after this call, so modify the
6975 // instruction in place.
6976 if (OldVAddrIdx == NewVAddrIdx) {
6977 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6978 // Clear use list from the old vaddr holding a zero register.
6979 MRI.removeRegOperandFromUseList(&NewVAddr);
6980 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6981 Inst.removeOperand(OldSAddrIdx);
6982 // Update the use list with the pointer we have just moved from vaddr to
6983 // saddr position. Otherwise new vaddr will be missing from the use list.
6984 MRI.removeRegOperandFromUseList(&NewVAddr);
6985 MRI.addRegOperandToUseList(&NewVAddr);
6986 } else {
6987 assert(OldSAddrIdx == NewVAddrIdx);
6988
6989 if (OldVAddrIdx >= 0) {
6990 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6991 AMDGPU::OpName::vdst_in);
6992
6993 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6994 // it asserts. Untie the operands for now and retie them afterwards.
6995 if (NewVDstIn != -1) {
6996 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6997 Inst.untieRegOperand(OldVDstIn);
6998 }
6999
7000 Inst.removeOperand(OldVAddrIdx);
7001
7002 if (NewVDstIn != -1) {
7003 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
7004 Inst.tieOperands(NewVDst, NewVDstIn);
7005 }
7006 }
7007 }
7008
7009 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
7010 VAddrDef->eraseFromParent();
7011
7012 return true;
7013}
7014
7015// FIXME: Remove this when SelectionDAG is obsoleted.
7017 MachineInstr &MI) const {
7018 if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
7019 return;
7020
7021 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
7022 // thinks they are uniform, so a readfirstlane should be valid.
7023 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
7024 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
7025 return;
7026
7028 return;
7029
7030 const TargetRegisterClass *DeclaredRC =
7031 getRegClass(MI.getDesc(), SAddr->getOperandNo());
7032
7033 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
7034 SAddr->setReg(ToSGPR);
7035}
7036
7039 const TargetRegisterClass *DstRC,
7042 const DebugLoc &DL) const {
7043 Register OpReg = Op.getReg();
7044 unsigned OpSubReg = Op.getSubReg();
7045
7046 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
7047 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
7048
7049 // Check if operand is already the correct register class.
7050 if (DstRC == OpRC)
7051 return;
7052
7053 Register DstReg = MRI.createVirtualRegister(DstRC);
7054 auto Copy =
7055 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
7056 Op.setReg(DstReg);
7057
7058 MachineInstr *Def = MRI.getVRegDef(OpReg);
7059 if (!Def)
7060 return;
7061
7062 // Try to eliminate the copy if it is copying an immediate value.
7063 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
7064 foldImmediate(*Copy, *Def, OpReg, &MRI);
7065
7066 bool ImpDef = Def->isImplicitDef();
7067 while (!ImpDef && Def && Def->isCopy()) {
7068 if (Def->getOperand(1).getReg().isPhysical())
7069 break;
7070 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
7071 ImpDef = Def && Def->isImplicitDef();
7072 }
7073 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
7074 !ImpDef)
7075 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
7076}
7077
7078// Emit the actual waterfall loop, executing the wrapped instruction for each
7079// unique value of \p ScalarOps across all lanes. In the best case we execute 1
7080// iteration, in the worst case we execute 64 (once per lane).
7083 MachineBasicBlock &BodyBB, const DebugLoc &DL,
7084 ArrayRef<MachineOperand *> ScalarOps, ArrayRef<Register> PhySGPRs = {}) {
7085 MachineFunction &MF = *LoopBB.getParent();
7087 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7089 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7090
7092 Register CondReg;
7093 for (auto [Idx, ScalarOp] : enumerate(ScalarOps)) {
7094 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
7095 unsigned NumSubRegs = RegSize / 32;
7096 Register VScalarOp = ScalarOp->getReg();
7097
7098 const TargetRegisterClass *RFLSrcRC =
7099 TII.getRegClass(TII.get(AMDGPU::V_READFIRSTLANE_B32), 1);
7100
7101 if (NumSubRegs == 1) {
7102 const TargetRegisterClass *VScalarOpRC = MRI.getRegClass(VScalarOp);
7103 if (const TargetRegisterClass *Common =
7104 TRI->getCommonSubClass(VScalarOpRC, RFLSrcRC);
7105 Common != VScalarOpRC) {
7106 Register VRReg = MRI.createVirtualRegister(Common);
7107 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::COPY), VRReg).addReg(VScalarOp);
7108 VScalarOp = VRReg;
7109 }
7110 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7111
7112 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
7113 .addReg(VScalarOp);
7114
7115 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
7116
7117 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
7118 .addReg(CurReg)
7119 .addReg(VScalarOp);
7120
7121 // Combine the comparison results with AND.
7122 if (!CondReg) // First.
7123 CondReg = NewCondReg;
7124 else { // If not the first, we create an AND.
7125 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7126 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7127 .addReg(CondReg)
7128 .addReg(NewCondReg);
7129 CondReg = AndReg;
7130 }
7131
7132 // Update ScalarOp operand to use the SGPR ScalarOp.
7133 if (PhySGPRs.empty() || !PhySGPRs[Idx].isValid())
7134 ScalarOp->setReg(CurReg);
7135 else {
7136 // Insert into the same block of use
7137 BuildMI(*ScalarOp->getParent()->getParent(), ScalarOp->getParent(), DL,
7138 TII.get(AMDGPU::COPY), PhySGPRs[Idx])
7139 .addReg(CurReg);
7140 ScalarOp->setReg(PhySGPRs[Idx]);
7141 }
7142 ScalarOp->setIsKill();
7143 } else {
7144 SmallVector<Register, 8> ReadlanePieces;
7145 RegState VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
7146 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
7147 "Unhandled register size");
7148
7149 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
7150 Register CurRegLo =
7151 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7152 Register CurRegHi =
7153 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7154
7155 // Read the next variant <- also loop target.
7156 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
7157 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
7158
7159 // Read the next variant <- also loop target.
7160 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
7161 .addReg(VScalarOp, VScalarOpUndef,
7162 TRI->getSubRegFromChannel(Idx + 1));
7163
7164 ReadlanePieces.push_back(CurRegLo);
7165 ReadlanePieces.push_back(CurRegHi);
7166
7167 // Comparison is to be done as 64-bit.
7168 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
7169 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
7170 .addReg(CurRegLo)
7171 .addImm(AMDGPU::sub0)
7172 .addReg(CurRegHi)
7173 .addImm(AMDGPU::sub1);
7174
7175 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
7176 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
7177 NewCondReg)
7178 .addReg(CurReg);
7179 if (NumSubRegs <= 2)
7180 Cmp.addReg(VScalarOp);
7181 else
7182 Cmp.addReg(VScalarOp, VScalarOpUndef,
7183 TRI->getSubRegFromChannel(Idx, 2));
7184
7185 // Combine the comparison results with AND.
7186 if (!CondReg) // First.
7187 CondReg = NewCondReg;
7188 else { // If not the first, we create an AND.
7189 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7190 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7191 .addReg(CondReg)
7192 .addReg(NewCondReg);
7193 CondReg = AndReg;
7194 }
7195 } // End for loop.
7196
7197 const auto *SScalarOpRC =
7198 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
7199 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
7200
7201 // Build scalar ScalarOp.
7202 auto Merge =
7203 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
7204 unsigned Channel = 0;
7205 for (Register Piece : ReadlanePieces) {
7206 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
7207 }
7208
7209 // Update ScalarOp operand to use the SGPR ScalarOp.
7210 if (PhySGPRs.empty() || !PhySGPRs[Idx].isValid())
7211 ScalarOp->setReg(SScalarOp);
7212 else {
7213 BuildMI(*ScalarOp->getParent()->getParent(), ScalarOp->getParent(), DL,
7214 TII.get(AMDGPU::COPY), PhySGPRs[Idx])
7215 .addReg(SScalarOp);
7216 ScalarOp->setReg(PhySGPRs[Idx]);
7217 }
7218 ScalarOp->setIsKill();
7219 }
7220 }
7221
7222 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7223 MRI.setSimpleHint(SaveExec, CondReg);
7224
7225 // Update EXEC to matching lanes, saving original to SaveExec.
7226 BuildMI(LoopBB, I, DL, TII.get(LMC.AndSaveExecOpc), SaveExec)
7227 .addReg(CondReg, RegState::Kill);
7228
7229 // The original instruction is here; we insert the terminators after it.
7230 I = BodyBB.end();
7231
7232 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
7233 BuildMI(BodyBB, I, DL, TII.get(LMC.XorTermOpc), LMC.ExecReg)
7234 .addReg(LMC.ExecReg)
7235 .addReg(SaveExec);
7236
7237 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
7238}
7239
7240// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
7241// with SGPRs by iterating over all unique values across all lanes.
7242// Returns the loop basic block that now contains \p MI.
7243static MachineBasicBlock *
7247 MachineBasicBlock::iterator Begin = nullptr,
7248 MachineBasicBlock::iterator End = nullptr,
7249 ArrayRef<Register> PhySGPRs = {}) {
7250 assert((PhySGPRs.empty() || PhySGPRs.size() == ScalarOps.size()) &&
7251 "Physical SGPRs must be empty or match the number of scalar operands");
7252 MachineBasicBlock &MBB = *MI.getParent();
7253 MachineFunction &MF = *MBB.getParent();
7255 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7256 MachineRegisterInfo &MRI = MF.getRegInfo();
7257 if (!Begin.isValid())
7258 Begin = &MI;
7259 if (!End.isValid()) {
7260 End = &MI;
7261 ++End;
7262 }
7263 const DebugLoc &DL = MI.getDebugLoc();
7265 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7266
7267 // Save SCC. Waterfall Loop may overwrite SCC.
7268 Register SaveSCCReg;
7269
7270 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
7271 // rather than unlimited scan everywhere
7272 bool SCCNotDead =
7273 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
7274 std::numeric_limits<unsigned>::max()) !=
7276 if (SCCNotDead) {
7277 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7278 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
7279 .addImm(1)
7280 .addImm(0);
7281 }
7282
7283 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7284
7285 // Save the EXEC mask
7286 BuildMI(MBB, Begin, DL, TII.get(LMC.MovOpc), SaveExec).addReg(LMC.ExecReg);
7287
7288 // Killed uses in the instruction we are waterfalling around will be
7289 // incorrect due to the added control-flow.
7291 ++AfterMI;
7292 for (auto I = Begin; I != AfterMI; I++) {
7293 for (auto &MO : I->all_uses())
7294 MRI.clearKillFlags(MO.getReg());
7295 }
7296
7297 // To insert the loop we need to split the block. Move everything after this
7298 // point to a new block, and insert a new empty block between the two.
7301 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
7303 ++MBBI;
7304
7305 MF.insert(MBBI, LoopBB);
7306 MF.insert(MBBI, BodyBB);
7307 MF.insert(MBBI, RemainderBB);
7308
7309 LoopBB->addSuccessor(BodyBB);
7310 BodyBB->addSuccessor(LoopBB);
7311 BodyBB->addSuccessor(RemainderBB);
7312
7313 // Move Begin to MI to the BodyBB, and the remainder of the block to
7314 // RemainderBB.
7315 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
7316 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
7317 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
7318
7319 MBB.addSuccessor(LoopBB);
7320
7321 // Update dominators. We know that MBB immediately dominates LoopBB, that
7322 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7323 // RemainderBB. RemainderBB immediately dominates all of the successors
7324 // transferred to it from MBB that MBB used to properly dominate.
7325 if (MDT) {
7326 MDT->addNewBlock(LoopBB, &MBB);
7327 MDT->addNewBlock(BodyBB, LoopBB);
7328 MDT->addNewBlock(RemainderBB, BodyBB);
7329 for (auto &Succ : RemainderBB->successors()) {
7330 if (MDT->properlyDominates(&MBB, Succ)) {
7331 MDT->changeImmediateDominator(Succ, RemainderBB);
7332 }
7333 }
7334 }
7335
7336 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps,
7337 PhySGPRs);
7338
7339 MachineBasicBlock::iterator First = RemainderBB->begin();
7340 // Restore SCC
7341 if (SCCNotDead) {
7342 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
7343 .addReg(SaveSCCReg, RegState::Kill)
7344 .addImm(0);
7345 }
7346
7347 // Restore the EXEC mask
7348 BuildMI(*RemainderBB, First, DL, TII.get(LMC.MovOpc), LMC.ExecReg)
7349 .addReg(SaveExec);
7350 return BodyBB;
7351}
7352
7353// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7354static std::tuple<unsigned, unsigned>
7356 MachineBasicBlock &MBB = *MI.getParent();
7357 MachineFunction &MF = *MBB.getParent();
7358 MachineRegisterInfo &MRI = MF.getRegInfo();
7359
7360 // Extract the ptr from the resource descriptor.
7361 unsigned RsrcPtr =
7362 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
7363 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
7364
7365 // Create an empty resource descriptor
7366 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
7367 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7368 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7369 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
7370 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7371
7372 // Zero64 = 0
7373 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
7374 .addImm(0);
7375
7376 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7377 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
7378 .addImm(Lo_32(RsrcDataFormat));
7379
7380 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7381 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
7382 .addImm(Hi_32(RsrcDataFormat));
7383
7384 // NewSRsrc = {Zero64, SRsrcFormat}
7385 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
7386 .addReg(Zero64)
7387 .addImm(AMDGPU::sub0_sub1)
7388 .addReg(SRsrcFormatLo)
7389 .addImm(AMDGPU::sub2)
7390 .addReg(SRsrcFormatHi)
7391 .addImm(AMDGPU::sub3);
7392
7393 return std::tuple(RsrcPtr, NewSRsrc);
7394}
7395
7398 MachineDominatorTree *MDT) const {
7399 MachineFunction &MF = *MI.getMF();
7400 MachineRegisterInfo &MRI = MF.getRegInfo();
7401 MachineBasicBlock *CreatedBB = nullptr;
7402
7403 // Legalize VOP2
7404 if (isVOP2(MI) || isVOPC(MI)) {
7406 return CreatedBB;
7407 }
7408
7409 // Legalize VOP3
7410 if (isVOP3(MI)) {
7412 return CreatedBB;
7413 }
7414
7415 // Legalize SMRD
7416 if (isSMRD(MI)) {
7418 return CreatedBB;
7419 }
7420
7421 // Legalize FLAT
7422 if (isFLAT(MI)) {
7424 return CreatedBB;
7425 }
7426
7427 // Legalize PHI
7428 // The register class of the operands must be the same type as the register
7429 // class of the output.
7430 if (MI.getOpcode() == AMDGPU::PHI) {
7431 const TargetRegisterClass *VRC = getOpRegClass(MI, 0);
7432 assert(!RI.isSGPRClass(VRC));
7433
7434 // Update all the operands so they have the same type.
7435 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7436 MachineOperand &Op = MI.getOperand(I);
7437 if (!Op.isReg() || !Op.getReg().isVirtual())
7438 continue;
7439
7440 // MI is a PHI instruction.
7441 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
7443
7444 // Avoid creating no-op copies with the same src and dst reg class. These
7445 // confuse some of the machine passes.
7446 legalizeGenericOperand(*InsertBB, Insert, VRC, Op, MRI, MI.getDebugLoc());
7447 }
7448 }
7449
7450 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7451 // VGPR dest type and SGPR sources, insert copies so all operands are
7452 // VGPRs. This seems to help operand folding / the register coalescer.
7453 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7454 MachineBasicBlock *MBB = MI.getParent();
7455 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
7456 if (RI.hasVGPRs(DstRC)) {
7457 // Update all the operands so they are VGPR register classes. These may
7458 // not be the same register class because REG_SEQUENCE supports mixing
7459 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7460 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7461 MachineOperand &Op = MI.getOperand(I);
7462 if (!Op.isReg() || !Op.getReg().isVirtual())
7463 continue;
7464
7465 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
7466 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
7467 if (VRC == OpRC)
7468 continue;
7469
7470 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
7471 Op.setIsKill();
7472 }
7473 }
7474
7475 return CreatedBB;
7476 }
7477
7478 // Legalize INSERT_SUBREG
7479 // src0 must have the same register class as dst
7480 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7481 Register Dst = MI.getOperand(0).getReg();
7482 Register Src0 = MI.getOperand(1).getReg();
7483 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
7484 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
7485 if (DstRC != Src0RC) {
7486 MachineBasicBlock *MBB = MI.getParent();
7487 MachineOperand &Op = MI.getOperand(1);
7488 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
7489 }
7490 return CreatedBB;
7491 }
7492
7493 // Legalize SI_INIT_M0
7494 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7495 MachineOperand &Src = MI.getOperand(0);
7496 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7497 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7498 return CreatedBB;
7499 }
7500
7501 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7502 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7503 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7504 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7505 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7506 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7507 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7508 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7509 MachineOperand &Src = MI.getOperand(1);
7510 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7511 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7512 return CreatedBB;
7513 }
7514
7515 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7516 //
7517 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7518 // scratch memory access. In both cases, the legalization never involves
7519 // conversion to the addr64 form.
7521 (isMUBUF(MI) || isMTBUF(MI)))) {
7522 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7523 ? AMDGPU::OpName::rsrc
7524 : AMDGPU::OpName::srsrc;
7525 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
7526 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
7527 CreatedBB = generateWaterFallLoop(*this, MI, {SRsrc}, MDT);
7528
7529 AMDGPU::OpName SampOpName =
7530 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7531 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
7532 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
7533 CreatedBB = generateWaterFallLoop(*this, MI, {SSamp}, MDT);
7534
7535 return CreatedBB;
7536 }
7537
7538 // Legalize SI_CALL
7539 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7540 MachineOperand *Dest = &MI.getOperand(0);
7541 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
7542 createWaterFallForSiCall(&MI, MDT, {Dest});
7543 }
7544 }
7545
7546 // Legalize s_sleep_var.
7547 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7548 const DebugLoc &DL = MI.getDebugLoc();
7549 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7550 int Src0Idx =
7551 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
7552 MachineOperand &Src0 = MI.getOperand(Src0Idx);
7553 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
7554 .add(Src0);
7555 Src0.ChangeToRegister(Reg, false);
7556 return nullptr;
7557 }
7558
7559 // Legalize TENSOR_LOAD_TO_LDS_d2/_d4, TENSOR_STORE_FROM_LDS_d2/_d4. All their
7560 // operands are scalar.
7561 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_d2 ||
7562 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_d4 ||
7563 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_d2 ||
7564 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_d4) {
7565 for (MachineOperand &Src : MI.explicit_operands()) {
7566 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7567 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7568 }
7569 return CreatedBB;
7570 }
7571
7572 // Legalize MUBUF instructions.
7573 bool isSoffsetLegal = true;
7574 int SoffsetIdx =
7575 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7576 if (SoffsetIdx != -1) {
7577 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7578 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7579 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7580 isSoffsetLegal = false;
7581 }
7582 }
7583
7584 bool isRsrcLegal = true;
7585 int RsrcIdx =
7586 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
7587 if (RsrcIdx != -1) {
7588 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7589 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
7590 isRsrcLegal = false;
7591 }
7592
7593 // The operands are legal.
7594 if (isRsrcLegal && isSoffsetLegal)
7595 return CreatedBB;
7596
7597 if (!isRsrcLegal) {
7598 // Legalize a VGPR Rsrc
7599 //
7600 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7601 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7602 // a zero-value SRsrc.
7603 //
7604 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7605 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7606 // above.
7607 //
7608 // Otherwise we are on non-ADDR64 hardware, and/or we have
7609 // idxen/offen/bothen and we fall back to a waterfall loop.
7610
7611 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7612 MachineBasicBlock &MBB = *MI.getParent();
7613
7614 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7615 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
7616 // This is already an ADDR64 instruction so we need to add the pointer
7617 // extracted from the resource descriptor to the current value of VAddr.
7618 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7619 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7620 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7621
7622 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7623 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
7624 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
7625
7626 unsigned RsrcPtr, NewSRsrc;
7627 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7628
7629 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7630 const DebugLoc &DL = MI.getDebugLoc();
7631 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
7632 .addDef(CondReg0)
7633 .addReg(RsrcPtr, {}, AMDGPU::sub0)
7634 .addReg(VAddr->getReg(), {}, AMDGPU::sub0)
7635 .addImm(0);
7636
7637 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7638 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
7639 .addDef(CondReg1, RegState::Dead)
7640 .addReg(RsrcPtr, {}, AMDGPU::sub1)
7641 .addReg(VAddr->getReg(), {}, AMDGPU::sub1)
7642 .addReg(CondReg0, RegState::Kill)
7643 .addImm(0);
7644
7645 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7646 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
7647 .addReg(NewVAddrLo)
7648 .addImm(AMDGPU::sub0)
7649 .addReg(NewVAddrHi)
7650 .addImm(AMDGPU::sub1);
7651
7652 VAddr->setReg(NewVAddr);
7653 Rsrc->setReg(NewSRsrc);
7654 } else if (!VAddr && ST.hasAddr64()) {
7655 // This instructions is the _OFFSET variant, so we need to convert it to
7656 // ADDR64.
7657 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7658 "FIXME: Need to emit flat atomics here");
7659
7660 unsigned RsrcPtr, NewSRsrc;
7661 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7662
7663 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7664 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
7665 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
7666 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7667 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
7668
7669 // Atomics with return have an additional tied operand and are
7670 // missing some of the special bits.
7671 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
7672 MachineInstr *Addr64;
7673
7674 if (!VDataIn) {
7675 // Regular buffer load / store.
7677 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7678 .add(*VData)
7679 .addReg(NewVAddr)
7680 .addReg(NewSRsrc)
7681 .add(*SOffset)
7682 .add(*Offset);
7683
7684 if (const MachineOperand *CPol =
7685 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
7686 MIB.addImm(CPol->getImm());
7687 }
7688
7689 if (const MachineOperand *TFE =
7690 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
7691 MIB.addImm(TFE->getImm());
7692 }
7693
7694 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
7695
7696 MIB.cloneMemRefs(MI);
7697 Addr64 = MIB;
7698 } else {
7699 // Atomics with return.
7700 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7701 .add(*VData)
7702 .add(*VDataIn)
7703 .addReg(NewVAddr)
7704 .addReg(NewSRsrc)
7705 .add(*SOffset)
7706 .add(*Offset)
7707 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7708 .cloneMemRefs(MI);
7709 }
7710
7711 MI.removeFromParent();
7712
7713 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7714 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7715 NewVAddr)
7716 .addReg(RsrcPtr, {}, AMDGPU::sub0)
7717 .addImm(AMDGPU::sub0)
7718 .addReg(RsrcPtr, {}, AMDGPU::sub1)
7719 .addImm(AMDGPU::sub1);
7720 } else {
7721 // Legalize a VGPR Rsrc and soffset together.
7722 if (!isSoffsetLegal) {
7723 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7724 CreatedBB = generateWaterFallLoop(*this, MI, {Rsrc, Soffset}, MDT);
7725 return CreatedBB;
7726 }
7727 CreatedBB = generateWaterFallLoop(*this, MI, {Rsrc}, MDT);
7728 return CreatedBB;
7729 }
7730 }
7731
7732 // Legalize a VGPR soffset.
7733 if (!isSoffsetLegal) {
7734 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7735 CreatedBB = generateWaterFallLoop(*this, MI, {Soffset}, MDT);
7736 return CreatedBB;
7737 }
7738 return CreatedBB;
7739}
7740
7742 InstrList.insert(MI);
7743 // Add MBUF instructiosn to deferred list.
7744 int RsrcIdx =
7745 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7746 if (RsrcIdx != -1) {
7747 DeferredList.insert(MI);
7748 }
7749}
7750
7752 return DeferredList.contains(MI);
7753}
7754
7755// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7756// lowering (change sgpr to vgpr).
7757// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7758// size. Need to legalize the size of the operands during the vgpr lowering
7759// chain. This can be removed after we have sgpr16 in place
7761 MachineRegisterInfo &MRI) const {
7762 if (!ST.useRealTrue16Insts())
7763 return;
7764
7765 unsigned Opcode = MI.getOpcode();
7766 MachineBasicBlock *MBB = MI.getParent();
7767 // Legalize operands and check for size mismatch
7768 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7769 OpIdx >= get(Opcode).getNumOperands() ||
7770 get(Opcode).operands()[OpIdx].RegClass == -1)
7771 return;
7772
7773 MachineOperand &Op = MI.getOperand(OpIdx);
7774 if (!Op.isReg() || !Op.getReg().isVirtual())
7775 return;
7776
7777 const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
7778 if (!RI.isVGPRClass(CurrRC))
7779 return;
7780
7781 int16_t RCID = getOpRegClassID(get(Opcode).operands()[OpIdx]);
7782 const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
7783 if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
7784 Op.setSubReg(AMDGPU::lo16);
7785 } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
7786 const DebugLoc &DL = MI.getDebugLoc();
7787 Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7788 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7789 BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
7790 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
7791 .addReg(Op.getReg())
7792 .addImm(AMDGPU::lo16)
7793 .addReg(Undef)
7794 .addImm(AMDGPU::hi16);
7795 Op.setReg(NewDstReg);
7796 }
7797}
7799 MachineRegisterInfo &MRI) const {
7800 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7802}
7803
7807 ArrayRef<Register> PhySGPRs) const {
7808 assert(MI->getOpcode() == AMDGPU::SI_CALL_ISEL &&
7809 "This only handle waterfall for SI_CALL_ISEL");
7810 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7811 // following copies, we also need to move copies from and to physical
7812 // registers into the loop block.
7813 // Also move the copies to physical registers into the loop block
7814 MachineBasicBlock &MBB = *MI->getParent();
7816 while (Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
7817 --Start;
7819 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
7820 ++End;
7821
7822 // Also include following copies of the return value
7823 ++End;
7824 while (End != MBB.end() && End->isCopy() &&
7825 MI->definesRegister(End->getOperand(1).getReg(), &RI))
7826 ++End;
7827
7828 generateWaterFallLoop(*this, *MI, ScalarOps, MDT, Start, End, PhySGPRs);
7829}
7830
7832 MachineDominatorTree *MDT) const {
7834 DenseMap<MachineInstr *, bool> V2SPhyCopiesToErase;
7835 while (!Worklist.empty()) {
7836 MachineInstr &Inst = *Worklist.top();
7837 Worklist.erase_top();
7838 // Skip MachineInstr in the deferred list.
7839 if (Worklist.isDeferred(&Inst))
7840 continue;
7841 moveToVALUImpl(Worklist, MDT, Inst, WaterFalls, V2SPhyCopiesToErase);
7842 }
7843
7844 // Deferred list of instructions will be processed once
7845 // all the MachineInstr in the worklist are done.
7846 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7847 moveToVALUImpl(Worklist, MDT, *Inst, WaterFalls, V2SPhyCopiesToErase);
7848 assert(Worklist.empty() &&
7849 "Deferred MachineInstr are not supposed to re-populate worklist");
7850 }
7851
7852 for (std::pair<MachineInstr *, V2PhysSCopyInfo> &Entry : WaterFalls) {
7853 if (Entry.first->getOpcode() == AMDGPU::SI_CALL_ISEL)
7854 createWaterFallForSiCall(Entry.first, MDT, Entry.second.MOs,
7855 Entry.second.SGPRs);
7856 }
7857
7858 for (std::pair<MachineInstr *, bool> Entry : V2SPhyCopiesToErase)
7859 if (Entry.second)
7860 Entry.first->eraseFromParent();
7861}
7863 MachineRegisterInfo &MRI, Register DstReg, MachineInstr &Inst) const {
7864 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
7865 // hope for the best.
7866 const TargetRegisterClass *DstRC = RI.getRegClassForReg(MRI, DstReg);
7867 ArrayRef<int16_t> SubRegIndices = RI.getRegSplitParts(DstRC, 4);
7868 if (SubRegIndices.size() <= 1) {
7869 Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7870 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7871 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
7872 .add(Inst.getOperand(1));
7873 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
7874 DstReg)
7875 .addReg(NewDst);
7876 } else {
7878 for (int16_t Indice : SubRegIndices) {
7879 Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7880 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7881 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
7882 .addReg(Inst.getOperand(1).getReg(), {}, Indice);
7883
7884 DstRegs.push_back(NewDst);
7885 }
7887 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7888 get(AMDGPU::REG_SEQUENCE), DstReg);
7889 for (unsigned i = 0; i < SubRegIndices.size(); ++i) {
7890 MIB.addReg(DstRegs[i]);
7891 MIB.addImm(RI.getSubRegFromChannel(i));
7892 }
7893 }
7894}
7895
7897 SIInstrWorklist &Worklist, Register DstReg, MachineInstr &Inst,
7900 DenseMap<MachineInstr *, bool> &V2SPhyCopiesToErase) const {
7901 if (DstReg == AMDGPU::M0) {
7902 createReadFirstLaneFromCopyToPhysReg(MRI, DstReg, Inst);
7903 V2SPhyCopiesToErase.try_emplace(&Inst, true);
7904 return;
7905 }
7906 Register SrcReg = Inst.getOperand(1).getReg();
7909 // Only search current block since phyreg's def & use cannot cross
7910 // blocks when MF.NoPhi = false.
7911 while (++I != E) {
7912 // For SI_CALL_ISEL users, replace the phys SGPR with the VGPR source
7913 // and record the operand for later waterfall loop generation.
7914 if (I->getOpcode() == AMDGPU::SI_CALL_ISEL) {
7915 MachineInstr *UseMI = &*I;
7916 for (unsigned i = 0; i < UseMI->getNumOperands(); ++i) {
7917 if (UseMI->getOperand(i).isReg() &&
7918 UseMI->getOperand(i).getReg() == DstReg) {
7919 MachineOperand *MO = &UseMI->getOperand(i);
7920 MO->setReg(SrcReg);
7921 V2PhysSCopyInfo &V2SCopyInfo = WaterFalls[UseMI];
7922 V2SCopyInfo.MOs.push_back(MO);
7923 V2SCopyInfo.SGPRs.push_back(DstReg);
7924 V2SPhyCopiesToErase.try_emplace(&Inst, true);
7925 }
7926 }
7927 } else if (I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG &&
7928 I->getOperand(0).isReg() &&
7929 I->getOperand(0).getReg() == DstReg) {
7930 createReadFirstLaneFromCopyToPhysReg(MRI, DstReg, Inst);
7931 V2SPhyCopiesToErase.try_emplace(&Inst, true);
7932 } else if (I->readsRegister(DstReg, &RI)) {
7933 // COPY cannot be erased if other type of inst uses it.
7934 V2SPhyCopiesToErase[&Inst] = false;
7935 }
7936 if (I->findRegisterDefOperand(DstReg, &RI))
7937 break;
7938 }
7939}
7940
7942 SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst,
7944 DenseMap<MachineInstr *, bool> &V2SPhyCopiesToErase) const {
7945
7947 if (!MBB)
7948 return;
7949 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7950 unsigned Opcode = Inst.getOpcode();
7951 unsigned NewOpcode = getVALUOp(Inst);
7952 const DebugLoc &DL = Inst.getDebugLoc();
7953
7954 // Handle some special cases
7955 switch (Opcode) {
7956 default:
7957 break;
7958 case AMDGPU::S_ADD_I32:
7959 case AMDGPU::S_SUB_I32: {
7960 // FIXME: The u32 versions currently selected use the carry.
7961 bool Changed;
7962 MachineBasicBlock *CreatedBBTmp = nullptr;
7963 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7964 if (Changed)
7965 return;
7966
7967 // Default handling
7968 break;
7969 }
7970
7971 case AMDGPU::S_MUL_U64:
7972 if (ST.hasVMulU64Inst()) {
7973 NewOpcode = AMDGPU::V_MUL_U64_e64;
7974 break;
7975 }
7976 // Split s_mul_u64 in 32-bit vector multiplications.
7977 splitScalarSMulU64(Worklist, Inst, MDT);
7978 Inst.eraseFromParent();
7979 return;
7980
7981 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7982 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7983 // This is a special case of s_mul_u64 where all the operands are either
7984 // zero extended or sign extended.
7985 splitScalarSMulPseudo(Worklist, Inst, MDT);
7986 Inst.eraseFromParent();
7987 return;
7988
7989 case AMDGPU::S_AND_B64:
7990 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7991 Inst.eraseFromParent();
7992 return;
7993
7994 case AMDGPU::S_OR_B64:
7995 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7996 Inst.eraseFromParent();
7997 return;
7998
7999 case AMDGPU::S_XOR_B64:
8000 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
8001 Inst.eraseFromParent();
8002 return;
8003
8004 case AMDGPU::S_NAND_B64:
8005 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
8006 Inst.eraseFromParent();
8007 return;
8008
8009 case AMDGPU::S_NOR_B64:
8010 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
8011 Inst.eraseFromParent();
8012 return;
8013
8014 case AMDGPU::S_XNOR_B64:
8015 if (ST.hasDLInsts())
8016 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
8017 else
8018 splitScalar64BitXnor(Worklist, Inst, MDT);
8019 Inst.eraseFromParent();
8020 return;
8021
8022 case AMDGPU::S_ANDN2_B64:
8023 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
8024 Inst.eraseFromParent();
8025 return;
8026
8027 case AMDGPU::S_ORN2_B64:
8028 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
8029 Inst.eraseFromParent();
8030 return;
8031
8032 case AMDGPU::S_BREV_B64:
8033 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
8034 Inst.eraseFromParent();
8035 return;
8036
8037 case AMDGPU::S_NOT_B64:
8038 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
8039 Inst.eraseFromParent();
8040 return;
8041
8042 case AMDGPU::S_BCNT1_I32_B64:
8043 splitScalar64BitBCNT(Worklist, Inst);
8044 Inst.eraseFromParent();
8045 return;
8046
8047 case AMDGPU::S_BFE_I64:
8048 splitScalar64BitBFE(Worklist, Inst);
8049 Inst.eraseFromParent();
8050 return;
8051
8052 case AMDGPU::S_FLBIT_I32_B64:
8053 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
8054 Inst.eraseFromParent();
8055 return;
8056 case AMDGPU::S_FF1_I32_B64:
8057 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
8058 Inst.eraseFromParent();
8059 return;
8060
8061 case AMDGPU::S_LSHL_B32:
8062 if (ST.hasOnlyRevVALUShifts()) {
8063 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
8064 swapOperands(Inst);
8065 }
8066 break;
8067 case AMDGPU::S_ASHR_I32:
8068 if (ST.hasOnlyRevVALUShifts()) {
8069 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
8070 swapOperands(Inst);
8071 }
8072 break;
8073 case AMDGPU::S_LSHR_B32:
8074 if (ST.hasOnlyRevVALUShifts()) {
8075 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
8076 swapOperands(Inst);
8077 }
8078 break;
8079 case AMDGPU::S_LSHL_B64:
8080 if (ST.hasOnlyRevVALUShifts()) {
8081 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
8082 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
8083 : AMDGPU::V_LSHLREV_B64_e64;
8084 swapOperands(Inst);
8085 }
8086 break;
8087 case AMDGPU::S_ASHR_I64:
8088 if (ST.hasOnlyRevVALUShifts()) {
8089 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
8090 swapOperands(Inst);
8091 }
8092 break;
8093 case AMDGPU::S_LSHR_B64:
8094 if (ST.hasOnlyRevVALUShifts()) {
8095 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
8096 swapOperands(Inst);
8097 }
8098 break;
8099
8100 case AMDGPU::S_ABS_I32:
8101 lowerScalarAbs(Worklist, Inst);
8102 Inst.eraseFromParent();
8103 return;
8104
8105 case AMDGPU::S_ABSDIFF_I32:
8106 lowerScalarAbsDiff(Worklist, Inst);
8107 Inst.eraseFromParent();
8108 return;
8109
8110 case AMDGPU::S_CBRANCH_SCC0:
8111 case AMDGPU::S_CBRANCH_SCC1: {
8112 // Clear unused bits of vcc
8113 Register CondReg = Inst.getOperand(1).getReg();
8114 bool IsSCC = CondReg == AMDGPU::SCC;
8116 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(LMC.AndOpc), LMC.VccReg)
8117 .addReg(LMC.ExecReg)
8118 .addReg(IsSCC ? LMC.VccReg : CondReg);
8119 Inst.removeOperand(1);
8120 } break;
8121
8122 case AMDGPU::S_BFE_U64:
8123 case AMDGPU::S_BFM_B64:
8124 llvm_unreachable("Moving this op to VALU not implemented");
8125
8126 case AMDGPU::S_PACK_LL_B32_B16:
8127 case AMDGPU::S_PACK_LH_B32_B16:
8128 case AMDGPU::S_PACK_HL_B32_B16:
8129 case AMDGPU::S_PACK_HH_B32_B16:
8130 movePackToVALU(Worklist, MRI, Inst);
8131 Inst.eraseFromParent();
8132 return;
8133
8134 case AMDGPU::S_XNOR_B32:
8135 lowerScalarXnor(Worklist, Inst);
8136 Inst.eraseFromParent();
8137 return;
8138
8139 case AMDGPU::S_NAND_B32:
8140 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
8141 Inst.eraseFromParent();
8142 return;
8143
8144 case AMDGPU::S_NOR_B32:
8145 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
8146 Inst.eraseFromParent();
8147 return;
8148
8149 case AMDGPU::S_ANDN2_B32:
8150 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
8151 Inst.eraseFromParent();
8152 return;
8153
8154 case AMDGPU::S_ORN2_B32:
8155 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
8156 Inst.eraseFromParent();
8157 return;
8158
8159 // TODO: remove as soon as everything is ready
8160 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
8161 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
8162 // can only be selected from the uniform SDNode.
8163 case AMDGPU::S_ADD_CO_PSEUDO:
8164 case AMDGPU::S_SUB_CO_PSEUDO: {
8165 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
8166 ? AMDGPU::V_ADDC_U32_e64
8167 : AMDGPU::V_SUBB_U32_e64;
8168 const auto *CarryRC = RI.getWaveMaskRegClass();
8169
8170 Register CarryInReg = Inst.getOperand(4).getReg();
8171 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
8172 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
8173 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
8174 .addReg(CarryInReg);
8175 }
8176
8177 Register CarryOutReg = Inst.getOperand(1).getReg();
8178
8179 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
8180 MRI.getRegClass(Inst.getOperand(0).getReg())));
8181 MachineInstr *CarryOp =
8182 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
8183 .addReg(CarryOutReg, RegState::Define)
8184 .add(Inst.getOperand(2))
8185 .add(Inst.getOperand(3))
8186 .addReg(CarryInReg)
8187 .addImm(0);
8188 legalizeOperands(*CarryOp);
8189 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
8190 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8191 Inst.eraseFromParent();
8192 }
8193 return;
8194 case AMDGPU::S_UADDO_PSEUDO:
8195 case AMDGPU::S_USUBO_PSEUDO: {
8196 MachineOperand &Dest0 = Inst.getOperand(0);
8197 MachineOperand &Dest1 = Inst.getOperand(1);
8198 MachineOperand &Src0 = Inst.getOperand(2);
8199 MachineOperand &Src1 = Inst.getOperand(3);
8200
8201 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
8202 ? AMDGPU::V_ADD_CO_U32_e64
8203 : AMDGPU::V_SUB_CO_U32_e64;
8204 const TargetRegisterClass *NewRC =
8205 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
8206 Register DestReg = MRI.createVirtualRegister(NewRC);
8207 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
8208 .addReg(Dest1.getReg(), RegState::Define)
8209 .add(Src0)
8210 .add(Src1)
8211 .addImm(0); // clamp bit
8212
8213 legalizeOperands(*NewInstr, MDT);
8214 MRI.replaceRegWith(Dest0.getReg(), DestReg);
8215 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8216 Inst.eraseFromParent();
8217 }
8218 return;
8219 case AMDGPU::S_LSHL1_ADD_U32:
8220 case AMDGPU::S_LSHL2_ADD_U32:
8221 case AMDGPU::S_LSHL3_ADD_U32:
8222 case AMDGPU::S_LSHL4_ADD_U32: {
8223 MachineOperand &Dest = Inst.getOperand(0);
8224 MachineOperand &Src0 = Inst.getOperand(1);
8225 MachineOperand &Src1 = Inst.getOperand(2);
8226 unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1
8227 : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
8228 : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
8229 : 4);
8230
8231 const TargetRegisterClass *NewRC =
8232 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()));
8233 Register DestReg = MRI.createVirtualRegister(NewRC);
8234 MachineInstr *NewInstr =
8235 BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
8236 .add(Src0)
8237 .addImm(ShiftAmt)
8238 .add(Src1);
8239
8240 legalizeOperands(*NewInstr, MDT);
8241 MRI.replaceRegWith(Dest.getReg(), DestReg);
8242 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8243 Inst.eraseFromParent();
8244 }
8245 return;
8246 case AMDGPU::S_CSELECT_B32:
8247 case AMDGPU::S_CSELECT_B64:
8248 lowerSelect(Worklist, Inst, MDT);
8249 Inst.eraseFromParent();
8250 return;
8251 case AMDGPU::S_CMP_EQ_I32:
8252 case AMDGPU::S_CMP_LG_I32:
8253 case AMDGPU::S_CMP_GT_I32:
8254 case AMDGPU::S_CMP_GE_I32:
8255 case AMDGPU::S_CMP_LT_I32:
8256 case AMDGPU::S_CMP_LE_I32:
8257 case AMDGPU::S_CMP_EQ_U32:
8258 case AMDGPU::S_CMP_LG_U32:
8259 case AMDGPU::S_CMP_GT_U32:
8260 case AMDGPU::S_CMP_GE_U32:
8261 case AMDGPU::S_CMP_LT_U32:
8262 case AMDGPU::S_CMP_LE_U32:
8263 case AMDGPU::S_CMP_EQ_U64:
8264 case AMDGPU::S_CMP_LG_U64:
8265 case AMDGPU::S_CMP_LT_F32:
8266 case AMDGPU::S_CMP_EQ_F32:
8267 case AMDGPU::S_CMP_LE_F32:
8268 case AMDGPU::S_CMP_GT_F32:
8269 case AMDGPU::S_CMP_LG_F32:
8270 case AMDGPU::S_CMP_GE_F32:
8271 case AMDGPU::S_CMP_O_F32:
8272 case AMDGPU::S_CMP_U_F32:
8273 case AMDGPU::S_CMP_NGE_F32:
8274 case AMDGPU::S_CMP_NLG_F32:
8275 case AMDGPU::S_CMP_NGT_F32:
8276 case AMDGPU::S_CMP_NLE_F32:
8277 case AMDGPU::S_CMP_NEQ_F32:
8278 case AMDGPU::S_CMP_NLT_F32: {
8279 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8280 auto NewInstr =
8281 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8282 .setMIFlags(Inst.getFlags());
8283 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
8284 0) {
8285 NewInstr
8286 .addImm(0) // src0_modifiers
8287 .add(Inst.getOperand(0)) // src0
8288 .addImm(0) // src1_modifiers
8289 .add(Inst.getOperand(1)) // src1
8290 .addImm(0); // clamp
8291 } else {
8292 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
8293 }
8294 legalizeOperands(*NewInstr, MDT);
8295 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8296 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8297 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8298 Inst.eraseFromParent();
8299 return;
8300 }
8301 case AMDGPU::S_CMP_LT_F16:
8302 case AMDGPU::S_CMP_EQ_F16:
8303 case AMDGPU::S_CMP_LE_F16:
8304 case AMDGPU::S_CMP_GT_F16:
8305 case AMDGPU::S_CMP_LG_F16:
8306 case AMDGPU::S_CMP_GE_F16:
8307 case AMDGPU::S_CMP_O_F16:
8308 case AMDGPU::S_CMP_U_F16:
8309 case AMDGPU::S_CMP_NGE_F16:
8310 case AMDGPU::S_CMP_NLG_F16:
8311 case AMDGPU::S_CMP_NGT_F16:
8312 case AMDGPU::S_CMP_NLE_F16:
8313 case AMDGPU::S_CMP_NEQ_F16:
8314 case AMDGPU::S_CMP_NLT_F16: {
8315 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8316 auto NewInstr =
8317 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8318 .setMIFlags(Inst.getFlags());
8319 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
8320 NewInstr
8321 .addImm(0) // src0_modifiers
8322 .add(Inst.getOperand(0)) // src0
8323 .addImm(0) // src1_modifiers
8324 .add(Inst.getOperand(1)) // src1
8325 .addImm(0); // clamp
8326 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8327 NewInstr.addImm(0); // op_sel0
8328 } else {
8329 NewInstr
8330 .add(Inst.getOperand(0))
8331 .add(Inst.getOperand(1));
8332 }
8333 legalizeOperandsVALUt16(*NewInstr, MRI);
8334 legalizeOperands(*NewInstr, MDT);
8335 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8336 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8337 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8338 Inst.eraseFromParent();
8339 return;
8340 }
8341 case AMDGPU::S_CVT_HI_F32_F16: {
8342 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8343 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8344 if (ST.useRealTrue16Insts()) {
8345 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
8346 .add(Inst.getOperand(1));
8347 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8348 .addImm(0) // src0_modifiers
8349 .addReg(TmpReg, {}, AMDGPU::hi16)
8350 .addImm(0) // clamp
8351 .addImm(0) // omod
8352 .addImm(0); // op_sel0
8353 } else {
8354 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8355 .addImm(16)
8356 .add(Inst.getOperand(1));
8357 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8358 .addImm(0) // src0_modifiers
8359 .addReg(TmpReg)
8360 .addImm(0) // clamp
8361 .addImm(0); // omod
8362 }
8363
8364 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8365 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8366 Inst.eraseFromParent();
8367 return;
8368 }
8369 case AMDGPU::S_MINIMUM_F32:
8370 case AMDGPU::S_MAXIMUM_F32: {
8371 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8372 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8373 .addImm(0) // src0_modifiers
8374 .add(Inst.getOperand(1))
8375 .addImm(0) // src1_modifiers
8376 .add(Inst.getOperand(2))
8377 .addImm(0) // clamp
8378 .addImm(0); // omod
8379 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8380
8381 legalizeOperands(*NewInstr, MDT);
8382 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8383 Inst.eraseFromParent();
8384 return;
8385 }
8386 case AMDGPU::S_MINIMUM_F16:
8387 case AMDGPU::S_MAXIMUM_F16: {
8388 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8389 ? &AMDGPU::VGPR_16RegClass
8390 : &AMDGPU::VGPR_32RegClass);
8391 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8392 .addImm(0) // src0_modifiers
8393 .add(Inst.getOperand(1))
8394 .addImm(0) // src1_modifiers
8395 .add(Inst.getOperand(2))
8396 .addImm(0) // clamp
8397 .addImm(0) // omod
8398 .addImm(0); // opsel0
8399 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8400 legalizeOperandsVALUt16(*NewInstr, MRI);
8401 legalizeOperands(*NewInstr, MDT);
8402 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8403 Inst.eraseFromParent();
8404 return;
8405 }
8406 case AMDGPU::V_S_EXP_F16_e64:
8407 case AMDGPU::V_S_LOG_F16_e64:
8408 case AMDGPU::V_S_RCP_F16_e64:
8409 case AMDGPU::V_S_RSQ_F16_e64:
8410 case AMDGPU::V_S_SQRT_F16_e64: {
8411 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8412 ? &AMDGPU::VGPR_16RegClass
8413 : &AMDGPU::VGPR_32RegClass);
8414 auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8415 .add(Inst.getOperand(1)) // src0_modifiers
8416 .add(Inst.getOperand(2))
8417 .add(Inst.getOperand(3)) // clamp
8418 .add(Inst.getOperand(4)) // omod
8419 .setMIFlags(Inst.getFlags());
8420 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8421 NewInstr.addImm(0); // opsel0
8422 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8423 legalizeOperandsVALUt16(*NewInstr, MRI);
8424 legalizeOperands(*NewInstr, MDT);
8425 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8426 Inst.eraseFromParent();
8427 return;
8428 }
8429 }
8430
8431 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8432 // We cannot move this instruction to the VALU, so we should try to
8433 // legalize its operands instead.
8434 legalizeOperands(Inst, MDT);
8435 return;
8436 }
8437 // Handle converting generic instructions like COPY-to-SGPR into
8438 // COPY-to-VGPR.
8439 if (NewOpcode == Opcode) {
8440 Register DstReg = Inst.getOperand(0).getReg();
8441 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8442
8443 if (Inst.isCopy() && DstReg.isPhysical() &&
8444 Inst.getOperand(1).getReg().isVirtual()) {
8445 handleCopyToPhysHelper(Worklist, DstReg, Inst, MRI, WaterFalls,
8446 V2SPhyCopiesToErase);
8447 return;
8448 }
8449
8450 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual()) {
8451 Register NewDstReg = Inst.getOperand(1).getReg();
8452 const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg);
8453 if (const TargetRegisterClass *CommonRC =
8454 RI.getCommonSubClass(NewDstRC, SrcRC)) {
8455 // Instead of creating a copy where src and dst are the same register
8456 // class, we just replace all uses of dst with src. These kinds of
8457 // copies interfere with the heuristics MachineSink uses to decide
8458 // whether or not to split a critical edge. Since the pass assumes
8459 // that copies will end up as machine instructions and not be
8460 // eliminated.
8461 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
8462 MRI.replaceRegWith(DstReg, NewDstReg);
8463 MRI.clearKillFlags(NewDstReg);
8464 Inst.getOperand(0).setReg(DstReg);
8465
8466 if (!MRI.constrainRegClass(NewDstReg, CommonRC))
8467 llvm_unreachable("failed to constrain register");
8468
8469 Inst.eraseFromParent();
8470
8471 for (MachineOperand &UseMO :
8472 make_early_inc_range(MRI.use_operands(NewDstReg))) {
8473 MachineInstr &UseMI = *UseMO.getParent();
8474
8475 // Legalize t16 operands since replaceReg is called after
8476 // addUsersToVALU.
8478
8479 unsigned OpIdx = UseMI.getOperandNo(&UseMO);
8480 if (const TargetRegisterClass *OpRC =
8481 getRegClass(UseMI.getDesc(), OpIdx))
8482 MRI.constrainRegClass(NewDstReg, OpRC);
8483 }
8484
8485 return;
8486 }
8487 }
8488
8489 // If this is a v2s copy between 16bit and 32bit reg,
8490 // replace vgpr copy to reg_sequence/extract_subreg
8491 // This can be remove after we have sgpr16 in place
8492 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8493 Inst.getOperand(1).getReg().isVirtual() &&
8494 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8495 const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
8496 if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
8497 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8498 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
8499 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8500 get(AMDGPU::IMPLICIT_DEF), Undef);
8501 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8502 get(AMDGPU::REG_SEQUENCE), NewDstReg)
8503 .addReg(Inst.getOperand(1).getReg())
8504 .addImm(AMDGPU::lo16)
8505 .addReg(Undef)
8506 .addImm(AMDGPU::hi16);
8507 Inst.eraseFromParent();
8508 MRI.replaceRegWith(DstReg, NewDstReg);
8509 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8510 return;
8511 } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
8512 AMDGPU::lo16)) {
8513 Inst.getOperand(1).setSubReg(AMDGPU::lo16);
8514 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8515 MRI.replaceRegWith(DstReg, NewDstReg);
8516 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8517 return;
8518 }
8519 }
8520
8521 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8522 MRI.replaceRegWith(DstReg, NewDstReg);
8523 legalizeOperands(Inst, MDT);
8524 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8525 return;
8526 }
8527
8528 // Use the new VALU Opcode.
8529 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
8530 .setMIFlags(Inst.getFlags());
8531 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
8532 // Intersperse VOP3 modifiers among the SALU operands.
8533 NewInstr->addOperand(Inst.getOperand(0));
8534 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8535 AMDGPU::OpName::src0_modifiers) >= 0)
8536 NewInstr.addImm(0);
8537 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
8538 const MachineOperand &Src = Inst.getOperand(1);
8539 NewInstr->addOperand(Src);
8540 }
8541
8542 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8543 // We are converting these to a BFE, so we need to add the missing
8544 // operands for the size and offset.
8545 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8546 NewInstr.addImm(0);
8547 NewInstr.addImm(Size);
8548 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8549 // The VALU version adds the second operand to the result, so insert an
8550 // extra 0 operand.
8551 NewInstr.addImm(0);
8552 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8553 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
8554 // If we need to move this to VGPRs, we need to unpack the second
8555 // operand back into the 2 separate ones for bit offset and width.
8556 assert(OffsetWidthOp.isImm() &&
8557 "Scalar BFE is only implemented for constant width and offset");
8558 uint32_t Imm = OffsetWidthOp.getImm();
8559
8560 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8561 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8562 NewInstr.addImm(Offset);
8563 NewInstr.addImm(BitWidth);
8564 } else {
8565 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8566 AMDGPU::OpName::src1_modifiers) >= 0)
8567 NewInstr.addImm(0);
8568 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
8569 NewInstr->addOperand(Inst.getOperand(2));
8570 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8571 AMDGPU::OpName::src2_modifiers) >= 0)
8572 NewInstr.addImm(0);
8573 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
8574 NewInstr->addOperand(Inst.getOperand(3));
8575 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
8576 NewInstr.addImm(0);
8577 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
8578 NewInstr.addImm(0);
8579 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
8580 NewInstr.addImm(0);
8581 }
8582 } else {
8583 // Just copy the SALU operands.
8584 for (const MachineOperand &Op : Inst.explicit_operands())
8585 NewInstr->addOperand(Op);
8586 }
8587
8588 // Remove any references to SCC. Vector instructions can't read from it, and
8589 // We're just about to add the implicit use / defs of VCC, and we don't want
8590 // both.
8591 for (MachineOperand &Op : Inst.implicit_operands()) {
8592 if (Op.getReg() == AMDGPU::SCC) {
8593 // Only propagate through live-def of SCC.
8594 if (Op.isDef() && !Op.isDead())
8595 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
8596 if (Op.isUse())
8597 addSCCDefsToVALUWorklist(NewInstr, Worklist);
8598 }
8599 }
8600 Inst.eraseFromParent();
8601 Register NewDstReg;
8602 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
8603 Register DstReg = NewInstr->getOperand(0).getReg();
8604 assert(DstReg.isVirtual());
8605 // Update the destination register class.
8606 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
8607 assert(NewDstRC);
8608 NewDstReg = MRI.createVirtualRegister(NewDstRC);
8609 MRI.replaceRegWith(DstReg, NewDstReg);
8610 }
8611 fixImplicitOperands(*NewInstr);
8612
8613 legalizeOperandsVALUt16(*NewInstr, MRI);
8614
8615 // Legalize the operands
8616 legalizeOperands(*NewInstr, MDT);
8617 if (NewDstReg)
8618 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8619}
8620
8621// Add/sub require special handling to deal with carry outs.
8622std::pair<bool, MachineBasicBlock *>
8623SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8624 MachineDominatorTree *MDT) const {
8625 if (ST.hasAddNoCarryInsts()) {
8626 // Assume there is no user of scc since we don't select this in that case.
8627 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8628 // is used.
8629
8630 MachineBasicBlock &MBB = *Inst.getParent();
8631 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8632
8633 Register OldDstReg = Inst.getOperand(0).getReg();
8634 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8635
8636 unsigned Opc = Inst.getOpcode();
8637 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8638
8639 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8640 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8641
8642 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8643 Inst.removeOperand(3);
8644
8645 Inst.setDesc(get(NewOpc));
8646 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
8647 Inst.addImplicitDefUseOperands(*MBB.getParent());
8648 MRI.replaceRegWith(OldDstReg, ResultReg);
8649 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
8650
8651 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8652 return std::pair(true, NewBB);
8653 }
8654
8655 return std::pair(false, nullptr);
8656}
8657
8658void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8659 MachineDominatorTree *MDT) const {
8660
8661 MachineBasicBlock &MBB = *Inst.getParent();
8662 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8663 MachineBasicBlock::iterator MII = Inst;
8664 const DebugLoc &DL = Inst.getDebugLoc();
8665
8666 MachineOperand &Dest = Inst.getOperand(0);
8667 MachineOperand &Src0 = Inst.getOperand(1);
8668 MachineOperand &Src1 = Inst.getOperand(2);
8669 MachineOperand &Cond = Inst.getOperand(3);
8670
8671 Register CondReg = Cond.getReg();
8672 bool IsSCC = (CondReg == AMDGPU::SCC);
8673
8674 // If this is a trivial select where the condition is effectively not SCC
8675 // (CondReg is a source of copy to SCC), then the select is semantically
8676 // equivalent to copying CondReg. Hence, there is no need to create
8677 // V_CNDMASK, we can just use that and bail out.
8678 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8679 (Src1.getImm() == 0)) {
8680 MRI.replaceRegWith(Dest.getReg(), CondReg);
8681 return;
8682 }
8683
8684 Register NewCondReg = CondReg;
8685 if (IsSCC) {
8686 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8687 NewCondReg = MRI.createVirtualRegister(TC);
8688
8689 // Now look for the closest SCC def if it is a copy
8690 // replacing the CondReg with the COPY source register
8691 bool CopyFound = false;
8692 for (MachineInstr &CandI :
8694 Inst.getParent()->rend())) {
8695 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
8696 -1) {
8697 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
8698 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
8699 .addReg(CandI.getOperand(1).getReg());
8700 CopyFound = true;
8701 }
8702 break;
8703 }
8704 }
8705 if (!CopyFound) {
8706 // SCC def is not a copy
8707 // Insert a trivial select instead of creating a copy, because a copy from
8708 // SCC would semantically mean just copying a single bit, but we may need
8709 // the result to be a vector condition mask that needs preserving.
8710 unsigned Opcode =
8711 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8712 auto NewSelect =
8713 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
8714 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
8715 }
8716 }
8717
8718 Register NewDestReg = MRI.createVirtualRegister(
8719 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
8720 MachineInstr *NewInst;
8721 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8722 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
8723 .addImm(0)
8724 .add(Src1) // False
8725 .addImm(0)
8726 .add(Src0) // True
8727 .addReg(NewCondReg);
8728 } else {
8729 NewInst =
8730 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
8731 .add(Src1) // False
8732 .add(Src0) // True
8733 .addReg(NewCondReg);
8734 }
8735 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
8736 legalizeOperands(*NewInst, MDT);
8737 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
8738}
8739
8740void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8741 MachineInstr &Inst) const {
8742 MachineBasicBlock &MBB = *Inst.getParent();
8743 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8744 MachineBasicBlock::iterator MII = Inst;
8745 const DebugLoc &DL = Inst.getDebugLoc();
8746
8747 MachineOperand &Dest = Inst.getOperand(0);
8748 MachineOperand &Src = Inst.getOperand(1);
8749 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8750 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8751
8752 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8753 : AMDGPU::V_SUB_CO_U32_e32;
8754
8755 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
8756 .addImm(0)
8757 .addReg(Src.getReg());
8758
8759 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8760 .addReg(Src.getReg())
8761 .addReg(TmpReg);
8762
8763 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8764 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8765}
8766
8767void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist,
8768 MachineInstr &Inst) const {
8769 MachineBasicBlock &MBB = *Inst.getParent();
8770 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8771 MachineBasicBlock::iterator MII = Inst;
8772 const DebugLoc &DL = Inst.getDebugLoc();
8773
8774 MachineOperand &Dest = Inst.getOperand(0);
8775 MachineOperand &Src1 = Inst.getOperand(1);
8776 MachineOperand &Src2 = Inst.getOperand(2);
8777 Register SubResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8778 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8779 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8780
8781 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8782 : AMDGPU::V_SUB_CO_U32_e32;
8783
8784 BuildMI(MBB, MII, DL, get(SubOp), SubResultReg)
8785 .addReg(Src1.getReg())
8786 .addReg(Src2.getReg());
8787
8788 BuildMI(MBB, MII, DL, get(SubOp), TmpReg).addImm(0).addReg(SubResultReg);
8789
8790 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8791 .addReg(SubResultReg)
8792 .addReg(TmpReg);
8793
8794 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8795 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8796}
8797
8798void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8799 MachineInstr &Inst) const {
8800 MachineBasicBlock &MBB = *Inst.getParent();
8801 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8802 MachineBasicBlock::iterator MII = Inst;
8803 const DebugLoc &DL = Inst.getDebugLoc();
8804
8805 MachineOperand &Dest = Inst.getOperand(0);
8806 MachineOperand &Src0 = Inst.getOperand(1);
8807 MachineOperand &Src1 = Inst.getOperand(2);
8808
8809 if (ST.hasDLInsts()) {
8810 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8811 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
8812 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
8813
8814 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
8815 .add(Src0)
8816 .add(Src1);
8817
8818 MRI.replaceRegWith(Dest.getReg(), NewDest);
8819 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8820 } else {
8821 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8822 // invert either source and then perform the XOR. If either source is a
8823 // scalar register, then we can leave the inversion on the scalar unit to
8824 // achieve a better distribution of scalar and vector instructions.
8825 bool Src0IsSGPR = Src0.isReg() &&
8826 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
8827 bool Src1IsSGPR = Src1.isReg() &&
8828 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
8829 MachineInstr *Xor;
8830 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8831 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8832
8833 // Build a pair of scalar instructions and add them to the work list.
8834 // The next iteration over the work list will lower these to the vector
8835 // unit as necessary.
8836 if (Src0IsSGPR) {
8837 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
8838 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8839 .addReg(Temp)
8840 .add(Src1);
8841 } else if (Src1IsSGPR) {
8842 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
8843 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8844 .add(Src0)
8845 .addReg(Temp);
8846 } else {
8847 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
8848 .add(Src0)
8849 .add(Src1);
8850 MachineInstr *Not =
8851 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
8852 Worklist.insert(Not);
8853 }
8854
8855 MRI.replaceRegWith(Dest.getReg(), NewDest);
8856
8857 Worklist.insert(Xor);
8858
8859 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8860 }
8861}
8862
8863void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8864 MachineInstr &Inst,
8865 unsigned Opcode) const {
8866 MachineBasicBlock &MBB = *Inst.getParent();
8867 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8868 MachineBasicBlock::iterator MII = Inst;
8869 const DebugLoc &DL = Inst.getDebugLoc();
8870
8871 MachineOperand &Dest = Inst.getOperand(0);
8872 MachineOperand &Src0 = Inst.getOperand(1);
8873 MachineOperand &Src1 = Inst.getOperand(2);
8874
8875 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8876 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8877
8878 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
8879 .add(Src0)
8880 .add(Src1);
8881
8882 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
8883 .addReg(Interm);
8884
8885 Worklist.insert(&Op);
8886 Worklist.insert(&Not);
8887
8888 MRI.replaceRegWith(Dest.getReg(), NewDest);
8889 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8890}
8891
8892void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8893 MachineInstr &Inst,
8894 unsigned Opcode) const {
8895 MachineBasicBlock &MBB = *Inst.getParent();
8896 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8897 MachineBasicBlock::iterator MII = Inst;
8898 const DebugLoc &DL = Inst.getDebugLoc();
8899
8900 MachineOperand &Dest = Inst.getOperand(0);
8901 MachineOperand &Src0 = Inst.getOperand(1);
8902 MachineOperand &Src1 = Inst.getOperand(2);
8903
8904 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8905 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8906
8907 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
8908 .add(Src1);
8909
8910 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
8911 .add(Src0)
8912 .addReg(Interm);
8913
8914 Worklist.insert(&Not);
8915 Worklist.insert(&Op);
8916
8917 MRI.replaceRegWith(Dest.getReg(), NewDest);
8918 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8919}
8920
8921void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8922 MachineInstr &Inst, unsigned Opcode,
8923 bool Swap) const {
8924 MachineBasicBlock &MBB = *Inst.getParent();
8925 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8926
8927 MachineOperand &Dest = Inst.getOperand(0);
8928 MachineOperand &Src0 = Inst.getOperand(1);
8929 const DebugLoc &DL = Inst.getDebugLoc();
8930
8931 MachineBasicBlock::iterator MII = Inst;
8932
8933 const MCInstrDesc &InstDesc = get(Opcode);
8934 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8935 MRI.getRegClass(Src0.getReg()) :
8936 &AMDGPU::SGPR_32RegClass;
8937
8938 const TargetRegisterClass *Src0SubRC =
8939 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8940
8941 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8942 AMDGPU::sub0, Src0SubRC);
8943
8944 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8945 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8946 const TargetRegisterClass *NewDestSubRC =
8947 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8948
8949 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8950 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
8951
8952 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8953 AMDGPU::sub1, Src0SubRC);
8954
8955 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8956 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
8957
8958 if (Swap)
8959 std::swap(DestSub0, DestSub1);
8960
8961 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8962 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8963 .addReg(DestSub0)
8964 .addImm(AMDGPU::sub0)
8965 .addReg(DestSub1)
8966 .addImm(AMDGPU::sub1);
8967
8968 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8969
8970 Worklist.insert(&LoHalf);
8971 Worklist.insert(&HiHalf);
8972
8973 // We don't need to legalizeOperands here because for a single operand, src0
8974 // will support any kind of input.
8975
8976 // Move all users of this moved value.
8977 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8978}
8979
8980// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8981// split the s_mul_u64 in 32-bit vector multiplications.
8982void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8983 MachineInstr &Inst,
8984 MachineDominatorTree *MDT) const {
8985 MachineBasicBlock &MBB = *Inst.getParent();
8986 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8987
8988 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8989 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8990 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8991
8992 MachineOperand &Dest = Inst.getOperand(0);
8993 MachineOperand &Src0 = Inst.getOperand(1);
8994 MachineOperand &Src1 = Inst.getOperand(2);
8995 const DebugLoc &DL = Inst.getDebugLoc();
8996 MachineBasicBlock::iterator MII = Inst;
8997
8998 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8999 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
9000 const TargetRegisterClass *Src0SubRC =
9001 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9002 if (RI.isSGPRClass(Src0SubRC))
9003 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
9004 const TargetRegisterClass *Src1SubRC =
9005 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9006 if (RI.isSGPRClass(Src1SubRC))
9007 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
9008
9009 // First, we extract the low 32-bit and high 32-bit values from each of the
9010 // operands.
9011 MachineOperand Op0L =
9012 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
9013 MachineOperand Op1L =
9014 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
9015 MachineOperand Op0H =
9016 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
9017 MachineOperand Op1H =
9018 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
9019
9020 // The multilication is done as follows:
9021 //
9022 // Op1H Op1L
9023 // * Op0H Op0L
9024 // --------------------
9025 // Op1H*Op0L Op1L*Op0L
9026 // + Op1H*Op0H Op1L*Op0H
9027 // -----------------------------------------
9028 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
9029 //
9030 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
9031 // value and that would overflow.
9032 // The low 32-bit value is Op1L*Op0L.
9033 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
9034
9035 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9036 MachineInstr *Op1L_Op0H =
9037 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
9038 .add(Op1L)
9039 .add(Op0H);
9040
9041 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9042 MachineInstr *Op1H_Op0L =
9043 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
9044 .add(Op1H)
9045 .add(Op0L);
9046
9047 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9048 MachineInstr *Carry =
9049 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
9050 .add(Op1L)
9051 .add(Op0L);
9052
9053 MachineInstr *LoHalf =
9054 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
9055 .add(Op1L)
9056 .add(Op0L);
9057
9058 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9059 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
9060 .addReg(Op1L_Op0H_Reg)
9061 .addReg(Op1H_Op0L_Reg);
9062
9063 MachineInstr *HiHalf =
9064 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
9065 .addReg(AddReg)
9066 .addReg(CarryReg);
9067
9068 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9069 .addReg(DestSub0)
9070 .addImm(AMDGPU::sub0)
9071 .addReg(DestSub1)
9072 .addImm(AMDGPU::sub1);
9073
9074 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9075
9076 // Try to legalize the operands in case we need to swap the order to keep it
9077 // valid.
9078 legalizeOperands(*Op1L_Op0H, MDT);
9079 legalizeOperands(*Op1H_Op0L, MDT);
9080 legalizeOperands(*Carry, MDT);
9081 legalizeOperands(*LoHalf, MDT);
9082 legalizeOperands(*Add, MDT);
9083 legalizeOperands(*HiHalf, MDT);
9084
9085 // Move all users of this moved value.
9086 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9087}
9088
9089// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
9090// multiplications.
9091void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
9092 MachineInstr &Inst,
9093 MachineDominatorTree *MDT) const {
9094 MachineBasicBlock &MBB = *Inst.getParent();
9095 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9096
9097 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9098 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9099 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9100
9101 MachineOperand &Dest = Inst.getOperand(0);
9102 MachineOperand &Src0 = Inst.getOperand(1);
9103 MachineOperand &Src1 = Inst.getOperand(2);
9104 const DebugLoc &DL = Inst.getDebugLoc();
9105 MachineBasicBlock::iterator MII = Inst;
9106
9107 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
9108 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
9109 const TargetRegisterClass *Src0SubRC =
9110 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9111 if (RI.isSGPRClass(Src0SubRC))
9112 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
9113 const TargetRegisterClass *Src1SubRC =
9114 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9115 if (RI.isSGPRClass(Src1SubRC))
9116 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
9117
9118 // First, we extract the low 32-bit and high 32-bit values from each of the
9119 // operands.
9120 MachineOperand Op0L =
9121 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
9122 MachineOperand Op1L =
9123 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
9124
9125 unsigned Opc = Inst.getOpcode();
9126 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
9127 ? AMDGPU::V_MUL_HI_U32_e64
9128 : AMDGPU::V_MUL_HI_I32_e64;
9129 MachineInstr *HiHalf =
9130 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
9131
9132 MachineInstr *LoHalf =
9133 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
9134 .add(Op1L)
9135 .add(Op0L);
9136
9137 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9138 .addReg(DestSub0)
9139 .addImm(AMDGPU::sub0)
9140 .addReg(DestSub1)
9141 .addImm(AMDGPU::sub1);
9142
9143 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9144
9145 // Try to legalize the operands in case we need to swap the order to keep it
9146 // valid.
9147 legalizeOperands(*HiHalf, MDT);
9148 legalizeOperands(*LoHalf, MDT);
9149
9150 // Move all users of this moved value.
9151 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9152}
9153
9154void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
9155 MachineInstr &Inst, unsigned Opcode,
9156 MachineDominatorTree *MDT) const {
9157 MachineBasicBlock &MBB = *Inst.getParent();
9158 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9159
9160 MachineOperand &Dest = Inst.getOperand(0);
9161 MachineOperand &Src0 = Inst.getOperand(1);
9162 MachineOperand &Src1 = Inst.getOperand(2);
9163 const DebugLoc &DL = Inst.getDebugLoc();
9164
9165 MachineBasicBlock::iterator MII = Inst;
9166
9167 const MCInstrDesc &InstDesc = get(Opcode);
9168 const TargetRegisterClass *Src0RC = Src0.isReg() ?
9169 MRI.getRegClass(Src0.getReg()) :
9170 &AMDGPU::SGPR_32RegClass;
9171
9172 const TargetRegisterClass *Src0SubRC =
9173 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9174 const TargetRegisterClass *Src1RC = Src1.isReg() ?
9175 MRI.getRegClass(Src1.getReg()) :
9176 &AMDGPU::SGPR_32RegClass;
9177
9178 const TargetRegisterClass *Src1SubRC =
9179 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9180
9181 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9182 AMDGPU::sub0, Src0SubRC);
9183 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
9184 AMDGPU::sub0, Src1SubRC);
9185 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9186 AMDGPU::sub1, Src0SubRC);
9187 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
9188 AMDGPU::sub1, Src1SubRC);
9189
9190 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9191 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
9192 const TargetRegisterClass *NewDestSubRC =
9193 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
9194
9195 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
9196 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
9197 .add(SrcReg0Sub0)
9198 .add(SrcReg1Sub0);
9199
9200 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
9201 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
9202 .add(SrcReg0Sub1)
9203 .add(SrcReg1Sub1);
9204
9205 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
9206 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9207 .addReg(DestSub0)
9208 .addImm(AMDGPU::sub0)
9209 .addReg(DestSub1)
9210 .addImm(AMDGPU::sub1);
9211
9212 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9213
9214 Worklist.insert(&LoHalf);
9215 Worklist.insert(&HiHalf);
9216
9217 // Move all users of this moved value.
9218 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9219}
9220
9221void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
9222 MachineInstr &Inst,
9223 MachineDominatorTree *MDT) const {
9224 MachineBasicBlock &MBB = *Inst.getParent();
9225 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9226
9227 MachineOperand &Dest = Inst.getOperand(0);
9228 MachineOperand &Src0 = Inst.getOperand(1);
9229 MachineOperand &Src1 = Inst.getOperand(2);
9230 const DebugLoc &DL = Inst.getDebugLoc();
9231
9232 MachineBasicBlock::iterator MII = Inst;
9233
9234 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9235
9236 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
9237
9238 MachineOperand* Op0;
9239 MachineOperand* Op1;
9240
9241 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
9242 Op0 = &Src0;
9243 Op1 = &Src1;
9244 } else {
9245 Op0 = &Src1;
9246 Op1 = &Src0;
9247 }
9248
9249 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
9250 .add(*Op0);
9251
9252 Register NewDest = MRI.createVirtualRegister(DestRC);
9253
9254 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
9255 .addReg(Interm)
9256 .add(*Op1);
9257
9258 MRI.replaceRegWith(Dest.getReg(), NewDest);
9259
9260 Worklist.insert(&Xor);
9261}
9262
9263void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
9264 MachineInstr &Inst) const {
9265 MachineBasicBlock &MBB = *Inst.getParent();
9266 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9267
9268 MachineBasicBlock::iterator MII = Inst;
9269 const DebugLoc &DL = Inst.getDebugLoc();
9270
9271 MachineOperand &Dest = Inst.getOperand(0);
9272 MachineOperand &Src = Inst.getOperand(1);
9273
9274 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
9275 const TargetRegisterClass *SrcRC = Src.isReg() ?
9276 MRI.getRegClass(Src.getReg()) :
9277 &AMDGPU::SGPR_32RegClass;
9278
9279 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9280 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9281
9282 const TargetRegisterClass *SrcSubRC =
9283 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9284
9285 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9286 AMDGPU::sub0, SrcSubRC);
9287 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9288 AMDGPU::sub1, SrcSubRC);
9289
9290 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
9291
9292 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
9293
9294 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9295
9296 // We don't need to legalize operands here. src0 for either instruction can be
9297 // an SGPR, and the second input is unused or determined here.
9298 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9299}
9300
9301void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
9302 MachineInstr &Inst) const {
9303 MachineBasicBlock &MBB = *Inst.getParent();
9304 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9305 MachineBasicBlock::iterator MII = Inst;
9306 const DebugLoc &DL = Inst.getDebugLoc();
9307
9308 MachineOperand &Dest = Inst.getOperand(0);
9309 uint32_t Imm = Inst.getOperand(2).getImm();
9310 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
9311 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
9312
9313 (void) Offset;
9314
9315 // Only sext_inreg cases handled.
9316 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
9317 Offset == 0 && "Not implemented");
9318
9319 if (BitWidth < 32) {
9320 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9321 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9322 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9323
9324 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
9325 .addReg(Inst.getOperand(1).getReg(), {}, AMDGPU::sub0)
9326 .addImm(0)
9327 .addImm(BitWidth);
9328
9329 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
9330 .addImm(31)
9331 .addReg(MidRegLo);
9332
9333 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9334 .addReg(MidRegLo)
9335 .addImm(AMDGPU::sub0)
9336 .addReg(MidRegHi)
9337 .addImm(AMDGPU::sub1);
9338
9339 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9340 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9341 return;
9342 }
9343
9344 MachineOperand &Src = Inst.getOperand(1);
9345 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9346 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9347
9348 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
9349 .addImm(31)
9350 .addReg(Src.getReg(), {}, AMDGPU::sub0);
9351
9352 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9353 .addReg(Src.getReg(), {}, AMDGPU::sub0)
9354 .addImm(AMDGPU::sub0)
9355 .addReg(TmpReg)
9356 .addImm(AMDGPU::sub1);
9357
9358 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9359 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9360}
9361
9362void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
9363 MachineInstr &Inst, unsigned Opcode,
9364 MachineDominatorTree *MDT) const {
9365 // (S_FLBIT_I32_B64 hi:lo) ->
9366 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
9367 // (S_FF1_I32_B64 hi:lo) ->
9368 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
9369
9370 MachineBasicBlock &MBB = *Inst.getParent();
9371 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9372 MachineBasicBlock::iterator MII = Inst;
9373 const DebugLoc &DL = Inst.getDebugLoc();
9374
9375 MachineOperand &Dest = Inst.getOperand(0);
9376 MachineOperand &Src = Inst.getOperand(1);
9377
9378 const MCInstrDesc &InstDesc = get(Opcode);
9379
9380 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
9381 unsigned OpcodeAdd = ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64
9382 : AMDGPU::V_ADD_CO_U32_e32;
9383
9384 const TargetRegisterClass *SrcRC =
9385 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
9386 const TargetRegisterClass *SrcSubRC =
9387 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9388
9389 MachineOperand SrcRegSub0 =
9390 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
9391 MachineOperand SrcRegSub1 =
9392 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
9393
9394 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9395 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9396 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9397 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9398
9399 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
9400
9401 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
9402
9403 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
9404 .addReg(IsCtlz ? MidReg1 : MidReg2)
9405 .addImm(32)
9406 .addImm(1); // enable clamp
9407
9408 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
9409 .addReg(MidReg3)
9410 .addReg(IsCtlz ? MidReg2 : MidReg1);
9411
9412 MRI.replaceRegWith(Dest.getReg(), MidReg4);
9413
9414 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
9415}
9416
9417void SIInstrInfo::addUsersToMoveToVALUWorklist(
9418 Register DstReg, MachineRegisterInfo &MRI,
9419 SIInstrWorklist &Worklist) const {
9420 for (MachineOperand &MO : make_early_inc_range(MRI.use_operands(DstReg))) {
9421 MachineInstr &UseMI = *MO.getParent();
9422
9423 unsigned OpNo = 0;
9424
9425 switch (UseMI.getOpcode()) {
9426 case AMDGPU::COPY:
9427 case AMDGPU::WQM:
9428 case AMDGPU::SOFT_WQM:
9429 case AMDGPU::STRICT_WWM:
9430 case AMDGPU::STRICT_WQM:
9431 case AMDGPU::REG_SEQUENCE:
9432 case AMDGPU::PHI:
9433 case AMDGPU::INSERT_SUBREG:
9434 break;
9435 default:
9436 OpNo = MO.getOperandNo();
9437 break;
9438 }
9439
9440 const TargetRegisterClass *OpRC = getOpRegClass(UseMI, OpNo);
9441 MRI.constrainRegClass(DstReg, OpRC);
9442
9443 if (!RI.hasVectorRegisters(OpRC))
9444 Worklist.insert(&UseMI);
9445 else
9446 // Legalization could change user list.
9447 legalizeOperandsVALUt16(UseMI, OpNo, MRI);
9448 }
9449}
9450
9451void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9453 MachineInstr &Inst) const {
9454 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9455 MachineBasicBlock *MBB = Inst.getParent();
9456 MachineOperand &Src0 = Inst.getOperand(1);
9457 MachineOperand &Src1 = Inst.getOperand(2);
9458 const DebugLoc &DL = Inst.getDebugLoc();
9459
9460 if (ST.useRealTrue16Insts()) {
9461 Register SrcReg0, SrcReg1;
9462 if (!Src0.isReg() || !RI.isVGPR(MRI, Src0.getReg())) {
9463 SrcReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9464 BuildMI(*MBB, Inst, DL,
9465 get(Src0.isImm() ? AMDGPU::V_MOV_B32_e32 : AMDGPU::COPY), SrcReg0)
9466 .add(Src0);
9467 } else {
9468 SrcReg0 = Src0.getReg();
9469 }
9470
9471 if (!Src1.isReg() || !RI.isVGPR(MRI, Src1.getReg())) {
9472 SrcReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9473 BuildMI(*MBB, Inst, DL,
9474 get(Src1.isImm() ? AMDGPU::V_MOV_B32_e32 : AMDGPU::COPY), SrcReg1)
9475 .add(Src1);
9476 } else {
9477 SrcReg1 = Src1.getReg();
9478 }
9479
9480 bool isSrc0Reg16 = MRI.constrainRegClass(SrcReg0, &AMDGPU::VGPR_16RegClass);
9481 bool isSrc1Reg16 = MRI.constrainRegClass(SrcReg1, &AMDGPU::VGPR_16RegClass);
9482
9483 auto NewMI = BuildMI(*MBB, Inst, DL, get(AMDGPU::REG_SEQUENCE), ResultReg);
9484 switch (Inst.getOpcode()) {
9485 case AMDGPU::S_PACK_LL_B32_B16:
9486 NewMI
9487 .addReg(SrcReg0, {},
9488 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9489 .addImm(AMDGPU::lo16)
9490 .addReg(SrcReg1, {},
9491 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9492 .addImm(AMDGPU::hi16);
9493 break;
9494 case AMDGPU::S_PACK_LH_B32_B16:
9495 NewMI
9496 .addReg(SrcReg0, {},
9497 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9498 .addImm(AMDGPU::lo16)
9499 .addReg(SrcReg1, {}, AMDGPU::hi16)
9500 .addImm(AMDGPU::hi16);
9501 break;
9502 case AMDGPU::S_PACK_HL_B32_B16:
9503 NewMI.addReg(SrcReg0, {}, AMDGPU::hi16)
9504 .addImm(AMDGPU::lo16)
9505 .addReg(SrcReg1, {},
9506 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9507 .addImm(AMDGPU::hi16);
9508 break;
9509 case AMDGPU::S_PACK_HH_B32_B16:
9510 NewMI.addReg(SrcReg0, {}, AMDGPU::hi16)
9511 .addImm(AMDGPU::lo16)
9512 .addReg(SrcReg1, {}, AMDGPU::hi16)
9513 .addImm(AMDGPU::hi16);
9514 break;
9515 default:
9516 llvm_unreachable("unhandled s_pack_* instruction");
9517 }
9518
9519 MachineOperand &Dest = Inst.getOperand(0);
9520 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9521 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9522 return;
9523 }
9524
9525 switch (Inst.getOpcode()) {
9526 case AMDGPU::S_PACK_LL_B32_B16: {
9527 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9528 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9529
9530 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9531 // 0.
9532 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9533 .addImm(0xffff);
9534
9535 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
9536 .addReg(ImmReg, RegState::Kill)
9537 .add(Src0);
9538
9539 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9540 .add(Src1)
9541 .addImm(16)
9542 .addReg(TmpReg, RegState::Kill);
9543 break;
9544 }
9545 case AMDGPU::S_PACK_LH_B32_B16: {
9546 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9547 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9548 .addImm(0xffff);
9549 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
9550 .addReg(ImmReg, RegState::Kill)
9551 .add(Src0)
9552 .add(Src1);
9553 break;
9554 }
9555 case AMDGPU::S_PACK_HL_B32_B16: {
9556 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9557 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9558 .addImm(16)
9559 .add(Src0);
9560 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9561 .add(Src1)
9562 .addImm(16)
9563 .addReg(TmpReg, RegState::Kill);
9564 break;
9565 }
9566 case AMDGPU::S_PACK_HH_B32_B16: {
9567 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9568 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9569 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9570 .addImm(16)
9571 .add(Src0);
9572 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9573 .addImm(0xffff0000);
9574 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
9575 .add(Src1)
9576 .addReg(ImmReg, RegState::Kill)
9577 .addReg(TmpReg, RegState::Kill);
9578 break;
9579 }
9580 default:
9581 llvm_unreachable("unhandled s_pack_* instruction");
9582 }
9583
9584 MachineOperand &Dest = Inst.getOperand(0);
9585 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9586 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9587}
9588
9589void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
9590 MachineInstr &SCCDefInst,
9591 SIInstrWorklist &Worklist,
9592 Register NewCond) const {
9593
9594 // Ensure that def inst defines SCC, which is still live.
9595 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9596 !Op.isDead() && Op.getParent() == &SCCDefInst);
9597 SmallVector<MachineInstr *, 4> CopyToDelete;
9598 // This assumes that all the users of SCC are in the same block
9599 // as the SCC def.
9600 for (MachineInstr &MI : // Skip the def inst itself.
9601 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
9602 SCCDefInst.getParent()->end())) {
9603 // Check if SCC is used first.
9604 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
9605 if (SCCIdx != -1) {
9606 if (MI.isCopy()) {
9607 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9608 Register DestReg = MI.getOperand(0).getReg();
9609
9610 MRI.replaceRegWith(DestReg, NewCond);
9611 CopyToDelete.push_back(&MI);
9612 } else {
9613
9614 if (NewCond.isValid())
9615 MI.getOperand(SCCIdx).setReg(NewCond);
9616
9617 Worklist.insert(&MI);
9618 }
9619 }
9620 // Exit if we find another SCC def.
9621 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
9622 break;
9623 }
9624 for (auto &Copy : CopyToDelete)
9625 Copy->eraseFromParent();
9626}
9627
9628// Instructions that use SCC may be converted to VALU instructions. When that
9629// happens, the SCC register is changed to VCC_LO. The instruction that defines
9630// SCC must be changed to an instruction that defines VCC. This function makes
9631// sure that the instruction that defines SCC is added to the moveToVALU
9632// worklist.
9633void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9634 SIInstrWorklist &Worklist) const {
9635 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9636 // then there is nothing to do because the defining instruction has been
9637 // converted to a VALU already. If SCC then that instruction needs to be
9638 // converted to a VALU.
9639 for (MachineInstr &MI :
9640 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
9641 SCCUseInst->getParent()->rend())) {
9642 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
9643 break;
9644 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
9645 Worklist.insert(&MI);
9646 break;
9647 }
9648 }
9649}
9650
9651const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9652 const MachineInstr &Inst) const {
9653 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
9654
9655 switch (Inst.getOpcode()) {
9656 // For target instructions, getOpRegClass just returns the virtual register
9657 // class associated with the operand, so we need to find an equivalent VGPR
9658 // register class in order to move the instruction to the VALU.
9659 case AMDGPU::COPY:
9660 case AMDGPU::PHI:
9661 case AMDGPU::REG_SEQUENCE:
9662 case AMDGPU::INSERT_SUBREG:
9663 case AMDGPU::WQM:
9664 case AMDGPU::SOFT_WQM:
9665 case AMDGPU::STRICT_WWM:
9666 case AMDGPU::STRICT_WQM: {
9667 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
9668 if (RI.isAGPRClass(SrcRC)) {
9669 if (RI.isAGPRClass(NewDstRC))
9670 return nullptr;
9671
9672 switch (Inst.getOpcode()) {
9673 case AMDGPU::PHI:
9674 case AMDGPU::REG_SEQUENCE:
9675 case AMDGPU::INSERT_SUBREG:
9676 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
9677 break;
9678 default:
9679 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9680 }
9681
9682 if (!NewDstRC)
9683 return nullptr;
9684 } else {
9685 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9686 return nullptr;
9687
9688 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9689 if (!NewDstRC)
9690 return nullptr;
9691 }
9692
9693 return NewDstRC;
9694 }
9695 default:
9696 return NewDstRC;
9697 }
9698}
9699
9700// Find the one SGPR operand we are allowed to use.
9701Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9702 int OpIndices[3]) const {
9703 const MCInstrDesc &Desc = MI.getDesc();
9704
9705 // Find the one SGPR operand we are allowed to use.
9706 //
9707 // First we need to consider the instruction's operand requirements before
9708 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9709 // of VCC, but we are still bound by the constant bus requirement to only use
9710 // one.
9711 //
9712 // If the operand's class is an SGPR, we can never move it.
9713
9714 Register SGPRReg = findImplicitSGPRRead(MI);
9715 if (SGPRReg)
9716 return SGPRReg;
9717
9718 Register UsedSGPRs[3] = {Register()};
9719 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9720
9721 for (unsigned i = 0; i < 3; ++i) {
9722 int Idx = OpIndices[i];
9723 if (Idx == -1)
9724 break;
9725
9726 const MachineOperand &MO = MI.getOperand(Idx);
9727 if (!MO.isReg())
9728 continue;
9729
9730 // Is this operand statically required to be an SGPR based on the operand
9731 // constraints?
9732 const TargetRegisterClass *OpRC =
9733 RI.getRegClass(getOpRegClassID(Desc.operands()[Idx]));
9734 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
9735 if (IsRequiredSGPR)
9736 return MO.getReg();
9737
9738 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9739 Register Reg = MO.getReg();
9740 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9741 if (RI.isSGPRClass(RegRC))
9742 UsedSGPRs[i] = Reg;
9743 }
9744
9745 // We don't have a required SGPR operand, so we have a bit more freedom in
9746 // selecting operands to move.
9747
9748 // Try to select the most used SGPR. If an SGPR is equal to one of the
9749 // others, we choose that.
9750 //
9751 // e.g.
9752 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9753 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9754
9755 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9756 // prefer those.
9757
9758 if (UsedSGPRs[0]) {
9759 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9760 SGPRReg = UsedSGPRs[0];
9761 }
9762
9763 if (!SGPRReg && UsedSGPRs[1]) {
9764 if (UsedSGPRs[1] == UsedSGPRs[2])
9765 SGPRReg = UsedSGPRs[1];
9766 }
9767
9768 return SGPRReg;
9769}
9770
9772 AMDGPU::OpName OperandName) const {
9773 if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
9774 return nullptr;
9775
9776 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
9777 if (Idx == -1)
9778 return nullptr;
9779
9780 return &MI.getOperand(Idx);
9781}
9782
9784 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9785 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9788 return (Format << 44) |
9789 (1ULL << 56) | // RESOURCE_LEVEL = 1
9790 (3ULL << 60); // OOB_SELECT = 3
9791 }
9792
9793 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9794 if (ST.isAmdHsaOS()) {
9795 // Set ATC = 1. GFX9 doesn't have this bit.
9796 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9797 RsrcDataFormat |= (1ULL << 56);
9798
9799 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9800 // BTW, it disables TC L2 and therefore decreases performance.
9801 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9802 RsrcDataFormat |= (2ULL << 59);
9803 }
9804
9805 return RsrcDataFormat;
9806}
9807
9811 0xffffffff; // Size;
9812
9813 // GFX9 doesn't have ELEMENT_SIZE.
9814 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9815 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
9816 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9817 }
9818
9819 // IndexStride = 64 / 32.
9820 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9821 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9822
9823 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9824 // Clear them unless we want a huge stride.
9825 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9826 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9827 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9828
9829 return Rsrc23;
9830}
9831
9833 unsigned Opc = MI.getOpcode();
9834
9835 return isSMRD(Opc);
9836}
9837
9839 return get(Opc).mayLoad() &&
9840 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
9841}
9842
9844 TypeSize &MemBytes) const {
9845 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
9846 if (!Addr || !Addr->isFI())
9847 return Register();
9848
9849 assert(!MI.memoperands_empty() &&
9850 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9851
9852 FrameIndex = Addr->getIndex();
9853
9854 int VDataIdx =
9855 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
9856 MemBytes = TypeSize::getFixed(getOpSize(MI.getOpcode(), VDataIdx));
9857 return MI.getOperand(VDataIdx).getReg();
9858}
9859
9861 TypeSize &MemBytes) const {
9862 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
9863 assert(Addr && Addr->isFI());
9864 FrameIndex = Addr->getIndex();
9865
9866 int DataIdx =
9867 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::data);
9868 MemBytes = TypeSize::getFixed(getOpSize(MI.getOpcode(), DataIdx));
9869 return MI.getOperand(DataIdx).getReg();
9870}
9871
9873 int &FrameIndex,
9874 TypeSize &MemBytes) const {
9875 if (!MI.mayLoad())
9876 return Register();
9877
9878 if (isMUBUF(MI) || isVGPRSpill(MI))
9879 return isStackAccess(MI, FrameIndex, MemBytes);
9880
9881 if (isSGPRSpill(MI))
9882 return isSGPRStackAccess(MI, FrameIndex, MemBytes);
9883
9884 return Register();
9885}
9886
9888 int &FrameIndex,
9889 TypeSize &MemBytes) const {
9890 if (!MI.mayStore())
9891 return Register();
9892
9893 if (isMUBUF(MI) || isVGPRSpill(MI))
9894 return isStackAccess(MI, FrameIndex, MemBytes);
9895
9896 if (isSGPRSpill(MI))
9897 return isSGPRStackAccess(MI, FrameIndex, MemBytes);
9898
9899 return Register();
9900}
9901
9903 unsigned Opc = MI.getOpcode();
9905 unsigned DescSize = Desc.getSize();
9906
9907 // If we have a definitive size, we can use it. Otherwise we need to inspect
9908 // the operands to know the size.
9909 if (isFixedSize(MI)) {
9910 unsigned Size = DescSize;
9911
9912 // If we hit the buggy offset, an extra nop will be inserted in MC so
9913 // estimate the worst case.
9914 if (MI.isBranch() && ST.hasOffset3fBug())
9915 Size += 4;
9916
9917 return Size;
9918 }
9919
9920 // Instructions may have a 32-bit literal encoded after them. Check
9921 // operands that could ever be literals.
9922 if (isVALU(MI) || isSALU(MI)) {
9923 if (isDPP(MI))
9924 return DescSize;
9925 bool HasLiteral = false;
9926 unsigned LiteralSize = 4;
9927 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9928 const MachineOperand &Op = MI.getOperand(I);
9929 const MCOperandInfo &OpInfo = Desc.operands()[I];
9930 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
9931 HasLiteral = true;
9932 if (ST.has64BitLiterals()) {
9933 switch (OpInfo.OperandType) {
9934 default:
9935 break;
9937 if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
9938 LiteralSize = 8;
9939 break;
9941 // A 32-bit literal is only valid when the value fits in BOTH signed
9942 // and unsigned 32-bit ranges [0, 2^31-1], matching the MC code
9943 // emitter's getLit64Encoding logic. This is because of the lack of
9944 // abilility to tell signedness of the literal, therefore we need to
9945 // be conservative and assume values outside this range require a
9946 // 64-bit literal encoding (8 bytes).
9947 if (!Op.isImm() || !isInt<32>(Op.getImm()) ||
9948 !isUInt<32>(Op.getImm()))
9949 LiteralSize = 8;
9950 break;
9951 }
9952 }
9953 break;
9954 }
9955 }
9956 return HasLiteral ? DescSize + LiteralSize : DescSize;
9957 }
9958
9959 // Check whether we have extra NSA words.
9960 if (isMIMG(MI)) {
9961 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
9962 if (VAddr0Idx < 0)
9963 return 8;
9964
9965 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
9966 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9967 }
9968
9969 switch (Opc) {
9970 case TargetOpcode::BUNDLE:
9971 return getInstBundleSize(MI);
9972 case TargetOpcode::INLINEASM:
9973 case TargetOpcode::INLINEASM_BR: {
9974 const MachineFunction *MF = MI.getMF();
9975 const char *AsmStr = MI.getOperand(0).getSymbolName();
9976 return getInlineAsmLength(AsmStr, MF->getTarget().getMCAsmInfo(), &ST);
9977 }
9978 default:
9979 if (MI.isMetaInstruction())
9980 return 0;
9981
9982 // If D16 Pseudo inst, get correct MC code size
9983 const auto *D16Info = AMDGPU::getT16D16Helper(Opc);
9984 if (D16Info) {
9985 // Assume d16_lo/hi inst are always in same size
9986 unsigned LoInstOpcode = D16Info->LoOp;
9987 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);
9988 DescSize = Desc.getSize();
9989 }
9990
9991 // If FMA Pseudo inst, get correct MC code size
9992 if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) {
9993 // All potential lowerings are the same size; arbitrarily pick one.
9994 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(AMDGPU::V_FMA_MIXLO_F16);
9995 DescSize = Desc.getSize();
9996 }
9997
9998 return DescSize;
9999 }
10000}
10001
10004 if (MI.isBranch() && ST.hasOffset3fBug())
10005 return InstSizeVerifyMode::NoVerify;
10006 return InstSizeVerifyMode::ExactSize;
10007}
10008
10010 if (!isFLAT(MI))
10011 return false;
10012
10013 if (MI.memoperands_empty())
10014 return true;
10015
10016 for (const MachineMemOperand *MMO : MI.memoperands()) {
10017 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
10018 return true;
10019 }
10020 return false;
10021}
10022
10025 static const std::pair<int, const char *> TargetIndices[] = {
10026 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
10027 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
10028 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
10029 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
10030 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
10031 return ArrayRef(TargetIndices);
10032}
10033
10034/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
10035/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
10038 const ScheduleDAG *DAG) const {
10039 return new GCNHazardRecognizer(DAG->MF);
10040}
10041
10042/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
10043/// pass.
10046 MachineLoopInfo *MLI) const {
10047 return new GCNHazardRecognizer(MF, MLI);
10048}
10049
10050// Called during:
10051// - pre-RA scheduling and post-RA scheduling
10054 const ScheduleDAGMI *DAG) const {
10055 // Borrowed from Arm Target
10056 // We would like to restrict this hazard recognizer to only
10057 // post-RA scheduling; we can tell that we're post-RA because we don't
10058 // track VRegLiveness.
10059 if (!DAG->hasVRegLiveness())
10060 return new GCNHazardRecognizer(DAG->MF);
10062}
10063
10064std::pair<unsigned, unsigned>
10066 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
10067}
10068
10071 static const std::pair<unsigned, const char *> TargetFlags[] = {
10072 {MO_GOTPCREL, "amdgpu-gotprel"},
10073 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
10074 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
10075 {MO_GOTPCREL64, "amdgpu-gotprel64"},
10076 {MO_REL32_LO, "amdgpu-rel32-lo"},
10077 {MO_REL32_HI, "amdgpu-rel32-hi"},
10078 {MO_REL64, "amdgpu-rel64"},
10079 {MO_ABS32_LO, "amdgpu-abs32-lo"},
10080 {MO_ABS32_HI, "amdgpu-abs32-hi"},
10081 {MO_ABS64, "amdgpu-abs64"},
10082 };
10083
10084 return ArrayRef(TargetFlags);
10085}
10086
10089 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
10090 {
10091 {MONoClobber, "amdgpu-noclobber"},
10092 {MOLastUse, "amdgpu-last-use"},
10093 {MOCooperative, "amdgpu-cooperative"},
10094 {MOThreadPrivate, "amdgpu-thread-private"},
10095 };
10096
10097 return ArrayRef(TargetFlags);
10098}
10099
10101 const MachineFunction &MF) const {
10103 assert(SrcReg.isVirtual());
10104 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
10105 return AMDGPU::WWM_COPY;
10106
10107 return AMDGPU::COPY;
10108}
10109
10111 uint32_t Opcode = MI.getOpcode();
10112 // Check if it is SGPR spill or wwm-register spill Opcode.
10113 if (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode))
10114 return true;
10115
10116 const MachineFunction *MF = MI.getMF();
10117 const MachineRegisterInfo &MRI = MF->getRegInfo();
10119
10120 // See if this is Liverange split instruction inserted for SGPR or
10121 // wwm-register. The implicit def inserted for wwm-registers should also be
10122 // included as they can appear at the bb begin.
10123 bool IsLRSplitInst = MI.getFlag(MachineInstr::LRSplit);
10124 if (!IsLRSplitInst && Opcode != AMDGPU::IMPLICIT_DEF)
10125 return false;
10126
10127 Register Reg = MI.getOperand(0).getReg();
10128 if (RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg)))
10129 return IsLRSplitInst;
10130
10131 return MFI->isWWMReg(Reg);
10132}
10133
10135 Register Reg) const {
10136 // We need to handle instructions which may be inserted during register
10137 // allocation to handle the prolog. The initial prolog instruction may have
10138 // been separated from the start of the block by spills and copies inserted
10139 // needed by the prolog. However, the insertions for scalar registers can
10140 // always be placed at the BB top as they are independent of the exec mask
10141 // value.
10142 bool IsNullOrVectorRegister = true;
10143 if (Reg) {
10144 const MachineFunction *MF = MI.getMF();
10145 const MachineRegisterInfo &MRI = MF->getRegInfo();
10146 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
10147 }
10148
10149 return IsNullOrVectorRegister &&
10150 (canAddToBBProlog(MI) ||
10151 (!MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
10152 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
10153}
10154
10158 const DebugLoc &DL,
10159 Register DestReg) const {
10160 if (ST.hasAddNoCarryInsts())
10161 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
10162
10163 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
10164 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
10165 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
10166
10167 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
10168 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
10169}
10170
10173 const DebugLoc &DL,
10174 Register DestReg,
10175 RegScavenger &RS) const {
10176 if (ST.hasAddNoCarryInsts())
10177 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
10178
10179 // If available, prefer to use vcc.
10180 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
10181 ? Register(RI.getVCC())
10182 : RS.scavengeRegisterBackwards(
10183 *RI.getBoolRC(), I, /* RestoreAfter */ false,
10184 0, /* AllowSpill */ false);
10185
10186 // TODO: Users need to deal with this.
10187 if (!UnusedCarry.isValid())
10188 return MachineInstrBuilder();
10189
10190 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
10191 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
10192}
10193
10194bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
10195 switch (Opcode) {
10196 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
10197 case AMDGPU::SI_KILL_I1_TERMINATOR:
10198 return true;
10199 default:
10200 return false;
10201 }
10202}
10203
10205 switch (Opcode) {
10206 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
10207 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
10208 case AMDGPU::SI_KILL_I1_PSEUDO:
10209 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
10210 default:
10211 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
10212 }
10213}
10214
10215bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
10216 return Imm <= getMaxMUBUFImmOffset(ST);
10217}
10218
10220 // GFX12 field is non-negative 24-bit signed byte offset.
10221 const unsigned OffsetBits =
10222 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
10223 return (1 << OffsetBits) - 1;
10224}
10225
10227 if (!ST.isWave32())
10228 return;
10229
10230 if (MI.isInlineAsm())
10231 return;
10232
10233 if (MI.getNumOperands() < MI.getNumExplicitOperands())
10234 return;
10235
10236 for (auto &Op : MI.implicit_operands()) {
10237 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
10238 Op.setReg(AMDGPU::VCC_LO);
10239 }
10240}
10241
10243 if (!isSMRD(MI))
10244 return false;
10245
10246 // Check that it is using a buffer resource.
10247 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
10248 if (Idx == -1) // e.g. s_memtime
10249 return false;
10250
10251 const int16_t RCID = getOpRegClassID(MI.getDesc().operands()[Idx]);
10252 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
10253}
10254
10255// Given Imm, split it into the values to put into the SOffset and ImmOffset
10256// fields in an MUBUF instruction. Return false if it is not possible (due to a
10257// hardware bug needing a workaround).
10258//
10259// The required alignment ensures that individual address components remain
10260// aligned if they are aligned to begin with. It also ensures that additional
10261// offsets within the given alignment can be added to the resulting ImmOffset.
10263 uint32_t &ImmOffset, Align Alignment) const {
10264 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
10265 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
10266 uint32_t Overflow = 0;
10267
10268 if (Imm > MaxImm) {
10269 if (Imm <= MaxImm + 64) {
10270 // Use an SOffset inline constant for 4..64
10271 Overflow = Imm - MaxImm;
10272 Imm = MaxImm;
10273 } else {
10274 // Try to keep the same value in SOffset for adjacent loads, so that
10275 // the corresponding register contents can be re-used.
10276 //
10277 // Load values with all low-bits (except for alignment bits) set into
10278 // SOffset, so that a larger range of values can be covered using
10279 // s_movk_i32.
10280 //
10281 // Atomic operations fail to work correctly when individual address
10282 // components are unaligned, even if their sum is aligned.
10283 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
10284 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
10285 Imm = Low;
10286 Overflow = High - Alignment.value();
10287 }
10288 }
10289
10290 if (Overflow > 0) {
10291 // There is a hardware bug in SI and CI which prevents address clamping in
10292 // MUBUF instructions from working correctly with SOffsets. The immediate
10293 // offset is unaffected.
10294 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
10295 return false;
10296
10297 // It is not possible to set immediate in SOffset field on some targets.
10298 if (ST.hasRestrictedSOffset())
10299 return false;
10300 }
10301
10302 ImmOffset = Imm;
10303 SOffset = Overflow;
10304 return true;
10305}
10306
10307// Depending on the used address space and instructions, some immediate offsets
10308// are allowed and some are not.
10309// Pre-GFX12, flat instruction offsets can only be non-negative, global and
10310// scratch instruction offsets can also be negative. On GFX12, offsets can be
10311// negative for all variants.
10312//
10313// There are several bugs related to these offsets:
10314// On gfx10.1, flat instructions that go into the global address space cannot
10315// use an offset.
10316//
10317// For scratch instructions, the address can be either an SGPR or a VGPR.
10318// The following offsets can be used, depending on the architecture (x means
10319// cannot be used):
10320// +----------------------------+------+------+
10321// | Address-Mode | SGPR | VGPR |
10322// +----------------------------+------+------+
10323// | gfx9 | | |
10324// | negative, 4-aligned offset | x | ok |
10325// | negative, unaligned offset | x | ok |
10326// +----------------------------+------+------+
10327// | gfx10 | | |
10328// | negative, 4-aligned offset | ok | ok |
10329// | negative, unaligned offset | ok | x |
10330// +----------------------------+------+------+
10331// | gfx10.3 | | |
10332// | negative, 4-aligned offset | ok | ok |
10333// | negative, unaligned offset | ok | ok |
10334// +----------------------------+------+------+
10335//
10336// This function ignores the addressing mode, so if an offset cannot be used in
10337// one addressing mode, it is considered illegal.
10338bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
10339 uint64_t FlatVariant) const {
10340 // TODO: Should 0 be special cased?
10341 if (!ST.hasFlatInstOffsets())
10342 return false;
10343
10344 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
10345 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
10346 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
10347 return false;
10348
10349 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10350 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
10351 (Offset % 4) != 0) {
10352 return false;
10353 }
10354
10355 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10356 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
10357 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
10358}
10359
10360// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
10361std::pair<int64_t, int64_t>
10362SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
10363 uint64_t FlatVariant) const {
10364 int64_t RemainderOffset = COffsetVal;
10365 int64_t ImmField = 0;
10366
10367 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10368 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
10369
10370 if (AllowNegative) {
10371 // Use signed division by a power of two to truncate towards 0.
10372 int64_t D = 1LL << NumBits;
10373 RemainderOffset = (COffsetVal / D) * D;
10374 ImmField = COffsetVal - RemainderOffset;
10375
10376 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10377 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
10378 (ImmField % 4) != 0) {
10379 // Make ImmField a multiple of 4
10380 RemainderOffset += ImmField % 4;
10381 ImmField -= ImmField % 4;
10382 }
10383 } else if (COffsetVal >= 0) {
10384 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
10385 RemainderOffset = COffsetVal - ImmField;
10386 }
10387
10388 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
10389 assert(RemainderOffset + ImmField == COffsetVal);
10390 return {ImmField, RemainderOffset};
10391}
10392
10394 if (ST.hasNegativeScratchOffsetBug() &&
10395 FlatVariant == SIInstrFlags::FlatScratch)
10396 return false;
10397
10398 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
10399}
10400
10401static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
10402 switch (ST.getGeneration()) {
10403 default:
10404 break;
10407 return SIEncodingFamily::SI;
10410 return SIEncodingFamily::VI;
10414 return ST.hasGFX11_7Insts() ? SIEncodingFamily::GFX1170
10417 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
10421 }
10422 llvm_unreachable("Unknown subtarget generation!");
10423}
10424
10425bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
10426 switch(MCOp) {
10427 // These opcodes use indirect register addressing so
10428 // they need special handling by codegen (currently missing).
10429 // Therefore it is too risky to allow these opcodes
10430 // to be selected by dpp combiner or sdwa peepholer.
10431 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
10432 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
10433 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
10434 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
10435 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
10436 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
10437 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
10438 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
10439 return true;
10440 default:
10441 return false;
10442 }
10443}
10444
10445#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
10446 case OPCODE##_dpp: \
10447 case OPCODE##_e32: \
10448 case OPCODE##_e64: \
10449 case OPCODE##_e64_dpp: \
10450 case OPCODE##_sdwa:
10451
10452static bool isRenamedInGFX9(int Opcode) {
10453 switch (Opcode) {
10454 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
10455 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
10456 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
10457 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
10458 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
10459 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
10460 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
10461 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
10462 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
10463 //
10464 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
10465 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
10466 case AMDGPU::V_FMA_F16_gfx9_e64:
10467 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
10468 case AMDGPU::V_INTERP_P2_F16:
10469 case AMDGPU::V_MAD_F16_e64:
10470 case AMDGPU::V_MAD_U16_e64:
10471 case AMDGPU::V_MAD_I16_e64:
10472 return true;
10473 default:
10474 return false;
10475 }
10476}
10477
10478int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
10479 assert(Opcode == (int)SIInstrInfo::getNonSoftWaitcntOpcode(Opcode) &&
10480 "SIInsertWaitcnts should have promoted soft waitcnt instructions!");
10481
10482 unsigned Gen = subtargetEncodingFamily(ST);
10483
10484 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
10486
10487 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
10488 // subtarget has UnpackedD16VMem feature.
10489 // TODO: remove this when we discard GFX80 encoding.
10490 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
10492
10493 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
10494 switch (ST.getGeneration()) {
10495 default:
10497 break;
10500 break;
10503 break;
10504 }
10505 }
10506
10507 if (isMAI(Opcode)) {
10508 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
10509 if (MFMAOp != -1)
10510 Opcode = MFMAOp;
10511 }
10512
10513 int32_t MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
10514
10515 if (MCOp == AMDGPU::INSTRUCTION_LIST_END && ST.hasGFX11_7Insts())
10517
10518 if (MCOp == AMDGPU::INSTRUCTION_LIST_END && ST.hasGFX1250Insts())
10520
10521 // -1 means that Opcode is already a native instruction.
10522 if (MCOp == -1)
10523 return Opcode;
10524
10525 if (ST.hasGFX90AInsts()) {
10526 uint32_t NMCOp = AMDGPU::INSTRUCTION_LIST_END;
10527 if (ST.hasGFX940Insts())
10529 if (NMCOp == AMDGPU::INSTRUCTION_LIST_END)
10531 if (NMCOp == AMDGPU::INSTRUCTION_LIST_END)
10533 if (NMCOp != AMDGPU::INSTRUCTION_LIST_END)
10534 MCOp = NMCOp;
10535 }
10536
10537 // INSTRUCTION_LIST_END means that Opcode is a pseudo instruction that has no
10538 // encoding in the given subtarget generation.
10539 if (MCOp == AMDGPU::INSTRUCTION_LIST_END)
10540 return -1;
10541
10542 if (isAsmOnlyOpcode(MCOp))
10543 return -1;
10544
10545 return MCOp;
10546}
10547
10548static
10550 assert(RegOpnd.isReg());
10551 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
10552 getRegSubRegPair(RegOpnd);
10553}
10554
10557 assert(MI.isRegSequence());
10558 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
10559 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
10560 auto &RegOp = MI.getOperand(1 + 2 * I);
10561 return getRegOrUndef(RegOp);
10562 }
10564}
10565
10566// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10567// Following a subreg of reg:subreg isn't supported
10570 if (!RSR.SubReg)
10571 return false;
10572 switch (MI.getOpcode()) {
10573 default: break;
10574 case AMDGPU::REG_SEQUENCE:
10575 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
10576 return true;
10577 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10578 case AMDGPU::INSERT_SUBREG:
10579 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
10580 // inserted the subreg we're looking for
10581 RSR = getRegOrUndef(MI.getOperand(2));
10582 else { // the subreg in the rest of the reg
10583 auto R1 = getRegOrUndef(MI.getOperand(1));
10584 if (R1.SubReg) // subreg of subreg isn't supported
10585 return false;
10586 RSR.Reg = R1.Reg;
10587 }
10588 return true;
10589 }
10590 return false;
10591}
10592
10594 const MachineRegisterInfo &MRI) {
10595 assert(MRI.isSSA());
10596 if (!P.Reg.isVirtual())
10597 return nullptr;
10598
10599 auto RSR = P;
10600 auto *DefInst = MRI.getVRegDef(RSR.Reg);
10601 while (auto *MI = DefInst) {
10602 DefInst = nullptr;
10603 switch (MI->getOpcode()) {
10604 case AMDGPU::COPY:
10605 case AMDGPU::V_MOV_B32_e32: {
10606 auto &Op1 = MI->getOperand(1);
10607 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10608 if (Op1.isUndef())
10609 return nullptr;
10610 RSR = getRegSubRegPair(Op1);
10611 DefInst = MRI.getVRegDef(RSR.Reg);
10612 }
10613 break;
10614 }
10615 default:
10616 if (followSubRegDef(*MI, RSR)) {
10617 if (!RSR.Reg)
10618 return nullptr;
10619 DefInst = MRI.getVRegDef(RSR.Reg);
10620 }
10621 }
10622 if (!DefInst)
10623 return MI;
10624 }
10625 return nullptr;
10626}
10627
10629 Register VReg,
10630 const MachineInstr &DefMI,
10631 const MachineInstr &UseMI) {
10632 assert(MRI.isSSA() && "Must be run on SSA");
10633
10634 auto *TRI = MRI.getTargetRegisterInfo();
10635 auto *DefBB = DefMI.getParent();
10636
10637 // Don't bother searching between blocks, although it is possible this block
10638 // doesn't modify exec.
10639 if (UseMI.getParent() != DefBB)
10640 return true;
10641
10642 const int MaxInstScan = 20;
10643 int NumInst = 0;
10644
10645 // Stop scan at the use.
10646 auto E = UseMI.getIterator();
10647 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
10648 if (I->isDebugInstr())
10649 continue;
10650
10651 if (++NumInst > MaxInstScan)
10652 return true;
10653
10654 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
10655 return true;
10656 }
10657
10658 return false;
10659}
10660
10662 Register VReg,
10663 const MachineInstr &DefMI) {
10664 assert(MRI.isSSA() && "Must be run on SSA");
10665
10666 auto *TRI = MRI.getTargetRegisterInfo();
10667 auto *DefBB = DefMI.getParent();
10668
10669 const int MaxUseScan = 10;
10670 int NumUse = 0;
10671
10672 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
10673 auto &UseInst = *Use.getParent();
10674 // Don't bother searching between blocks, although it is possible this block
10675 // doesn't modify exec.
10676 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10677 return true;
10678
10679 if (++NumUse > MaxUseScan)
10680 return true;
10681 }
10682
10683 if (NumUse == 0)
10684 return false;
10685
10686 const int MaxInstScan = 20;
10687 int NumInst = 0;
10688
10689 // Stop scan when we have seen all the uses.
10690 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
10691 assert(I != DefBB->end());
10692
10693 if (I->isDebugInstr())
10694 continue;
10695
10696 if (++NumInst > MaxInstScan)
10697 return true;
10698
10699 for (const MachineOperand &Op : I->operands()) {
10700 // We don't check reg masks here as they're used only on calls:
10701 // 1. EXEC is only considered const within one BB
10702 // 2. Call should be a terminator instruction if present in a BB
10703
10704 if (!Op.isReg())
10705 continue;
10706
10707 Register Reg = Op.getReg();
10708 if (Op.isUse()) {
10709 if (Reg == VReg && --NumUse == 0)
10710 return false;
10711 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
10712 return true;
10713 }
10714 }
10715}
10716
10719 const DebugLoc &DL, Register Src, Register Dst) const {
10720 auto Cur = MBB.begin();
10721 if (Cur != MBB.end())
10722 do {
10723 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
10724 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
10725 ++Cur;
10726 } while (Cur != MBB.end() && Cur != LastPHIIt);
10727
10728 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
10729 Dst);
10730}
10731
10734 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10735 if (InsPt != MBB.end() &&
10736 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10737 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10738 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10739 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
10740 InsPt++;
10741 return BuildMI(MBB, InsPt, DL,
10742 get(AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), Dst)
10743 .addReg(Src, {}, SrcSubReg)
10744 .addReg(AMDGPU::EXEC, RegState::Implicit);
10745 }
10746 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10747 Dst);
10748}
10749
10750bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10751
10753 const MachineInstr &SecondMI) const {
10754 for (const auto &Use : SecondMI.all_uses()) {
10755 if (Use.isReg() && FirstMI.modifiesRegister(Use.getReg(), &RI))
10756 return true;
10757 }
10758 return false;
10759}
10760
10761/// If OpX is multicycle, anti-dependencies are not allowed.
10762/// isDPMACCInstruction was not designed for VOPD, but it is fit for the
10763/// purpose.
10765 const MachineInstr &OpX) const {
10767}
10768
10771 ArrayRef<unsigned> Ops, int FrameIndex,
10772 MachineInstr *&CopyMI, LiveIntervals *LIS,
10773 VirtRegMap *VRM) const {
10774 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10775 //
10776 // %0:sreg_32 = COPY $m0
10777 //
10778 // We explicitly chose SReg_32 for the virtual register so such a copy might
10779 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10780 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10781 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10782 // TargetInstrInfo::foldMemoryOperand() is going to try.
10783 // A similar issue also exists with spilling and reloading $exec registers.
10784 //
10785 // To prevent that, constrain the %0 register class here.
10786 if (isFullCopyInstr(MI)) {
10787 Register DstReg = MI.getOperand(0).getReg();
10788 Register SrcReg = MI.getOperand(1).getReg();
10789 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10790 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10791 MachineRegisterInfo &MRI = MF.getRegInfo();
10792 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10793 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
10794 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
10795 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
10796 return nullptr;
10797 }
10798 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
10799 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
10800 return nullptr;
10801 }
10802 }
10803 }
10804
10805 return nullptr;
10806}
10807
10809 const MachineInstr &MI,
10810 unsigned *PredCost) const {
10811 if (MI.isBundle()) {
10813 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10814 unsigned Lat = 0, Count = 0;
10815 for (++I; I != E && I->isBundledWithPred(); ++I) {
10816 ++Count;
10817 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
10818 }
10819 return Lat + Count - 1;
10820 }
10821
10822 return SchedModel.computeInstrLatency(&MI);
10823}
10824
10825const MachineOperand &
10827 if (const MachineOperand *CallAddrOp =
10828 getNamedOperand(MI, AMDGPU::OpName::src0))
10829 return *CallAddrOp;
10831}
10832
10835 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10836 unsigned Opcode = MI.getOpcode();
10837
10838 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10839 Register Dst = MI.getOperand(0).getReg();
10840 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
10841 : MI.getOperand(1).getReg();
10842 LLT DstTy = MRI.getType(Dst);
10843 LLT SrcTy = MRI.getType(Src);
10844 unsigned DstAS = DstTy.getAddressSpace();
10845 unsigned SrcAS = SrcTy.getAddressSpace();
10846 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10847 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10848 ST.hasGloballyAddressableScratch()
10851 };
10852
10853 // If the target supports globally addressable scratch, the mapping from
10854 // scratch memory to the flat aperture changes therefore an address space cast
10855 // is no longer uniform.
10856 if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
10857 return HandleAddrSpaceCast(MI);
10858
10859 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
10860 auto IID = GI->getIntrinsicID();
10865
10866 switch (IID) {
10867 case Intrinsic::amdgcn_addrspacecast_nonnull:
10868 return HandleAddrSpaceCast(MI);
10869 case Intrinsic::amdgcn_if:
10870 case Intrinsic::amdgcn_else:
10871 // FIXME: Uniform if second result
10872 break;
10873 }
10874
10876 }
10877
10878 // Loads from the private and flat address spaces are divergent, because
10879 // threads can execute the load instruction with the same inputs and get
10880 // different results.
10881 //
10882 // All other loads are not divergent, because if threads issue loads with the
10883 // same arguments, they will always get the same result.
10884 if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
10885 Opcode == AMDGPU::G_SEXTLOAD) {
10886 if (MI.memoperands_empty())
10887 return ValueUniformity::NeverUniform; // conservative assumption
10888
10889 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10890 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10891 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10892 })) {
10893 // At least one MMO in a non-global address space.
10895 }
10897 }
10898
10899 if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) ||
10900 Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10901 Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10902 AMDGPU::isGenericAtomic(Opcode)) {
10904 }
10906}
10907
10909 if (!Formatter)
10910 Formatter = std::make_unique<AMDGPUMIRFormatter>(ST);
10911 return Formatter.get();
10912}
10913
10915
10916 if (isNeverUniform(MI))
10918
10919 unsigned opcode = MI.getOpcode();
10920 if (opcode == AMDGPU::V_READLANE_B32 ||
10921 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10922 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10924
10925 if (isCopyInstr(MI)) {
10926 const MachineOperand &srcOp = MI.getOperand(1);
10927 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10928 const TargetRegisterClass *regClass =
10929 RI.getPhysRegBaseClass(srcOp.getReg());
10930 return RI.isSGPRClass(regClass) ? ValueUniformity::AlwaysUniform
10932 }
10934 }
10935
10936 // GMIR handling
10937 if (MI.isPreISelOpcode())
10939
10940 // Atomics are divergent because they are executed sequentially: when an
10941 // atomic operation refers to the same address in each thread, then each
10942 // thread after the first sees the value written by the previous thread as
10943 // original value.
10944
10945 if (isAtomic(MI))
10947
10948 // Loads from the private and flat address spaces are divergent, because
10949 // threads can execute the load instruction with the same inputs and get
10950 // different results.
10951 if (isFLAT(MI) && MI.mayLoad()) {
10952 if (MI.memoperands_empty())
10953 return ValueUniformity::NeverUniform; // conservative assumption
10954
10955 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10956 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10957 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10958 })) {
10959 // At least one MMO in a non-global address space.
10961 }
10962
10964 }
10965
10966 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10967 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
10968
10969 // FIXME: It's conceptually broken to report this for an instruction, and not
10970 // a specific def operand. For inline asm in particular, there could be mixed
10971 // uniform and divergent results.
10972 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
10973 const MachineOperand &SrcOp = MI.getOperand(I);
10974 if (!SrcOp.isReg())
10975 continue;
10976
10977 Register Reg = SrcOp.getReg();
10978 if (!Reg || !SrcOp.readsReg())
10979 continue;
10980
10981 // If RegBank is null, this is unassigned or an unallocatable special
10982 // register, which are all scalars.
10983 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
10984 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
10986 }
10987
10988 // TODO: Uniformity check condtions above can be rearranged for more
10989 // redability
10990
10991 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
10992 // currently turned into no-op COPYs by SelectionDAG ISel and are
10993 // therefore no longer recognizable.
10994
10996}
10997
10999 switch (MF.getFunction().getCallingConv()) {
11001 return 1;
11003 return 2;
11005 return 3;
11009 const Function &F = MF.getFunction();
11010 F.getContext().diagnose(DiagnosticInfoUnsupported(
11011 F, "ds_ordered_count unsupported for this calling conv"));
11012 [[fallthrough]];
11013 }
11016 case CallingConv::C:
11017 case CallingConv::Fast:
11018 default:
11019 // Assume other calling conventions are various compute callable functions
11020 return 0;
11021 }
11022}
11023
11025 Register &SrcReg2, int64_t &CmpMask,
11026 int64_t &CmpValue) const {
11027 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
11028 return false;
11029
11030 switch (MI.getOpcode()) {
11031 default:
11032 break;
11033 case AMDGPU::S_CMP_EQ_U32:
11034 case AMDGPU::S_CMP_EQ_I32:
11035 case AMDGPU::S_CMP_LG_U32:
11036 case AMDGPU::S_CMP_LG_I32:
11037 case AMDGPU::S_CMP_LT_U32:
11038 case AMDGPU::S_CMP_LT_I32:
11039 case AMDGPU::S_CMP_GT_U32:
11040 case AMDGPU::S_CMP_GT_I32:
11041 case AMDGPU::S_CMP_LE_U32:
11042 case AMDGPU::S_CMP_LE_I32:
11043 case AMDGPU::S_CMP_GE_U32:
11044 case AMDGPU::S_CMP_GE_I32:
11045 case AMDGPU::S_CMP_EQ_U64:
11046 case AMDGPU::S_CMP_LG_U64:
11047 SrcReg = MI.getOperand(0).getReg();
11048 if (MI.getOperand(1).isReg()) {
11049 if (MI.getOperand(1).getSubReg())
11050 return false;
11051 SrcReg2 = MI.getOperand(1).getReg();
11052 CmpValue = 0;
11053 } else if (MI.getOperand(1).isImm()) {
11054 SrcReg2 = Register();
11055 CmpValue = MI.getOperand(1).getImm();
11056 } else {
11057 return false;
11058 }
11059 CmpMask = ~0;
11060 return true;
11061 case AMDGPU::S_CMPK_EQ_U32:
11062 case AMDGPU::S_CMPK_EQ_I32:
11063 case AMDGPU::S_CMPK_LG_U32:
11064 case AMDGPU::S_CMPK_LG_I32:
11065 case AMDGPU::S_CMPK_LT_U32:
11066 case AMDGPU::S_CMPK_LT_I32:
11067 case AMDGPU::S_CMPK_GT_U32:
11068 case AMDGPU::S_CMPK_GT_I32:
11069 case AMDGPU::S_CMPK_LE_U32:
11070 case AMDGPU::S_CMPK_LE_I32:
11071 case AMDGPU::S_CMPK_GE_U32:
11072 case AMDGPU::S_CMPK_GE_I32:
11073 SrcReg = MI.getOperand(0).getReg();
11074 SrcReg2 = Register();
11075 CmpValue = MI.getOperand(1).getImm();
11076 CmpMask = ~0;
11077 return true;
11078 }
11079
11080 return false;
11081}
11082
11084 for (MachineBasicBlock *S : MBB->successors()) {
11085 if (S->isLiveIn(AMDGPU::SCC))
11086 return false;
11087 }
11088 return true;
11089}
11090
11091// Invert all uses of SCC following SCCDef because SCCDef may be deleted and
11092// (incoming SCC) = !(SCC defined by SCCDef).
11093// Return true if all uses can be re-written, false otherwise.
11094bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
11095 MachineBasicBlock *MBB = SCCDef->getParent();
11096 SmallVector<MachineInstr *> InvertInstr;
11097 bool SCCIsDead = false;
11098
11099 // Scan instructions for SCC uses that need to be inverted until SCC is dead.
11100 constexpr unsigned ScanLimit = 12;
11101 unsigned Count = 0;
11102 for (MachineInstr &MI :
11103 make_range(std::next(MachineBasicBlock::iterator(SCCDef)), MBB->end())) {
11104 if (++Count > ScanLimit)
11105 return false;
11106 if (MI.readsRegister(AMDGPU::SCC, &RI)) {
11107 if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
11108 MI.getOpcode() == AMDGPU::S_CSELECT_B64 ||
11109 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
11110 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC1)
11111 InvertInstr.push_back(&MI);
11112 else
11113 return false;
11114 }
11115 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
11116 SCCIsDead = true;
11117 break;
11118 }
11119 }
11120 if (!SCCIsDead && isSCCDeadOnExit(MBB))
11121 SCCIsDead = true;
11122
11123 // SCC may have more uses. Can't invert all of them.
11124 if (!SCCIsDead)
11125 return false;
11126
11127 // Invert uses
11128 for (MachineInstr *MI : InvertInstr) {
11129 if (MI->getOpcode() == AMDGPU::S_CSELECT_B32 ||
11130 MI->getOpcode() == AMDGPU::S_CSELECT_B64) {
11131 swapOperands(*MI);
11132 } else if (MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
11133 MI->getOpcode() == AMDGPU::S_CBRANCH_SCC1) {
11134 MI->setDesc(get(MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0
11135 ? AMDGPU::S_CBRANCH_SCC1
11136 : AMDGPU::S_CBRANCH_SCC0));
11137 } else {
11138 llvm_unreachable("SCC used but no inversion handling");
11139 }
11140 }
11141 return true;
11142}
11143
11144// SCC is already valid after SCCValid.
11145// SCCRedefine will redefine SCC to the same value already available after
11146// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
11147// update kill/dead flags if necessary.
11148bool SIInstrInfo::optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
11149 bool NeedInversion) const {
11150 MachineInstr *KillsSCC = nullptr;
11151 if (SCCValid->getParent() != SCCRedefine->getParent())
11152 return false;
11153 for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()),
11154 SCCRedefine->getIterator())) {
11155 if (MI.modifiesRegister(AMDGPU::SCC, &RI))
11156 return false;
11157 if (MI.killsRegister(AMDGPU::SCC, &RI))
11158 KillsSCC = &MI;
11159 }
11160 if (NeedInversion && !invertSCCUse(SCCRedefine))
11161 return false;
11162 if (MachineOperand *SccDef =
11163 SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
11164 SccDef->setIsDead(false);
11165 if (KillsSCC)
11166 KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
11167 SCCRedefine->eraseFromParent();
11168 return true;
11169}
11170
11171static bool foldableSelect(const MachineInstr &Def) {
11172 if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 &&
11173 Def.getOpcode() != AMDGPU::S_CSELECT_B64)
11174 return false;
11175 bool Op1IsNonZeroImm =
11176 Def.getOperand(1).isImm() && Def.getOperand(1).getImm() != 0;
11177 bool Op2IsZeroImm =
11178 Def.getOperand(2).isImm() && Def.getOperand(2).getImm() == 0;
11179 if (!Op1IsNonZeroImm || !Op2IsZeroImm)
11180 return false;
11181 return true;
11182}
11183
11184static bool setsSCCIfResultIsZero(const MachineInstr &Def, bool &NeedInversion,
11185 unsigned &NewDefOpc) {
11186 // S_ADD_U32 X, 1 sets SCC on carryout which can only happen if result==0.
11187 // S_ADD_I32 X, 1 can be converted to S_ADD_U32 X, 1 if SCC is dead.
11188 if (Def.getOpcode() != AMDGPU::S_ADD_I32 &&
11189 Def.getOpcode() != AMDGPU::S_ADD_U32)
11190 return false;
11191 const MachineOperand &AddSrc1 = Def.getOperand(1);
11192 const MachineOperand &AddSrc2 = Def.getOperand(2);
11193 int64_t addend;
11194
11195 if ((!AddSrc1.isImm() || AddSrc1.getImm() != 1) &&
11196 (!AddSrc2.isImm() || AddSrc2.getImm() != 1) &&
11197 (!getFoldableImm(&AddSrc1, addend) || addend != 1) &&
11198 (!getFoldableImm(&AddSrc2, addend) || addend != 1))
11199 return false;
11200
11201 if (Def.getOpcode() == AMDGPU::S_ADD_I32) {
11202 const MachineOperand *SccDef =
11203 Def.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
11204 if (!SccDef->isDead())
11205 return false;
11206 NewDefOpc = AMDGPU::S_ADD_U32;
11207 }
11208 NeedInversion = !NeedInversion;
11209 return true;
11210}
11211
11213 Register SrcReg2, int64_t CmpMask,
11214 int64_t CmpValue,
11215 const MachineRegisterInfo *MRI) const {
11216 if (!SrcReg || SrcReg.isPhysical())
11217 return false;
11218
11219 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
11220 return false;
11221
11222 const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
11223 this](bool NeedInversion) -> bool {
11224 if (CmpValue != 0)
11225 return false;
11226
11227 MachineInstr *Def = MRI->getVRegDef(SrcReg);
11228 if (!Def)
11229 return false;
11230
11231 // For S_OP that set SCC = DST!=0, do the transformation
11232 //
11233 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11234 //
11235 // For (S_OP ...) that set SCC = DST==0, invert NeedInversion and
11236 // do the transformation:
11237 //
11238 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11239 //
11240 // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
11241 // for S_CSELECT* already has the same value that will be calculated by
11242 // s_cmp_lg_*
11243 //
11244 // s_cmp_[lg|eq]_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT*
11245 // (non-zero imm), 0)
11246
11247 unsigned NewDefOpc = Def->getOpcode();
11248 if (!setsSCCIfResultIsNonZero(*Def) &&
11249 !setsSCCIfResultIsZero(*Def, NeedInversion, NewDefOpc) &&
11250 !foldableSelect(*Def))
11251 return false;
11252
11253 if (!optimizeSCC(Def, &CmpInstr, NeedInversion))
11254 return false;
11255
11256 if (NewDefOpc != Def->getOpcode())
11257 Def->setDesc(get(NewDefOpc));
11258
11259 // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
11260 // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
11261 // 64-bit foldableSelect then delete s_or_b32 in the sequence:
11262 // sX = s_cselect_b64 (non-zero imm), 0
11263 // sLo = copy sX.sub0
11264 // sHi = copy sX.sub1
11265 // sY = s_or_b32 sLo, sHi
11266 if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
11267 MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
11268 const MachineOperand &OrOpnd1 = Def->getOperand(1);
11269 const MachineOperand &OrOpnd2 = Def->getOperand(2);
11270 if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
11271 MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg());
11272 MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg());
11273 if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
11274 Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() &&
11275 Def2->getOperand(1).isReg() &&
11276 Def1->getOperand(1).getSubReg() == AMDGPU::sub0 &&
11277 Def2->getOperand(1).getSubReg() == AMDGPU::sub1 &&
11278 Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
11279 MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg());
11280 if (Select && foldableSelect(*Select))
11281 optimizeSCC(Select, Def, /*NeedInversion=*/false);
11282 }
11283 }
11284 }
11285 return true;
11286 };
11287
11288 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
11289 this](int64_t ExpectedValue, unsigned SrcSize,
11290 bool IsReversible, bool IsSigned) -> bool {
11291 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11292 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11293 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11294 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11295 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
11296 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11297 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11298 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11299 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11300 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
11301 //
11302 // Signed ge/gt are not used for the sign bit.
11303 //
11304 // If result of the AND is unused except in the compare:
11305 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
11306 //
11307 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11308 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11309 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
11310 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11311 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11312 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
11313
11314 MachineInstr *Def = MRI->getVRegDef(SrcReg);
11315 if (!Def)
11316 return false;
11317
11318 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
11319 Def->getOpcode() != AMDGPU::S_AND_B64)
11320 return false;
11321
11322 int64_t Mask;
11323 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
11324 if (MO->isImm())
11325 Mask = MO->getImm();
11326 else if (!getFoldableImm(MO, Mask))
11327 return false;
11328 Mask &= maxUIntN(SrcSize);
11329 return isPowerOf2_64(Mask);
11330 };
11331
11332 MachineOperand *SrcOp = &Def->getOperand(1);
11333 if (isMask(SrcOp))
11334 SrcOp = &Def->getOperand(2);
11335 else if (isMask(&Def->getOperand(2)))
11336 SrcOp = &Def->getOperand(1);
11337 else
11338 return false;
11339
11340 // A valid Mask is required to have a single bit set, hence a non-zero and
11341 // power-of-two value. This verifies that we will not do 64-bit shift below.
11342 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
11343 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
11344 if (IsSigned && BitNo == SrcSize - 1)
11345 return false;
11346
11347 ExpectedValue <<= BitNo;
11348
11349 bool IsReversedCC = false;
11350 if (CmpValue != ExpectedValue) {
11351 if (!IsReversible)
11352 return false;
11353 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
11354 if (!IsReversedCC)
11355 return false;
11356 }
11357
11358 Register DefReg = Def->getOperand(0).getReg();
11359 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
11360 return false;
11361
11362 if (!optimizeSCC(Def, &CmpInstr, /*NeedInversion=*/false))
11363 return false;
11364
11365 if (!MRI->use_nodbg_empty(DefReg)) {
11366 assert(!IsReversedCC);
11367 return true;
11368 }
11369
11370 // Replace AND with unused result with a S_BITCMP.
11371 MachineBasicBlock *MBB = Def->getParent();
11372
11373 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
11374 : AMDGPU::S_BITCMP1_B32
11375 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
11376 : AMDGPU::S_BITCMP1_B64;
11377
11378 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
11379 .add(*SrcOp)
11380 .addImm(BitNo);
11381 Def->eraseFromParent();
11382
11383 return true;
11384 };
11385
11386 switch (CmpInstr.getOpcode()) {
11387 default:
11388 break;
11389 case AMDGPU::S_CMP_EQ_U32:
11390 case AMDGPU::S_CMP_EQ_I32:
11391 case AMDGPU::S_CMPK_EQ_U32:
11392 case AMDGPU::S_CMPK_EQ_I32:
11393 return optimizeCmpAnd(1, 32, true, false) ||
11394 optimizeCmpSelect(/*NeedInversion=*/true);
11395 case AMDGPU::S_CMP_GE_U32:
11396 case AMDGPU::S_CMPK_GE_U32:
11397 return optimizeCmpAnd(1, 32, false, false);
11398 case AMDGPU::S_CMP_GE_I32:
11399 case AMDGPU::S_CMPK_GE_I32:
11400 return optimizeCmpAnd(1, 32, false, true);
11401 case AMDGPU::S_CMP_EQ_U64:
11402 return optimizeCmpAnd(1, 64, true, false);
11403 case AMDGPU::S_CMP_LG_U32:
11404 case AMDGPU::S_CMP_LG_I32:
11405 case AMDGPU::S_CMPK_LG_U32:
11406 case AMDGPU::S_CMPK_LG_I32:
11407 return optimizeCmpAnd(0, 32, true, false) ||
11408 optimizeCmpSelect(/*NeedInversion=*/false);
11409 case AMDGPU::S_CMP_GT_U32:
11410 case AMDGPU::S_CMPK_GT_U32:
11411 return optimizeCmpAnd(0, 32, false, false);
11412 case AMDGPU::S_CMP_GT_I32:
11413 case AMDGPU::S_CMPK_GT_I32:
11414 return optimizeCmpAnd(0, 32, false, true);
11415 case AMDGPU::S_CMP_LG_U64:
11416 return optimizeCmpAnd(0, 64, true, false) ||
11417 optimizeCmpSelect(/*NeedInversion=*/false);
11418 }
11419
11420 return false;
11421}
11422
11424 AMDGPU::OpName OpName) const {
11425 if (!ST.needsAlignedVGPRs())
11426 return;
11427
11428 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
11429 if (OpNo < 0)
11430 return;
11431 MachineOperand &Op = MI.getOperand(OpNo);
11432 if (getOpSize(MI, OpNo) > 4)
11433 return;
11434
11435 // Add implicit aligned super-reg to force alignment on the data operand.
11436 const DebugLoc &DL = MI.getDebugLoc();
11437 MachineBasicBlock *BB = MI.getParent();
11439 Register DataReg = Op.getReg();
11440 bool IsAGPR = RI.isAGPR(MRI, DataReg);
11442 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
11443 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
11444 Register NewVR =
11445 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
11446 : &AMDGPU::VReg_64_Align2RegClass);
11447 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
11448 .addReg(DataReg, {}, Op.getSubReg())
11449 .addImm(AMDGPU::sub0)
11450 .addReg(Undef)
11451 .addImm(AMDGPU::sub1);
11452 Op.setReg(NewVR);
11453 Op.setSubReg(AMDGPU::sub0);
11454 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
11455}
11456
11458 if (isIGLP(*MI))
11459 return false;
11460
11462}
11463
11465 if (!isWMMA(MI) && !isSWMMAC(MI))
11466 return false;
11467
11468 if (ST.hasGFX1250Insts())
11469 return AMDGPU::getWMMAIsXDL(MI.getOpcode());
11470
11471 return true;
11472}
11473
11475 unsigned Opcode = MI.getOpcode();
11476
11477 if (AMDGPU::isGFX12Plus(ST))
11478 return isDOT(MI) || isXDLWMMA(MI);
11479
11480 if (!isMAI(MI) || isDGEMM(Opcode) ||
11481 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
11482 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
11483 return false;
11484
11485 if (!ST.hasGFX940Insts())
11486 return true;
11487
11488 return AMDGPU::getMAIIsGFX940XDL(Opcode);
11489}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static const TargetRegisterClass * getRegClass(const MachineInstr &MI, Register Reg)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static bool isUndef(const MachineInstr &MI)
TargetInstrInfo::RegSubRegPair RegSubRegPair
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps, ArrayRef< Register > PhySGPRs={})
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
static MachineBasicBlock * generateWaterFallLoop(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr, ArrayRef< Register > PhySGPRs={})
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static MachineInstr * swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static bool setsSCCIfResultIsZero(const MachineInstr &Def, bool &NeedInversion, unsigned &NewDefOpc)
static bool isSCCDeadOnExit(MachineBasicBlock *MBB)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static constexpr AMDGPU::OpName ModifierOpNames[]
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static bool foldableSelect(const MachineInstr &Def)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, AMDGPU::OpName OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
static bool canRemat(const MachineInstr &MI)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:483
#define LLVM_DEBUG(...)
Definition Debug.h:119
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static LLVM_ABI Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition APFloat.cpp:145
Class for arbitrary precision integers.
Definition APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
Get the first element.
Definition ArrayRef.h:144
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
uint64_t getZExtValue() const
A debug info location.
Definition DebugLoc.h:123
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:254
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
constexpr unsigned getAddressSpace() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
static LocationSize precise(uint64_t Value)
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:418
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
unsigned getSize() const
Return the number of bytes in the encoding of this instruction, or zero if the encoding size cannot b...
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
unsigned getOpcode() const
Return the opcode number for this descriptor.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:86
uint8_t OperandType
Information about the type of the operand.
Definition MCInstrDesc.h:98
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition MCInstrDesc.h:92
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
LLVM_ABI void setVariableValue(const MCExpr *Value)
Definition MCSymbol.cpp:50
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
MIRFormater - Interface to format MIR operand based on target.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LLVM_ABI LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
bool isBundle() const
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
bool modifiesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr modifies (fully define or partially define) the specified register.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI void eraseFromBundle()
Unlink 'this' from its basic block and delete it.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
mop_range explicit_operands()
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
LLVM_ABI void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
LLVM_ABI void clearRegisterKills(Register Reg, const TargetRegisterInfo *RegInfo)
Clear all kill flags affecting Reg.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
MachineOperand * findRegisterDefOperand(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false)
Wrapper for findRegisterDefOperandIdx, it returns a pointer to the MachineOperand rather than an inde...
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
iterator_range< use_nodbg_iterator > use_nodbg_operands(Register Reg) const
bool use_nodbg_empty(Register RegNo) const
use_nodbg_empty - Return true if there are no non-Debug instructions using the specified register.
LLVM_ABI void moveOperands(MachineOperand *Dst, MachineOperand *Src, unsigned NumOps)
Move NumOps operands from Src to Dst, updating use-def lists as needed.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
bool reservedRegsFrozen() const
reservedRegsFrozen - Returns true after freezeReservedRegs() was called to ensure the set of reserved...
LLVM_ABI void clearVirtRegs()
clearVirtRegs - Remove all virtual registers (after physreg assignment).
void setRegAllocationHint(Register VReg, unsigned Type, Register PrefReg)
setRegAllocationHint - Specify a register allocation hint for the specified virtual register.
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
void setSimpleHint(Register VReg, Register PrefReg)
Specify the preferred (target independent) register allocation hint for the specified virtual registe...
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
iterator_range< use_iterator > use_operands(Register Reg) const
LLVM_ABI void removeRegOperandFromUseList(MachineOperand *MO)
Remove MO from its use-def list.
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
LLVM_ABI void addRegOperandToUseList(MachineOperand *MO)
Add MO to the linked list of operands for its register.
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
bool canAddToBBProlog(const MachineInstr &MI) const
static bool isDS(const MachineInstr &MI)
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
bool isXDLWMMA(const MachineInstr &MI) const
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
bool isSpill(uint32_t Opcode) const
uint64_t getDefaultRsrcDataFormat() const
static bool isSOPP(const MachineInstr &MI)
bool mayAccessScratch(const MachineInstr &MI) const
bool isIGLP(unsigned Opcode) const
static bool isFLATScratch(const MachineInstr &MI)
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getFoldableCopySrcIdx(const MachineInstr &MI)
unsigned getOpSize(uint32_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static bool isMTBUF(const MachineInstr &MI)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool setsSCCIfResultIsNonZero(const MachineInstr &MI)
const MIRFormatter * getMIRFormatter() const override
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
static bool isVOP2(const MachineInstr &MI)
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
MachineInstr * getWholeWaveFunctionSetup(MachineFunction &MF) const
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
static bool isDOT(const MachineInstr &MI)
InstSizeVerifyMode getInstSizeVerifyMode(const MachineInstr &MI) const override
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
static bool isSWMMAC(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
bool isXDL(const MachineInstr &MI) const
Register isStackAccess(const MachineInstr &MI, int &FrameIndex, TypeSize &MemBytes) const
static bool isVIMAGE(const MachineInstr &MI)
void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const
static bool isSOP2(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
bool hasRAWDependency(const MachineInstr &FirstMI, const MachineInstr &SecondMI) const
bool isLegalAV64PseudoImm(uint64_t Imm) const
Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool isNeverCoissue(MachineInstr &MI) const
static bool isBUF(const MachineInstr &MI)
void handleCopyToPhysHelper(SIInstrWorklist &Worklist, Register DstReg, MachineInstr &Inst, MachineRegisterInfo &MRI, DenseMap< MachineInstr *, V2PhysSCopyInfo > &WaterFalls, DenseMap< MachineInstr *, bool > &V2SPhyCopiesToErase) const
bool hasModifiersSet(const MachineInstr &MI, AMDGPU::OpName OpName) const
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, unsigned toIdx) const
static bool isFLATGlobal(const MachineInstr &MI)
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, int FrameIndex, MachineInstr *&CopyMI, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
bool isGlobalMemoryObject(const MachineInstr *MI) const override
static bool isVSAMPLE(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool isVOPDAntidependencyAllowed(const MachineInstr &MI) const
If OpX is multicycle, anti-dependencies are not allowed.
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex, TypeSize &MemBytes) const
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isTRANS(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static bool isSOPK(const MachineInstr &MI)
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
void createWaterFallForSiCall(MachineInstr *MI, MachineDominatorTree *MDT, ArrayRef< MachineOperand * > ScalarOps, ArrayRef< Register > PhySGPRs={}) const
Wrapper function for generating waterfall for instruction MI This function take into consideration of...
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, unsigned SubReg=0, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
bool isReMaterializableImpl(const MachineInstr &MI) const override
static bool isVOP3(const MCInstrDesc &Desc)
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
bool physRegUsesConstantBus(const MachineOperand &Reg) const
static bool isF16PseudoScalarTrans(unsigned Opcode)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const
static bool isDPP(const MachineInstr &MI)
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
static bool isMFMA(const MachineInstr &MI)
bool isLowLatencyInstruction(const MachineInstr &MI) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
void mutateAndCleanupImplicit(MachineInstr &MI, const MCInstrDesc &NewDesc) const
ValueUniformity getGenericValueUniformity(const MachineInstr &MI) const
static bool isMAI(const MCInstrDesc &Desc)
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, LaneBitmask UsedLanes=LaneBitmask::getAll()) const override
static bool usesLGKM_CNT(const MachineInstr &MI)
void legalizeOperandsVALUt16(MachineInstr &Inst, MachineRegisterInfo &MRI) const
Fix operands in Inst to fix 16bit SALU to VALU lowering.
bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
const MachineOperand & getCalleeOperand(const MachineInstr &MI) const override
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
bool isAlwaysGDS(uint32_t Opcode) const
static bool isVGPRSpill(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
int64_t getNamedImmOperand(const MachineInstr &MI, AMDGPU::OpName OperandName) const
Get required immediate operand.
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool regUsesConstantBus(const MachineOperand &Reg, const MachineRegisterInfo &MRI) const
static bool isMIMG(const MachineInstr &MI)
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description or operand ind...
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
void createReadFirstLaneFromCopyToPhysReg(MachineRegisterInfo &MRI, Register DstReg, MachineInstr &Inst) const
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
static bool isAtomic(const MachineInstr &MI)
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
bool isLiteralOperandLegal(const MCInstrDesc &InstDesc, const MCOperandInfo &OpInfo) const
static bool isWWMRegSpillOpcode(uint32_t Opcode)
static bool sopkIsZext(unsigned Opcode)
static bool isSGPRSpill(const MachineInstr &MI)
static bool isWMMA(const MachineInstr &MI)
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
bool isBarrier(unsigned Opcode) const
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
bool isLegalGFX12PlusPackedMathFP32Operand(const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO=nullptr) const
Check if MO would be a legal operand for gfx12+ packed math FP32 instructions.
static bool usesVM_CNT(const MachineInstr &MI)
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
static bool isFixedSize(const MachineInstr &MI)
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
ValueUniformity getValueUniformity(const MachineInstr &MI) const final
uint64_t getScratchRsrcWords23() const
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
std::optional< int64_t > getImmOrMaterializedImm(MachineOperand &Op) const
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst, DenseMap< MachineInstr *, V2PhysSCopyInfo > &WaterFalls, DenseMap< MachineInstr *, bool > &V2SPhyCopiesToErase) const
static bool isLDSDMA(const MachineInstr &MI)
static bool isVOP1(const MachineInstr &MI)
SIInstrInfo(const GCNSubtarget &ST)
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
unsigned getScratchReservedForDynamicVGPRs() const
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
unsigned getHWRegIndex(MCRegister Reg) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
unsigned getChannelFromSubReg(unsigned SubReg) const
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReMaterializableImpl(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual const MachineOperand & getCalleeOperand(const MachineInstr &MI) const
Returns the callee operand from the given MI.
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, LaneBitmask UsedLanes=LaneBitmask::getAll()) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo & getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isPackedFP32Inst(unsigned Opc)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
bool isPKFMACF16InlineConstant(uint32_t Literal, bool IsGFX11Plus)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
bool getWMMAIsXDL(unsigned Opc)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isDPMACCInstruction(unsigned Opc)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int32_t getCommuteRev(uint32_t Opcode)
LLVM_READONLY int32_t getCommuteOrig(uint32_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
LLVM_READONLY int32_t getGlobalVaddrOp(uint32_t Opcode)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READONLY int32_t getMFMAEarlyClobberOp(uint32_t Opcode)
bool getMAIIsGFX940XDL(unsigned Opc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY int32_t getIfAddr64Inst(uint32_t Opcode)
Check if Opcode is an Addr64 opcode.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
const uint64_t RSRC_TID_ENABLE
LLVM_READONLY int32_t getVOPe32(uint32_t Opcode)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
bool isGenericAtomic(unsigned Opc)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
LLVM_READONLY int32_t getAddr64Inst(uint32_t Opcode)
int32_t getMCOpcode(uint32_t Opcode, unsigned Gen)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition SIDefines.h:234
@ OPERAND_REG_IMM_INT64
Definition SIDefines.h:204
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:211
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:225
@ OPERAND_REG_INLINE_C_BF16
Definition SIDefines.h:222
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:227
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:213
@ OPERAND_REG_IMM_BF16
Definition SIDefines.h:208
@ OPERAND_REG_IMM_INT32
Operands with register, 32-bit, or 64-bit immediate.
Definition SIDefines.h:203
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:210
@ OPERAND_REG_IMM_FP16
Definition SIDefines.h:209
@ OPERAND_REG_IMM_V2FP16_SPLAT
Definition SIDefines.h:212
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:221
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition SIDefines.h:219
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:214
@ OPERAND_REG_IMM_FP64
Definition SIDefines.h:207
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:228
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:239
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:240
@ OPERAND_REG_IMM_V2INT32
Definition SIDefines.h:215
@ OPERAND_SDWA_VOPC_DST
Definition SIDefines.h:251
@ OPERAND_REG_IMM_FP32
Definition SIDefines.h:206
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:224
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:220
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:226
@ OPERAND_INLINE_C_AV64_PSEUDO
Definition SIDefines.h:245
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:216
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:241
@ OPERAND_REG_INLINE_C_FP16
Definition SIDefines.h:223
@ OPERAND_REG_IMM_INT16
Definition SIDefines.h:205
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition SIDefines.h:231
LLVM_READONLY int32_t getBasicFromSDWAOp(uint32_t Opcode)
bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, const MCSubtargetInfo &ST)
@ TI_SCRATCH_RSRC_DWORD1
Definition AMDGPU.h:614
@ TI_SCRATCH_RSRC_DWORD3
Definition AMDGPU.h:616
@ TI_SCRATCH_RSRC_DWORD0
Definition AMDGPU.h:613
@ TI_SCRATCH_RSRC_DWORD2
Definition AMDGPU.h:615
@ TI_CONSTDATA_START
Definition AMDGPU.h:612
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READONLY int32_t getFlatScratchInstSVfromSS(uint32_t Opcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ OPERAND_GENERIC_4
Definition MCInstrDesc.h:71
@ OPERAND_GENERIC_2
Definition MCInstrDesc.h:69
@ OPERAND_GENERIC_1
Definition MCInstrDesc.h:68
@ OPERAND_GENERIC_3
Definition MCInstrDesc.h:70
@ OPERAND_IMMEDIATE
Definition MCInstrDesc.h:61
@ OPERAND_GENERIC_0
Definition MCInstrDesc.h:67
@ OPERAND_GENERIC_5
Definition MCInstrDesc.h:72
Not(const Pred &P) -> Not< Pred >
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:557
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:207
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
RegState
Flags to represent properties of register accesses.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
constexpr RegState getKillRegState(bool B)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:149
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, const MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI VirtRegInfo AnalyzeVirtRegInBundle(MachineInstr &MI, Register Reg, SmallVectorImpl< std::pair< MachineInstr *, unsigned > > *Ops=nullptr)
AnalyzeVirtRegInBundle - Analyze how the current instruction or bundle uses a virtual register.
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
@ Xor
Bitwise or logical XOR of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
constexpr unsigned BitWidth
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
constexpr T reverseBits(T Val)
Reverse the bits in Val.
Definition MathExtras.h:118
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
constexpr RegState getUndefRegState(bool B)
ValueUniformity
Enum describing how values behave with respect to uniformity and divergence, to answer the question: ...
Definition Uniformity.h:18
@ AlwaysUniform
The result value is always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result value can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
MachineCycleInfo::CycleT MachineCycle
static const MachineMemOperand::Flags MOThreadPrivate
Mark the MMO of accesses to memory locations that are never written to by other threads.
Definition SIInstrInfo.h:63
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:876
#define N
Helper struct for the implementation of 3-address conversion to communicate updates made to instructi...
MachineInstr * RemoveMIUse
Other instruction whose def is no longer used by the converted instruction.
static constexpr uint64_t encode(Fields... Values)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
constexpr bool all() const
Definition LaneBitmask.h:54
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition SIInstrInfo.h:67
MachineInstr * top() const
Definition SIInstrInfo.h:72
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition SIInstrInfo.h:91
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.
VirtRegInfo - Information about a virtual register used by a set of operands.
bool Reads
Reads - One of the operands read the virtual register.
bool Writes
Writes - One of the operands writes the virtual register.