LLVM 22.0.0git
SIFoldOperands.cpp
Go to the documentation of this file.
1//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7/// \file
8//===----------------------------------------------------------------------===//
9//
10
11#include "SIFoldOperands.h"
12#include "AMDGPU.h"
13#include "GCNSubtarget.h"
15#include "SIInstrInfo.h"
17#include "SIRegisterInfo.h"
22
23#define DEBUG_TYPE "si-fold-operands"
24using namespace llvm;
25
26namespace {
27
28/// Track a value we may want to fold into downstream users, applying
29/// subregister extracts along the way.
30struct FoldableDef {
31 union {
32 MachineOperand *OpToFold = nullptr;
33 uint64_t ImmToFold;
34 int FrameIndexToFold;
35 };
36
37 /// Register class of the originally defined value.
38 const TargetRegisterClass *DefRC = nullptr;
39
40 /// Track the original defining instruction for the value.
41 const MachineInstr *DefMI = nullptr;
42
43 /// Subregister to apply to the value at the use point.
44 unsigned DefSubReg = AMDGPU::NoSubRegister;
45
46 /// Kind of value stored in the union.
48
49 FoldableDef() = delete;
50 FoldableDef(MachineOperand &FoldOp, const TargetRegisterClass *DefRC,
51 unsigned DefSubReg = AMDGPU::NoSubRegister)
52 : DefRC(DefRC), DefSubReg(DefSubReg), Kind(FoldOp.getType()) {
53
54 if (FoldOp.isImm()) {
55 ImmToFold = FoldOp.getImm();
56 } else if (FoldOp.isFI()) {
57 FrameIndexToFold = FoldOp.getIndex();
58 } else {
59 assert(FoldOp.isReg() || FoldOp.isGlobal());
60 OpToFold = &FoldOp;
61 }
62
63 DefMI = FoldOp.getParent();
64 }
65
66 FoldableDef(int64_t FoldImm, const TargetRegisterClass *DefRC,
67 unsigned DefSubReg = AMDGPU::NoSubRegister)
68 : ImmToFold(FoldImm), DefRC(DefRC), DefSubReg(DefSubReg),
70
71 /// Copy the current def and apply \p SubReg to the value.
72 FoldableDef getWithSubReg(const SIRegisterInfo &TRI, unsigned SubReg) const {
73 FoldableDef Copy(*this);
74 Copy.DefSubReg = TRI.composeSubRegIndices(DefSubReg, SubReg);
75 return Copy;
76 }
77
78 bool isReg() const { return Kind == MachineOperand::MO_Register; }
79
80 Register getReg() const {
81 assert(isReg());
82 return OpToFold->getReg();
83 }
84
85 unsigned getSubReg() const {
86 assert(isReg());
87 return OpToFold->getSubReg();
88 }
89
90 bool isImm() const { return Kind == MachineOperand::MO_Immediate; }
91
92 bool isFI() const {
93 return Kind == MachineOperand::MO_FrameIndex;
94 }
95
96 int getFI() const {
97 assert(isFI());
98 return FrameIndexToFold;
99 }
100
101 bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
102
103 /// Return the effective immediate value defined by this instruction, after
104 /// application of any subregister extracts which may exist between the use
105 /// and def instruction.
106 std::optional<int64_t> getEffectiveImmVal() const {
107 assert(isImm());
108 return SIInstrInfo::extractSubregFromImm(ImmToFold, DefSubReg);
109 }
110
111 /// Check if it is legal to fold this effective value into \p MI's \p OpNo
112 /// operand.
113 bool isOperandLegal(const SIInstrInfo &TII, const MachineInstr &MI,
114 unsigned OpIdx) const {
115 switch (Kind) {
117 std::optional<int64_t> ImmToFold = getEffectiveImmVal();
118 if (!ImmToFold)
119 return false;
120
121 // TODO: Should verify the subregister index is supported by the class
122 // TODO: Avoid the temporary MachineOperand
123 MachineOperand TmpOp = MachineOperand::CreateImm(*ImmToFold);
124 return TII.isOperandLegal(MI, OpIdx, &TmpOp);
125 }
127 if (DefSubReg != AMDGPU::NoSubRegister)
128 return false;
129 MachineOperand TmpOp = MachineOperand::CreateFI(FrameIndexToFold);
130 return TII.isOperandLegal(MI, OpIdx, &TmpOp);
131 }
132 default:
133 // TODO: Try to apply DefSubReg, for global address we can extract
134 // low/high.
135 if (DefSubReg != AMDGPU::NoSubRegister)
136 return false;
137 return TII.isOperandLegal(MI, OpIdx, OpToFold);
138 }
139
140 llvm_unreachable("covered MachineOperand kind switch");
141 }
142};
143
144struct FoldCandidate {
146 FoldableDef Def;
147 int ShrinkOpcode;
148 unsigned UseOpNo;
149 bool Commuted;
150
151 FoldCandidate(MachineInstr *MI, unsigned OpNo, FoldableDef Def,
152 bool Commuted = false, int ShrinkOp = -1)
153 : UseMI(MI), Def(Def), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
154 Commuted(Commuted) {}
155
156 bool isFI() const { return Def.isFI(); }
157
158 int getFI() const {
159 assert(isFI());
160 return Def.FrameIndexToFold;
161 }
162
163 bool isImm() const { return Def.isImm(); }
164
165 bool isReg() const { return Def.isReg(); }
166
167 Register getReg() const { return Def.getReg(); }
168
169 bool isGlobal() const { return Def.isGlobal(); }
170
171 bool needsShrink() const { return ShrinkOpcode != -1; }
172};
173
174class SIFoldOperandsImpl {
175public:
176 MachineFunction *MF;
178 const SIInstrInfo *TII;
179 const SIRegisterInfo *TRI;
180 const GCNSubtarget *ST;
181 const SIMachineFunctionInfo *MFI;
182
183 bool frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
184 const FoldableDef &OpToFold) const;
185
186 // TODO: Just use TII::getVALUOp
187 unsigned convertToVALUOp(unsigned Opc, bool UseVOP3 = false) const {
188 switch (Opc) {
189 case AMDGPU::S_ADD_I32: {
190 if (ST->hasAddNoCarry())
191 return UseVOP3 ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_U32_e32;
192 return UseVOP3 ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
193 }
194 case AMDGPU::S_OR_B32:
195 return UseVOP3 ? AMDGPU::V_OR_B32_e64 : AMDGPU::V_OR_B32_e32;
196 case AMDGPU::S_AND_B32:
197 return UseVOP3 ? AMDGPU::V_AND_B32_e64 : AMDGPU::V_AND_B32_e32;
198 case AMDGPU::S_MUL_I32:
199 return AMDGPU::V_MUL_LO_U32_e64;
200 default:
201 return AMDGPU::INSTRUCTION_LIST_END;
202 }
203 }
204
205 bool foldCopyToVGPROfScalarAddOfFrameIndex(Register DstReg, Register SrcReg,
206 MachineInstr &MI) const;
207
208 bool updateOperand(FoldCandidate &Fold) const;
209
210 bool canUseImmWithOpSel(const MachineInstr *MI, unsigned UseOpNo,
211 int64_t ImmVal) const;
212
213 /// Try to fold immediate \p ImmVal into \p MI's operand at index \p UseOpNo.
214 bool tryFoldImmWithOpSel(MachineInstr *MI, unsigned UseOpNo,
215 int64_t ImmVal) const;
216
217 bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
218 MachineInstr *MI, unsigned OpNo,
219 const FoldableDef &OpToFold) const;
220 bool isUseSafeToFold(const MachineInstr &MI,
221 const MachineOperand &UseMO) const;
222
223 const TargetRegisterClass *getRegSeqInit(
224 MachineInstr &RegSeq,
225 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs) const;
226
227 const TargetRegisterClass *
228 getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
229 Register UseReg) const;
230
231 std::pair<int64_t, const TargetRegisterClass *>
232 isRegSeqSplat(MachineInstr &RegSeg) const;
233
234 bool tryFoldRegSeqSplat(MachineInstr *UseMI, unsigned UseOpIdx,
235 int64_t SplatVal,
236 const TargetRegisterClass *SplatRC) const;
237
238 bool tryToFoldACImm(const FoldableDef &OpToFold, MachineInstr *UseMI,
239 unsigned UseOpIdx,
240 SmallVectorImpl<FoldCandidate> &FoldList) const;
241 void foldOperand(FoldableDef OpToFold, MachineInstr *UseMI, int UseOpIdx,
243 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
244
245 std::optional<int64_t> getImmOrMaterializedImm(MachineOperand &Op) const;
246 bool tryConstantFoldOp(MachineInstr *MI) const;
247 bool tryFoldCndMask(MachineInstr &MI) const;
248 bool tryFoldZeroHighBits(MachineInstr &MI) const;
249 bool foldInstOperand(MachineInstr &MI, const FoldableDef &OpToFold) const;
250
251 bool foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const;
252 bool tryFoldFoldableCopy(MachineInstr &MI,
253 MachineOperand *&CurrentKnownM0Val) const;
254
255 const MachineOperand *isClamp(const MachineInstr &MI) const;
256 bool tryFoldClamp(MachineInstr &MI);
257
258 std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
259 bool tryFoldOMod(MachineInstr &MI);
260 bool tryFoldRegSequence(MachineInstr &MI);
261 bool tryFoldPhiAGPR(MachineInstr &MI);
262 bool tryFoldLoad(MachineInstr &MI);
263
264 bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB);
265
266public:
267 SIFoldOperandsImpl() = default;
268
269 bool run(MachineFunction &MF);
270};
271
272class SIFoldOperandsLegacy : public MachineFunctionPass {
273public:
274 static char ID;
275
276 SIFoldOperandsLegacy() : MachineFunctionPass(ID) {}
277
278 bool runOnMachineFunction(MachineFunction &MF) override {
279 if (skipFunction(MF.getFunction()))
280 return false;
281 return SIFoldOperandsImpl().run(MF);
282 }
283
284 StringRef getPassName() const override { return "SI Fold Operands"; }
285
286 void getAnalysisUsage(AnalysisUsage &AU) const override {
287 AU.setPreservesCFG();
289 }
290
291 MachineFunctionProperties getRequiredProperties() const override {
292 return MachineFunctionProperties().setIsSSA();
293 }
294};
295
296} // End anonymous namespace.
297
298INITIALIZE_PASS(SIFoldOperandsLegacy, DEBUG_TYPE, "SI Fold Operands", false,
299 false)
300
301char SIFoldOperandsLegacy::ID = 0;
302
303char &llvm::SIFoldOperandsLegacyID = SIFoldOperandsLegacy::ID;
304
307 const MachineOperand &MO) {
308 const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
309 if (const TargetRegisterClass *SubRC =
310 TRI.getSubRegisterClass(RC, MO.getSubReg()))
311 RC = SubRC;
312 return RC;
313}
314
315// Map multiply-accumulate opcode to corresponding multiply-add opcode if any.
316static unsigned macToMad(unsigned Opc) {
317 switch (Opc) {
318 case AMDGPU::V_MAC_F32_e64:
319 return AMDGPU::V_MAD_F32_e64;
320 case AMDGPU::V_MAC_F16_e64:
321 return AMDGPU::V_MAD_F16_e64;
322 case AMDGPU::V_FMAC_F32_e64:
323 return AMDGPU::V_FMA_F32_e64;
324 case AMDGPU::V_FMAC_F16_e64:
325 return AMDGPU::V_FMA_F16_gfx9_e64;
326 case AMDGPU::V_FMAC_F16_t16_e64:
327 return AMDGPU::V_FMA_F16_gfx9_t16_e64;
328 case AMDGPU::V_FMAC_F16_fake16_e64:
329 return AMDGPU::V_FMA_F16_gfx9_fake16_e64;
330 case AMDGPU::V_FMAC_LEGACY_F32_e64:
331 return AMDGPU::V_FMA_LEGACY_F32_e64;
332 case AMDGPU::V_FMAC_F64_e64:
333 return AMDGPU::V_FMA_F64_e64;
334 }
335 return AMDGPU::INSTRUCTION_LIST_END;
336}
337
338// TODO: Add heuristic that the frame index might not fit in the addressing mode
339// immediate offset to avoid materializing in loops.
340bool SIFoldOperandsImpl::frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
341 const FoldableDef &OpToFold) const {
342 if (!OpToFold.isFI())
343 return false;
344
345 const unsigned Opc = UseMI.getOpcode();
346 switch (Opc) {
347 case AMDGPU::S_ADD_I32:
348 case AMDGPU::S_ADD_U32:
349 case AMDGPU::V_ADD_U32_e32:
350 case AMDGPU::V_ADD_CO_U32_e32:
351 // TODO: Possibly relax hasOneUse. It matters more for mubuf, since we have
352 // to insert the wave size shift at every point we use the index.
353 // TODO: Fix depending on visit order to fold immediates into the operand
354 return UseMI.getOperand(OpNo == 1 ? 2 : 1).isImm() &&
355 MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());
356 case AMDGPU::V_ADD_U32_e64:
357 case AMDGPU::V_ADD_CO_U32_e64:
358 return UseMI.getOperand(OpNo == 2 ? 3 : 2).isImm() &&
359 MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());
360 default:
361 break;
362 }
363
364 if (TII->isMUBUF(UseMI))
365 return OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
366 if (!TII->isFLATScratch(UseMI))
367 return false;
368
369 int SIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
370 if (OpNo == SIdx)
371 return true;
372
373 int VIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
374 return OpNo == VIdx && SIdx == -1;
375}
376
377/// Fold %vgpr = COPY (S_ADD_I32 x, frameindex)
378///
379/// => %vgpr = V_ADD_U32 x, frameindex
380bool SIFoldOperandsImpl::foldCopyToVGPROfScalarAddOfFrameIndex(
381 Register DstReg, Register SrcReg, MachineInstr &MI) const {
382 if (TRI->isVGPR(*MRI, DstReg) && TRI->isSGPRReg(*MRI, SrcReg) &&
383 MRI->hasOneNonDBGUse(SrcReg)) {
384 MachineInstr *Def = MRI->getVRegDef(SrcReg);
385 if (!Def || Def->getNumOperands() != 4)
386 return false;
387
388 MachineOperand *Src0 = &Def->getOperand(1);
389 MachineOperand *Src1 = &Def->getOperand(2);
390
391 // TODO: This is profitable with more operand types, and for more
392 // opcodes. But ultimately this is working around poor / nonexistent
393 // regbankselect.
394 if (!Src0->isFI() && !Src1->isFI())
395 return false;
396
397 if (Src0->isFI())
398 std::swap(Src0, Src1);
399
400 const bool UseVOP3 = !Src0->isImm() || TII->isInlineConstant(*Src0);
401 unsigned NewOp = convertToVALUOp(Def->getOpcode(), UseVOP3);
402 if (NewOp == AMDGPU::INSTRUCTION_LIST_END ||
403 !Def->getOperand(3).isDead()) // Check if scc is dead
404 return false;
405
406 MachineBasicBlock *MBB = Def->getParent();
407 const DebugLoc &DL = Def->getDebugLoc();
408 if (NewOp != AMDGPU::V_ADD_CO_U32_e32) {
409 MachineInstrBuilder Add =
410 BuildMI(*MBB, *Def, DL, TII->get(NewOp), DstReg);
411
412 if (Add->getDesc().getNumDefs() == 2) {
413 Register CarryOutReg = MRI->createVirtualRegister(TRI->getBoolRC());
414 Add.addDef(CarryOutReg, RegState::Dead);
415 MRI->setRegAllocationHint(CarryOutReg, 0, TRI->getVCC());
416 }
417
418 Add.add(*Src0).add(*Src1).setMIFlags(Def->getFlags());
419 if (AMDGPU::hasNamedOperand(NewOp, AMDGPU::OpName::clamp))
420 Add.addImm(0);
421
422 Def->eraseFromParent();
423 MI.eraseFromParent();
424 return true;
425 }
426
427 assert(NewOp == AMDGPU::V_ADD_CO_U32_e32);
428
430 MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, *Def, 16);
431 if (Liveness == MachineBasicBlock::LQR_Dead) {
432 // TODO: If src1 satisfies operand constraints, use vop3 version.
433 BuildMI(*MBB, *Def, DL, TII->get(NewOp), DstReg)
434 .add(*Src0)
435 .add(*Src1)
436 .setOperandDead(3) // implicit-def $vcc
437 .setMIFlags(Def->getFlags());
438 Def->eraseFromParent();
439 MI.eraseFromParent();
440 return true;
441 }
442 }
443
444 return false;
445}
446
448 return new SIFoldOperandsLegacy();
449}
450
451bool SIFoldOperandsImpl::canUseImmWithOpSel(const MachineInstr *MI,
452 unsigned UseOpNo,
453 int64_t ImmVal) const {
454 const uint64_t TSFlags = MI->getDesc().TSFlags;
455
456 if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
457 (TSFlags & SIInstrFlags::IsWMMA) || (TSFlags & SIInstrFlags::IsSWMMAC) ||
458 (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)))
459 return false;
460
461 const MachineOperand &Old = MI->getOperand(UseOpNo);
462 int OpNo = MI->getOperandNo(&Old);
463
464 unsigned Opcode = MI->getOpcode();
465 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
466 switch (OpType) {
467 default:
468 return false;
476 // VOP3 packed instructions ignore op_sel source modifiers, we cannot encode
477 // two different constants.
478 if ((TSFlags & SIInstrFlags::VOP3) && !(TSFlags & SIInstrFlags::VOP3P) &&
479 static_cast<uint16_t>(ImmVal) != static_cast<uint16_t>(ImmVal >> 16))
480 return false;
481 break;
482 }
483
484 return true;
485}
486
487bool SIFoldOperandsImpl::tryFoldImmWithOpSel(MachineInstr *MI, unsigned UseOpNo,
488 int64_t ImmVal) const {
489 MachineOperand &Old = MI->getOperand(UseOpNo);
490 unsigned Opcode = MI->getOpcode();
491 int OpNo = MI->getOperandNo(&Old);
492 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
493
494 // If the literal can be inlined as-is, apply it and short-circuit the
495 // tests below. The main motivation for this is to avoid unintuitive
496 // uses of opsel.
497 if (AMDGPU::isInlinableLiteralV216(ImmVal, OpType)) {
498 Old.ChangeToImmediate(ImmVal);
499 return true;
500 }
501
502 // Refer to op_sel/op_sel_hi and check if we can change the immediate and
503 // op_sel in a way that allows an inline constant.
504 AMDGPU::OpName ModName = AMDGPU::OpName::NUM_OPERAND_NAMES;
505 unsigned SrcIdx = ~0;
506 if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) {
507 ModName = AMDGPU::OpName::src0_modifiers;
508 SrcIdx = 0;
509 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) {
510 ModName = AMDGPU::OpName::src1_modifiers;
511 SrcIdx = 1;
512 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) {
513 ModName = AMDGPU::OpName::src2_modifiers;
514 SrcIdx = 2;
515 }
516 assert(ModName != AMDGPU::OpName::NUM_OPERAND_NAMES);
517 int ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModName);
518 MachineOperand &Mod = MI->getOperand(ModIdx);
519 unsigned ModVal = Mod.getImm();
520
521 uint16_t ImmLo =
522 static_cast<uint16_t>(ImmVal >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0));
523 uint16_t ImmHi =
524 static_cast<uint16_t>(ImmVal >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0));
525 uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo;
526 unsigned NewModVal = ModVal & ~(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
527
528 // Helper function that attempts to inline the given value with a newly
529 // chosen opsel pattern.
530 auto tryFoldToInline = [&](uint32_t Imm) -> bool {
531 if (AMDGPU::isInlinableLiteralV216(Imm, OpType)) {
532 Mod.setImm(NewModVal | SISrcMods::OP_SEL_1);
533 Old.ChangeToImmediate(Imm);
534 return true;
535 }
536
537 // Try to shuffle the halves around and leverage opsel to get an inline
538 // constant.
539 uint16_t Lo = static_cast<uint16_t>(Imm);
540 uint16_t Hi = static_cast<uint16_t>(Imm >> 16);
541 if (Lo == Hi) {
542 if (AMDGPU::isInlinableLiteralV216(Lo, OpType)) {
543 Mod.setImm(NewModVal);
545 return true;
546 }
547
548 if (static_cast<int16_t>(Lo) < 0) {
549 int32_t SExt = static_cast<int16_t>(Lo);
550 if (AMDGPU::isInlinableLiteralV216(SExt, OpType)) {
551 Mod.setImm(NewModVal);
552 Old.ChangeToImmediate(SExt);
553 return true;
554 }
555 }
556
557 // This check is only useful for integer instructions
558 if (OpType == AMDGPU::OPERAND_REG_IMM_V2INT16) {
559 if (AMDGPU::isInlinableLiteralV216(Lo << 16, OpType)) {
560 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
561 Old.ChangeToImmediate(static_cast<uint32_t>(Lo) << 16);
562 return true;
563 }
564 }
565 } else {
566 uint32_t Swapped = (static_cast<uint32_t>(Lo) << 16) | Hi;
567 if (AMDGPU::isInlinableLiteralV216(Swapped, OpType)) {
568 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0);
569 Old.ChangeToImmediate(Swapped);
570 return true;
571 }
572 }
573
574 return false;
575 };
576
577 if (tryFoldToInline(Imm))
578 return true;
579
580 // Replace integer addition by subtraction and vice versa if it allows
581 // folding the immediate to an inline constant.
582 //
583 // We should only ever get here for SrcIdx == 1 due to canonicalization
584 // earlier in the pipeline, but we double-check here to be safe / fully
585 // general.
586 bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16;
587 bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16;
588 if (SrcIdx == 1 && (IsUAdd || IsUSub)) {
589 unsigned ClampIdx =
590 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp);
591 bool Clamp = MI->getOperand(ClampIdx).getImm() != 0;
592
593 if (!Clamp) {
594 uint16_t NegLo = -static_cast<uint16_t>(Imm);
595 uint16_t NegHi = -static_cast<uint16_t>(Imm >> 16);
596 uint32_t NegImm = (static_cast<uint32_t>(NegHi) << 16) | NegLo;
597
598 if (tryFoldToInline(NegImm)) {
599 unsigned NegOpcode =
600 IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16;
601 MI->setDesc(TII->get(NegOpcode));
602 return true;
603 }
604 }
605 }
606
607 return false;
608}
609
610bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
611 MachineInstr *MI = Fold.UseMI;
612 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
613 assert(Old.isReg());
614
615 std::optional<int64_t> ImmVal;
616 if (Fold.isImm())
617 ImmVal = Fold.Def.getEffectiveImmVal();
618
619 if (ImmVal && canUseImmWithOpSel(Fold.UseMI, Fold.UseOpNo, *ImmVal)) {
620 if (tryFoldImmWithOpSel(Fold.UseMI, Fold.UseOpNo, *ImmVal))
621 return true;
622
623 // We can't represent the candidate as an inline constant. Try as a literal
624 // with the original opsel, checking constant bus limitations.
625 MachineOperand New = MachineOperand::CreateImm(*ImmVal);
626 int OpNo = MI->getOperandNo(&Old);
627 if (!TII->isOperandLegal(*MI, OpNo, &New))
628 return false;
629 Old.ChangeToImmediate(*ImmVal);
630 return true;
631 }
632
633 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
634 MachineBasicBlock *MBB = MI->getParent();
635 auto Liveness = MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 16);
636 if (Liveness != MachineBasicBlock::LQR_Dead) {
637 LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n");
638 return false;
639 }
640
641 int Op32 = Fold.ShrinkOpcode;
642 MachineOperand &Dst0 = MI->getOperand(0);
643 MachineOperand &Dst1 = MI->getOperand(1);
644 assert(Dst0.isDef() && Dst1.isDef());
645
646 bool HaveNonDbgCarryUse = !MRI->use_nodbg_empty(Dst1.getReg());
647
648 const TargetRegisterClass *Dst0RC = MRI->getRegClass(Dst0.getReg());
649 Register NewReg0 = MRI->createVirtualRegister(Dst0RC);
650
651 MachineInstr *Inst32 = TII->buildShrunkInst(*MI, Op32);
652
653 if (HaveNonDbgCarryUse) {
654 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::COPY),
655 Dst1.getReg())
656 .addReg(AMDGPU::VCC, RegState::Kill);
657 }
658
659 // Keep the old instruction around to avoid breaking iterators, but
660 // replace it with a dummy instruction to remove uses.
661 //
662 // FIXME: We should not invert how this pass looks at operands to avoid
663 // this. Should track set of foldable movs instead of looking for uses
664 // when looking at a use.
665 Dst0.setReg(NewReg0);
666 for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
667 MI->removeOperand(I);
668 MI->setDesc(TII->get(AMDGPU::IMPLICIT_DEF));
669
670 if (Fold.Commuted)
671 TII->commuteInstruction(*Inst32, false);
672 return true;
673 }
674
675 assert(!Fold.needsShrink() && "not handled");
676
677 if (ImmVal) {
678 if (Old.isTied()) {
679 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(MI->getOpcode());
680 if (NewMFMAOpc == -1)
681 return false;
682 MI->setDesc(TII->get(NewMFMAOpc));
683 MI->untieRegOperand(0);
684 const MCInstrDesc &MCID = MI->getDesc();
685 for (unsigned I = 0; I < MI->getNumDefs(); ++I)
687 MI->getOperand(I).setIsEarlyClobber(true);
688 }
689
690 // TODO: Should we try to avoid adding this to the candidate list?
691 MachineOperand New = MachineOperand::CreateImm(*ImmVal);
692 int OpNo = MI->getOperandNo(&Old);
693 if (!TII->isOperandLegal(*MI, OpNo, &New))
694 return false;
695
696 Old.ChangeToImmediate(*ImmVal);
697 return true;
698 }
699
700 if (Fold.isGlobal()) {
701 Old.ChangeToGA(Fold.Def.OpToFold->getGlobal(),
702 Fold.Def.OpToFold->getOffset(),
703 Fold.Def.OpToFold->getTargetFlags());
704 return true;
705 }
706
707 if (Fold.isFI()) {
708 Old.ChangeToFrameIndex(Fold.getFI());
709 return true;
710 }
711
712 MachineOperand *New = Fold.Def.OpToFold;
713
714 // Verify the register is compatible with the operand.
715 if (const TargetRegisterClass *OpRC =
716 TII->getRegClass(MI->getDesc(), Fold.UseOpNo, TRI)) {
717 const TargetRegisterClass *NewRC =
718 TRI->getRegClassForReg(*MRI, New->getReg());
719
720 const TargetRegisterClass *ConstrainRC = OpRC;
721 if (New->getSubReg()) {
722 ConstrainRC =
723 TRI->getMatchingSuperRegClass(NewRC, OpRC, New->getSubReg());
724
725 if (!ConstrainRC)
726 return false;
727 }
728
729 if (New->getReg().isVirtual() &&
730 !MRI->constrainRegClass(New->getReg(), ConstrainRC)) {
731 LLVM_DEBUG(dbgs() << "Cannot constrain " << printReg(New->getReg(), TRI)
732 << TRI->getRegClassName(ConstrainRC) << '\n');
733 return false;
734 }
735 }
736
737 // Rework once the VS_16 register class is updated to include proper
738 // 16-bit SGPRs instead of 32-bit ones.
739 if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
740 Old.setSubReg(AMDGPU::NoSubRegister);
741 if (New->getReg().isPhysical()) {
742 Old.substPhysReg(New->getReg(), *TRI);
743 } else {
744 Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
745 Old.setIsUndef(New->isUndef());
746 }
747 return true;
748}
749
751 FoldCandidate &&Entry) {
752 // Skip additional folding on the same operand.
753 for (FoldCandidate &Fold : FoldList)
754 if (Fold.UseMI == Entry.UseMI && Fold.UseOpNo == Entry.UseOpNo)
755 return;
756 LLVM_DEBUG(dbgs() << "Append " << (Entry.Commuted ? "commuted" : "normal")
757 << " operand " << Entry.UseOpNo << "\n " << *Entry.UseMI);
758 FoldList.push_back(Entry);
759}
760
762 MachineInstr *MI, unsigned OpNo,
763 const FoldableDef &FoldOp,
764 bool Commuted = false, int ShrinkOp = -1) {
765 appendFoldCandidate(FoldList,
766 FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp));
767}
768
769bool SIFoldOperandsImpl::tryAddToFoldList(
770 SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo,
771 const FoldableDef &OpToFold) const {
772 const unsigned Opc = MI->getOpcode();
773
774 auto tryToFoldAsFMAAKorMK = [&]() {
775 if (!OpToFold.isImm())
776 return false;
777
778 const bool TryAK = OpNo == 3;
779 const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
780 MI->setDesc(TII->get(NewOpc));
781
782 // We have to fold into operand which would be Imm not into OpNo.
783 bool FoldAsFMAAKorMK =
784 tryAddToFoldList(FoldList, MI, TryAK ? 3 : 2, OpToFold);
785 if (FoldAsFMAAKorMK) {
786 // Untie Src2 of fmac.
787 MI->untieRegOperand(3);
788 // For fmamk swap operands 1 and 2 if OpToFold was meant for operand 1.
789 if (OpNo == 1) {
790 MachineOperand &Op1 = MI->getOperand(1);
791 MachineOperand &Op2 = MI->getOperand(2);
792 Register OldReg = Op1.getReg();
793 // Operand 2 might be an inlinable constant
794 if (Op2.isImm()) {
795 Op1.ChangeToImmediate(Op2.getImm());
796 Op2.ChangeToRegister(OldReg, false);
797 } else {
798 Op1.setReg(Op2.getReg());
799 Op2.setReg(OldReg);
800 }
801 }
802 return true;
803 }
804 MI->setDesc(TII->get(Opc));
805 return false;
806 };
807
808 bool IsLegal = OpToFold.isOperandLegal(*TII, *MI, OpNo);
809 if (!IsLegal && OpToFold.isImm()) {
810 if (std::optional<int64_t> ImmVal = OpToFold.getEffectiveImmVal())
811 IsLegal = canUseImmWithOpSel(MI, OpNo, *ImmVal);
812 }
813
814 if (!IsLegal) {
815 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
816 unsigned NewOpc = macToMad(Opc);
817 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
818 // Check if changing this to a v_mad_{f16, f32} instruction will allow us
819 // to fold the operand.
820 MI->setDesc(TII->get(NewOpc));
821 bool AddOpSel = !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel) &&
822 AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel);
823 if (AddOpSel)
824 MI->addOperand(MachineOperand::CreateImm(0));
825 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold);
826 if (FoldAsMAD) {
827 MI->untieRegOperand(OpNo);
828 return true;
829 }
830 if (AddOpSel)
831 MI->removeOperand(MI->getNumExplicitOperands() - 1);
832 MI->setDesc(TII->get(Opc));
833 }
834
835 // Special case for s_fmac_f32 if we are trying to fold into Src2.
836 // By transforming into fmaak we can untie Src2 and make folding legal.
837 if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
838 if (tryToFoldAsFMAAKorMK())
839 return true;
840 }
841
842 // Special case for s_setreg_b32
843 if (OpToFold.isImm()) {
844 unsigned ImmOpc = 0;
845 if (Opc == AMDGPU::S_SETREG_B32)
846 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
847 else if (Opc == AMDGPU::S_SETREG_B32_mode)
848 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
849 if (ImmOpc) {
850 MI->setDesc(TII->get(ImmOpc));
851 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
852 return true;
853 }
854 }
855
856 // Operand is not legal, so try to commute the instruction to
857 // see if this makes it possible to fold.
858 unsigned CommuteOpNo = TargetInstrInfo::CommuteAnyOperandIndex;
859 bool CanCommute = TII->findCommutedOpIndices(*MI, OpNo, CommuteOpNo);
860 if (!CanCommute)
861 return false;
862
863 MachineOperand &Op = MI->getOperand(OpNo);
864 MachineOperand &CommutedOp = MI->getOperand(CommuteOpNo);
865
866 // One of operands might be an Imm operand, and OpNo may refer to it after
867 // the call of commuteInstruction() below. Such situations are avoided
868 // here explicitly as OpNo must be a register operand to be a candidate
869 // for memory folding.
870 if (!Op.isReg() || !CommutedOp.isReg())
871 return false;
872
873 // The same situation with an immediate could reproduce if both inputs are
874 // the same register.
875 if (Op.isReg() && CommutedOp.isReg() &&
876 (Op.getReg() == CommutedOp.getReg() &&
877 Op.getSubReg() == CommutedOp.getSubReg()))
878 return false;
879
880 if (!TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo))
881 return false;
882
883 int Op32 = -1;
884 if (!OpToFold.isOperandLegal(*TII, *MI, CommuteOpNo)) {
885 if ((Opc != AMDGPU::V_ADD_CO_U32_e64 && Opc != AMDGPU::V_SUB_CO_U32_e64 &&
886 Opc != AMDGPU::V_SUBREV_CO_U32_e64) || // FIXME
887 (!OpToFold.isImm() && !OpToFold.isFI() && !OpToFold.isGlobal())) {
888 TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo);
889 return false;
890 }
891
892 // Verify the other operand is a VGPR, otherwise we would violate the
893 // constant bus restriction.
894 MachineOperand &OtherOp = MI->getOperand(OpNo);
895 if (!OtherOp.isReg() ||
896 !TII->getRegisterInfo().isVGPR(*MRI, OtherOp.getReg()))
897 return false;
898
899 assert(MI->getOperand(1).isDef());
900
901 // Make sure to get the 32-bit version of the commuted opcode.
902 unsigned MaybeCommutedOpc = MI->getOpcode();
903 Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
904 }
905
906 appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, /*Commuted=*/true,
907 Op32);
908 return true;
909 }
910
911 // Special case for s_fmac_f32 if we are trying to fold into Src0 or Src1.
912 // By changing into fmamk we can untie Src2.
913 // If folding for Src0 happens first and it is identical operand to Src1 we
914 // should avoid transforming into fmamk which requires commuting as it would
915 // cause folding into Src1 to fail later on due to wrong OpNo used.
916 if (Opc == AMDGPU::S_FMAC_F32 &&
917 (OpNo != 1 || !MI->getOperand(1).isIdenticalTo(MI->getOperand(2)))) {
918 if (tryToFoldAsFMAAKorMK())
919 return true;
920 }
921
922 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
923 return true;
924}
925
926bool SIFoldOperandsImpl::isUseSafeToFold(const MachineInstr &MI,
927 const MachineOperand &UseMO) const {
928 // Operands of SDWA instructions must be registers.
929 return !TII->isSDWA(MI);
930}
931
934 Register SrcReg) {
935 MachineOperand *Sub = nullptr;
936 for (MachineInstr *SubDef = MRI.getVRegDef(SrcReg);
937 SubDef && TII.isFoldableCopy(*SubDef);
938 SubDef = MRI.getVRegDef(Sub->getReg())) {
939 unsigned SrcIdx = TII.getFoldableCopySrcIdx(*SubDef);
940 MachineOperand &SrcOp = SubDef->getOperand(SrcIdx);
941
942 if (SrcOp.isImm())
943 return &SrcOp;
944 if (!SrcOp.isReg() || SrcOp.getReg().isPhysical())
945 break;
946 Sub = &SrcOp;
947 // TODO: Support compose
948 if (SrcOp.getSubReg())
949 break;
950 }
951
952 return Sub;
953}
954
955const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
956 MachineInstr &RegSeq,
957 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs) const {
958
959 assert(RegSeq.isRegSequence());
960
961 const TargetRegisterClass *RC = nullptr;
962
963 for (unsigned I = 1, E = RegSeq.getNumExplicitOperands(); I != E; I += 2) {
964 MachineOperand &SrcOp = RegSeq.getOperand(I);
965 unsigned SubRegIdx = RegSeq.getOperand(I + 1).getImm();
966
967 // Only accept reg_sequence with uniform reg class inputs for simplicity.
968 const TargetRegisterClass *OpRC = getRegOpRC(*MRI, *TRI, SrcOp);
969 if (!RC)
970 RC = OpRC;
971 else if (!TRI->getCommonSubClass(RC, OpRC))
972 return nullptr;
973
974 if (SrcOp.getSubReg()) {
975 // TODO: Handle subregister compose
976 Defs.emplace_back(&SrcOp, SubRegIdx);
977 continue;
978 }
979
980 MachineOperand *DefSrc = lookUpCopyChain(*TII, *MRI, SrcOp.getReg());
981 if (DefSrc && (DefSrc->isReg() || DefSrc->isImm())) {
982 Defs.emplace_back(DefSrc, SubRegIdx);
983 continue;
984 }
985
986 Defs.emplace_back(&SrcOp, SubRegIdx);
987 }
988
989 return RC;
990}
991
992// Find a def of the UseReg, check if it is a reg_sequence and find initializers
993// for each subreg, tracking it to an immediate if possible. Returns the
994// register class of the inputs on success.
995const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
996 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
997 Register UseReg) const {
998 MachineInstr *Def = MRI->getVRegDef(UseReg);
999 if (!Def || !Def->isRegSequence())
1000 return nullptr;
1001
1002 return getRegSeqInit(*Def, Defs);
1003}
1004
1005std::pair<int64_t, const TargetRegisterClass *>
1006SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq) const {
1008 const TargetRegisterClass *SrcRC = getRegSeqInit(RegSeq, Defs);
1009 if (!SrcRC)
1010 return {};
1011
1012 bool TryToMatchSplat64 = false;
1013
1014 int64_t Imm;
1015 for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
1016 const MachineOperand *Op = Defs[I].first;
1017 if (!Op->isImm())
1018 return {};
1019
1020 int64_t SubImm = Op->getImm();
1021 if (!I) {
1022 Imm = SubImm;
1023 continue;
1024 }
1025
1026 if (Imm != SubImm) {
1027 if (I == 1 && (E & 1) == 0) {
1028 // If we have an even number of inputs, there's a chance this is a
1029 // 64-bit element splat broken into 32-bit pieces.
1030 TryToMatchSplat64 = true;
1031 break;
1032 }
1033
1034 return {}; // Can only fold splat constants
1035 }
1036 }
1037
1038 if (!TryToMatchSplat64)
1039 return {Defs[0].first->getImm(), SrcRC};
1040
1041 // Fallback to recognizing 64-bit splats broken into 32-bit pieces
1042 // (i.e. recognize every other other element is 0 for 64-bit immediates)
1043 int64_t SplatVal64;
1044 for (unsigned I = 0, E = Defs.size(); I != E; I += 2) {
1045 const MachineOperand *Op0 = Defs[I].first;
1046 const MachineOperand *Op1 = Defs[I + 1].first;
1047
1048 if (!Op0->isImm() || !Op1->isImm())
1049 return {};
1050
1051 unsigned SubReg0 = Defs[I].second;
1052 unsigned SubReg1 = Defs[I + 1].second;
1053
1054 // Assume we're going to generally encounter reg_sequences with sorted
1055 // subreg indexes, so reject any that aren't consecutive.
1056 if (TRI->getChannelFromSubReg(SubReg0) + 1 !=
1057 TRI->getChannelFromSubReg(SubReg1))
1058 return {};
1059
1060 int64_t MergedVal = Make_64(Op1->getImm(), Op0->getImm());
1061 if (I == 0)
1062 SplatVal64 = MergedVal;
1063 else if (SplatVal64 != MergedVal)
1064 return {};
1065 }
1066
1067 const TargetRegisterClass *RC64 = TRI->getSubRegisterClass(
1068 MRI->getRegClass(RegSeq.getOperand(0).getReg()), AMDGPU::sub0_sub1);
1069
1070 return {SplatVal64, RC64};
1071}
1072
1073bool SIFoldOperandsImpl::tryFoldRegSeqSplat(
1074 MachineInstr *UseMI, unsigned UseOpIdx, int64_t SplatVal,
1075 const TargetRegisterClass *SplatRC) const {
1076 const MCInstrDesc &Desc = UseMI->getDesc();
1077 if (UseOpIdx >= Desc.getNumOperands())
1078 return false;
1079
1080 // Filter out unhandled pseudos.
1081 if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
1082 return false;
1083
1084 int16_t RCID = TII->getOpRegClassID(Desc.operands()[UseOpIdx]);
1085 if (RCID == -1)
1086 return false;
1087
1088 const TargetRegisterClass *OpRC = TRI->getRegClass(RCID);
1089
1090 // Special case 0/-1, since when interpreted as a 64-bit element both halves
1091 // have the same bits. These are the only cases where a splat has the same
1092 // interpretation for 32-bit and 64-bit splats.
1093 if (SplatVal != 0 && SplatVal != -1) {
1094 // We need to figure out the scalar type read by the operand. e.g. the MFMA
1095 // operand will be AReg_128, and we want to check if it's compatible with an
1096 // AReg_32 constant.
1097 uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
1098 switch (OpTy) {
1103 OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0);
1104 break;
1108 OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1);
1109 break;
1110 default:
1111 return false;
1112 }
1113
1114 if (!TRI->getCommonSubClass(OpRC, SplatRC))
1115 return false;
1116 }
1117
1118 MachineOperand TmpOp = MachineOperand::CreateImm(SplatVal);
1119 if (!TII->isOperandLegal(*UseMI, UseOpIdx, &TmpOp))
1120 return false;
1121
1122 return true;
1123}
1124
1125bool SIFoldOperandsImpl::tryToFoldACImm(
1126 const FoldableDef &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx,
1127 SmallVectorImpl<FoldCandidate> &FoldList) const {
1128 const MCInstrDesc &Desc = UseMI->getDesc();
1129 if (UseOpIdx >= Desc.getNumOperands())
1130 return false;
1131
1132 // Filter out unhandled pseudos.
1133 if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
1134 return false;
1135
1136 if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) {
1137 appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold);
1138 return true;
1139 }
1140
1141 return false;
1142}
1143
1144void SIFoldOperandsImpl::foldOperand(
1145 FoldableDef OpToFold, MachineInstr *UseMI, int UseOpIdx,
1146 SmallVectorImpl<FoldCandidate> &FoldList,
1147 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
1148 const MachineOperand *UseOp = &UseMI->getOperand(UseOpIdx);
1149
1150 if (!isUseSafeToFold(*UseMI, *UseOp))
1151 return;
1152
1153 // FIXME: Fold operands with subregs.
1154 if (UseOp->isReg() && OpToFold.isReg()) {
1155 if (UseOp->isImplicit())
1156 return;
1157 // Allow folding from SGPRs to 16-bit VGPRs.
1158 if (UseOp->getSubReg() != AMDGPU::NoSubRegister &&
1159 (UseOp->getSubReg() != AMDGPU::lo16 ||
1160 !TRI->isSGPRReg(*MRI, OpToFold.getReg())))
1161 return;
1162 }
1163
1164 // Special case for REG_SEQUENCE: We can't fold literals into
1165 // REG_SEQUENCE instructions, so we have to fold them into the
1166 // uses of REG_SEQUENCE.
1167 if (UseMI->isRegSequence()) {
1168 Register RegSeqDstReg = UseMI->getOperand(0).getReg();
1169 unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
1170
1171 int64_t SplatVal;
1172 const TargetRegisterClass *SplatRC;
1173 std::tie(SplatVal, SplatRC) = isRegSeqSplat(*UseMI);
1174
1175 // Grab the use operands first
1177 llvm::make_pointer_range(MRI->use_nodbg_operands(RegSeqDstReg)));
1178 for (unsigned I = 0; I != UsesToProcess.size(); ++I) {
1179 MachineOperand *RSUse = UsesToProcess[I];
1180 MachineInstr *RSUseMI = RSUse->getParent();
1181 unsigned OpNo = RSUseMI->getOperandNo(RSUse);
1182
1183 if (SplatRC) {
1184 if (RSUseMI->isCopy()) {
1185 Register DstReg = RSUseMI->getOperand(0).getReg();
1186 append_range(UsesToProcess,
1187 make_pointer_range(MRI->use_nodbg_operands(DstReg)));
1188 continue;
1189 }
1190 if (tryFoldRegSeqSplat(RSUseMI, OpNo, SplatVal, SplatRC)) {
1191 FoldableDef SplatDef(SplatVal, SplatRC);
1192 appendFoldCandidate(FoldList, RSUseMI, OpNo, SplatDef);
1193 continue;
1194 }
1195 }
1196
1197 // TODO: Handle general compose
1198 if (RSUse->getSubReg() != RegSeqDstSubReg)
1199 continue;
1200
1201 // FIXME: We should avoid recursing here. There should be a cleaner split
1202 // between the in-place mutations and adding to the fold list.
1203 foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList,
1204 CopiesToReplace);
1205 }
1206
1207 return;
1208 }
1209
1210 if (tryToFoldACImm(OpToFold, UseMI, UseOpIdx, FoldList))
1211 return;
1212
1213 if (frameIndexMayFold(*UseMI, UseOpIdx, OpToFold)) {
1214 // Verify that this is a stack access.
1215 // FIXME: Should probably use stack pseudos before frame lowering.
1216
1217 if (TII->isMUBUF(*UseMI)) {
1218 if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
1219 MFI->getScratchRSrcReg())
1220 return;
1221
1222 // Ensure this is either relative to the current frame or the current
1223 // wave.
1224 MachineOperand &SOff =
1225 *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
1226 if (!SOff.isImm() || SOff.getImm() != 0)
1227 return;
1228 }
1229
1230 const unsigned Opc = UseMI->getOpcode();
1231 if (TII->isFLATScratch(*UseMI) &&
1232 AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
1233 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::saddr)) {
1234 unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(Opc);
1235 unsigned CPol =
1236 TII->getNamedOperand(*UseMI, AMDGPU::OpName::cpol)->getImm();
1237 if ((CPol & AMDGPU::CPol::SCAL) &&
1239 return;
1240
1241 UseMI->setDesc(TII->get(NewOpc));
1242 }
1243
1244 // A frame index will resolve to a positive constant, so it should always be
1245 // safe to fold the addressing mode, even pre-GFX9.
1246 UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getFI());
1247
1248 return;
1249 }
1250
1251 bool FoldingImmLike =
1252 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1253
1254 if (FoldingImmLike && UseMI->isCopy()) {
1255 Register DestReg = UseMI->getOperand(0).getReg();
1256 Register SrcReg = UseMI->getOperand(1).getReg();
1257 unsigned UseSubReg = UseMI->getOperand(1).getSubReg();
1258 assert(SrcReg.isVirtual());
1259
1260 const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
1261
1262 // Don't fold into a copy to a physical register with the same class. Doing
1263 // so would interfere with the register coalescer's logic which would avoid
1264 // redundant initializations.
1265 if (DestReg.isPhysical() && SrcRC->contains(DestReg))
1266 return;
1267
1268 const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
1269 // In order to fold immediates into copies, we need to change the copy to a
1270 // MOV. Find a compatible mov instruction with the value.
1271 for (unsigned MovOp :
1272 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
1273 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_MOV_B16_t16_e64,
1274 AMDGPU::V_ACCVGPR_WRITE_B32_e64, AMDGPU::AV_MOV_B32_IMM_PSEUDO,
1275 AMDGPU::AV_MOV_B64_IMM_PSEUDO}) {
1276 const MCInstrDesc &MovDesc = TII->get(MovOp);
1277 const TargetRegisterClass *MovDstRC =
1278 TRI->getRegClass(TII->getOpRegClassID(MovDesc.operands()[0]));
1279
1280 // Fold if the destination register class of the MOV instruction (ResRC)
1281 // is a superclass of (or equal to) the destination register class of the
1282 // COPY (DestRC). If this condition fails, folding would be illegal.
1283 if (!DestRC->hasSuperClassEq(MovDstRC))
1284 continue;
1285
1286 const int SrcIdx = MovOp == AMDGPU::V_MOV_B16_t16_e64 ? 2 : 1;
1287 const TargetRegisterClass *MovSrcRC =
1288 TRI->getRegClass(TII->getOpRegClassID(MovDesc.operands()[SrcIdx]));
1289
1290 if (MovSrcRC) {
1291 if (UseSubReg)
1292 MovSrcRC = TRI->getMatchingSuperRegClass(SrcRC, MovSrcRC, UseSubReg);
1293
1294 // FIXME: We should be able to directly check immediate operand legality
1295 // for all cases, but gfx908 hacks break.
1296 if (MovOp == AMDGPU::AV_MOV_B32_IMM_PSEUDO &&
1297 (!OpToFold.isImm() ||
1298 !TII->isImmOperandLegal(MovDesc, SrcIdx,
1299 *OpToFold.getEffectiveImmVal())))
1300 break;
1301
1302 if (!MRI->constrainRegClass(SrcReg, MovSrcRC))
1303 break;
1304
1305 // FIXME: This is mutating the instruction only and deferring the actual
1306 // fold of the immediate
1307 } else {
1308 // For the _IMM_PSEUDO cases, there can be value restrictions on the
1309 // immediate to verify. Technically we should always verify this, but it
1310 // only matters for these concrete cases.
1311 // TODO: Handle non-imm case if it's useful.
1312 if (!OpToFold.isImm() ||
1313 !TII->isImmOperandLegal(MovDesc, 1, *OpToFold.getEffectiveImmVal()))
1314 break;
1315 }
1316
1319 while (ImpOpI != ImpOpE) {
1320 MachineInstr::mop_iterator Tmp = ImpOpI;
1321 ImpOpI++;
1323 }
1324 UseMI->setDesc(MovDesc);
1325
1326 if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
1327 const auto &SrcOp = UseMI->getOperand(UseOpIdx);
1328 MachineOperand NewSrcOp(SrcOp);
1329 MachineFunction *MF = UseMI->getParent()->getParent();
1330 UseMI->removeOperand(1);
1331 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
1332 UseMI->addOperand(NewSrcOp); // src0
1333 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel
1334 UseOpIdx = SrcIdx;
1335 UseOp = &UseMI->getOperand(UseOpIdx);
1336 }
1337 CopiesToReplace.push_back(UseMI);
1338 break;
1339 }
1340
1341 // We failed to replace the copy, so give up.
1342 if (UseMI->getOpcode() == AMDGPU::COPY)
1343 return;
1344
1345 } else {
1346 if (UseMI->isCopy() && OpToFold.isReg() &&
1347 UseMI->getOperand(0).getReg().isVirtual() &&
1348 !UseMI->getOperand(1).getSubReg() &&
1349 OpToFold.DefMI->implicit_operands().empty()) {
1350 LLVM_DEBUG(dbgs() << "Folding " << OpToFold.OpToFold << "\n into "
1351 << *UseMI);
1352 unsigned Size = TII->getOpSize(*UseMI, 1);
1353 Register UseReg = OpToFold.getReg();
1355 unsigned SubRegIdx = OpToFold.getSubReg();
1356 // Hack to allow 32-bit SGPRs to be folded into True16 instructions
1357 // Remove this if 16-bit SGPRs (i.e. SGPR_LO16) are added to the
1358 // VS_16RegClass
1359 //
1360 // Excerpt from AMDGPUGenRegisterInfo.inc
1361 // NoSubRegister, //0
1362 // hi16, // 1
1363 // lo16, // 2
1364 // sub0, // 3
1365 // ...
1366 // sub1, // 11
1367 // sub1_hi16, // 12
1368 // sub1_lo16, // 13
1369 static_assert(AMDGPU::sub1_hi16 == 12, "Subregister layout has changed");
1370 if (Size == 2 && TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
1371 TRI->isSGPRReg(*MRI, UseReg)) {
1372 // Produce the 32 bit subregister index to which the 16-bit subregister
1373 // is aligned.
1374 if (SubRegIdx > AMDGPU::sub1) {
1375 LaneBitmask M = TRI->getSubRegIndexLaneMask(SubRegIdx);
1376 M |= M.getLane(M.getHighestLane() - 1);
1377 SmallVector<unsigned, 4> Indexes;
1378 TRI->getCoveringSubRegIndexes(TRI->getRegClassForReg(*MRI, UseReg), M,
1379 Indexes);
1380 assert(Indexes.size() == 1 && "Expected one 32-bit subreg to cover");
1381 SubRegIdx = Indexes[0];
1382 // 32-bit registers do not have a sub0 index
1383 } else if (TII->getOpSize(*UseMI, 1) == 4)
1384 SubRegIdx = 0;
1385 else
1386 SubRegIdx = AMDGPU::sub0;
1387 }
1388 UseMI->getOperand(1).setSubReg(SubRegIdx);
1389 UseMI->getOperand(1).setIsKill(false);
1390 CopiesToReplace.push_back(UseMI);
1391 OpToFold.OpToFold->setIsKill(false);
1392
1393 // Remove kill flags as kills may now be out of order with uses.
1394 MRI->clearKillFlags(UseReg);
1395 if (foldCopyToAGPRRegSequence(UseMI))
1396 return;
1397 }
1398
1399 unsigned UseOpc = UseMI->getOpcode();
1400 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
1401 (UseOpc == AMDGPU::V_READLANE_B32 &&
1402 (int)UseOpIdx ==
1403 AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
1404 // %vgpr = V_MOV_B32 imm
1405 // %sgpr = V_READFIRSTLANE_B32 %vgpr
1406 // =>
1407 // %sgpr = S_MOV_B32 imm
1408 if (FoldingImmLike) {
1410 UseMI->getOperand(UseOpIdx).getReg(),
1411 *OpToFold.DefMI, *UseMI))
1412 return;
1413
1414 UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
1415
1416 if (OpToFold.isImm()) {
1418 *OpToFold.getEffectiveImmVal());
1419 } else if (OpToFold.isFI())
1420 UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getFI());
1421 else {
1422 assert(OpToFold.isGlobal());
1423 UseMI->getOperand(1).ChangeToGA(OpToFold.OpToFold->getGlobal(),
1424 OpToFold.OpToFold->getOffset(),
1425 OpToFold.OpToFold->getTargetFlags());
1426 }
1427 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1428 return;
1429 }
1430
1431 if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
1433 UseMI->getOperand(UseOpIdx).getReg(),
1434 *OpToFold.DefMI, *UseMI))
1435 return;
1436
1437 // %vgpr = COPY %sgpr0
1438 // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
1439 // =>
1440 // %sgpr1 = COPY %sgpr0
1441 UseMI->setDesc(TII->get(AMDGPU::COPY));
1442 UseMI->getOperand(1).setReg(OpToFold.getReg());
1443 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
1444 UseMI->getOperand(1).setIsKill(false);
1445 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1446 return;
1447 }
1448 }
1449
1450 const MCInstrDesc &UseDesc = UseMI->getDesc();
1451
1452 // Don't fold into target independent nodes. Target independent opcodes
1453 // don't have defined register classes.
1454 if (UseDesc.isVariadic() || UseOp->isImplicit() ||
1455 UseDesc.operands()[UseOpIdx].RegClass == -1)
1456 return;
1457 }
1458
1459 // FIXME: We could try to change the instruction from 64-bit to 32-bit
1460 // to enable more folding opportunities. The shrink operands pass
1461 // already does this.
1462
1463 tryAddToFoldList(FoldList, UseMI, UseOpIdx, OpToFold);
1464}
1465
1466static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
1468 switch (Opcode) {
1469 case AMDGPU::V_AND_B32_e64:
1470 case AMDGPU::V_AND_B32_e32:
1471 case AMDGPU::S_AND_B32:
1472 Result = LHS & RHS;
1473 return true;
1474 case AMDGPU::V_OR_B32_e64:
1475 case AMDGPU::V_OR_B32_e32:
1476 case AMDGPU::S_OR_B32:
1477 Result = LHS | RHS;
1478 return true;
1479 case AMDGPU::V_XOR_B32_e64:
1480 case AMDGPU::V_XOR_B32_e32:
1481 case AMDGPU::S_XOR_B32:
1482 Result = LHS ^ RHS;
1483 return true;
1484 case AMDGPU::S_XNOR_B32:
1485 Result = ~(LHS ^ RHS);
1486 return true;
1487 case AMDGPU::S_NAND_B32:
1488 Result = ~(LHS & RHS);
1489 return true;
1490 case AMDGPU::S_NOR_B32:
1491 Result = ~(LHS | RHS);
1492 return true;
1493 case AMDGPU::S_ANDN2_B32:
1494 Result = LHS & ~RHS;
1495 return true;
1496 case AMDGPU::S_ORN2_B32:
1497 Result = LHS | ~RHS;
1498 return true;
1499 case AMDGPU::V_LSHL_B32_e64:
1500 case AMDGPU::V_LSHL_B32_e32:
1501 case AMDGPU::S_LSHL_B32:
1502 // The instruction ignores the high bits for out of bounds shifts.
1503 Result = LHS << (RHS & 31);
1504 return true;
1505 case AMDGPU::V_LSHLREV_B32_e64:
1506 case AMDGPU::V_LSHLREV_B32_e32:
1507 Result = RHS << (LHS & 31);
1508 return true;
1509 case AMDGPU::V_LSHR_B32_e64:
1510 case AMDGPU::V_LSHR_B32_e32:
1511 case AMDGPU::S_LSHR_B32:
1512 Result = LHS >> (RHS & 31);
1513 return true;
1514 case AMDGPU::V_LSHRREV_B32_e64:
1515 case AMDGPU::V_LSHRREV_B32_e32:
1516 Result = RHS >> (LHS & 31);
1517 return true;
1518 case AMDGPU::V_ASHR_I32_e64:
1519 case AMDGPU::V_ASHR_I32_e32:
1520 case AMDGPU::S_ASHR_I32:
1521 Result = static_cast<int32_t>(LHS) >> (RHS & 31);
1522 return true;
1523 case AMDGPU::V_ASHRREV_I32_e64:
1524 case AMDGPU::V_ASHRREV_I32_e32:
1525 Result = static_cast<int32_t>(RHS) >> (LHS & 31);
1526 return true;
1527 default:
1528 return false;
1529 }
1530}
1531
1532static unsigned getMovOpc(bool IsScalar) {
1533 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1534}
1535
1536static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
1537 MI.setDesc(NewDesc);
1538
1539 // Remove any leftover implicit operands from mutating the instruction. e.g.
1540 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
1541 // anymore.
1542 const MCInstrDesc &Desc = MI.getDesc();
1543 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
1544 Desc.implicit_defs().size();
1545
1546 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
1547 MI.removeOperand(I);
1548}
1549
1550std::optional<int64_t>
1551SIFoldOperandsImpl::getImmOrMaterializedImm(MachineOperand &Op) const {
1552 if (Op.isImm())
1553 return Op.getImm();
1554
1555 if (!Op.isReg() || !Op.getReg().isVirtual())
1556 return std::nullopt;
1557
1558 const MachineInstr *Def = MRI->getVRegDef(Op.getReg());
1559 if (Def && Def->isMoveImmediate()) {
1560 const MachineOperand &ImmSrc = Def->getOperand(1);
1561 if (ImmSrc.isImm())
1562 return TII->extractSubregFromImm(ImmSrc.getImm(), Op.getSubReg());
1563 }
1564
1565 return std::nullopt;
1566}
1567
1568// Try to simplify operations with a constant that may appear after instruction
1569// selection.
1570// TODO: See if a frame index with a fixed offset can fold.
1571bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
1572 if (!MI->allImplicitDefsAreDead())
1573 return false;
1574
1575 unsigned Opc = MI->getOpcode();
1576
1577 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1578 if (Src0Idx == -1)
1579 return false;
1580
1581 MachineOperand *Src0 = &MI->getOperand(Src0Idx);
1582 std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(*Src0);
1583
1584 if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
1585 Opc == AMDGPU::S_NOT_B32) &&
1586 Src0Imm) {
1587 MI->getOperand(1).ChangeToImmediate(~*Src0Imm);
1588 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
1589 return true;
1590 }
1591
1592 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1593 if (Src1Idx == -1)
1594 return false;
1595
1596 MachineOperand *Src1 = &MI->getOperand(Src1Idx);
1597 std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);
1598
1599 if (!Src0Imm && !Src1Imm)
1600 return false;
1601
1602 // and k0, k1 -> v_mov_b32 (k0 & k1)
1603 // or k0, k1 -> v_mov_b32 (k0 | k1)
1604 // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
1605 if (Src0Imm && Src1Imm) {
1606 int32_t NewImm;
1607 if (!evalBinaryInstruction(Opc, NewImm, *Src0Imm, *Src1Imm))
1608 return false;
1609
1610 bool IsSGPR = TRI->isSGPRReg(*MRI, MI->getOperand(0).getReg());
1611
1612 // Be careful to change the right operand, src0 may belong to a different
1613 // instruction.
1614 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1615 MI->removeOperand(Src1Idx);
1616 mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
1617 return true;
1618 }
1619
1620 if (!MI->isCommutable())
1621 return false;
1622
1623 if (Src0Imm && !Src1Imm) {
1624 std::swap(Src0, Src1);
1625 std::swap(Src0Idx, Src1Idx);
1626 std::swap(Src0Imm, Src1Imm);
1627 }
1628
1629 int32_t Src1Val = static_cast<int32_t>(*Src1Imm);
1630 if (Opc == AMDGPU::V_OR_B32_e64 ||
1631 Opc == AMDGPU::V_OR_B32_e32 ||
1632 Opc == AMDGPU::S_OR_B32) {
1633 if (Src1Val == 0) {
1634 // y = or x, 0 => y = copy x
1635 MI->removeOperand(Src1Idx);
1636 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1637 } else if (Src1Val == -1) {
1638 // y = or x, -1 => y = v_mov_b32 -1
1639 MI->removeOperand(Src1Idx);
1640 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
1641 } else
1642 return false;
1643
1644 return true;
1645 }
1646
1647 if (Opc == AMDGPU::V_AND_B32_e64 || Opc == AMDGPU::V_AND_B32_e32 ||
1648 Opc == AMDGPU::S_AND_B32) {
1649 if (Src1Val == 0) {
1650 // y = and x, 0 => y = v_mov_b32 0
1651 MI->removeOperand(Src0Idx);
1652 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
1653 } else if (Src1Val == -1) {
1654 // y = and x, -1 => y = copy x
1655 MI->removeOperand(Src1Idx);
1656 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1657 } else
1658 return false;
1659
1660 return true;
1661 }
1662
1663 if (Opc == AMDGPU::V_XOR_B32_e64 || Opc == AMDGPU::V_XOR_B32_e32 ||
1664 Opc == AMDGPU::S_XOR_B32) {
1665 if (Src1Val == 0) {
1666 // y = xor x, 0 => y = copy x
1667 MI->removeOperand(Src1Idx);
1668 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1669 return true;
1670 }
1671 }
1672
1673 return false;
1674}
1675
1676// Try to fold an instruction into a simpler one
1677bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
1678 unsigned Opc = MI.getOpcode();
1679 if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1680 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1681 return false;
1682
1683 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1684 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1685 if (!Src1->isIdenticalTo(*Src0)) {
1686 std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);
1687 if (!Src1Imm)
1688 return false;
1689
1690 std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(*Src0);
1691 if (!Src0Imm || *Src0Imm != *Src1Imm)
1692 return false;
1693 }
1694
1695 int Src1ModIdx =
1696 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
1697 int Src0ModIdx =
1698 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
1699 if ((Src1ModIdx != -1 && MI.getOperand(Src1ModIdx).getImm() != 0) ||
1700 (Src0ModIdx != -1 && MI.getOperand(Src0ModIdx).getImm() != 0))
1701 return false;
1702
1703 LLVM_DEBUG(dbgs() << "Folded " << MI << " into ");
1704 auto &NewDesc =
1705 TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
1706 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1707 if (Src2Idx != -1)
1708 MI.removeOperand(Src2Idx);
1709 MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
1710 if (Src1ModIdx != -1)
1711 MI.removeOperand(Src1ModIdx);
1712 if (Src0ModIdx != -1)
1713 MI.removeOperand(Src0ModIdx);
1714 mutateCopyOp(MI, NewDesc);
1715 LLVM_DEBUG(dbgs() << MI);
1716 return true;
1717}
1718
1719bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const {
1720 if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1721 MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1722 return false;
1723
1724 std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(MI.getOperand(1));
1725 if (!Src0Imm || *Src0Imm != 0xffff || !MI.getOperand(2).isReg())
1726 return false;
1727
1728 Register Src1 = MI.getOperand(2).getReg();
1729 MachineInstr *SrcDef = MRI->getVRegDef(Src1);
1730 if (!ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode()))
1731 return false;
1732
1733 Register Dst = MI.getOperand(0).getReg();
1734 MRI->replaceRegWith(Dst, Src1);
1735 if (!MI.getOperand(2).isKill())
1736 MRI->clearKillFlags(Src1);
1737 MI.eraseFromParent();
1738 return true;
1739}
1740
1741bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
1742 const FoldableDef &OpToFold) const {
1743 // We need mutate the operands of new mov instructions to add implicit
1744 // uses of EXEC, but adding them invalidates the use_iterator, so defer
1745 // this.
1746 SmallVector<MachineInstr *, 4> CopiesToReplace;
1748 MachineOperand &Dst = MI.getOperand(0);
1749 bool Changed = false;
1750
1751 if (OpToFold.isImm()) {
1752 for (auto &UseMI :
1753 make_early_inc_range(MRI->use_nodbg_instructions(Dst.getReg()))) {
1754 // Folding the immediate may reveal operations that can be constant
1755 // folded or replaced with a copy. This can happen for example after
1756 // frame indices are lowered to constants or from splitting 64-bit
1757 // constants.
1758 //
1759 // We may also encounter cases where one or both operands are
1760 // immediates materialized into a register, which would ordinarily not
1761 // be folded due to multiple uses or operand constraints.
1762 if (tryConstantFoldOp(&UseMI)) {
1763 LLVM_DEBUG(dbgs() << "Constant folded " << UseMI);
1764 Changed = true;
1765 }
1766 }
1767 }
1768
1770 llvm::make_pointer_range(MRI->use_nodbg_operands(Dst.getReg())));
1771 for (auto *U : UsesToProcess) {
1772 MachineInstr *UseMI = U->getParent();
1773
1774 FoldableDef SubOpToFold = OpToFold.getWithSubReg(*TRI, U->getSubReg());
1775 foldOperand(SubOpToFold, UseMI, UseMI->getOperandNo(U), FoldList,
1776 CopiesToReplace);
1777 }
1778
1779 if (CopiesToReplace.empty() && FoldList.empty())
1780 return Changed;
1781
1782 MachineFunction *MF = MI.getParent()->getParent();
1783 // Make sure we add EXEC uses to any new v_mov instructions created.
1784 for (MachineInstr *Copy : CopiesToReplace)
1785 Copy->addImplicitDefUseOperands(*MF);
1786
1787 SetVector<MachineInstr *> ConstantFoldCandidates;
1788 for (FoldCandidate &Fold : FoldList) {
1789 assert(!Fold.isReg() || Fold.Def.OpToFold);
1790 if (Fold.isReg() && Fold.getReg().isVirtual()) {
1791 Register Reg = Fold.getReg();
1792 const MachineInstr *DefMI = Fold.Def.DefMI;
1793 if (DefMI->readsRegister(AMDGPU::EXEC, TRI) &&
1794 execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI))
1795 continue;
1796 }
1797 if (updateOperand(Fold)) {
1798 // Clear kill flags.
1799 if (Fold.isReg()) {
1800 assert(Fold.Def.OpToFold && Fold.isReg());
1801 // FIXME: Probably shouldn't bother trying to fold if not an
1802 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
1803 // copies.
1804 MRI->clearKillFlags(Fold.getReg());
1805 }
1806 LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
1807 << static_cast<int>(Fold.UseOpNo) << " of "
1808 << *Fold.UseMI);
1809
1810 if (Fold.isImm())
1811 ConstantFoldCandidates.insert(Fold.UseMI);
1812
1813 } else if (Fold.Commuted) {
1814 // Restoring instruction's original operand order if fold has failed.
1815 TII->commuteInstruction(*Fold.UseMI, false);
1816 }
1817 }
1818
1819 for (MachineInstr *MI : ConstantFoldCandidates) {
1820 if (tryConstantFoldOp(MI)) {
1821 LLVM_DEBUG(dbgs() << "Constant folded " << *MI);
1822 Changed = true;
1823 }
1824 }
1825 return true;
1826}
1827
1828/// Fold %agpr = COPY (REG_SEQUENCE x_MOV_B32, ...) into REG_SEQUENCE
1829/// (V_ACCVGPR_WRITE_B32_e64) ... depending on the reg_sequence input values.
1830bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const {
1831 // It is very tricky to store a value into an AGPR. v_accvgpr_write_b32 can
1832 // only accept VGPR or inline immediate. Recreate a reg_sequence with its
1833 // initializers right here, so we will rematerialize immediates and avoid
1834 // copies via different reg classes.
1835 const TargetRegisterClass *DefRC =
1836 MRI->getRegClass(CopyMI->getOperand(0).getReg());
1837 if (!TRI->isAGPRClass(DefRC))
1838 return false;
1839
1840 Register UseReg = CopyMI->getOperand(1).getReg();
1841 MachineInstr *RegSeq = MRI->getVRegDef(UseReg);
1842 if (!RegSeq || !RegSeq->isRegSequence())
1843 return false;
1844
1845 const DebugLoc &DL = CopyMI->getDebugLoc();
1846 MachineBasicBlock &MBB = *CopyMI->getParent();
1847
1848 MachineInstrBuilder B(*MBB.getParent(), CopyMI);
1849 DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
1850
1851 const TargetRegisterClass *UseRC =
1852 MRI->getRegClass(CopyMI->getOperand(1).getReg());
1853
1854 // Value, subregindex for new REG_SEQUENCE
1856
1857 unsigned NumRegSeqOperands = RegSeq->getNumOperands();
1858 unsigned NumFoldable = 0;
1859
1860 for (unsigned I = 1; I != NumRegSeqOperands; I += 2) {
1861 MachineOperand &RegOp = RegSeq->getOperand(I);
1862 unsigned SubRegIdx = RegSeq->getOperand(I + 1).getImm();
1863
1864 if (RegOp.getSubReg()) {
1865 // TODO: Handle subregister compose
1866 NewDefs.emplace_back(&RegOp, SubRegIdx);
1867 continue;
1868 }
1869
1870 MachineOperand *Lookup = lookUpCopyChain(*TII, *MRI, RegOp.getReg());
1871 if (!Lookup)
1872 Lookup = &RegOp;
1873
1874 if (Lookup->isImm()) {
1875 // Check if this is an agpr_32 subregister.
1876 const TargetRegisterClass *DestSuperRC = TRI->getMatchingSuperRegClass(
1877 DefRC, &AMDGPU::AGPR_32RegClass, SubRegIdx);
1878 if (DestSuperRC &&
1879 TII->isInlineConstant(*Lookup, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
1880 ++NumFoldable;
1881 NewDefs.emplace_back(Lookup, SubRegIdx);
1882 continue;
1883 }
1884 }
1885
1886 const TargetRegisterClass *InputRC =
1887 Lookup->isReg() ? MRI->getRegClass(Lookup->getReg())
1888 : MRI->getRegClass(RegOp.getReg());
1889
1890 // TODO: Account for Lookup->getSubReg()
1891
1892 // If we can't find a matching super class, this is an SGPR->AGPR or
1893 // VGPR->AGPR subreg copy (or something constant-like we have to materialize
1894 // in the AGPR). We can't directly copy from SGPR to AGPR on gfx908, so we
1895 // want to rewrite to copy to an intermediate VGPR class.
1896 const TargetRegisterClass *MatchRC =
1897 TRI->getMatchingSuperRegClass(DefRC, InputRC, SubRegIdx);
1898 if (!MatchRC) {
1899 ++NumFoldable;
1900 NewDefs.emplace_back(&RegOp, SubRegIdx);
1901 continue;
1902 }
1903
1904 NewDefs.emplace_back(&RegOp, SubRegIdx);
1905 }
1906
1907 // Do not clone a reg_sequence and merely change the result register class.
1908 if (NumFoldable == 0)
1909 return false;
1910
1911 CopyMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
1912 for (unsigned I = CopyMI->getNumOperands() - 1; I > 0; --I)
1913 CopyMI->removeOperand(I);
1914
1915 for (auto [Def, DestSubIdx] : NewDefs) {
1916 if (!Def->isReg()) {
1917 // TODO: Should we use single write for each repeated value like in
1918 // register case?
1919 Register Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
1920 BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)
1921 .add(*Def);
1922 B.addReg(Tmp);
1923 } else {
1924 TargetInstrInfo::RegSubRegPair Src = getRegSubRegPair(*Def);
1925 Def->setIsKill(false);
1926
1927 Register &VGPRCopy = VGPRCopies[Src];
1928 if (!VGPRCopy) {
1929 const TargetRegisterClass *VGPRUseSubRC =
1930 TRI->getSubRegisterClass(UseRC, DestSubIdx);
1931
1932 // We cannot build a reg_sequence out of the same registers, they
1933 // must be copied. Better do it here before copyPhysReg() created
1934 // several reads to do the AGPR->VGPR->AGPR copy.
1935
1936 // Direct copy from SGPR to AGPR is not possible on gfx908. To avoid
1937 // creation of exploded copies SGPR->VGPR->AGPR in the copyPhysReg()
1938 // later, create a copy here and track if we already have such a copy.
1939 const TargetRegisterClass *SubRC =
1940 TRI->getSubRegisterClass(MRI->getRegClass(Src.Reg), Src.SubReg);
1941 if (!VGPRUseSubRC->hasSubClassEq(SubRC)) {
1942 // TODO: Try to reconstrain class
1943 VGPRCopy = MRI->createVirtualRegister(VGPRUseSubRC);
1944 BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), VGPRCopy).add(*Def);
1945 B.addReg(VGPRCopy);
1946 } else {
1947 // If it is already a VGPR, do not copy the register.
1948 B.add(*Def);
1949 }
1950 } else {
1951 B.addReg(VGPRCopy);
1952 }
1953 }
1954
1955 B.addImm(DestSubIdx);
1956 }
1957
1958 LLVM_DEBUG(dbgs() << "Folded " << *CopyMI);
1959 return true;
1960}
1961
1962bool SIFoldOperandsImpl::tryFoldFoldableCopy(
1963 MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
1964 Register DstReg = MI.getOperand(0).getReg();
1965 // Specially track simple redefs of m0 to the same value in a block, so we
1966 // can erase the later ones.
1967 if (DstReg == AMDGPU::M0) {
1968 MachineOperand &NewM0Val = MI.getOperand(1);
1969 if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
1970 MI.eraseFromParent();
1971 return true;
1972 }
1973
1974 // We aren't tracking other physical registers
1975 CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical())
1976 ? nullptr
1977 : &NewM0Val;
1978 return false;
1979 }
1980
1981 MachineOperand *OpToFoldPtr;
1982 if (MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e64) {
1983 // Folding when any src_modifiers are non-zero is unsupported
1984 if (TII->hasAnyModifiersSet(MI))
1985 return false;
1986 OpToFoldPtr = &MI.getOperand(2);
1987 } else
1988 OpToFoldPtr = &MI.getOperand(1);
1989 MachineOperand &OpToFold = *OpToFoldPtr;
1990 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1991
1992 // FIXME: We could also be folding things like TargetIndexes.
1993 if (!FoldingImm && !OpToFold.isReg())
1994 return false;
1995
1996 // Fold virtual registers and constant physical registers.
1997 if (OpToFold.isReg() && OpToFold.getReg().isPhysical() &&
1998 !TRI->isConstantPhysReg(OpToFold.getReg()))
1999 return false;
2000
2001 // Prevent folding operands backwards in the function. For example,
2002 // the COPY opcode must not be replaced by 1 in this example:
2003 //
2004 // %3 = COPY %vgpr0; VGPR_32:%3
2005 // ...
2006 // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
2007 if (!DstReg.isVirtual())
2008 return false;
2009
2010 const TargetRegisterClass *DstRC =
2011 MRI->getRegClass(MI.getOperand(0).getReg());
2012
2013 // True16: Fix malformed 16-bit sgpr COPY produced by peephole-opt
2014 // Can remove this code if proper 16-bit SGPRs are implemented
2015 // Example: Pre-peephole-opt
2016 // %29:sgpr_lo16 = COPY %16.lo16:sreg_32
2017 // %32:sreg_32 = COPY %29:sgpr_lo16
2018 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
2019 // Post-peephole-opt and DCE
2020 // %32:sreg_32 = COPY %16.lo16:sreg_32
2021 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
2022 // After this transform
2023 // %32:sreg_32 = COPY %16:sreg_32
2024 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
2025 // After the fold operands pass
2026 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %16:sreg_32
2027 if (MI.getOpcode() == AMDGPU::COPY && OpToFold.isReg() &&
2028 OpToFold.getSubReg()) {
2029 if (DstRC == &AMDGPU::SReg_32RegClass &&
2030 DstRC == MRI->getRegClass(OpToFold.getReg())) {
2031 assert(OpToFold.getSubReg() == AMDGPU::lo16);
2032 OpToFold.setSubReg(0);
2033 }
2034 }
2035
2036 // Fold copy to AGPR through reg_sequence
2037 // TODO: Handle with subregister extract
2038 if (OpToFold.isReg() && MI.isCopy() && !MI.getOperand(1).getSubReg()) {
2039 if (foldCopyToAGPRRegSequence(&MI))
2040 return true;
2041 }
2042
2043 FoldableDef Def(OpToFold, DstRC);
2044 bool Changed = foldInstOperand(MI, Def);
2045
2046 // If we managed to fold all uses of this copy then we might as well
2047 // delete it now.
2048 // The only reason we need to follow chains of copies here is that
2049 // tryFoldRegSequence looks forward through copies before folding a
2050 // REG_SEQUENCE into its eventual users.
2051 auto *InstToErase = &MI;
2052 while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
2053 auto &SrcOp = InstToErase->getOperand(1);
2054 auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register();
2055 InstToErase->eraseFromParent();
2056 Changed = true;
2057 InstToErase = nullptr;
2058 if (!SrcReg || SrcReg.isPhysical())
2059 break;
2060 InstToErase = MRI->getVRegDef(SrcReg);
2061 if (!InstToErase || !TII->isFoldableCopy(*InstToErase))
2062 break;
2063 }
2064
2065 if (InstToErase && InstToErase->isRegSequence() &&
2066 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
2067 InstToErase->eraseFromParent();
2068 Changed = true;
2069 }
2070
2071 if (Changed)
2072 return true;
2073
2074 // Run this after foldInstOperand to avoid turning scalar additions into
2075 // vector additions when the result scalar result could just be folded into
2076 // the user(s).
2077 return OpToFold.isReg() &&
2078 foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.getReg(), MI);
2079}
2080
2081// Clamp patterns are canonically selected to v_max_* instructions, so only
2082// handle them.
2083const MachineOperand *
2084SIFoldOperandsImpl::isClamp(const MachineInstr &MI) const {
2085 unsigned Op = MI.getOpcode();
2086 switch (Op) {
2087 case AMDGPU::V_MAX_F32_e64:
2088 case AMDGPU::V_MAX_F16_e64:
2089 case AMDGPU::V_MAX_F16_t16_e64:
2090 case AMDGPU::V_MAX_F16_fake16_e64:
2091 case AMDGPU::V_MAX_F64_e64:
2092 case AMDGPU::V_MAX_NUM_F64_e64:
2093 case AMDGPU::V_PK_MAX_F16:
2094 case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2095 case AMDGPU::V_PK_MAX_NUM_BF16: {
2096 if (MI.mayRaiseFPException())
2097 return nullptr;
2098
2099 if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
2100 return nullptr;
2101
2102 // Make sure sources are identical.
2103 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2104 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2105 if (!Src0->isReg() || !Src1->isReg() ||
2106 Src0->getReg() != Src1->getReg() ||
2107 Src0->getSubReg() != Src1->getSubReg() ||
2108 Src0->getSubReg() != AMDGPU::NoSubRegister)
2109 return nullptr;
2110
2111 // Can't fold up if we have modifiers.
2112 if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
2113 return nullptr;
2114
2115 unsigned Src0Mods
2116 = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
2117 unsigned Src1Mods
2118 = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
2119
2120 // Having a 0 op_sel_hi would require swizzling the output in the source
2121 // instruction, which we can't do.
2122 unsigned UnsetMods =
2123 (Op == AMDGPU::V_PK_MAX_F16 || Op == AMDGPU::V_PK_MAX_NUM_BF16)
2125 : 0u;
2126 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
2127 return nullptr;
2128 return Src0;
2129 }
2130 default:
2131 return nullptr;
2132 }
2133}
2134
2135// FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
2136bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) {
2137 const MachineOperand *ClampSrc = isClamp(MI);
2138 if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg()))
2139 return false;
2140
2141 if (!ClampSrc->getReg().isVirtual())
2142 return false;
2143
2144 // Look through COPY. COPY only observed with True16.
2145 Register DefSrcReg = TRI->lookThruCopyLike(ClampSrc->getReg(), MRI);
2146 MachineInstr *Def =
2147 MRI->getVRegDef(DefSrcReg.isVirtual() ? DefSrcReg : ClampSrc->getReg());
2148
2149 // The type of clamp must be compatible.
2150 if (TII->getClampMask(*Def) != TII->getClampMask(MI))
2151 return false;
2152
2153 if (Def->mayRaiseFPException())
2154 return false;
2155
2156 MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
2157 if (!DefClamp)
2158 return false;
2159
2160 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def);
2161
2162 // Clamp is applied after omod, so it is OK if omod is set.
2163 DefClamp->setImm(1);
2164
2165 Register DefReg = Def->getOperand(0).getReg();
2166 Register MIDstReg = MI.getOperand(0).getReg();
2167 if (TRI->isSGPRReg(*MRI, DefReg)) {
2168 // Pseudo scalar instructions have a SGPR for dst and clamp is a v_max*
2169 // instruction with a VGPR dst.
2170 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY),
2171 MIDstReg)
2172 .addReg(DefReg);
2173 } else {
2174 MRI->replaceRegWith(MIDstReg, DefReg);
2175 }
2176 MI.eraseFromParent();
2177
2178 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
2179 // instruction, so we might as well convert it to the more flexible VOP3-only
2180 // mad/fma form.
2181 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
2182 Def->eraseFromParent();
2183
2184 return true;
2185}
2186
2187static int getOModValue(unsigned Opc, int64_t Val) {
2188 switch (Opc) {
2189 case AMDGPU::V_MUL_F64_e64:
2190 case AMDGPU::V_MUL_F64_pseudo_e64: {
2191 switch (Val) {
2192 case 0x3fe0000000000000: // 0.5
2193 return SIOutMods::DIV2;
2194 case 0x4000000000000000: // 2.0
2195 return SIOutMods::MUL2;
2196 case 0x4010000000000000: // 4.0
2197 return SIOutMods::MUL4;
2198 default:
2199 return SIOutMods::NONE;
2200 }
2201 }
2202 case AMDGPU::V_MUL_F32_e64: {
2203 switch (static_cast<uint32_t>(Val)) {
2204 case 0x3f000000: // 0.5
2205 return SIOutMods::DIV2;
2206 case 0x40000000: // 2.0
2207 return SIOutMods::MUL2;
2208 case 0x40800000: // 4.0
2209 return SIOutMods::MUL4;
2210 default:
2211 return SIOutMods::NONE;
2212 }
2213 }
2214 case AMDGPU::V_MUL_F16_e64:
2215 case AMDGPU::V_MUL_F16_t16_e64:
2216 case AMDGPU::V_MUL_F16_fake16_e64: {
2217 switch (static_cast<uint16_t>(Val)) {
2218 case 0x3800: // 0.5
2219 return SIOutMods::DIV2;
2220 case 0x4000: // 2.0
2221 return SIOutMods::MUL2;
2222 case 0x4400: // 4.0
2223 return SIOutMods::MUL4;
2224 default:
2225 return SIOutMods::NONE;
2226 }
2227 }
2228 default:
2229 llvm_unreachable("invalid mul opcode");
2230 }
2231}
2232
2233// FIXME: Does this really not support denormals with f16?
2234// FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
2235// handled, so will anything other than that break?
2236std::pair<const MachineOperand *, int>
2237SIFoldOperandsImpl::isOMod(const MachineInstr &MI) const {
2238 unsigned Op = MI.getOpcode();
2239 switch (Op) {
2240 case AMDGPU::V_MUL_F64_e64:
2241 case AMDGPU::V_MUL_F64_pseudo_e64:
2242 case AMDGPU::V_MUL_F32_e64:
2243 case AMDGPU::V_MUL_F16_t16_e64:
2244 case AMDGPU::V_MUL_F16_fake16_e64:
2245 case AMDGPU::V_MUL_F16_e64: {
2246 // If output denormals are enabled, omod is ignored.
2247 if ((Op == AMDGPU::V_MUL_F32_e64 &&
2249 ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
2250 Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 ||
2251 Op == AMDGPU::V_MUL_F16_fake16_e64) &&
2254 MI.mayRaiseFPException())
2255 return std::pair(nullptr, SIOutMods::NONE);
2256
2257 const MachineOperand *RegOp = nullptr;
2258 const MachineOperand *ImmOp = nullptr;
2259 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2260 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2261 if (Src0->isImm()) {
2262 ImmOp = Src0;
2263 RegOp = Src1;
2264 } else if (Src1->isImm()) {
2265 ImmOp = Src1;
2266 RegOp = Src0;
2267 } else
2268 return std::pair(nullptr, SIOutMods::NONE);
2269
2270 int OMod = getOModValue(Op, ImmOp->getImm());
2271 if (OMod == SIOutMods::NONE ||
2272 TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
2273 TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
2274 TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
2275 TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
2276 return std::pair(nullptr, SIOutMods::NONE);
2277
2278 return std::pair(RegOp, OMod);
2279 }
2280 case AMDGPU::V_ADD_F64_e64:
2281 case AMDGPU::V_ADD_F64_pseudo_e64:
2282 case AMDGPU::V_ADD_F32_e64:
2283 case AMDGPU::V_ADD_F16_e64:
2284 case AMDGPU::V_ADD_F16_t16_e64:
2285 case AMDGPU::V_ADD_F16_fake16_e64: {
2286 // If output denormals are enabled, omod is ignored.
2287 if ((Op == AMDGPU::V_ADD_F32_e64 &&
2289 ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F64_pseudo_e64 ||
2290 Op == AMDGPU::V_ADD_F16_e64 || Op == AMDGPU::V_ADD_F16_t16_e64 ||
2291 Op == AMDGPU::V_ADD_F16_fake16_e64) &&
2293 return std::pair(nullptr, SIOutMods::NONE);
2294
2295 // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
2296 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2297 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2298
2299 if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
2300 Src0->getSubReg() == Src1->getSubReg() &&
2301 !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
2302 !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
2303 !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
2304 !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
2305 return std::pair(Src0, SIOutMods::MUL2);
2306
2307 return std::pair(nullptr, SIOutMods::NONE);
2308 }
2309 default:
2310 return std::pair(nullptr, SIOutMods::NONE);
2311 }
2312}
2313
2314// FIXME: Does this need to check IEEE bit on function?
2315bool SIFoldOperandsImpl::tryFoldOMod(MachineInstr &MI) {
2316 const MachineOperand *RegOp;
2317 int OMod;
2318 std::tie(RegOp, OMod) = isOMod(MI);
2319 if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
2320 RegOp->getSubReg() != AMDGPU::NoSubRegister ||
2321 !MRI->hasOneNonDBGUser(RegOp->getReg()))
2322 return false;
2323
2324 MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
2325 MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
2326 if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
2327 return false;
2328
2329 if (Def->mayRaiseFPException())
2330 return false;
2331
2332 // Clamp is applied after omod. If the source already has clamp set, don't
2333 // fold it.
2334 if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
2335 return false;
2336
2337 LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def);
2338
2339 DefOMod->setImm(OMod);
2340 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
2341 // Kill flags can be wrong if we replaced a def inside a loop with a def
2342 // outside the loop.
2343 MRI->clearKillFlags(Def->getOperand(0).getReg());
2344 MI.eraseFromParent();
2345
2346 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
2347 // instruction, so we might as well convert it to the more flexible VOP3-only
2348 // mad/fma form.
2349 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
2350 Def->eraseFromParent();
2351
2352 return true;
2353}
2354
2355// Try to fold a reg_sequence with vgpr output and agpr inputs into an
2356// instruction which can take an agpr. So far that means a store.
2357bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) {
2358 assert(MI.isRegSequence());
2359 auto Reg = MI.getOperand(0).getReg();
2360
2361 if (!ST->hasGFX90AInsts() || !TRI->isVGPR(*MRI, Reg) ||
2362 !MRI->hasOneNonDBGUse(Reg))
2363 return false;
2364
2366 if (!getRegSeqInit(Defs, Reg))
2367 return false;
2368
2369 for (auto &[Op, SubIdx] : Defs) {
2370 if (!Op->isReg())
2371 return false;
2372 if (TRI->isAGPR(*MRI, Op->getReg()))
2373 continue;
2374 // Maybe this is a COPY from AREG
2375 const MachineInstr *SubDef = MRI->getVRegDef(Op->getReg());
2376 if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(1).getSubReg())
2377 return false;
2378 if (!TRI->isAGPR(*MRI, SubDef->getOperand(1).getReg()))
2379 return false;
2380 }
2381
2382 MachineOperand *Op = &*MRI->use_nodbg_begin(Reg);
2383 MachineInstr *UseMI = Op->getParent();
2384 while (UseMI->isCopy() && !Op->getSubReg()) {
2385 Reg = UseMI->getOperand(0).getReg();
2386 if (!TRI->isVGPR(*MRI, Reg) || !MRI->hasOneNonDBGUse(Reg))
2387 return false;
2388 Op = &*MRI->use_nodbg_begin(Reg);
2389 UseMI = Op->getParent();
2390 }
2391
2392 if (Op->getSubReg())
2393 return false;
2394
2395 unsigned OpIdx = Op - &UseMI->getOperand(0);
2396 const MCInstrDesc &InstDesc = UseMI->getDesc();
2397 const TargetRegisterClass *OpRC = TII->getRegClass(InstDesc, OpIdx, TRI);
2398 if (!OpRC || !TRI->isVectorSuperClass(OpRC))
2399 return false;
2400
2401 const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));
2402 auto Dst = MRI->createVirtualRegister(NewDstRC);
2403 auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
2404 TII->get(AMDGPU::REG_SEQUENCE), Dst);
2405
2406 for (auto &[Def, SubIdx] : Defs) {
2407 Def->setIsKill(false);
2408 if (TRI->isAGPR(*MRI, Def->getReg())) {
2409 RS.add(*Def);
2410 } else { // This is a copy
2411 MachineInstr *SubDef = MRI->getVRegDef(Def->getReg());
2412 SubDef->getOperand(1).setIsKill(false);
2413 RS.addReg(SubDef->getOperand(1).getReg(), 0, Def->getSubReg());
2414 }
2415 RS.addImm(SubIdx);
2416 }
2417
2418 Op->setReg(Dst);
2419 if (!TII->isOperandLegal(*UseMI, OpIdx, Op)) {
2420 Op->setReg(Reg);
2421 RS->eraseFromParent();
2422 return false;
2423 }
2424
2425 LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI);
2426
2427 // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users,
2428 // in which case we can erase them all later in runOnMachineFunction.
2429 if (MRI->use_nodbg_empty(MI.getOperand(0).getReg()))
2430 MI.eraseFromParent();
2431 return true;
2432}
2433
2434/// Checks whether \p Copy is a AGPR -> VGPR copy. Returns `true` on success and
2435/// stores the AGPR register in \p OutReg and the subreg in \p OutSubReg
2436static bool isAGPRCopy(const SIRegisterInfo &TRI,
2437 const MachineRegisterInfo &MRI, const MachineInstr &Copy,
2438 Register &OutReg, unsigned &OutSubReg) {
2439 assert(Copy.isCopy());
2440
2441 const MachineOperand &CopySrc = Copy.getOperand(1);
2442 Register CopySrcReg = CopySrc.getReg();
2443 if (!CopySrcReg.isVirtual())
2444 return false;
2445
2446 // Common case: copy from AGPR directly, e.g.
2447 // %1:vgpr_32 = COPY %0:agpr_32
2448 if (TRI.isAGPR(MRI, CopySrcReg)) {
2449 OutReg = CopySrcReg;
2450 OutSubReg = CopySrc.getSubReg();
2451 return true;
2452 }
2453
2454 // Sometimes it can also involve two copies, e.g.
2455 // %1:vgpr_256 = COPY %0:agpr_256
2456 // %2:vgpr_32 = COPY %1:vgpr_256.sub0
2457 const MachineInstr *CopySrcDef = MRI.getVRegDef(CopySrcReg);
2458 if (!CopySrcDef || !CopySrcDef->isCopy())
2459 return false;
2460
2461 const MachineOperand &OtherCopySrc = CopySrcDef->getOperand(1);
2462 Register OtherCopySrcReg = OtherCopySrc.getReg();
2463 if (!OtherCopySrcReg.isVirtual() ||
2464 CopySrcDef->getOperand(0).getSubReg() != AMDGPU::NoSubRegister ||
2465 OtherCopySrc.getSubReg() != AMDGPU::NoSubRegister ||
2466 !TRI.isAGPR(MRI, OtherCopySrcReg))
2467 return false;
2468
2469 OutReg = OtherCopySrcReg;
2470 OutSubReg = CopySrc.getSubReg();
2471 return true;
2472}
2473
2474// Try to hoist an AGPR to VGPR copy across a PHI.
2475// This should allow folding of an AGPR into a consumer which may support it.
2476//
2477// Example 1: LCSSA PHI
2478// loop:
2479// %1:vreg = COPY %0:areg
2480// exit:
2481// %2:vreg = PHI %1:vreg, %loop
2482// =>
2483// loop:
2484// exit:
2485// %1:areg = PHI %0:areg, %loop
2486// %2:vreg = COPY %1:areg
2487//
2488// Example 2: PHI with multiple incoming values:
2489// entry:
2490// %1:vreg = GLOBAL_LOAD(..)
2491// loop:
2492// %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop
2493// %3:areg = COPY %2:vreg
2494// %4:areg = (instr using %3:areg)
2495// %5:vreg = COPY %4:areg
2496// =>
2497// entry:
2498// %1:vreg = GLOBAL_LOAD(..)
2499// %2:areg = COPY %1:vreg
2500// loop:
2501// %3:areg = PHI %2:areg, %entry, %X:areg,
2502// %4:areg = (instr using %3:areg)
2503bool SIFoldOperandsImpl::tryFoldPhiAGPR(MachineInstr &PHI) {
2504 assert(PHI.isPHI());
2505
2506 Register PhiOut = PHI.getOperand(0).getReg();
2507 if (!TRI->isVGPR(*MRI, PhiOut))
2508 return false;
2509
2510 // Iterate once over all incoming values of the PHI to check if this PHI is
2511 // eligible, and determine the exact AGPR RC we'll target.
2512 const TargetRegisterClass *ARC = nullptr;
2513 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
2514 MachineOperand &MO = PHI.getOperand(K);
2515 MachineInstr *Copy = MRI->getVRegDef(MO.getReg());
2516 if (!Copy || !Copy->isCopy())
2517 continue;
2518
2519 Register AGPRSrc;
2520 unsigned AGPRRegMask = AMDGPU::NoSubRegister;
2521 if (!isAGPRCopy(*TRI, *MRI, *Copy, AGPRSrc, AGPRRegMask))
2522 continue;
2523
2524 const TargetRegisterClass *CopyInRC = MRI->getRegClass(AGPRSrc);
2525 if (const auto *SubRC = TRI->getSubRegisterClass(CopyInRC, AGPRRegMask))
2526 CopyInRC = SubRC;
2527
2528 if (ARC && !ARC->hasSubClassEq(CopyInRC))
2529 return false;
2530 ARC = CopyInRC;
2531 }
2532
2533 if (!ARC)
2534 return false;
2535
2536 bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass);
2537
2538 // Rewrite the PHI's incoming values to ARC.
2539 LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI);
2540 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
2541 MachineOperand &MO = PHI.getOperand(K);
2542 Register Reg = MO.getReg();
2543
2545 MachineBasicBlock *InsertMBB = nullptr;
2546
2547 // Look at the def of Reg, ignoring all copies.
2548 unsigned CopyOpc = AMDGPU::COPY;
2549 if (MachineInstr *Def = MRI->getVRegDef(Reg)) {
2550
2551 // Look at pre-existing COPY instructions from ARC: Steal the operand. If
2552 // the copy was single-use, it will be removed by DCE later.
2553 if (Def->isCopy()) {
2554 Register AGPRSrc;
2555 unsigned AGPRSubReg = AMDGPU::NoSubRegister;
2556 if (isAGPRCopy(*TRI, *MRI, *Def, AGPRSrc, AGPRSubReg)) {
2557 MO.setReg(AGPRSrc);
2558 MO.setSubReg(AGPRSubReg);
2559 continue;
2560 }
2561
2562 // If this is a multi-use SGPR -> VGPR copy, use V_ACCVGPR_WRITE on
2563 // GFX908 directly instead of a COPY. Otherwise, SIFoldOperand may try
2564 // to fold the sgpr -> vgpr -> agpr copy into a sgpr -> agpr copy which
2565 // is unlikely to be profitable.
2566 //
2567 // Note that V_ACCVGPR_WRITE is only used for AGPR_32.
2568 MachineOperand &CopyIn = Def->getOperand(1);
2569 if (IsAGPR32 && !ST->hasGFX90AInsts() && !MRI->hasOneNonDBGUse(Reg) &&
2570 TRI->isSGPRReg(*MRI, CopyIn.getReg()))
2571 CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
2572 }
2573
2574 InsertMBB = Def->getParent();
2575 InsertPt = InsertMBB->SkipPHIsLabelsAndDebug(++Def->getIterator());
2576 } else {
2577 InsertMBB = PHI.getOperand(MO.getOperandNo() + 1).getMBB();
2578 InsertPt = InsertMBB->getFirstTerminator();
2579 }
2580
2581 Register NewReg = MRI->createVirtualRegister(ARC);
2582 MachineInstr *MI = BuildMI(*InsertMBB, InsertPt, PHI.getDebugLoc(),
2583 TII->get(CopyOpc), NewReg)
2584 .addReg(Reg);
2585 MO.setReg(NewReg);
2586
2587 (void)MI;
2588 LLVM_DEBUG(dbgs() << " Created COPY: " << *MI);
2589 }
2590
2591 // Replace the PHI's result with a new register.
2592 Register NewReg = MRI->createVirtualRegister(ARC);
2593 PHI.getOperand(0).setReg(NewReg);
2594
2595 // COPY that new register back to the original PhiOut register. This COPY will
2596 // usually be folded out later.
2597 MachineBasicBlock *MBB = PHI.getParent();
2598 BuildMI(*MBB, MBB->getFirstNonPHI(), PHI.getDebugLoc(),
2599 TII->get(AMDGPU::COPY), PhiOut)
2600 .addReg(NewReg);
2601
2602 LLVM_DEBUG(dbgs() << " Done: Folded " << PHI);
2603 return true;
2604}
2605
2606// Attempt to convert VGPR load to an AGPR load.
2607bool SIFoldOperandsImpl::tryFoldLoad(MachineInstr &MI) {
2608 assert(MI.mayLoad());
2609 if (!ST->hasGFX90AInsts() || MI.getNumExplicitDefs() != 1)
2610 return false;
2611
2612 MachineOperand &Def = MI.getOperand(0);
2613 if (!Def.isDef())
2614 return false;
2615
2616 Register DefReg = Def.getReg();
2617
2618 if (DefReg.isPhysical() || !TRI->isVGPR(*MRI, DefReg))
2619 return false;
2620
2622 llvm::make_pointer_range(MRI->use_nodbg_instructions(DefReg)));
2623 SmallVector<Register, 8> MoveRegs;
2624
2625 if (Users.empty())
2626 return false;
2627
2628 // Check that all uses a copy to an agpr or a reg_sequence producing an agpr.
2629 while (!Users.empty()) {
2630 const MachineInstr *I = Users.pop_back_val();
2631 if (!I->isCopy() && !I->isRegSequence())
2632 return false;
2633 Register DstReg = I->getOperand(0).getReg();
2634 // Physical registers may have more than one instruction definitions
2635 if (DstReg.isPhysical())
2636 return false;
2637 if (TRI->isAGPR(*MRI, DstReg))
2638 continue;
2639 MoveRegs.push_back(DstReg);
2640 for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg))
2641 Users.push_back(&U);
2642 }
2643
2644 const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
2645 MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC));
2646 if (!TII->isOperandLegal(MI, 0, &Def)) {
2647 MRI->setRegClass(DefReg, RC);
2648 return false;
2649 }
2650
2651 while (!MoveRegs.empty()) {
2652 Register Reg = MoveRegs.pop_back_val();
2653 MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)));
2654 }
2655
2656 LLVM_DEBUG(dbgs() << "Folded " << MI);
2657
2658 return true;
2659}
2660
2661// tryFoldPhiAGPR will aggressively try to create AGPR PHIs.
2662// For GFX90A and later, this is pretty much always a good thing, but for GFX908
2663// there's cases where it can create a lot more AGPR-AGPR copies, which are
2664// expensive on this architecture due to the lack of V_ACCVGPR_MOV.
2665//
2666// This function looks at all AGPR PHIs in a basic block and collects their
2667// operands. Then, it checks for register that are used more than once across
2668// all PHIs and caches them in a VGPR. This prevents ExpandPostRAPseudo from
2669// having to create one VGPR temporary per use, which can get very messy if
2670// these PHIs come from a broken-up large PHI (e.g. 32 AGPR phis, one per vector
2671// element).
2672//
2673// Example
2674// a:
2675// %in:agpr_256 = COPY %foo:vgpr_256
2676// c:
2677// %x:agpr_32 = ..
2678// b:
2679// %0:areg = PHI %in.sub0:agpr_32, %a, %x, %c
2680// %1:areg = PHI %in.sub0:agpr_32, %a, %y, %c
2681// %2:areg = PHI %in.sub0:agpr_32, %a, %z, %c
2682// =>
2683// a:
2684// %in:agpr_256 = COPY %foo:vgpr_256
2685// %tmp:vgpr_32 = V_ACCVGPR_READ_B32_e64 %in.sub0:agpr_32
2686// %tmp_agpr:agpr_32 = COPY %tmp
2687// c:
2688// %x:agpr_32 = ..
2689// b:
2690// %0:areg = PHI %tmp_agpr, %a, %x, %c
2691// %1:areg = PHI %tmp_agpr, %a, %y, %c
2692// %2:areg = PHI %tmp_agpr, %a, %z, %c
2693bool SIFoldOperandsImpl::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
2694 // This is only really needed on GFX908 where AGPR-AGPR copies are
2695 // unreasonably difficult.
2696 if (ST->hasGFX90AInsts())
2697 return false;
2698
2699 // Look at all AGPR Phis and collect the register + subregister used.
2700 DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>>
2701 RegToMO;
2702
2703 for (auto &MI : MBB) {
2704 if (!MI.isPHI())
2705 break;
2706
2707 if (!TRI->isAGPR(*MRI, MI.getOperand(0).getReg()))
2708 continue;
2709
2710 for (unsigned K = 1; K < MI.getNumOperands(); K += 2) {
2711 MachineOperand &PhiMO = MI.getOperand(K);
2712 if (!PhiMO.getSubReg())
2713 continue;
2714 RegToMO[{PhiMO.getReg(), PhiMO.getSubReg()}].push_back(&PhiMO);
2715 }
2716 }
2717
2718 // For all (Reg, SubReg) pair that are used more than once, cache the value in
2719 // a VGPR.
2720 bool Changed = false;
2721 for (const auto &[Entry, MOs] : RegToMO) {
2722 if (MOs.size() == 1)
2723 continue;
2724
2725 const auto [Reg, SubReg] = Entry;
2726 MachineInstr *Def = MRI->getVRegDef(Reg);
2727 MachineBasicBlock *DefMBB = Def->getParent();
2728
2729 // Create a copy in a VGPR using V_ACCVGPR_READ_B32_e64 so it's not folded
2730 // out.
2731 const TargetRegisterClass *ARC = getRegOpRC(*MRI, *TRI, *MOs.front());
2732 Register TempVGPR =
2733 MRI->createVirtualRegister(TRI->getEquivalentVGPRClass(ARC));
2734 MachineInstr *VGPRCopy =
2735 BuildMI(*DefMBB, ++Def->getIterator(), Def->getDebugLoc(),
2736 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
2737 .addReg(Reg, /* flags */ 0, SubReg);
2738
2739 // Copy back to an AGPR and use that instead of the AGPR subreg in all MOs.
2740 Register TempAGPR = MRI->createVirtualRegister(ARC);
2741 BuildMI(*DefMBB, ++VGPRCopy->getIterator(), Def->getDebugLoc(),
2742 TII->get(AMDGPU::COPY), TempAGPR)
2743 .addReg(TempVGPR);
2744
2745 LLVM_DEBUG(dbgs() << "Caching AGPR into VGPR: " << *VGPRCopy);
2746 for (MachineOperand *MO : MOs) {
2747 MO->setReg(TempAGPR);
2748 MO->setSubReg(AMDGPU::NoSubRegister);
2749 LLVM_DEBUG(dbgs() << " Changed PHI Operand: " << *MO << "\n");
2750 }
2751
2752 Changed = true;
2753 }
2754
2755 return Changed;
2756}
2757
2758bool SIFoldOperandsImpl::run(MachineFunction &MF) {
2759 this->MF = &MF;
2760 MRI = &MF.getRegInfo();
2761 ST = &MF.getSubtarget<GCNSubtarget>();
2762 TII = ST->getInstrInfo();
2763 TRI = &TII->getRegisterInfo();
2764 MFI = MF.getInfo<SIMachineFunctionInfo>();
2765
2766 // omod is ignored by hardware if IEEE bit is enabled. omod also does not
2767 // correctly handle signed zeros.
2768 //
2769 // FIXME: Also need to check strictfp
2770 bool IsIEEEMode = MFI->getMode().IEEE;
2771 bool HasNSZ = MFI->hasNoSignedZerosFPMath();
2772
2773 bool Changed = false;
2774 for (MachineBasicBlock *MBB : depth_first(&MF)) {
2775 MachineOperand *CurrentKnownM0Val = nullptr;
2776 for (auto &MI : make_early_inc_range(*MBB)) {
2777 Changed |= tryFoldCndMask(MI);
2778
2779 if (tryFoldZeroHighBits(MI)) {
2780 Changed = true;
2781 continue;
2782 }
2783
2784 if (MI.isRegSequence() && tryFoldRegSequence(MI)) {
2785 Changed = true;
2786 continue;
2787 }
2788
2789 if (MI.isPHI() && tryFoldPhiAGPR(MI)) {
2790 Changed = true;
2791 continue;
2792 }
2793
2794 if (MI.mayLoad() && tryFoldLoad(MI)) {
2795 Changed = true;
2796 continue;
2797 }
2798
2799 if (TII->isFoldableCopy(MI)) {
2800 Changed |= tryFoldFoldableCopy(MI, CurrentKnownM0Val);
2801 continue;
2802 }
2803
2804 // Saw an unknown clobber of m0, so we no longer know what it is.
2805 if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
2806 CurrentKnownM0Val = nullptr;
2807
2808 // TODO: Omod might be OK if there is NSZ only on the source
2809 // instruction, and not the omod multiply.
2810 if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
2811 !tryFoldOMod(MI))
2812 Changed |= tryFoldClamp(MI);
2813 }
2814
2815 Changed |= tryOptimizeAGPRPhis(*MBB);
2816 }
2817
2818 return Changed;
2819}
2820
2823 MFPropsModifier _(*this, MF);
2824
2825 bool Changed = SIFoldOperandsImpl().run(MF);
2826 if (!Changed) {
2827 return PreservedAnalyses::all();
2828 }
2830 PA.preserveSet<CFGAnalyses>();
2831 return PA;
2832}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat)
Updates the operand at Idx in instruction Inst with the result of instruction Mat.
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
#define _
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
#define I(x, y, z)
Definition MD5.cpp:58
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
static unsigned macToMad(unsigned Opc)
static bool isAGPRCopy(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, const MachineInstr &Copy, Register &OutReg, unsigned &OutSubReg)
Checks whether Copy is a AGPR -> VGPR copy.
static void appendFoldCandidate(SmallVectorImpl< FoldCandidate > &FoldList, FoldCandidate &&Entry)
static const TargetRegisterClass * getRegOpRC(const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const MachineOperand &MO)
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, uint32_t LHS, uint32_t RHS)
static int getOModValue(unsigned Opc, int64_t Val)
static unsigned getMovOpc(bool IsScalar)
static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc)
static MachineOperand * lookUpCopyChain(const SIInstrInfo &TII, const MachineRegisterInfo &MRI, Register SrcReg)
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static int Lookup(ArrayRef< TableEntry > Table, unsigned Opcode)
Value * RHS
Value * LHS
Represent the analysis usage information of a pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
bool hasGFX90AInsts() const
const SIInstrInfo * getInstrInfo() const override
bool hasDOTOpSelHazard() const
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
Describe properties that are true of each instruction in the target description file.
ArrayRef< MCOperandInfo > operands() const
int getOperandConstraint(unsigned OpNum, MCOI::OperandConstraint Constraint) const
Returns the value of the specified operand constraint if it is present.
bool isVariadic() const
Return true if this instruction can have a variable number of operands.
An RAII based helper class to modify MachineFunctionProperties when running pass.
LLVM_ABI iterator SkipPHIsLabelsAndDebug(iterator I, Register Reg=Register(), bool SkipPseudoOp=true)
Return the first instruction in MBB after I that is not a PHI, label or debug.
LLVM_ABI LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
LivenessQueryResult
Possible outcome of a register liveness query to computeRegisterLiveness()
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool isCopy() const
const MachineBasicBlock * getParent() const
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
bool isRegSequence() const
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
MachineOperand * mop_iterator
iterator/begin/end - Iterate over all operands of a machine instruction.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
LLVM_ABI void substVirtReg(Register Reg, unsigned SubIdx, const TargetRegisterInfo &)
substVirtReg - Substitute the current register with the virtual subregister Reg:SubReg.
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
LLVM_ABI void substPhysReg(MCRegister Reg, const TargetRegisterInfo &)
substPhysReg - Substitute the current register with the physical register Reg, taking any existing Su...
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_GlobalAddress
Address of a global value.
@ MO_FrameIndex
Abstract Stack Frame Index.
@ MO_Register
Register operand.
static MachineOperand CreateFI(int Idx)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
SIModeRegisterDefaults getMode() const
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:150
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
static const unsigned CommuteAnyOperandIndex
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
IteratorT begin() const
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:209
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:222
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:224
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:210
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:208
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:218
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:211
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:225
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:236
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:237
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:221
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:217
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:223
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:238
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
LLVM_READONLY int getFlatScratchInstSSfromSV(uint16_t Opcode)
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Dead
Unused definition.
@ Kill
The last use of a register.
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2136
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
Op::Description Desc
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
FunctionPass * createSIFoldOperandsLegacyPass()
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
char & SIFoldOperandsLegacyID
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition iterator.h:363
iterator_range< df_iterator< T > > depth_first(const T &G)
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
constexpr uint64_t Make_64(uint32_t High, uint32_t Low)
Make a 64-bit integer from a high / low pair of 32-bit integers.
Definition MathExtras.h:160
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.