LLVM 23.0.0git
SIFoldOperands.cpp
Go to the documentation of this file.
1//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7/// \file
8//===----------------------------------------------------------------------===//
9//
10
11#include "SIFoldOperands.h"
12#include "AMDGPU.h"
13#include "GCNSubtarget.h"
15#include "SIInstrInfo.h"
17#include "SIRegisterInfo.h"
22
23#define DEBUG_TYPE "si-fold-operands"
24using namespace llvm;
25
26namespace {
27
28/// Track a value we may want to fold into downstream users, applying
29/// subregister extracts along the way.
30struct FoldableDef {
31 union {
32 MachineOperand *OpToFold = nullptr;
33 uint64_t ImmToFold;
34 int FrameIndexToFold;
35 };
36
37 /// Register class of the originally defined value.
38 const TargetRegisterClass *DefRC = nullptr;
39
40 /// Track the original defining instruction for the value.
41 const MachineInstr *DefMI = nullptr;
42
43 /// Subregister to apply to the value at the use point.
44 unsigned DefSubReg = AMDGPU::NoSubRegister;
45
46 /// Kind of value stored in the union.
48
49 FoldableDef() = delete;
50 FoldableDef(MachineOperand &FoldOp, const TargetRegisterClass *DefRC,
51 unsigned DefSubReg = AMDGPU::NoSubRegister)
52 : DefRC(DefRC), DefSubReg(DefSubReg), Kind(FoldOp.getType()) {
53
54 if (FoldOp.isImm()) {
55 ImmToFold = FoldOp.getImm();
56 } else if (FoldOp.isFI()) {
57 FrameIndexToFold = FoldOp.getIndex();
58 } else {
59 assert(FoldOp.isReg() || FoldOp.isGlobal());
60 OpToFold = &FoldOp;
61 }
62
63 DefMI = FoldOp.getParent();
64 }
65
66 FoldableDef(int64_t FoldImm, const TargetRegisterClass *DefRC,
67 unsigned DefSubReg = AMDGPU::NoSubRegister)
68 : ImmToFold(FoldImm), DefRC(DefRC), DefSubReg(DefSubReg),
70
71 /// Copy the current def and apply \p SubReg to the value.
72 FoldableDef getWithSubReg(const SIRegisterInfo &TRI, unsigned SubReg) const {
73 FoldableDef Copy(*this);
74 Copy.DefSubReg = TRI.composeSubRegIndices(DefSubReg, SubReg);
75 return Copy;
76 }
77
78 bool isReg() const { return Kind == MachineOperand::MO_Register; }
79
80 Register getReg() const {
81 assert(isReg());
82 return OpToFold->getReg();
83 }
84
85 unsigned getSubReg() const {
86 assert(isReg());
87 return OpToFold->getSubReg();
88 }
89
90 bool isImm() const { return Kind == MachineOperand::MO_Immediate; }
91
92 bool isFI() const {
93 return Kind == MachineOperand::MO_FrameIndex;
94 }
95
96 int getFI() const {
97 assert(isFI());
98 return FrameIndexToFold;
99 }
100
101 bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
102
103 /// Return the effective immediate value defined by this instruction, after
104 /// application of any subregister extracts which may exist between the use
105 /// and def instruction.
106 std::optional<int64_t> getEffectiveImmVal() const {
107 assert(isImm());
108 return SIInstrInfo::extractSubregFromImm(ImmToFold, DefSubReg);
109 }
110
111 /// Check if it is legal to fold this effective value into \p MI's \p OpNo
112 /// operand.
113 bool isOperandLegal(const SIInstrInfo &TII, const MachineInstr &MI,
114 unsigned OpIdx) const {
115 switch (Kind) {
117 std::optional<int64_t> ImmToFold = getEffectiveImmVal();
118 if (!ImmToFold)
119 return false;
120
121 // TODO: Should verify the subregister index is supported by the class
122 // TODO: Avoid the temporary MachineOperand
123 MachineOperand TmpOp = MachineOperand::CreateImm(*ImmToFold);
124 return TII.isOperandLegal(MI, OpIdx, &TmpOp);
125 }
127 if (DefSubReg != AMDGPU::NoSubRegister)
128 return false;
129 MachineOperand TmpOp = MachineOperand::CreateFI(FrameIndexToFold);
130 return TII.isOperandLegal(MI, OpIdx, &TmpOp);
131 }
132 default:
133 // TODO: Try to apply DefSubReg, for global address we can extract
134 // low/high.
135 if (DefSubReg != AMDGPU::NoSubRegister)
136 return false;
137 return TII.isOperandLegal(MI, OpIdx, OpToFold);
138 }
139
140 llvm_unreachable("covered MachineOperand kind switch");
141 }
142};
143
144struct FoldCandidate {
146 FoldableDef Def;
147 int ShrinkOpcode;
148 unsigned UseOpNo;
149 bool Commuted;
150
151 FoldCandidate(MachineInstr *MI, unsigned OpNo, FoldableDef Def,
152 bool Commuted = false, int ShrinkOp = -1)
153 : UseMI(MI), Def(Def), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
154 Commuted(Commuted) {}
155
156 bool isFI() const { return Def.isFI(); }
157
158 int getFI() const {
159 assert(isFI());
160 return Def.FrameIndexToFold;
161 }
162
163 bool isImm() const { return Def.isImm(); }
164
165 bool isReg() const { return Def.isReg(); }
166
167 Register getReg() const { return Def.getReg(); }
168
169 bool isGlobal() const { return Def.isGlobal(); }
170
171 bool needsShrink() const { return ShrinkOpcode != -1; }
172};
173
174class SIFoldOperandsImpl {
175public:
176 MachineFunction *MF;
178 const SIInstrInfo *TII;
179 const SIRegisterInfo *TRI;
180 const GCNSubtarget *ST;
181 const SIMachineFunctionInfo *MFI;
182
183 bool frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
184 const FoldableDef &OpToFold) const;
185
186 // TODO: Just use TII::getVALUOp
187 unsigned convertToVALUOp(unsigned Opc, bool UseVOP3 = false) const {
188 switch (Opc) {
189 case AMDGPU::S_ADD_I32: {
190 if (ST->hasAddNoCarryInsts())
191 return UseVOP3 ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_U32_e32;
192 return UseVOP3 ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
193 }
194 case AMDGPU::S_OR_B32:
195 return UseVOP3 ? AMDGPU::V_OR_B32_e64 : AMDGPU::V_OR_B32_e32;
196 case AMDGPU::S_AND_B32:
197 return UseVOP3 ? AMDGPU::V_AND_B32_e64 : AMDGPU::V_AND_B32_e32;
198 case AMDGPU::S_MUL_I32:
199 return AMDGPU::V_MUL_LO_U32_e64;
200 default:
201 return AMDGPU::INSTRUCTION_LIST_END;
202 }
203 }
204
205 bool foldCopyToVGPROfScalarAddOfFrameIndex(Register DstReg, Register SrcReg,
206 MachineInstr &MI) const;
207
208 bool updateOperand(FoldCandidate &Fold) const;
209
210 bool canUseImmWithOpSel(const MachineInstr *MI, unsigned UseOpNo,
211 int64_t ImmVal) const;
212
213 /// Try to fold immediate \p ImmVal into \p MI's operand at index \p UseOpNo.
214 bool tryFoldImmWithOpSel(MachineInstr *MI, unsigned UseOpNo,
215 int64_t ImmVal) const;
216
217 bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
218 MachineInstr *MI, unsigned OpNo,
219 const FoldableDef &OpToFold) const;
220 bool isUseSafeToFold(const MachineInstr &MI,
221 const MachineOperand &UseMO) const;
222
223 const TargetRegisterClass *getRegSeqInit(
224 MachineInstr &RegSeq,
225 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs) const;
226
227 const TargetRegisterClass *
228 getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
229 Register UseReg) const;
230
231 std::pair<int64_t, const TargetRegisterClass *>
232 isRegSeqSplat(MachineInstr &RegSeg) const;
233
234 bool tryFoldRegSeqSplat(MachineInstr *UseMI, unsigned UseOpIdx,
235 int64_t SplatVal,
236 const TargetRegisterClass *SplatRC) const;
237
238 bool tryToFoldACImm(const FoldableDef &OpToFold, MachineInstr *UseMI,
239 unsigned UseOpIdx,
240 SmallVectorImpl<FoldCandidate> &FoldList) const;
241 void foldOperand(FoldableDef OpToFold, MachineInstr *UseMI, int UseOpIdx,
243 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
244
245 bool tryConstantFoldOp(MachineInstr *MI) const;
246 bool tryFoldCndMask(MachineInstr &MI) const;
247 bool tryFoldZeroHighBits(MachineInstr &MI) const;
248 bool foldInstOperand(MachineInstr &MI, const FoldableDef &OpToFold) const;
249
250 bool foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const;
251 bool tryFoldFoldableCopy(MachineInstr &MI,
252 MachineOperand *&CurrentKnownM0Val) const;
253
254 const MachineOperand *isClamp(const MachineInstr &MI) const;
255 bool tryFoldClamp(MachineInstr &MI);
256
257 std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
258 bool tryFoldOMod(MachineInstr &MI);
259 bool tryFoldRegSequence(MachineInstr &MI);
260 bool tryFoldPhiAGPR(MachineInstr &MI);
261 bool tryFoldLoad(MachineInstr &MI);
262
263 bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB);
264
265public:
266 SIFoldOperandsImpl() = default;
267
268 bool run(MachineFunction &MF);
269};
270
271class SIFoldOperandsLegacy : public MachineFunctionPass {
272public:
273 static char ID;
274
275 SIFoldOperandsLegacy() : MachineFunctionPass(ID) {}
276
277 bool runOnMachineFunction(MachineFunction &MF) override {
278 if (skipFunction(MF.getFunction()))
279 return false;
280 return SIFoldOperandsImpl().run(MF);
281 }
282
283 StringRef getPassName() const override { return "SI Fold Operands"; }
284
285 void getAnalysisUsage(AnalysisUsage &AU) const override {
286 AU.setPreservesCFG();
288 }
289
290 MachineFunctionProperties getRequiredProperties() const override {
291 return MachineFunctionProperties().setIsSSA();
292 }
293};
294
295} // End anonymous namespace.
296
297INITIALIZE_PASS(SIFoldOperandsLegacy, DEBUG_TYPE, "SI Fold Operands", false,
298 false)
299
300char SIFoldOperandsLegacy::ID = 0;
301
302char &llvm::SIFoldOperandsLegacyID = SIFoldOperandsLegacy::ID;
303
306 const MachineOperand &MO) {
307 const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
308 if (const TargetRegisterClass *SubRC =
309 TRI.getSubRegisterClass(RC, MO.getSubReg()))
310 RC = SubRC;
311 return RC;
312}
313
314// Map multiply-accumulate opcode to corresponding multiply-add opcode if any.
315static unsigned macToMad(unsigned Opc) {
316 switch (Opc) {
317 case AMDGPU::V_MAC_F32_e64:
318 return AMDGPU::V_MAD_F32_e64;
319 case AMDGPU::V_MAC_F16_e64:
320 return AMDGPU::V_MAD_F16_e64;
321 case AMDGPU::V_FMAC_F32_e64:
322 return AMDGPU::V_FMA_F32_e64;
323 case AMDGPU::V_FMAC_F16_e64:
324 return AMDGPU::V_FMA_F16_gfx9_e64;
325 case AMDGPU::V_FMAC_F16_t16_e64:
326 return AMDGPU::V_FMA_F16_gfx9_t16_e64;
327 case AMDGPU::V_FMAC_F16_fake16_e64:
328 return AMDGPU::V_FMA_F16_gfx9_fake16_e64;
329 case AMDGPU::V_FMAC_LEGACY_F32_e64:
330 return AMDGPU::V_FMA_LEGACY_F32_e64;
331 case AMDGPU::V_FMAC_F64_e64:
332 return AMDGPU::V_FMA_F64_e64;
333 }
334 return AMDGPU::INSTRUCTION_LIST_END;
335}
336
337// TODO: Add heuristic that the frame index might not fit in the addressing mode
338// immediate offset to avoid materializing in loops.
339bool SIFoldOperandsImpl::frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
340 const FoldableDef &OpToFold) const {
341 if (!OpToFold.isFI())
342 return false;
343
344 const unsigned Opc = UseMI.getOpcode();
345 switch (Opc) {
346 case AMDGPU::S_ADD_I32:
347 case AMDGPU::S_ADD_U32:
348 case AMDGPU::V_ADD_U32_e32:
349 case AMDGPU::V_ADD_CO_U32_e32:
350 // TODO: Possibly relax hasOneUse. It matters more for mubuf, since we have
351 // to insert the wave size shift at every point we use the index.
352 // TODO: Fix depending on visit order to fold immediates into the operand
353 return UseMI.getOperand(OpNo == 1 ? 2 : 1).isImm() &&
354 MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());
355 case AMDGPU::V_ADD_U32_e64:
356 case AMDGPU::V_ADD_CO_U32_e64:
357 return UseMI.getOperand(OpNo == 2 ? 3 : 2).isImm() &&
358 MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());
359 default:
360 break;
361 }
362
363 if (TII->isMUBUF(UseMI))
364 return OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
365 if (!TII->isFLATScratch(UseMI))
366 return false;
367
368 int SIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
369 if (OpNo == SIdx)
370 return true;
371
372 int VIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
373 return OpNo == VIdx && SIdx == -1;
374}
375
376/// Fold %vgpr = COPY (S_ADD_I32 x, frameindex)
377///
378/// => %vgpr = V_ADD_U32 x, frameindex
379bool SIFoldOperandsImpl::foldCopyToVGPROfScalarAddOfFrameIndex(
380 Register DstReg, Register SrcReg, MachineInstr &MI) const {
381 if (TRI->isVGPR(*MRI, DstReg) && TRI->isSGPRReg(*MRI, SrcReg) &&
382 MRI->hasOneNonDBGUse(SrcReg)) {
383 MachineInstr *Def = MRI->getVRegDef(SrcReg);
384 if (!Def || Def->getNumOperands() != 4)
385 return false;
386
387 MachineOperand *Src0 = &Def->getOperand(1);
388 MachineOperand *Src1 = &Def->getOperand(2);
389
390 // TODO: This is profitable with more operand types, and for more
391 // opcodes. But ultimately this is working around poor / nonexistent
392 // regbankselect.
393 if (!Src0->isFI() && !Src1->isFI())
394 return false;
395
396 if (Src0->isFI())
397 std::swap(Src0, Src1);
398
399 const bool UseVOP3 = !Src0->isImm() || TII->isInlineConstant(*Src0);
400 unsigned NewOp = convertToVALUOp(Def->getOpcode(), UseVOP3);
401 if (NewOp == AMDGPU::INSTRUCTION_LIST_END ||
402 !Def->getOperand(3).isDead()) // Check if scc is dead
403 return false;
404
405 MachineBasicBlock *MBB = Def->getParent();
406 const DebugLoc &DL = Def->getDebugLoc();
407 if (NewOp != AMDGPU::V_ADD_CO_U32_e32) {
408 MachineInstrBuilder Add =
409 BuildMI(*MBB, *Def, DL, TII->get(NewOp), DstReg);
410
411 if (Add->getDesc().getNumDefs() == 2) {
412 Register CarryOutReg = MRI->createVirtualRegister(TRI->getBoolRC());
413 Add.addDef(CarryOutReg, RegState::Dead);
414 MRI->setRegAllocationHint(CarryOutReg, 0, TRI->getVCC());
415 }
416
417 Add.add(*Src0).add(*Src1).setMIFlags(Def->getFlags());
418 if (AMDGPU::hasNamedOperand(NewOp, AMDGPU::OpName::clamp))
419 Add.addImm(0);
420
421 Def->eraseFromParent();
422 MI.eraseFromParent();
423 return true;
424 }
425
426 assert(NewOp == AMDGPU::V_ADD_CO_U32_e32);
427
429 MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, *Def, 16);
430 if (Liveness == MachineBasicBlock::LQR_Dead) {
431 // TODO: If src1 satisfies operand constraints, use vop3 version.
432 BuildMI(*MBB, *Def, DL, TII->get(NewOp), DstReg)
433 .add(*Src0)
434 .add(*Src1)
435 .setOperandDead(3) // implicit-def $vcc
436 .setMIFlags(Def->getFlags());
437 Def->eraseFromParent();
438 MI.eraseFromParent();
439 return true;
440 }
441 }
442
443 return false;
444}
445
447 return new SIFoldOperandsLegacy();
448}
449
450bool SIFoldOperandsImpl::canUseImmWithOpSel(const MachineInstr *MI,
451 unsigned UseOpNo,
452 int64_t ImmVal) const {
456 return false;
457
458 const MachineOperand &Old = MI->getOperand(UseOpNo);
459 int OpNo = MI->getOperandNo(&Old);
460
461 unsigned Opcode = MI->getOpcode();
462 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
463 switch (OpType) {
464 default:
465 return false;
473 // VOP3 packed instructions ignore op_sel source modifiers, we cannot encode
474 // two different constants.
476 static_cast<uint16_t>(ImmVal) != static_cast<uint16_t>(ImmVal >> 16))
477 return false;
478 break;
479 }
480
481 return true;
482}
483
484bool SIFoldOperandsImpl::tryFoldImmWithOpSel(MachineInstr *MI, unsigned UseOpNo,
485 int64_t ImmVal) const {
486 MachineOperand &Old = MI->getOperand(UseOpNo);
487 unsigned Opcode = MI->getOpcode();
488 int OpNo = MI->getOperandNo(&Old);
489 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
490
491 // If the literal can be inlined as-is, apply it and short-circuit the
492 // tests below. The main motivation for this is to avoid unintuitive
493 // uses of opsel.
494 if (AMDGPU::isInlinableLiteralV216(ImmVal, OpType)) {
495 Old.ChangeToImmediate(ImmVal);
496 return true;
497 }
498
499 // Refer to op_sel/op_sel_hi and check if we can change the immediate and
500 // op_sel in a way that allows an inline constant.
501 AMDGPU::OpName ModName = AMDGPU::OpName::NUM_OPERAND_NAMES;
502 unsigned SrcIdx = ~0;
503 if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) {
504 ModName = AMDGPU::OpName::src0_modifiers;
505 SrcIdx = 0;
506 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) {
507 ModName = AMDGPU::OpName::src1_modifiers;
508 SrcIdx = 1;
509 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) {
510 ModName = AMDGPU::OpName::src2_modifiers;
511 SrcIdx = 2;
512 }
513 assert(ModName != AMDGPU::OpName::NUM_OPERAND_NAMES);
514 int ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModName);
515 MachineOperand &Mod = MI->getOperand(ModIdx);
516 unsigned ModVal = Mod.getImm();
517
518 uint16_t ImmLo =
519 static_cast<uint16_t>(ImmVal >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0));
520 uint16_t ImmHi =
521 static_cast<uint16_t>(ImmVal >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0));
522 uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo;
523 unsigned NewModVal = ModVal & ~(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
524
525 // Helper function that attempts to inline the given value with a newly
526 // chosen opsel pattern.
527 auto tryFoldToInline = [&](uint32_t Imm) -> bool {
528 if (AMDGPU::isInlinableLiteralV216(Imm, OpType)) {
529 Mod.setImm(NewModVal | SISrcMods::OP_SEL_1);
530 Old.ChangeToImmediate(Imm);
531 return true;
532 }
533
534 // Try to shuffle the halves around and leverage opsel to get an inline
535 // constant.
536 uint16_t Lo = static_cast<uint16_t>(Imm);
537 uint16_t Hi = static_cast<uint16_t>(Imm >> 16);
538 if (Lo == Hi) {
539 if (AMDGPU::isInlinableLiteralV216(Lo, OpType)) {
540 Mod.setImm(NewModVal);
542 return true;
543 }
544
545 if (static_cast<int16_t>(Lo) < 0) {
546 int32_t SExt = static_cast<int16_t>(Lo);
547 if (AMDGPU::isInlinableLiteralV216(SExt, OpType)) {
548 Mod.setImm(NewModVal);
549 Old.ChangeToImmediate(SExt);
550 return true;
551 }
552 }
553
554 // This check is only useful for integer instructions
555 if (OpType == AMDGPU::OPERAND_REG_IMM_V2INT16) {
556 if (AMDGPU::isInlinableLiteralV216(Lo << 16, OpType)) {
557 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
558 Old.ChangeToImmediate(static_cast<uint32_t>(Lo) << 16);
559 return true;
560 }
561 }
562 } else {
563 uint32_t Swapped = (static_cast<uint32_t>(Lo) << 16) | Hi;
564 if (AMDGPU::isInlinableLiteralV216(Swapped, OpType)) {
565 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0);
566 Old.ChangeToImmediate(Swapped);
567 return true;
568 }
569 }
570
571 return false;
572 };
573
574 if (tryFoldToInline(Imm))
575 return true;
576
577 // Replace integer addition by subtraction and vice versa if it allows
578 // folding the immediate to an inline constant.
579 //
580 // We should only ever get here for SrcIdx == 1 due to canonicalization
581 // earlier in the pipeline, but we double-check here to be safe / fully
582 // general.
583 bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16;
584 bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16;
585 if (SrcIdx == 1 && (IsUAdd || IsUSub)) {
586 unsigned ClampIdx =
587 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp);
588 bool Clamp = MI->getOperand(ClampIdx).getImm() != 0;
589
590 if (!Clamp) {
591 uint16_t NegLo = -static_cast<uint16_t>(Imm);
592 uint16_t NegHi = -static_cast<uint16_t>(Imm >> 16);
593 uint32_t NegImm = (static_cast<uint32_t>(NegHi) << 16) | NegLo;
594
595 if (tryFoldToInline(NegImm)) {
596 unsigned NegOpcode =
597 IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16;
598 MI->setDesc(TII->get(NegOpcode));
599 return true;
600 }
601 }
602 }
603
604 return false;
605}
606
607bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
608 MachineInstr *MI = Fold.UseMI;
609 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
610 assert(Old.isReg());
611
612 std::optional<int64_t> ImmVal;
613 if (Fold.isImm())
614 ImmVal = Fold.Def.getEffectiveImmVal();
615
616 if (ImmVal && canUseImmWithOpSel(Fold.UseMI, Fold.UseOpNo, *ImmVal)) {
617 if (tryFoldImmWithOpSel(Fold.UseMI, Fold.UseOpNo, *ImmVal))
618 return true;
619
620 // We can't represent the candidate as an inline constant. Try as a literal
621 // with the original opsel, checking constant bus limitations.
622 MachineOperand New = MachineOperand::CreateImm(*ImmVal);
623 int OpNo = MI->getOperandNo(&Old);
624 if (!TII->isOperandLegal(*MI, OpNo, &New))
625 return false;
626 Old.ChangeToImmediate(*ImmVal);
627 return true;
628 }
629
630 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
631 MachineBasicBlock *MBB = MI->getParent();
632 auto Liveness = MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 16);
633 if (Liveness != MachineBasicBlock::LQR_Dead) {
634 LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n");
635 return false;
636 }
637
638 int Op32 = Fold.ShrinkOpcode;
639 MachineOperand &Dst0 = MI->getOperand(0);
640 MachineOperand &Dst1 = MI->getOperand(1);
641 assert(Dst0.isDef() && Dst1.isDef());
642
643 bool HaveNonDbgCarryUse = !MRI->use_nodbg_empty(Dst1.getReg());
644
645 const TargetRegisterClass *Dst0RC = MRI->getRegClass(Dst0.getReg());
646 Register NewReg0 = MRI->createVirtualRegister(Dst0RC);
647
648 MachineInstr *Inst32 = TII->buildShrunkInst(*MI, Op32);
649
650 if (HaveNonDbgCarryUse) {
651 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::COPY),
652 Dst1.getReg())
653 .addReg(AMDGPU::VCC, RegState::Kill);
654 }
655
656 // Keep the old instruction around to avoid breaking iterators, but
657 // replace it with a dummy instruction to remove uses.
658 //
659 // FIXME: We should not invert how this pass looks at operands to avoid
660 // this. Should track set of foldable movs instead of looking for uses
661 // when looking at a use.
662 Dst0.setReg(NewReg0);
663 for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
664 MI->removeOperand(I);
665 MI->setDesc(TII->get(AMDGPU::IMPLICIT_DEF));
666
667 if (Fold.Commuted)
668 TII->commuteInstruction(*Inst32, false);
669 return true;
670 }
671
672 assert(!Fold.needsShrink() && "not handled");
673
674 if (ImmVal) {
675 if (Old.isTied()) {
676 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(MI->getOpcode());
677 if (NewMFMAOpc == -1)
678 return false;
679 MI->setDesc(TII->get(NewMFMAOpc));
680 MI->untieRegOperand(0);
681 const MCInstrDesc &MCID = MI->getDesc();
682 for (unsigned I = 0; I < MI->getNumDefs(); ++I)
684 MI->getOperand(I).setIsEarlyClobber(true);
685 }
686
687 // TODO: Should we try to avoid adding this to the candidate list?
688 MachineOperand New = MachineOperand::CreateImm(*ImmVal);
689 int OpNo = MI->getOperandNo(&Old);
690 if (!TII->isOperandLegal(*MI, OpNo, &New))
691 return false;
692
693 Old.ChangeToImmediate(*ImmVal);
694 return true;
695 }
696
697 if (Fold.isGlobal()) {
698 Old.ChangeToGA(Fold.Def.OpToFold->getGlobal(),
699 Fold.Def.OpToFold->getOffset(),
700 Fold.Def.OpToFold->getTargetFlags());
701 return true;
702 }
703
704 if (Fold.isFI()) {
705 Old.ChangeToFrameIndex(Fold.getFI());
706 return true;
707 }
708
709 MachineOperand *New = Fold.Def.OpToFold;
710
711 // Verify the register is compatible with the operand.
712 if (const TargetRegisterClass *OpRC =
713 TII->getRegClass(MI->getDesc(), Fold.UseOpNo)) {
714 const TargetRegisterClass *NewRC =
715 TRI->getRegClassForReg(*MRI, New->getReg());
716
717 const TargetRegisterClass *ConstrainRC = OpRC;
718 if (New->getSubReg()) {
719 ConstrainRC =
720 TRI->getMatchingSuperRegClass(NewRC, OpRC, New->getSubReg());
721
722 if (!ConstrainRC)
723 return false;
724 }
725
726 if (New->getReg().isVirtual() &&
727 !MRI->constrainRegClass(New->getReg(), ConstrainRC)) {
728 LLVM_DEBUG(dbgs() << "Cannot constrain " << printReg(New->getReg(), TRI)
729 << TRI->getRegClassName(ConstrainRC) << '\n');
730 return false;
731 }
732 }
733
734 // Rework once the VS_16 register class is updated to include proper
735 // 16-bit SGPRs instead of 32-bit ones.
736 if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
737 Old.setSubReg(AMDGPU::NoSubRegister);
738 if (New->getReg().isPhysical()) {
739 Old.substPhysReg(New->getReg(), *TRI);
740 } else {
741 Register OldReg = Old.getReg();
742 Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
743 Old.setIsUndef(New->isUndef());
744
745 // If MI is in a BUNDLE, also update header's matching implicit use.
746 if (MI->isBundledWithPred()) {
747 MachineInstr &Header = *getBundleStart(MI->getIterator());
748 for (MachineOperand &MO : Header.operands()) {
749 if (MO.getReg() == OldReg) {
750 MO.setReg(New->getReg());
751 MO.setSubReg(New->getSubReg());
752 }
753 }
754 }
755 }
756 return true;
757}
758
760 FoldCandidate &&Entry) {
761 // Skip additional folding on the same operand.
762 for (FoldCandidate &Fold : FoldList)
763 if (Fold.UseMI == Entry.UseMI && Fold.UseOpNo == Entry.UseOpNo)
764 return;
765 LLVM_DEBUG(dbgs() << "Append " << (Entry.Commuted ? "commuted" : "normal")
766 << " operand " << Entry.UseOpNo << "\n " << *Entry.UseMI);
767 FoldList.push_back(Entry);
768}
769
771 MachineInstr *MI, unsigned OpNo,
772 const FoldableDef &FoldOp,
773 bool Commuted = false, int ShrinkOp = -1) {
774 appendFoldCandidate(FoldList,
775 FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp));
776}
777
778// Returns true if the instruction is a packed F32 instruction and the
779// corresponding scalar operand reads 32 bits and replicates the bits to both
780// channels.
782 const GCNSubtarget *ST, MachineInstr *MI, unsigned OpNo) {
783 if (!ST->hasPKF32InstsReplicatingLower32BitsOfScalarInput())
784 return false;
785 const MCOperandInfo &OpDesc = MI->getDesc().operands()[OpNo];
787}
788
789// Packed FP32 instructions only read 32 bits from a scalar operand (SGPR or
790// literal) and replicates the bits to both channels. Therefore, if the hi and
791// lo are not same, we can't fold it.
793 const FoldableDef &OpToFold) {
794 assert(OpToFold.isImm() && "Expected immediate operand");
795 uint64_t ImmVal = OpToFold.getEffectiveImmVal().value();
796 uint32_t Lo = Lo_32(ImmVal);
797 uint32_t Hi = Hi_32(ImmVal);
798 return Lo == Hi;
799}
800
801bool SIFoldOperandsImpl::tryAddToFoldList(
802 SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo,
803 const FoldableDef &OpToFold) const {
804 const unsigned Opc = MI->getOpcode();
805
806 auto tryToFoldAsFMAAKorMK = [&]() {
807 if (!OpToFold.isImm())
808 return false;
809
810 const bool TryAK = OpNo == 3;
811 const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
812 MI->setDesc(TII->get(NewOpc));
813
814 // We have to fold into operand which would be Imm not into OpNo.
815 bool FoldAsFMAAKorMK =
816 tryAddToFoldList(FoldList, MI, TryAK ? 3 : 2, OpToFold);
817 if (FoldAsFMAAKorMK) {
818 // Untie Src2 of fmac.
819 MI->untieRegOperand(3);
820 // For fmamk swap operands 1 and 2 if OpToFold was meant for operand 1.
821 if (OpNo == 1) {
822 MachineOperand &Op1 = MI->getOperand(1);
823 MachineOperand &Op2 = MI->getOperand(2);
824 Register OldReg = Op1.getReg();
825 // Operand 2 might be an inlinable constant
826 if (Op2.isImm()) {
827 Op1.ChangeToImmediate(Op2.getImm());
828 Op2.ChangeToRegister(OldReg, false);
829 } else {
830 Op1.setReg(Op2.getReg());
831 Op2.setReg(OldReg);
832 }
833 }
834 return true;
835 }
836 MI->setDesc(TII->get(Opc));
837 return false;
838 };
839
840 bool IsLegal = OpToFold.isOperandLegal(*TII, *MI, OpNo);
841 if (!IsLegal && OpToFold.isImm()) {
842 if (std::optional<int64_t> ImmVal = OpToFold.getEffectiveImmVal())
843 IsLegal = canUseImmWithOpSel(MI, OpNo, *ImmVal);
844 }
845
846 if (!IsLegal) {
847 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
848 unsigned NewOpc = macToMad(Opc);
849 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
850 // Check if changing this to a v_mad_{f16, f32} instruction will allow us
851 // to fold the operand.
852 MI->setDesc(TII->get(NewOpc));
853 bool AddOpSel = !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel) &&
854 AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel);
855 if (AddOpSel)
856 MI->addOperand(MachineOperand::CreateImm(0));
857 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold);
858 if (FoldAsMAD) {
859 MI->untieRegOperand(OpNo);
860 return true;
861 }
862 if (AddOpSel)
863 MI->removeOperand(MI->getNumExplicitOperands() - 1);
864 MI->setDesc(TII->get(Opc));
865 }
866
867 // Special case for s_fmac_f32 if we are trying to fold into Src2.
868 // By transforming into fmaak we can untie Src2 and make folding legal.
869 if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
870 if (tryToFoldAsFMAAKorMK())
871 return true;
872 }
873
874 // Special case for s_setreg_b32
875 if (OpToFold.isImm()) {
876 unsigned ImmOpc = 0;
877 if (Opc == AMDGPU::S_SETREG_B32)
878 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
879 else if (Opc == AMDGPU::S_SETREG_B32_mode)
880 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
881 if (ImmOpc) {
882 MI->setDesc(TII->get(ImmOpc));
883 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
884 return true;
885 }
886 }
887
888 // Operand is not legal, so try to commute the instruction to
889 // see if this makes it possible to fold.
890 unsigned CommuteOpNo = TargetInstrInfo::CommuteAnyOperandIndex;
891 bool CanCommute = TII->findCommutedOpIndices(*MI, OpNo, CommuteOpNo);
892 if (!CanCommute)
893 return false;
894
895 MachineOperand &Op = MI->getOperand(OpNo);
896 MachineOperand &CommutedOp = MI->getOperand(CommuteOpNo);
897
898 // One of operands might be an Imm operand, and OpNo may refer to it after
899 // the call of commuteInstruction() below. Such situations are avoided
900 // here explicitly as OpNo must be a register operand to be a candidate
901 // for memory folding.
902 if (!Op.isReg() || !CommutedOp.isReg())
903 return false;
904
905 // The same situation with an immediate could reproduce if both inputs are
906 // the same register.
907 if (Op.isReg() && CommutedOp.isReg() &&
908 (Op.getReg() == CommutedOp.getReg() &&
909 Op.getSubReg() == CommutedOp.getSubReg()))
910 return false;
911
912 if (!TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo))
913 return false;
914
915 int Op32 = -1;
916 if (!OpToFold.isOperandLegal(*TII, *MI, CommuteOpNo)) {
917 if ((Opc != AMDGPU::V_ADD_CO_U32_e64 && Opc != AMDGPU::V_SUB_CO_U32_e64 &&
918 Opc != AMDGPU::V_SUBREV_CO_U32_e64) || // FIXME
919 (!OpToFold.isImm() && !OpToFold.isFI() && !OpToFold.isGlobal())) {
920 TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo);
921 return false;
922 }
923
924 // Verify the other operand is a VGPR, otherwise we would violate the
925 // constant bus restriction.
926 MachineOperand &OtherOp = MI->getOperand(OpNo);
927 if (!OtherOp.isReg() ||
928 !TII->getRegisterInfo().isVGPR(*MRI, OtherOp.getReg()))
929 return false;
930
931 assert(MI->getOperand(1).isDef());
932
933 // Make sure to get the 32-bit version of the commuted opcode.
934 unsigned MaybeCommutedOpc = MI->getOpcode();
935 Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
936 }
937
938 appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, /*Commuted=*/true,
939 Op32);
940 return true;
941 }
942
943 // Special case for s_fmac_f32 if we are trying to fold into Src0 or Src1.
944 // By changing into fmamk we can untie Src2.
945 // If folding for Src0 happens first and it is identical operand to Src1 we
946 // should avoid transforming into fmamk which requires commuting as it would
947 // cause folding into Src1 to fail later on due to wrong OpNo used.
948 if (Opc == AMDGPU::S_FMAC_F32 &&
949 (OpNo != 1 || !MI->getOperand(1).isIdenticalTo(MI->getOperand(2)))) {
950 if (tryToFoldAsFMAAKorMK())
951 return true;
952 }
953
954 // Special case for PK_F32 instructions if we are trying to fold an imm to
955 // src0 or src1.
956 if (OpToFold.isImm() &&
959 return false;
960
961 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
962 return true;
963}
964
965bool SIFoldOperandsImpl::isUseSafeToFold(const MachineInstr &MI,
966 const MachineOperand &UseMO) const {
967 // Operands of SDWA instructions must be registers.
968 return !TII->isSDWA(MI);
969}
970
972 const MachineRegisterInfo &MRI,
973 Register SrcReg) {
974 MachineOperand *Sub = nullptr;
975 for (MachineInstr *SubDef = MRI.getVRegDef(SrcReg);
976 SubDef && TII.isFoldableCopy(*SubDef);
977 SubDef = MRI.getVRegDef(Sub->getReg())) {
978 unsigned SrcIdx = TII.getFoldableCopySrcIdx(*SubDef);
979 MachineOperand &SrcOp = SubDef->getOperand(SrcIdx);
980
981 if (SrcOp.isImm())
982 return &SrcOp;
983 if (!SrcOp.isReg() || SrcOp.getReg().isPhysical())
984 break;
985 Sub = &SrcOp;
986 // TODO: Support compose
987 if (SrcOp.getSubReg())
988 break;
989 }
990
991 return Sub;
992}
993
994const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
995 MachineInstr &RegSeq,
996 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs) const {
997
998 assert(RegSeq.isRegSequence());
999
1000 const TargetRegisterClass *RC = nullptr;
1001
1002 for (unsigned I = 1, E = RegSeq.getNumExplicitOperands(); I != E; I += 2) {
1003 MachineOperand &SrcOp = RegSeq.getOperand(I);
1004 if (SrcOp.getReg().isPhysical())
1005 return nullptr;
1006 unsigned SubRegIdx = RegSeq.getOperand(I + 1).getImm();
1007
1008 // Only accept reg_sequence with uniform reg class inputs for simplicity.
1009 const TargetRegisterClass *OpRC = getRegOpRC(*MRI, *TRI, SrcOp);
1010 if (!RC)
1011 RC = OpRC;
1012 else if (!TRI->getCommonSubClass(RC, OpRC))
1013 return nullptr;
1014
1015 if (SrcOp.getSubReg()) {
1016 // TODO: Handle subregister compose
1017 Defs.emplace_back(&SrcOp, SubRegIdx);
1018 continue;
1019 }
1020
1021 MachineOperand *DefSrc = lookUpCopyChain(*TII, *MRI, SrcOp.getReg());
1022 if (DefSrc && (DefSrc->isReg() || DefSrc->isImm())) {
1023 Defs.emplace_back(DefSrc, SubRegIdx);
1024 continue;
1025 }
1026
1027 Defs.emplace_back(&SrcOp, SubRegIdx);
1028 }
1029
1030 return RC;
1031}
1032
1033// Find a def of the UseReg, check if it is a reg_sequence and find initializers
1034// for each subreg, tracking it to an immediate if possible. Returns the
1035// register class of the inputs on success.
1036const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
1037 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
1038 Register UseReg) const {
1039 MachineInstr *Def = MRI->getVRegDef(UseReg);
1040 if (!Def || !Def->isRegSequence())
1041 return nullptr;
1042
1043 return getRegSeqInit(*Def, Defs);
1044}
1045
1046std::pair<int64_t, const TargetRegisterClass *>
1047SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq) const {
1049 const TargetRegisterClass *SrcRC = getRegSeqInit(RegSeq, Defs);
1050 if (!SrcRC)
1051 return {};
1052
1053 bool TryToMatchSplat64 = false;
1054
1055 int64_t Imm;
1056 for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
1057 const MachineOperand *Op = Defs[I].first;
1058 if (!Op->isImm())
1059 return {};
1060
1061 int64_t SubImm = Op->getImm();
1062 if (!I) {
1063 Imm = SubImm;
1064 continue;
1065 }
1066
1067 if (Imm != SubImm) {
1068 if (I == 1 && (E & 1) == 0) {
1069 // If we have an even number of inputs, there's a chance this is a
1070 // 64-bit element splat broken into 32-bit pieces.
1071 TryToMatchSplat64 = true;
1072 break;
1073 }
1074
1075 return {}; // Can only fold splat constants
1076 }
1077 }
1078
1079 if (!TryToMatchSplat64)
1080 return {Defs[0].first->getImm(), SrcRC};
1081
1082 // Fallback to recognizing 64-bit splats broken into 32-bit pieces
1083 // (i.e. recognize every other other element is 0 for 64-bit immediates)
1084 int64_t SplatVal64;
1085 for (unsigned I = 0, E = Defs.size(); I != E; I += 2) {
1086 const MachineOperand *Op0 = Defs[I].first;
1087 const MachineOperand *Op1 = Defs[I + 1].first;
1088
1089 if (!Op0->isImm() || !Op1->isImm())
1090 return {};
1091
1092 unsigned SubReg0 = Defs[I].second;
1093 unsigned SubReg1 = Defs[I + 1].second;
1094
1095 // Assume we're going to generally encounter reg_sequences with sorted
1096 // subreg indexes, so reject any that aren't consecutive.
1097 if (TRI->getChannelFromSubReg(SubReg0) + 1 !=
1098 TRI->getChannelFromSubReg(SubReg1))
1099 return {};
1100
1101 if (TRI->getSubRegIdxSize(SubReg0) != 32)
1102 return {};
1103
1104 int64_t MergedVal = Make_64(Op1->getImm(), Op0->getImm());
1105 if (I == 0)
1106 SplatVal64 = MergedVal;
1107 else if (SplatVal64 != MergedVal)
1108 return {};
1109 }
1110
1111 const TargetRegisterClass *RC64 = TRI->getSubRegisterClass(
1112 MRI->getRegClass(RegSeq.getOperand(0).getReg()), AMDGPU::sub0_sub1);
1113
1114 return {SplatVal64, RC64};
1115}
1116
1117bool SIFoldOperandsImpl::tryFoldRegSeqSplat(
1118 MachineInstr *UseMI, unsigned UseOpIdx, int64_t SplatVal,
1119 const TargetRegisterClass *SplatRC) const {
1120 const MCInstrDesc &Desc = UseMI->getDesc();
1121 if (UseOpIdx >= Desc.getNumOperands())
1122 return false;
1123
1124 // Filter out unhandled pseudos.
1125 if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
1126 return false;
1127
1128 int16_t RCID = TII->getOpRegClassID(Desc.operands()[UseOpIdx]);
1129 if (RCID == -1)
1130 return false;
1131
1132 const TargetRegisterClass *OpRC = TRI->getRegClass(RCID);
1133
1134 // Special case 0/-1, since when interpreted as a 64-bit element both halves
1135 // have the same bits. These are the only cases where a splat has the same
1136 // interpretation for 32-bit and 64-bit splats.
1137 if (SplatVal != 0 && SplatVal != -1) {
1138 // We need to figure out the scalar type read by the operand. e.g. the MFMA
1139 // operand will be AReg_128, and we want to check if it's compatible with an
1140 // AReg_32 constant.
1141 uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
1142 switch (OpTy) {
1147 OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0);
1148 break;
1154 OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1);
1155 break;
1156 default:
1157 return false;
1158 }
1159
1160 if (!TRI->getCommonSubClass(OpRC, SplatRC))
1161 return false;
1162 }
1163
1164 MachineOperand TmpOp = MachineOperand::CreateImm(SplatVal);
1165 if (!TII->isOperandLegal(*UseMI, UseOpIdx, &TmpOp))
1166 return false;
1167
1168 return true;
1169}
1170
1171bool SIFoldOperandsImpl::tryToFoldACImm(
1172 const FoldableDef &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx,
1173 SmallVectorImpl<FoldCandidate> &FoldList) const {
1174 const MCInstrDesc &Desc = UseMI->getDesc();
1175 if (UseOpIdx >= Desc.getNumOperands())
1176 return false;
1177
1178 // Filter out unhandled pseudos.
1179 if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
1180 return false;
1181
1182 if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) {
1185 return false;
1186 appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold);
1187 return true;
1188 }
1189
1190 return false;
1191}
1192
1193void SIFoldOperandsImpl::foldOperand(
1194 FoldableDef OpToFold, MachineInstr *UseMI, int UseOpIdx,
1195 SmallVectorImpl<FoldCandidate> &FoldList,
1196 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
1197 const MachineOperand *UseOp = &UseMI->getOperand(UseOpIdx);
1198
1199 if (!isUseSafeToFold(*UseMI, *UseOp))
1200 return;
1201
1202 // FIXME: Fold operands with subregs.
1203 if (UseOp->isReg() && OpToFold.isReg()) {
1204 if (UseOp->isImplicit())
1205 return;
1206 // Allow folding from SGPRs to 16-bit VGPRs.
1207 if (UseOp->getSubReg() != AMDGPU::NoSubRegister &&
1208 (UseOp->getSubReg() != AMDGPU::lo16 ||
1209 !TRI->isSGPRReg(*MRI, OpToFold.getReg())))
1210 return;
1211 }
1212
1213 // Special case for REG_SEQUENCE: We can't fold literals into
1214 // REG_SEQUENCE instructions, so we have to fold them into the
1215 // uses of REG_SEQUENCE.
1216 if (UseMI->isRegSequence()) {
1217 Register RegSeqDstReg = UseMI->getOperand(0).getReg();
1218 unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
1219
1220 int64_t SplatVal;
1221 const TargetRegisterClass *SplatRC;
1222 std::tie(SplatVal, SplatRC) = isRegSeqSplat(*UseMI);
1223
1224 // Grab the use operands first
1226 llvm::make_pointer_range(MRI->use_nodbg_operands(RegSeqDstReg)));
1227 for (unsigned I = 0; I != UsesToProcess.size(); ++I) {
1228 MachineOperand *RSUse = UsesToProcess[I];
1229 MachineInstr *RSUseMI = RSUse->getParent();
1230 unsigned OpNo = RSUseMI->getOperandNo(RSUse);
1231
1232 if (SplatRC) {
1233 if (RSUseMI->isCopy()) {
1234 Register DstReg = RSUseMI->getOperand(0).getReg();
1235 append_range(UsesToProcess,
1237 continue;
1238 }
1239 if (tryFoldRegSeqSplat(RSUseMI, OpNo, SplatVal, SplatRC)) {
1240 FoldableDef SplatDef(SplatVal, SplatRC);
1241 appendFoldCandidate(FoldList, RSUseMI, OpNo, SplatDef);
1242 continue;
1243 }
1244 }
1245
1246 // TODO: Handle general compose
1247 if (RSUse->getSubReg() != RegSeqDstSubReg)
1248 continue;
1249
1250 // FIXME: We should avoid recursing here. There should be a cleaner split
1251 // between the in-place mutations and adding to the fold list.
1252 foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList,
1253 CopiesToReplace);
1254 }
1255
1256 return;
1257 }
1258
1259 if (tryToFoldACImm(OpToFold, UseMI, UseOpIdx, FoldList))
1260 return;
1261
1262 if (frameIndexMayFold(*UseMI, UseOpIdx, OpToFold)) {
1263 // Verify that this is a stack access.
1264 // FIXME: Should probably use stack pseudos before frame lowering.
1265
1266 if (TII->isMUBUF(*UseMI)) {
1267 if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
1268 MFI->getScratchRSrcReg())
1269 return;
1270
1271 // Ensure this is either relative to the current frame or the current
1272 // wave.
1273 MachineOperand &SOff =
1274 *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
1275 if (!SOff.isImm() || SOff.getImm() != 0)
1276 return;
1277 }
1278
1279 const unsigned Opc = UseMI->getOpcode();
1280 if (TII->isFLATScratch(*UseMI) &&
1281 AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
1282 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::saddr)) {
1283 unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(Opc);
1284 unsigned CPol =
1285 TII->getNamedOperand(*UseMI, AMDGPU::OpName::cpol)->getImm();
1286 if ((CPol & AMDGPU::CPol::SCAL) &&
1288 return;
1289
1290 UseMI->setDesc(TII->get(NewOpc));
1291 }
1292
1293 // A frame index will resolve to a positive constant, so it should always be
1294 // safe to fold the addressing mode, even pre-GFX9.
1295 UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getFI());
1296
1297 return;
1298 }
1299
1300 bool FoldingImmLike =
1301 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1302
1303 if (FoldingImmLike && UseMI->isCopy()) {
1304 Register DestReg = UseMI->getOperand(0).getReg();
1305 Register SrcReg = UseMI->getOperand(1).getReg();
1306 unsigned UseSubReg = UseMI->getOperand(1).getSubReg();
1307 assert(SrcReg.isVirtual());
1308
1309 const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
1310
1311 // Don't fold into a copy to a physical register with the same class. Doing
1312 // so would interfere with the register coalescer's logic which would avoid
1313 // redundant initializations.
1314 if (DestReg.isPhysical() && SrcRC->contains(DestReg))
1315 return;
1316
1317 const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
1318 // In order to fold immediates into copies, we need to change the copy to a
1319 // MOV. Find a compatible mov instruction with the value.
1320 for (unsigned MovOp :
1321 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
1322 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_MOV_B16_t16_e64,
1323 AMDGPU::V_ACCVGPR_WRITE_B32_e64, AMDGPU::AV_MOV_B32_IMM_PSEUDO,
1324 AMDGPU::AV_MOV_B64_IMM_PSEUDO}) {
1325 const MCInstrDesc &MovDesc = TII->get(MovOp);
1326 const TargetRegisterClass *MovDstRC =
1327 TRI->getRegClass(TII->getOpRegClassID(MovDesc.operands()[0]));
1328
1329 // Fold if the destination register class of the MOV instruction (ResRC)
1330 // is a superclass of (or equal to) the destination register class of the
1331 // COPY (DestRC). If this condition fails, folding would be illegal.
1332 if (!DestRC->hasSuperClassEq(MovDstRC))
1333 continue;
1334
1335 const int SrcIdx = MovOp == AMDGPU::V_MOV_B16_t16_e64 ? 2 : 1;
1336
1337 int16_t RegClassID = TII->getOpRegClassID(MovDesc.operands()[SrcIdx]);
1338 if (RegClassID != -1) {
1339 const TargetRegisterClass *MovSrcRC = TRI->getRegClass(RegClassID);
1340
1341 if (UseSubReg)
1342 MovSrcRC = TRI->getMatchingSuperRegClass(SrcRC, MovSrcRC, UseSubReg);
1343
1344 // FIXME: We should be able to directly check immediate operand legality
1345 // for all cases, but gfx908 hacks break.
1346 if (MovOp == AMDGPU::AV_MOV_B32_IMM_PSEUDO &&
1347 (!OpToFold.isImm() ||
1348 !TII->isImmOperandLegal(MovDesc, SrcIdx,
1349 *OpToFold.getEffectiveImmVal())))
1350 break;
1351
1352 if (!MRI->constrainRegClass(SrcReg, MovSrcRC))
1353 break;
1354
1355 // FIXME: This is mutating the instruction only and deferring the actual
1356 // fold of the immediate
1357 } else {
1358 // For the _IMM_PSEUDO cases, there can be value restrictions on the
1359 // immediate to verify. Technically we should always verify this, but it
1360 // only matters for these concrete cases.
1361 // TODO: Handle non-imm case if it's useful.
1362 if (!OpToFold.isImm() ||
1363 !TII->isImmOperandLegal(MovDesc, 1, *OpToFold.getEffectiveImmVal()))
1364 break;
1365 }
1366
1369 while (ImpOpI != ImpOpE) {
1370 MachineInstr::mop_iterator Tmp = ImpOpI;
1371 ImpOpI++;
1373 }
1374 UseMI->setDesc(MovDesc);
1375
1376 if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
1377 const auto &SrcOp = UseMI->getOperand(UseOpIdx);
1378 MachineOperand NewSrcOp(SrcOp);
1379 MachineFunction *MF = UseMI->getMF();
1380 UseMI->removeOperand(1);
1381 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
1382 UseMI->addOperand(NewSrcOp); // src0
1383 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel
1384 UseOpIdx = SrcIdx;
1385 UseOp = &UseMI->getOperand(UseOpIdx);
1386 }
1387 CopiesToReplace.push_back(UseMI);
1388 break;
1389 }
1390
1391 // We failed to replace the copy, so give up.
1392 if (UseMI->getOpcode() == AMDGPU::COPY)
1393 return;
1394
1395 } else {
1396 if (UseMI->isCopy() && OpToFold.isReg() &&
1397 UseMI->getOperand(0).getReg().isVirtual() &&
1398 !UseMI->getOperand(1).getSubReg() &&
1399 OpToFold.DefMI->implicit_operands().empty()) {
1400 LLVM_DEBUG(dbgs() << "Folding " << OpToFold.OpToFold << "\n into "
1401 << *UseMI);
1402 unsigned Size = TII->getOpSize(*UseMI, 1);
1403 Register UseReg = OpToFold.getReg();
1405 unsigned SubRegIdx = OpToFold.getSubReg();
1406 // Hack to allow 32-bit SGPRs to be folded into True16 instructions
1407 // Remove this if 16-bit SGPRs (i.e. SGPR_LO16) are added to the
1408 // VS_16RegClass
1409 //
1410 // Excerpt from AMDGPUGenRegisterInfoEnums.inc
1411 // NoSubRegister, //0
1412 // hi16, // 1
1413 // lo16, // 2
1414 // sub0, // 3
1415 // ...
1416 // sub1, // 11
1417 // sub1_hi16, // 12
1418 // sub1_lo16, // 13
1419 static_assert(AMDGPU::sub1_hi16 == 12, "Subregister layout has changed");
1420 if (Size == 2 && TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
1421 TRI->isSGPRReg(*MRI, UseReg)) {
1422 // Produce the 32 bit subregister index to which the 16-bit subregister
1423 // is aligned.
1424 if (SubRegIdx > AMDGPU::sub1) {
1425 LaneBitmask M = TRI->getSubRegIndexLaneMask(SubRegIdx);
1426 M |= M.getLane(M.getHighestLane() - 1);
1427 SmallVector<unsigned, 4> Indexes;
1428 TRI->getCoveringSubRegIndexes(TRI->getRegClassForReg(*MRI, UseReg), M,
1429 Indexes);
1430 assert(Indexes.size() == 1 && "Expected one 32-bit subreg to cover");
1431 SubRegIdx = Indexes[0];
1432 // 32-bit registers do not have a sub0 index
1433 } else if (TII->getOpSize(*UseMI, 1) == 4)
1434 SubRegIdx = 0;
1435 else
1436 SubRegIdx = AMDGPU::sub0;
1437 }
1438 UseMI->getOperand(1).setSubReg(SubRegIdx);
1439 UseMI->getOperand(1).setIsKill(false);
1440 CopiesToReplace.push_back(UseMI);
1441 OpToFold.OpToFold->setIsKill(false);
1442
1443 // Remove kill flags as kills may now be out of order with uses.
1444 MRI->clearKillFlags(UseReg);
1445 if (foldCopyToAGPRRegSequence(UseMI))
1446 return;
1447 }
1448
1449 unsigned UseOpc = UseMI->getOpcode();
1450 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
1451 (UseOpc == AMDGPU::V_READLANE_B32 &&
1452 (int)UseOpIdx ==
1453 AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
1454 // %vgpr = V_MOV_B32 imm
1455 // %sgpr = V_READFIRSTLANE_B32 %vgpr
1456 // =>
1457 // %sgpr = S_MOV_B32 imm
1458 if (FoldingImmLike) {
1460 UseMI->getOperand(UseOpIdx).getReg(),
1461 *OpToFold.DefMI, *UseMI))
1462 return;
1463
1464 UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
1466
1467 if (OpToFold.isImm()) {
1469 *OpToFold.getEffectiveImmVal());
1470 } else if (OpToFold.isFI())
1471 UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getFI());
1472 else {
1473 assert(OpToFold.isGlobal());
1474 UseMI->getOperand(1).ChangeToGA(OpToFold.OpToFold->getGlobal(),
1475 OpToFold.OpToFold->getOffset(),
1476 OpToFold.OpToFold->getTargetFlags());
1477 }
1478 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1479 return;
1480 }
1481
1482 if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
1484 UseMI->getOperand(UseOpIdx).getReg(),
1485 *OpToFold.DefMI, *UseMI))
1486 return;
1487
1488 // %vgpr = COPY %sgpr0
1489 // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
1490 // =>
1491 // %sgpr1 = COPY %sgpr0
1492 UseMI->setDesc(TII->get(AMDGPU::COPY));
1493 UseMI->getOperand(1).setReg(OpToFold.getReg());
1494 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
1495 UseMI->getOperand(1).setIsKill(false);
1496 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1498 return;
1499 }
1500 }
1501
1502 const MCInstrDesc &UseDesc = UseMI->getDesc();
1503
1504 // Don't fold into target independent nodes. Target independent opcodes
1505 // don't have defined register classes.
1506 if (UseDesc.isVariadic() || UseOp->isImplicit() ||
1507 UseDesc.operands()[UseOpIdx].RegClass == -1)
1508 return;
1509 }
1510
1511 // FIXME: We could try to change the instruction from 64-bit to 32-bit
1512 // to enable more folding opportunities. The shrink operands pass
1513 // already does this.
1514
1515 tryAddToFoldList(FoldList, UseMI, UseOpIdx, OpToFold);
1516}
1517
1518static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
1520 switch (Opcode) {
1521 case AMDGPU::S_ADD_I32:
1522 case AMDGPU::S_ADD_U32:
1523 Result = LHS + RHS;
1524 return true;
1525 case AMDGPU::S_SUB_I32:
1526 case AMDGPU::S_SUB_U32:
1527 Result = LHS - RHS;
1528 return true;
1529 case AMDGPU::V_AND_B32_e64:
1530 case AMDGPU::V_AND_B32_e32:
1531 case AMDGPU::S_AND_B32:
1532 Result = LHS & RHS;
1533 return true;
1534 case AMDGPU::V_OR_B32_e64:
1535 case AMDGPU::V_OR_B32_e32:
1536 case AMDGPU::S_OR_B32:
1537 Result = LHS | RHS;
1538 return true;
1539 case AMDGPU::V_XOR_B32_e64:
1540 case AMDGPU::V_XOR_B32_e32:
1541 case AMDGPU::S_XOR_B32:
1542 Result = LHS ^ RHS;
1543 return true;
1544 case AMDGPU::S_XNOR_B32:
1545 Result = ~(LHS ^ RHS);
1546 return true;
1547 case AMDGPU::S_NAND_B32:
1548 Result = ~(LHS & RHS);
1549 return true;
1550 case AMDGPU::S_NOR_B32:
1551 Result = ~(LHS | RHS);
1552 return true;
1553 case AMDGPU::S_ANDN2_B32:
1554 Result = LHS & ~RHS;
1555 return true;
1556 case AMDGPU::S_ORN2_B32:
1557 Result = LHS | ~RHS;
1558 return true;
1559 case AMDGPU::V_LSHL_B32_e64:
1560 case AMDGPU::V_LSHL_B32_e32:
1561 case AMDGPU::S_LSHL_B32:
1562 // The instruction ignores the high bits for out of bounds shifts.
1563 Result = LHS << (RHS & 31);
1564 return true;
1565 case AMDGPU::V_LSHLREV_B32_e64:
1566 case AMDGPU::V_LSHLREV_B32_e32:
1567 Result = RHS << (LHS & 31);
1568 return true;
1569 case AMDGPU::V_LSHR_B32_e64:
1570 case AMDGPU::V_LSHR_B32_e32:
1571 case AMDGPU::S_LSHR_B32:
1572 Result = LHS >> (RHS & 31);
1573 return true;
1574 case AMDGPU::V_LSHRREV_B32_e64:
1575 case AMDGPU::V_LSHRREV_B32_e32:
1576 Result = RHS >> (LHS & 31);
1577 return true;
1578 case AMDGPU::V_ASHR_I32_e64:
1579 case AMDGPU::V_ASHR_I32_e32:
1580 case AMDGPU::S_ASHR_I32:
1581 Result = static_cast<int32_t>(LHS) >> (RHS & 31);
1582 return true;
1583 case AMDGPU::V_ASHRREV_I32_e64:
1584 case AMDGPU::V_ASHRREV_I32_e32:
1585 Result = static_cast<int32_t>(RHS) >> (LHS & 31);
1586 return true;
1587 default:
1588 return false;
1589 }
1590}
1591
1592static unsigned getMovOpc(bool IsScalar) {
1593 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1594}
1595
1596// Try to simplify operations with a constant that may appear after instruction
1597// selection.
1598// TODO: See if a frame index with a fixed offset can fold.
1599bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
1600 if (!MI->allImplicitDefsAreDead())
1601 return false;
1602
1603 unsigned Opc = MI->getOpcode();
1604
1605 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1606 if (Src0Idx == -1)
1607 return false;
1608
1609 MachineOperand *Src0 = &MI->getOperand(Src0Idx);
1610 std::optional<int64_t> Src0Imm = TII->getImmOrMaterializedImm(*Src0);
1611
1612 if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
1613 Opc == AMDGPU::S_NOT_B32) &&
1614 Src0Imm) {
1615 MI->getOperand(1).ChangeToImmediate(~*Src0Imm);
1616 TII->mutateAndCleanupImplicit(
1617 *MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
1618 return true;
1619 }
1620
1621 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1622 if (Src1Idx == -1)
1623 return false;
1624
1625 MachineOperand *Src1 = &MI->getOperand(Src1Idx);
1626 std::optional<int64_t> Src1Imm = TII->getImmOrMaterializedImm(*Src1);
1627
1628 if (!Src0Imm && !Src1Imm)
1629 return false;
1630
1631 // and k0, k1 -> v_mov_b32 (k0 & k1)
1632 // or k0, k1 -> v_mov_b32 (k0 | k1)
1633 // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
1634 if (Src0Imm && Src1Imm) {
1635 int32_t NewImm;
1636 if (!evalBinaryInstruction(Opc, NewImm, *Src0Imm, *Src1Imm))
1637 return false;
1638
1639 bool IsSGPR = TRI->isSGPRReg(*MRI, MI->getOperand(0).getReg());
1640
1641 // Be careful to change the right operand, src0 may belong to a different
1642 // instruction.
1643 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1644 MI->removeOperand(Src1Idx);
1645 TII->mutateAndCleanupImplicit(*MI, TII->get(getMovOpc(IsSGPR)));
1646 return true;
1647 }
1648
1649 // S_SUB_* is not commutable, so handle it before the commutability gate.
1650 // Only `x - 0 -> copy x` is valid; `0 - x` is a negation, not a copy.
1651 if (Opc == AMDGPU::S_SUB_I32 || Opc == AMDGPU::S_SUB_U32) {
1652 if (Src1Imm && static_cast<int32_t>(*Src1Imm) == 0) {
1653 // y = sub x, 0 => y = copy x
1654 MI->removeOperand(Src1Idx);
1655 TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
1656 return true;
1657 }
1658 return false;
1659 }
1660
1661 if (!MI->isCommutable())
1662 return false;
1663
1664 if (Src0Imm && !Src1Imm) {
1665 std::swap(Src0, Src1);
1666 std::swap(Src0Idx, Src1Idx);
1667 std::swap(Src0Imm, Src1Imm);
1668 }
1669
1670 int32_t Src1Val = static_cast<int32_t>(*Src1Imm);
1671 if (Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_ADD_U32) {
1672 if (Src1Val == 0) {
1673 // y = add x, 0 => y = copy x
1674 MI->removeOperand(Src1Idx);
1675 TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
1676 return true;
1677 }
1678 return false;
1679 }
1680
1681 if (Opc == AMDGPU::V_OR_B32_e64 ||
1682 Opc == AMDGPU::V_OR_B32_e32 ||
1683 Opc == AMDGPU::S_OR_B32) {
1684 if (Src1Val == 0) {
1685 // y = or x, 0 => y = copy x
1686 MI->removeOperand(Src1Idx);
1687 TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
1688 } else if (Src1Val == -1) {
1689 // y = or x, -1 => y = v_mov_b32 -1
1690 MI->removeOperand(Src0Idx);
1691 TII->mutateAndCleanupImplicit(
1692 *MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
1693 } else
1694 return false;
1695
1696 return true;
1697 }
1698
1699 if (Opc == AMDGPU::V_AND_B32_e64 || Opc == AMDGPU::V_AND_B32_e32 ||
1700 Opc == AMDGPU::S_AND_B32) {
1701 if (Src1Val == 0) {
1702 // y = and x, 0 => y = v_mov_b32 0
1703 MI->removeOperand(Src0Idx);
1704 TII->mutateAndCleanupImplicit(
1705 *MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
1706 } else if (Src1Val == -1) {
1707 // y = and x, -1 => y = copy x
1708 MI->removeOperand(Src1Idx);
1709 TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
1710 } else
1711 return false;
1712
1713 return true;
1714 }
1715
1716 if (Opc == AMDGPU::V_XOR_B32_e64 || Opc == AMDGPU::V_XOR_B32_e32 ||
1717 Opc == AMDGPU::S_XOR_B32) {
1718 if (Src1Val == 0) {
1719 // y = xor x, 0 => y = copy x
1720 MI->removeOperand(Src1Idx);
1721 TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
1722 return true;
1723 }
1724 }
1725
1726 return false;
1727}
1728
1729// Try to fold an instruction into a simpler one
1730bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
1731 unsigned Opc = MI.getOpcode();
1732 if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1733 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1734 return false;
1735
1736 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1737 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1738 if (!Src1->isIdenticalTo(*Src0)) {
1739 std::optional<int64_t> Src1Imm = TII->getImmOrMaterializedImm(*Src1);
1740 if (!Src1Imm)
1741 return false;
1742
1743 std::optional<int64_t> Src0Imm = TII->getImmOrMaterializedImm(*Src0);
1744 if (!Src0Imm || *Src0Imm != *Src1Imm)
1745 return false;
1746 }
1747
1748 int Src1ModIdx =
1749 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
1750 int Src0ModIdx =
1751 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
1752 if ((Src1ModIdx != -1 && MI.getOperand(Src1ModIdx).getImm() != 0) ||
1753 (Src0ModIdx != -1 && MI.getOperand(Src0ModIdx).getImm() != 0))
1754 return false;
1755
1756 LLVM_DEBUG(dbgs() << "Folded " << MI << " into ");
1757 auto &NewDesc =
1758 TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
1759 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1760 if (Src2Idx != -1)
1761 MI.removeOperand(Src2Idx);
1762 MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
1763 if (Src1ModIdx != -1)
1764 MI.removeOperand(Src1ModIdx);
1765 if (Src0ModIdx != -1)
1766 MI.removeOperand(Src0ModIdx);
1767 TII->mutateAndCleanupImplicit(MI, NewDesc);
1768 LLVM_DEBUG(dbgs() << MI);
1769 return true;
1770}
1771
1772bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const {
1773 if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1774 MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1775 return false;
1776
1777 std::optional<int64_t> Src0Imm =
1778 TII->getImmOrMaterializedImm(MI.getOperand(1));
1779 if (!Src0Imm || *Src0Imm != 0xffff || !MI.getOperand(2).isReg())
1780 return false;
1781
1782 Register Src1 = MI.getOperand(2).getReg();
1783 MachineInstr *SrcDef = MRI->getVRegDef(Src1);
1784 if (!ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode()))
1785 return false;
1786
1787 Register Dst = MI.getOperand(0).getReg();
1788 MRI->replaceRegWith(Dst, Src1);
1789 if (!MI.getOperand(2).isKill())
1790 MRI->clearKillFlags(Src1);
1791 MI.eraseFromParent();
1792 return true;
1793}
1794
1795bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
1796 const FoldableDef &OpToFold) const {
1797 // We need mutate the operands of new mov instructions to add implicit
1798 // uses of EXEC, but adding them invalidates the use_iterator, so defer
1799 // this.
1800 SmallVector<MachineInstr *, 4> CopiesToReplace;
1802 MachineOperand &Dst = MI.getOperand(0);
1803 bool Changed = false;
1804
1805 if (OpToFold.isImm()) {
1806 for (auto &UseMI :
1807 make_early_inc_range(MRI->use_nodbg_instructions(Dst.getReg()))) {
1808 // Folding the immediate may reveal operations that can be constant
1809 // folded or replaced with a copy. This can happen for example after
1810 // frame indices are lowered to constants or from splitting 64-bit
1811 // constants.
1812 //
1813 // We may also encounter cases where one or both operands are
1814 // immediates materialized into a register, which would ordinarily not
1815 // be folded due to multiple uses or operand constraints.
1816 if (tryConstantFoldOp(&UseMI)) {
1817 LLVM_DEBUG(dbgs() << "Constant folded " << UseMI);
1818 Changed = true;
1819 }
1820 }
1821 }
1822
1824 llvm::make_pointer_range(MRI->use_nodbg_operands(Dst.getReg())));
1825 for (auto *U : UsesToProcess) {
1826 MachineInstr *UseMI = U->getParent();
1827
1828 FoldableDef SubOpToFold = OpToFold.getWithSubReg(*TRI, U->getSubReg());
1829 foldOperand(SubOpToFold, UseMI, UseMI->getOperandNo(U), FoldList,
1830 CopiesToReplace);
1831 }
1832
1833 if (CopiesToReplace.empty() && FoldList.empty())
1834 return Changed;
1835
1836 MachineFunction *MF = MI.getMF();
1837 // Make sure we add EXEC uses to any new v_mov instructions created.
1838 for (MachineInstr *Copy : CopiesToReplace)
1839 Copy->addImplicitDefUseOperands(*MF);
1840
1841 SetVector<MachineInstr *> ConstantFoldCandidates;
1842 for (FoldCandidate &Fold : FoldList) {
1843 assert(!Fold.isReg() || Fold.Def.OpToFold);
1844 if (Fold.isReg() && Fold.getReg().isVirtual()) {
1845 Register Reg = Fold.getReg();
1846 const MachineInstr *DefMI = Fold.Def.DefMI;
1847 if (DefMI->readsRegister(AMDGPU::EXEC, TRI) &&
1848 execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI))
1849 continue;
1850 }
1851 if (updateOperand(Fold)) {
1852 // Clear kill flags.
1853 if (Fold.isReg()) {
1854 assert(Fold.Def.OpToFold && Fold.isReg());
1855 // FIXME: Probably shouldn't bother trying to fold if not an
1856 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
1857 // copies.
1858 MRI->clearKillFlags(Fold.getReg());
1859 }
1860 LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
1861 << static_cast<int>(Fold.UseOpNo) << " of "
1862 << *Fold.UseMI);
1863
1864 if (Fold.isImm())
1865 ConstantFoldCandidates.insert(Fold.UseMI);
1866
1867 } else if (Fold.Commuted) {
1868 // Restoring instruction's original operand order if fold has failed.
1869 TII->commuteInstruction(*Fold.UseMI, false);
1870 }
1871 }
1872
1873 for (MachineInstr *MI : ConstantFoldCandidates) {
1874 if (tryConstantFoldOp(MI)) {
1875 LLVM_DEBUG(dbgs() << "Constant folded " << *MI);
1876 Changed = true;
1877 }
1878 }
1879 return true;
1880}
1881
1882/// Fold %agpr = COPY (REG_SEQUENCE x_MOV_B32, ...) into REG_SEQUENCE
1883/// (V_ACCVGPR_WRITE_B32_e64) ... depending on the reg_sequence input values.
1884bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const {
1885 // It is very tricky to store a value into an AGPR. v_accvgpr_write_b32 can
1886 // only accept VGPR or inline immediate. Recreate a reg_sequence with its
1887 // initializers right here, so we will rematerialize immediates and avoid
1888 // copies via different reg classes.
1889 const TargetRegisterClass *DefRC =
1890 MRI->getRegClass(CopyMI->getOperand(0).getReg());
1891 if (!TRI->isAGPRClass(DefRC))
1892 return false;
1893
1894 Register UseReg = CopyMI->getOperand(1).getReg();
1895 MachineInstr *RegSeq = MRI->getVRegDef(UseReg);
1896 if (!RegSeq || !RegSeq->isRegSequence())
1897 return false;
1898
1899 const DebugLoc &DL = CopyMI->getDebugLoc();
1900 MachineBasicBlock &MBB = *CopyMI->getParent();
1901
1902 MachineInstrBuilder B(*MBB.getParent(), CopyMI);
1903 DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
1904
1905 const TargetRegisterClass *UseRC =
1906 MRI->getRegClass(CopyMI->getOperand(1).getReg());
1907
1908 // Value, subregindex for new REG_SEQUENCE
1910
1911 unsigned NumRegSeqOperands = RegSeq->getNumOperands();
1912 unsigned NumFoldable = 0;
1913
1914 for (unsigned I = 1; I != NumRegSeqOperands; I += 2) {
1915 MachineOperand &RegOp = RegSeq->getOperand(I);
1916 unsigned SubRegIdx = RegSeq->getOperand(I + 1).getImm();
1917
1918 if (RegOp.getSubReg()) {
1919 // TODO: Handle subregister compose
1920 NewDefs.emplace_back(&RegOp, SubRegIdx);
1921 continue;
1922 }
1923
1924 MachineOperand *Lookup = lookUpCopyChain(*TII, *MRI, RegOp.getReg());
1925 if (!Lookup)
1926 Lookup = &RegOp;
1927
1928 if (Lookup->isImm()) {
1929 // Check if this is an agpr_32 subregister.
1930 const TargetRegisterClass *DestSuperRC = TRI->getMatchingSuperRegClass(
1931 DefRC, &AMDGPU::AGPR_32RegClass, SubRegIdx);
1932 if (DestSuperRC &&
1933 TII->isInlineConstant(*Lookup, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
1934 ++NumFoldable;
1935 NewDefs.emplace_back(Lookup, SubRegIdx);
1936 continue;
1937 }
1938 }
1939
1940 const TargetRegisterClass *InputRC =
1941 Lookup->isReg() ? MRI->getRegClass(Lookup->getReg())
1942 : MRI->getRegClass(RegOp.getReg());
1943
1944 // TODO: Account for Lookup->getSubReg()
1945
1946 // If we can't find a matching super class, this is an SGPR->AGPR or
1947 // VGPR->AGPR subreg copy (or something constant-like we have to materialize
1948 // in the AGPR). We can't directly copy from SGPR to AGPR on gfx908, so we
1949 // want to rewrite to copy to an intermediate VGPR class.
1950 const TargetRegisterClass *MatchRC =
1951 TRI->getMatchingSuperRegClass(DefRC, InputRC, SubRegIdx);
1952 if (!MatchRC) {
1953 ++NumFoldable;
1954 NewDefs.emplace_back(&RegOp, SubRegIdx);
1955 continue;
1956 }
1957
1958 NewDefs.emplace_back(&RegOp, SubRegIdx);
1959 }
1960
1961 // Do not clone a reg_sequence and merely change the result register class.
1962 if (NumFoldable == 0)
1963 return false;
1964
1965 CopyMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
1966 for (unsigned I = CopyMI->getNumOperands() - 1; I > 0; --I)
1967 CopyMI->removeOperand(I);
1968
1969 for (auto [Def, DestSubIdx] : NewDefs) {
1970 if (!Def->isReg()) {
1971 // TODO: Should we use single write for each repeated value like in
1972 // register case?
1973 Register Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
1974 BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)
1975 .add(*Def);
1976 B.addReg(Tmp);
1977 } else {
1978 TargetInstrInfo::RegSubRegPair Src = getRegSubRegPair(*Def);
1979 Def->setIsKill(false);
1980
1981 Register &VGPRCopy = VGPRCopies[Src];
1982 if (!VGPRCopy) {
1983 const TargetRegisterClass *VGPRUseSubRC =
1984 TRI->getSubRegisterClass(UseRC, DestSubIdx);
1985
1986 // We cannot build a reg_sequence out of the same registers, they
1987 // must be copied. Better do it here before copyPhysReg() created
1988 // several reads to do the AGPR->VGPR->AGPR copy.
1989
1990 // Direct copy from SGPR to AGPR is not possible on gfx908. To avoid
1991 // creation of exploded copies SGPR->VGPR->AGPR in the copyPhysReg()
1992 // later, create a copy here and track if we already have such a copy.
1993 const TargetRegisterClass *SubRC =
1994 TRI->getSubRegisterClass(MRI->getRegClass(Src.Reg), Src.SubReg);
1995 if (!VGPRUseSubRC->hasSubClassEq(SubRC)) {
1996 // TODO: Try to reconstrain class
1997 VGPRCopy = MRI->createVirtualRegister(VGPRUseSubRC);
1998 BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), VGPRCopy).add(*Def);
1999 B.addReg(VGPRCopy);
2000 } else {
2001 // If it is already a VGPR, do not copy the register.
2002 B.add(*Def);
2003 }
2004 } else {
2005 B.addReg(VGPRCopy);
2006 }
2007 }
2008
2009 B.addImm(DestSubIdx);
2010 }
2011
2012 LLVM_DEBUG(dbgs() << "Folded " << *CopyMI);
2013 return true;
2014}
2015
2016bool SIFoldOperandsImpl::tryFoldFoldableCopy(
2017 MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
2018 Register DstReg = MI.getOperand(0).getReg();
2019 // Specially track simple redefs of m0 to the same value in a block, so we
2020 // can erase the later ones.
2021 if (DstReg == AMDGPU::M0) {
2022 MachineOperand &NewM0Val = MI.getOperand(1);
2023 if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
2024 MI.eraseFromParent();
2025 return true;
2026 }
2027
2028 // We aren't tracking other physical registers
2029 CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical())
2030 ? nullptr
2031 : &NewM0Val;
2032 return false;
2033 }
2034
2035 MachineOperand *OpToFoldPtr;
2036 if (MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e64) {
2037 // Folding when any src_modifiers are non-zero is unsupported
2038 if (TII->hasAnyModifiersSet(MI))
2039 return false;
2040 OpToFoldPtr = &MI.getOperand(2);
2041 } else
2042 OpToFoldPtr = &MI.getOperand(1);
2043 MachineOperand &OpToFold = *OpToFoldPtr;
2044 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
2045
2046 // FIXME: We could also be folding things like TargetIndexes.
2047 if (!FoldingImm && !OpToFold.isReg())
2048 return false;
2049
2050 // Fold virtual registers and constant physical registers.
2051 if (OpToFold.isReg() && OpToFold.getReg().isPhysical() &&
2052 !TRI->isConstantPhysReg(OpToFold.getReg()))
2053 return false;
2054
2055 // Prevent folding operands backwards in the function. For example,
2056 // the COPY opcode must not be replaced by 1 in this example:
2057 //
2058 // %3 = COPY %vgpr0; VGPR_32:%3
2059 // ...
2060 // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
2061 if (!DstReg.isVirtual())
2062 return false;
2063
2064 const TargetRegisterClass *DstRC =
2065 MRI->getRegClass(MI.getOperand(0).getReg());
2066
2067 // True16: Fix malformed 16-bit sgpr COPY produced by peephole-opt
2068 // Can remove this code if proper 16-bit SGPRs are implemented
2069 // Example: Pre-peephole-opt
2070 // %29:sgpr_lo16 = COPY %16.lo16:sreg_32
2071 // %32:sreg_32 = COPY %29:sgpr_lo16
2072 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
2073 // Post-peephole-opt and DCE
2074 // %32:sreg_32 = COPY %16.lo16:sreg_32
2075 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
2076 // After this transform
2077 // %32:sreg_32 = COPY %16:sreg_32
2078 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
2079 // After the fold operands pass
2080 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %16:sreg_32
2081 if (MI.getOpcode() == AMDGPU::COPY && OpToFold.isReg() &&
2082 OpToFold.getSubReg()) {
2083 if (DstRC == &AMDGPU::SReg_32RegClass &&
2084 DstRC == MRI->getRegClass(OpToFold.getReg())) {
2085 assert(OpToFold.getSubReg() == AMDGPU::lo16);
2086 OpToFold.setSubReg(0);
2087 }
2088 }
2089
2090 // Fold copy to AGPR through reg_sequence
2091 // TODO: Handle with subregister extract
2092 if (OpToFold.isReg() && MI.isCopy() && !MI.getOperand(1).getSubReg()) {
2093 if (foldCopyToAGPRRegSequence(&MI))
2094 return true;
2095 }
2096
2097 FoldableDef Def(OpToFold, DstRC);
2098 bool Changed = foldInstOperand(MI, Def);
2099
2100 // If we managed to fold all uses of this copy then we might as well
2101 // delete it now.
2102 // The only reason we need to follow chains of copies here is that
2103 // tryFoldRegSequence looks forward through copies before folding a
2104 // REG_SEQUENCE into its eventual users.
2105 auto *InstToErase = &MI;
2106 while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
2107 auto &SrcOp = InstToErase->getOperand(1);
2108 auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register();
2109 InstToErase->eraseFromParent();
2110 Changed = true;
2111 InstToErase = nullptr;
2112 if (!SrcReg || SrcReg.isPhysical())
2113 break;
2114 InstToErase = MRI->getVRegDef(SrcReg);
2115 if (!InstToErase || !TII->isFoldableCopy(*InstToErase))
2116 break;
2117 }
2118
2119 if (InstToErase && InstToErase->isRegSequence() &&
2120 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
2121 InstToErase->eraseFromParent();
2122 Changed = true;
2123 }
2124
2125 if (Changed)
2126 return true;
2127
2128 // Run this after foldInstOperand to avoid turning scalar additions into
2129 // vector additions when the result scalar result could just be folded into
2130 // the user(s).
2131 return OpToFold.isReg() &&
2132 foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.getReg(), MI);
2133}
2134
2135// Clamp patterns are canonically selected to v_max_* instructions, so only
2136// handle them.
2137const MachineOperand *
2138SIFoldOperandsImpl::isClamp(const MachineInstr &MI) const {
2139 unsigned Op = MI.getOpcode();
2140 switch (Op) {
2141 case AMDGPU::V_MAX_F32_e64:
2142 case AMDGPU::V_MAX_F16_e64:
2143 case AMDGPU::V_MAX_F16_t16_e64:
2144 case AMDGPU::V_MAX_F16_fake16_e64:
2145 case AMDGPU::V_MAX_F64_e64:
2146 case AMDGPU::V_MAX_NUM_F64_e64:
2147 case AMDGPU::V_PK_MAX_F16:
2148 case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2149 case AMDGPU::V_PK_MAX_NUM_BF16: {
2150 if (MI.mayRaiseFPException())
2151 return nullptr;
2152
2153 if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
2154 return nullptr;
2155
2156 // Make sure sources are identical.
2157 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2158 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2159 if (!Src0->isReg() || !Src1->isReg() ||
2160 Src0->getReg() != Src1->getReg() ||
2161 Src0->getSubReg() != Src1->getSubReg() ||
2162 Src0->getSubReg() != AMDGPU::NoSubRegister)
2163 return nullptr;
2164
2165 // Can't fold up if we have modifiers.
2166 if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
2167 return nullptr;
2168
2169 unsigned Src0Mods
2170 = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
2171 unsigned Src1Mods
2172 = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
2173
2174 // Having a 0 op_sel_hi would require swizzling the output in the source
2175 // instruction, which we can't do.
2176 unsigned UnsetMods =
2177 (Op == AMDGPU::V_PK_MAX_F16 || Op == AMDGPU::V_PK_MAX_NUM_BF16)
2179 : 0u;
2180 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
2181 return nullptr;
2182 return Src0;
2183 }
2184 default:
2185 return nullptr;
2186 }
2187}
2188
2189// FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
2190bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) {
2191 const MachineOperand *ClampSrc = isClamp(MI);
2192 if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg()))
2193 return false;
2194
2195 if (!ClampSrc->getReg().isVirtual())
2196 return false;
2197
2198 // Look through COPY. COPY only observed with True16.
2199 Register DefSrcReg = TRI->lookThruCopyLike(ClampSrc->getReg(), MRI);
2200 MachineInstr *Def =
2201 MRI->getVRegDef(DefSrcReg.isVirtual() ? DefSrcReg : ClampSrc->getReg());
2202
2203 // The type of clamp must be compatible.
2204 if (!SIInstrInfo::hasSameClamp(*Def, MI))
2205 return false;
2206
2207 if (Def->mayRaiseFPException())
2208 return false;
2209
2210 MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
2211 if (!DefClamp)
2212 return false;
2213
2214 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def);
2215
2216 // Clamp is applied after omod, so it is OK if omod is set.
2217 DefClamp->setImm(1);
2218
2219 Register DefReg = Def->getOperand(0).getReg();
2220 Register MIDstReg = MI.getOperand(0).getReg();
2221 if (TRI->isSGPRReg(*MRI, DefReg)) {
2222 // Pseudo scalar instructions have a SGPR for dst and clamp is a v_max*
2223 // instruction with a VGPR dst.
2224 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY),
2225 MIDstReg)
2226 .addReg(DefReg);
2227 } else {
2228 MRI->replaceRegWith(MIDstReg, DefReg);
2229 }
2230 MI.eraseFromParent();
2231
2232 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
2233 // instruction, so we might as well convert it to the more flexible VOP3-only
2234 // mad/fma form.
2235 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
2236 Def->eraseFromParent();
2237
2238 return true;
2239}
2240
2241static int getOModValue(unsigned Opc, int64_t Val) {
2242 switch (Opc) {
2243 case AMDGPU::V_MUL_F64_e64:
2244 case AMDGPU::V_MUL_F64_pseudo_e64: {
2245 switch (Val) {
2246 case 0x3fe0000000000000: // 0.5
2247 return SIOutMods::DIV2;
2248 case 0x4000000000000000: // 2.0
2249 return SIOutMods::MUL2;
2250 case 0x4010000000000000: // 4.0
2251 return SIOutMods::MUL4;
2252 default:
2253 return SIOutMods::NONE;
2254 }
2255 }
2256 case AMDGPU::V_MUL_F32_e64: {
2257 switch (static_cast<uint32_t>(Val)) {
2258 case 0x3f000000: // 0.5
2259 return SIOutMods::DIV2;
2260 case 0x40000000: // 2.0
2261 return SIOutMods::MUL2;
2262 case 0x40800000: // 4.0
2263 return SIOutMods::MUL4;
2264 default:
2265 return SIOutMods::NONE;
2266 }
2267 }
2268 case AMDGPU::V_MUL_F16_e64:
2269 case AMDGPU::V_MUL_F16_t16_e64:
2270 case AMDGPU::V_MUL_F16_fake16_e64: {
2271 switch (static_cast<uint16_t>(Val)) {
2272 case 0x3800: // 0.5
2273 return SIOutMods::DIV2;
2274 case 0x4000: // 2.0
2275 return SIOutMods::MUL2;
2276 case 0x4400: // 4.0
2277 return SIOutMods::MUL4;
2278 default:
2279 return SIOutMods::NONE;
2280 }
2281 }
2282 default:
2283 llvm_unreachable("invalid mul opcode");
2284 }
2285}
2286
2287// FIXME: Does this really not support denormals with f16?
2288// FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
2289// handled, so will anything other than that break?
2290std::pair<const MachineOperand *, int>
2291SIFoldOperandsImpl::isOMod(const MachineInstr &MI) const {
2292 unsigned Op = MI.getOpcode();
2293 switch (Op) {
2294 case AMDGPU::V_MUL_F64_e64:
2295 case AMDGPU::V_MUL_F64_pseudo_e64:
2296 case AMDGPU::V_MUL_F32_e64:
2297 case AMDGPU::V_MUL_F16_t16_e64:
2298 case AMDGPU::V_MUL_F16_fake16_e64:
2299 case AMDGPU::V_MUL_F16_e64: {
2300 // If output denormals are enabled, omod is ignored.
2301 if ((Op == AMDGPU::V_MUL_F32_e64 &&
2303 ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
2304 Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 ||
2305 Op == AMDGPU::V_MUL_F16_fake16_e64) &&
2308 MI.mayRaiseFPException())
2309 return std::pair(nullptr, SIOutMods::NONE);
2310
2311 const MachineOperand *RegOp = nullptr;
2312 const MachineOperand *ImmOp = nullptr;
2313 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2314 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2315 if (Src0->isImm()) {
2316 ImmOp = Src0;
2317 RegOp = Src1;
2318 } else if (Src1->isImm()) {
2319 ImmOp = Src1;
2320 RegOp = Src0;
2321 } else
2322 return std::pair(nullptr, SIOutMods::NONE);
2323
2324 int OMod = getOModValue(Op, ImmOp->getImm());
2325 if (OMod == SIOutMods::NONE ||
2326 TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
2327 TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
2328 TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
2329 TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
2330 return std::pair(nullptr, SIOutMods::NONE);
2331
2332 return std::pair(RegOp, OMod);
2333 }
2334 case AMDGPU::V_ADD_F64_e64:
2335 case AMDGPU::V_ADD_F64_pseudo_e64:
2336 case AMDGPU::V_ADD_F32_e64:
2337 case AMDGPU::V_ADD_F16_e64:
2338 case AMDGPU::V_ADD_F16_t16_e64:
2339 case AMDGPU::V_ADD_F16_fake16_e64: {
2340 // If output denormals are enabled, omod is ignored.
2341 if ((Op == AMDGPU::V_ADD_F32_e64 &&
2343 ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F64_pseudo_e64 ||
2344 Op == AMDGPU::V_ADD_F16_e64 || Op == AMDGPU::V_ADD_F16_t16_e64 ||
2345 Op == AMDGPU::V_ADD_F16_fake16_e64) &&
2347 return std::pair(nullptr, SIOutMods::NONE);
2348
2349 // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
2350 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2351 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2352
2353 if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
2354 Src0->getSubReg() == Src1->getSubReg() &&
2355 !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
2356 !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
2357 !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
2358 !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
2359 return std::pair(Src0, SIOutMods::MUL2);
2360
2361 return std::pair(nullptr, SIOutMods::NONE);
2362 }
2363 default:
2364 return std::pair(nullptr, SIOutMods::NONE);
2365 }
2366}
2367
2368// FIXME: Does this need to check IEEE bit on function?
2369bool SIFoldOperandsImpl::tryFoldOMod(MachineInstr &MI) {
2370 const MachineOperand *RegOp;
2371 int OMod;
2372 std::tie(RegOp, OMod) = isOMod(MI);
2373 if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
2374 RegOp->getSubReg() != AMDGPU::NoSubRegister ||
2375 !MRI->hasOneNonDBGUser(RegOp->getReg()))
2376 return false;
2377
2378 MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
2379 MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
2380 if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
2381 return false;
2382
2383 if (Def->mayRaiseFPException())
2384 return false;
2385
2386 // Clamp is applied after omod. If the source already has clamp set, don't
2387 // fold it.
2388 if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
2389 return false;
2390
2391 LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def);
2392
2393 DefOMod->setImm(OMod);
2394 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
2395 // Kill flags can be wrong if we replaced a def inside a loop with a def
2396 // outside the loop.
2397 MRI->clearKillFlags(Def->getOperand(0).getReg());
2398 MI.eraseFromParent();
2399
2400 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
2401 // instruction, so we might as well convert it to the more flexible VOP3-only
2402 // mad/fma form.
2403 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
2404 Def->eraseFromParent();
2405
2406 return true;
2407}
2408
2409// Try to fold a reg_sequence with vgpr output and agpr inputs into an
2410// instruction which can take an agpr. So far that means a store.
2411bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) {
2412 assert(MI.isRegSequence());
2413 auto Reg = MI.getOperand(0).getReg();
2414
2415 if (!ST->hasGFX90AInsts() || !TRI->isVGPR(*MRI, Reg) ||
2416 !MRI->hasOneNonDBGUse(Reg))
2417 return false;
2418
2420 if (!getRegSeqInit(Defs, Reg))
2421 return false;
2422
2423 for (auto &[Op, SubIdx] : Defs) {
2424 if (!Op->isReg())
2425 return false;
2426 if (TRI->isAGPR(*MRI, Op->getReg()))
2427 continue;
2428 // Maybe this is a COPY from AREG
2429 const MachineInstr *SubDef = MRI->getVRegDef(Op->getReg());
2430 if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(1).getSubReg())
2431 return false;
2432 if (!TRI->isAGPR(*MRI, SubDef->getOperand(1).getReg()))
2433 return false;
2434 }
2435
2436 MachineOperand *Op = &*MRI->use_nodbg_begin(Reg);
2437 MachineInstr *UseMI = Op->getParent();
2438 while (UseMI->isCopy() && !Op->getSubReg()) {
2439 Reg = UseMI->getOperand(0).getReg();
2440 if (!TRI->isVGPR(*MRI, Reg) || !MRI->hasOneNonDBGUse(Reg))
2441 return false;
2442 Op = &*MRI->use_nodbg_begin(Reg);
2443 UseMI = Op->getParent();
2444 }
2445
2446 if (Op->getSubReg())
2447 return false;
2448
2449 unsigned OpIdx = Op - &UseMI->getOperand(0);
2450 const MCInstrDesc &InstDesc = UseMI->getDesc();
2451 const TargetRegisterClass *OpRC = TII->getRegClass(InstDesc, OpIdx);
2452 if (!OpRC || !TRI->isVectorSuperClass(OpRC))
2453 return false;
2454
2455 const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));
2456 auto Dst = MRI->createVirtualRegister(NewDstRC);
2457 auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
2458 TII->get(AMDGPU::REG_SEQUENCE), Dst);
2459
2460 for (auto &[Def, SubIdx] : Defs) {
2461 Def->setIsKill(false);
2462 if (TRI->isAGPR(*MRI, Def->getReg())) {
2463 RS.add(*Def);
2464 } else { // This is a copy
2465 MachineInstr *SubDef = MRI->getVRegDef(Def->getReg());
2466 SubDef->getOperand(1).setIsKill(false);
2467 RS.addReg(SubDef->getOperand(1).getReg(), {}, Def->getSubReg());
2468 }
2469 RS.addImm(SubIdx);
2470 }
2471
2472 Op->setReg(Dst);
2473 if (!TII->isOperandLegal(*UseMI, OpIdx, Op)) {
2474 Op->setReg(Reg);
2475 RS->eraseFromParent();
2476 return false;
2477 }
2478
2479 LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI);
2480
2481 // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users,
2482 // in which case we can erase them all later in runOnMachineFunction.
2483 if (MRI->use_nodbg_empty(MI.getOperand(0).getReg()))
2484 MI.eraseFromParent();
2485 return true;
2486}
2487
2488/// Checks whether \p Copy is a AGPR -> VGPR copy. Returns `true` on success and
2489/// stores the AGPR register in \p OutReg and the subreg in \p OutSubReg
2490static bool isAGPRCopy(const SIRegisterInfo &TRI,
2491 const MachineRegisterInfo &MRI, const MachineInstr &Copy,
2492 Register &OutReg, unsigned &OutSubReg) {
2493 assert(Copy.isCopy());
2494
2495 const MachineOperand &CopySrc = Copy.getOperand(1);
2496 Register CopySrcReg = CopySrc.getReg();
2497 if (!CopySrcReg.isVirtual())
2498 return false;
2499
2500 // Common case: copy from AGPR directly, e.g.
2501 // %1:vgpr_32 = COPY %0:agpr_32
2502 if (TRI.isAGPR(MRI, CopySrcReg)) {
2503 OutReg = CopySrcReg;
2504 OutSubReg = CopySrc.getSubReg();
2505 return true;
2506 }
2507
2508 // Sometimes it can also involve two copies, e.g.
2509 // %1:vgpr_256 = COPY %0:agpr_256
2510 // %2:vgpr_32 = COPY %1:vgpr_256.sub0
2511 const MachineInstr *CopySrcDef = MRI.getVRegDef(CopySrcReg);
2512 if (!CopySrcDef || !CopySrcDef->isCopy())
2513 return false;
2514
2515 const MachineOperand &OtherCopySrc = CopySrcDef->getOperand(1);
2516 Register OtherCopySrcReg = OtherCopySrc.getReg();
2517 if (!OtherCopySrcReg.isVirtual() ||
2518 CopySrcDef->getOperand(0).getSubReg() != AMDGPU::NoSubRegister ||
2519 OtherCopySrc.getSubReg() != AMDGPU::NoSubRegister ||
2520 !TRI.isAGPR(MRI, OtherCopySrcReg))
2521 return false;
2522
2523 OutReg = OtherCopySrcReg;
2524 OutSubReg = CopySrc.getSubReg();
2525 return true;
2526}
2527
2528// Try to hoist an AGPR to VGPR copy across a PHI.
2529// This should allow folding of an AGPR into a consumer which may support it.
2530//
2531// Example 1: LCSSA PHI
2532// loop:
2533// %1:vreg = COPY %0:areg
2534// exit:
2535// %2:vreg = PHI %1:vreg, %loop
2536// =>
2537// loop:
2538// exit:
2539// %1:areg = PHI %0:areg, %loop
2540// %2:vreg = COPY %1:areg
2541//
2542// Example 2: PHI with multiple incoming values:
2543// entry:
2544// %1:vreg = GLOBAL_LOAD(..)
2545// loop:
2546// %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop
2547// %3:areg = COPY %2:vreg
2548// %4:areg = (instr using %3:areg)
2549// %5:vreg = COPY %4:areg
2550// =>
2551// entry:
2552// %1:vreg = GLOBAL_LOAD(..)
2553// %2:areg = COPY %1:vreg
2554// loop:
2555// %3:areg = PHI %2:areg, %entry, %X:areg,
2556// %4:areg = (instr using %3:areg)
2557bool SIFoldOperandsImpl::tryFoldPhiAGPR(MachineInstr &PHI) {
2558 assert(PHI.isPHI());
2559
2560 Register PhiOut = PHI.getOperand(0).getReg();
2561 if (!TRI->isVGPR(*MRI, PhiOut))
2562 return false;
2563
2564 // Iterate once over all incoming values of the PHI to check if this PHI is
2565 // eligible, and determine the exact AGPR RC we'll target.
2566 const TargetRegisterClass *ARC = nullptr;
2567 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
2568 MachineOperand &MO = PHI.getOperand(K);
2569 MachineInstr *Copy = MRI->getVRegDef(MO.getReg());
2570 if (!Copy || !Copy->isCopy())
2571 continue;
2572
2573 Register AGPRSrc;
2574 unsigned AGPRRegMask = AMDGPU::NoSubRegister;
2575 if (!isAGPRCopy(*TRI, *MRI, *Copy, AGPRSrc, AGPRRegMask))
2576 continue;
2577
2578 const TargetRegisterClass *CopyInRC = MRI->getRegClass(AGPRSrc);
2579 if (const auto *SubRC = TRI->getSubRegisterClass(CopyInRC, AGPRRegMask))
2580 CopyInRC = SubRC;
2581
2582 if (ARC && !ARC->hasSubClassEq(CopyInRC))
2583 return false;
2584 ARC = CopyInRC;
2585 }
2586
2587 if (!ARC)
2588 return false;
2589
2590 bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass);
2591
2592 // Rewrite the PHI's incoming values to ARC.
2593 LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI);
2594 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
2595 MachineOperand &MO = PHI.getOperand(K);
2596 Register Reg = MO.getReg();
2597
2599 MachineBasicBlock *InsertMBB = nullptr;
2600
2601 // Look at the def of Reg, ignoring all copies.
2602 unsigned CopyOpc = AMDGPU::COPY;
2603 if (MachineInstr *Def = MRI->getVRegDef(Reg)) {
2604
2605 // Look at pre-existing COPY instructions from ARC: Steal the operand. If
2606 // the copy was single-use, it will be removed by DCE later.
2607 if (Def->isCopy()) {
2608 Register AGPRSrc;
2609 unsigned AGPRSubReg = AMDGPU::NoSubRegister;
2610 if (isAGPRCopy(*TRI, *MRI, *Def, AGPRSrc, AGPRSubReg)) {
2611 MO.setReg(AGPRSrc);
2612 MO.setSubReg(AGPRSubReg);
2613 continue;
2614 }
2615
2616 // If this is a multi-use SGPR -> VGPR copy, use V_ACCVGPR_WRITE on
2617 // GFX908 directly instead of a COPY. Otherwise, SIFoldOperand may try
2618 // to fold the sgpr -> vgpr -> agpr copy into a sgpr -> agpr copy which
2619 // is unlikely to be profitable.
2620 //
2621 // Note that V_ACCVGPR_WRITE is only used for AGPR_32.
2622 MachineOperand &CopyIn = Def->getOperand(1);
2623 if (IsAGPR32 && !ST->hasGFX90AInsts() && !MRI->hasOneNonDBGUse(Reg) &&
2624 TRI->isSGPRReg(*MRI, CopyIn.getReg()))
2625 CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
2626 }
2627
2628 InsertMBB = Def->getParent();
2629 InsertPt = InsertMBB->SkipPHIsLabelsAndDebug(++Def->getIterator());
2630 } else {
2631 InsertMBB = PHI.getOperand(MO.getOperandNo() + 1).getMBB();
2632 InsertPt = InsertMBB->getFirstTerminator();
2633 }
2634
2635 Register NewReg = MRI->createVirtualRegister(ARC);
2636 MachineInstr *MI = BuildMI(*InsertMBB, InsertPt, PHI.getDebugLoc(),
2637 TII->get(CopyOpc), NewReg)
2638 .addReg(Reg);
2639 MO.setReg(NewReg);
2640
2641 (void)MI;
2642 LLVM_DEBUG(dbgs() << " Created COPY: " << *MI);
2643 }
2644
2645 // Replace the PHI's result with a new register.
2646 Register NewReg = MRI->createVirtualRegister(ARC);
2647 PHI.getOperand(0).setReg(NewReg);
2648
2649 // COPY that new register back to the original PhiOut register. This COPY will
2650 // usually be folded out later.
2651 MachineBasicBlock *MBB = PHI.getParent();
2652 BuildMI(*MBB, MBB->getFirstNonPHI(), PHI.getDebugLoc(),
2653 TII->get(AMDGPU::COPY), PhiOut)
2654 .addReg(NewReg);
2655
2656 LLVM_DEBUG(dbgs() << " Done: Folded " << PHI);
2657 return true;
2658}
2659
2660// Attempt to convert VGPR load to an AGPR load.
2661bool SIFoldOperandsImpl::tryFoldLoad(MachineInstr &MI) {
2662 assert(MI.mayLoad());
2663 if (!ST->hasGFX90AInsts() || MI.getNumExplicitDefs() != 1)
2664 return false;
2665
2666 MachineOperand &Def = MI.getOperand(0);
2667 if (!Def.isDef())
2668 return false;
2669
2670 Register DefReg = Def.getReg();
2671
2672 if (DefReg.isPhysical() || !TRI->isVGPR(*MRI, DefReg))
2673 return false;
2674
2677 SmallVector<Register, 8> MoveRegs;
2678
2679 if (Users.empty())
2680 return false;
2681
2682 // Check that all uses a copy to an agpr or a reg_sequence producing an agpr.
2683 while (!Users.empty()) {
2684 const MachineInstr *I = Users.pop_back_val();
2685 if (!I->isCopy() && !I->isRegSequence())
2686 return false;
2687 Register DstReg = I->getOperand(0).getReg();
2688 // Physical registers may have more than one instruction definitions
2689 if (DstReg.isPhysical())
2690 return false;
2691 if (TRI->isAGPR(*MRI, DstReg))
2692 continue;
2693 MoveRegs.push_back(DstReg);
2694 for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg))
2695 Users.push_back(&U);
2696 }
2697
2698 const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
2699 MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC));
2700 if (!TII->isOperandLegal(MI, 0, &Def)) {
2701 MRI->setRegClass(DefReg, RC);
2702 return false;
2703 }
2704
2705 while (!MoveRegs.empty()) {
2706 Register Reg = MoveRegs.pop_back_val();
2707 MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)));
2708 }
2709
2710 LLVM_DEBUG(dbgs() << "Folded " << MI);
2711
2712 return true;
2713}
2714
2715// tryFoldPhiAGPR will aggressively try to create AGPR PHIs.
2716// For GFX90A and later, this is pretty much always a good thing, but for GFX908
2717// there's cases where it can create a lot more AGPR-AGPR copies, which are
2718// expensive on this architecture due to the lack of V_ACCVGPR_MOV.
2719//
2720// This function looks at all AGPR PHIs in a basic block and collects their
2721// operands. Then, it checks for register that are used more than once across
2722// all PHIs and caches them in a VGPR. This prevents ExpandPostRAPseudo from
2723// having to create one VGPR temporary per use, which can get very messy if
2724// these PHIs come from a broken-up large PHI (e.g. 32 AGPR phis, one per vector
2725// element).
2726//
2727// Example
2728// a:
2729// %in:agpr_256 = COPY %foo:vgpr_256
2730// c:
2731// %x:agpr_32 = ..
2732// b:
2733// %0:areg = PHI %in.sub0:agpr_32, %a, %x, %c
2734// %1:areg = PHI %in.sub0:agpr_32, %a, %y, %c
2735// %2:areg = PHI %in.sub0:agpr_32, %a, %z, %c
2736// =>
2737// a:
2738// %in:agpr_256 = COPY %foo:vgpr_256
2739// %tmp:vgpr_32 = V_ACCVGPR_READ_B32_e64 %in.sub0:agpr_32
2740// %tmp_agpr:agpr_32 = COPY %tmp
2741// c:
2742// %x:agpr_32 = ..
2743// b:
2744// %0:areg = PHI %tmp_agpr, %a, %x, %c
2745// %1:areg = PHI %tmp_agpr, %a, %y, %c
2746// %2:areg = PHI %tmp_agpr, %a, %z, %c
2747bool SIFoldOperandsImpl::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
2748 // This is only really needed on GFX908 where AGPR-AGPR copies are
2749 // unreasonably difficult.
2750 if (ST->hasGFX90AInsts())
2751 return false;
2752
2753 // Look at all AGPR Phis and collect the register + subregister used.
2754 DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>>
2755 RegToMO;
2756
2757 for (auto &MI : MBB) {
2758 if (!MI.isPHI())
2759 break;
2760
2761 if (!TRI->isAGPR(*MRI, MI.getOperand(0).getReg()))
2762 continue;
2763
2764 for (unsigned K = 1; K < MI.getNumOperands(); K += 2) {
2765 MachineOperand &PhiMO = MI.getOperand(K);
2766 if (!PhiMO.getSubReg())
2767 continue;
2768 RegToMO[{PhiMO.getReg(), PhiMO.getSubReg()}].push_back(&PhiMO);
2769 }
2770 }
2771
2772 // For all (Reg, SubReg) pair that are used more than once, cache the value in
2773 // a VGPR.
2774 bool Changed = false;
2775 for (const auto &[Entry, MOs] : RegToMO) {
2776 if (MOs.size() == 1)
2777 continue;
2778
2779 const auto [Reg, SubReg] = Entry;
2780 MachineInstr *Def = MRI->getVRegDef(Reg);
2781 MachineBasicBlock *DefMBB = Def->getParent();
2782
2783 // Create a copy in a VGPR using V_ACCVGPR_READ_B32_e64 so it's not folded
2784 // out.
2785 const TargetRegisterClass *ARC = getRegOpRC(*MRI, *TRI, *MOs.front());
2786 Register TempVGPR =
2787 MRI->createVirtualRegister(TRI->getEquivalentVGPRClass(ARC));
2788 MachineInstr *VGPRCopy =
2789 BuildMI(*DefMBB, ++Def->getIterator(), Def->getDebugLoc(),
2790 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
2791 .addReg(Reg, /* flags */ {}, SubReg);
2792
2793 // Copy back to an AGPR and use that instead of the AGPR subreg in all MOs.
2794 Register TempAGPR = MRI->createVirtualRegister(ARC);
2795 BuildMI(*DefMBB, ++VGPRCopy->getIterator(), Def->getDebugLoc(),
2796 TII->get(AMDGPU::COPY), TempAGPR)
2797 .addReg(TempVGPR);
2798
2799 LLVM_DEBUG(dbgs() << "Caching AGPR into VGPR: " << *VGPRCopy);
2800 for (MachineOperand *MO : MOs) {
2801 MO->setReg(TempAGPR);
2802 MO->setSubReg(AMDGPU::NoSubRegister);
2803 LLVM_DEBUG(dbgs() << " Changed PHI Operand: " << *MO << "\n");
2804 }
2805
2806 Changed = true;
2807 }
2808
2809 return Changed;
2810}
2811
2812bool SIFoldOperandsImpl::run(MachineFunction &MF) {
2813 this->MF = &MF;
2814 MRI = &MF.getRegInfo();
2815 ST = &MF.getSubtarget<GCNSubtarget>();
2816 TII = ST->getInstrInfo();
2817 TRI = &TII->getRegisterInfo();
2818 MFI = MF.getInfo<SIMachineFunctionInfo>();
2819
2820 // omod is ignored by hardware if IEEE bit is enabled. omod also does not
2821 // correctly handle signed zeros.
2822 //
2823 // FIXME: Also need to check strictfp
2824 bool IsIEEEMode = MFI->getMode().IEEE;
2825
2826 bool Changed = false;
2827 for (MachineBasicBlock *MBB : depth_first(&MF)) {
2828 MachineOperand *CurrentKnownM0Val = nullptr;
2829 for (auto &MI : make_early_inc_range(*MBB)) {
2830 Changed |= tryFoldCndMask(MI);
2831
2832 if (tryFoldZeroHighBits(MI)) {
2833 Changed = true;
2834 continue;
2835 }
2836
2837 if (MI.isRegSequence() && tryFoldRegSequence(MI)) {
2838 Changed = true;
2839 continue;
2840 }
2841
2842 if (MI.isPHI() && tryFoldPhiAGPR(MI)) {
2843 Changed = true;
2844 continue;
2845 }
2846
2847 if (MI.mayLoad() && tryFoldLoad(MI)) {
2848 Changed = true;
2849 continue;
2850 }
2851
2852 if (TII->isFoldableCopy(MI)) {
2853 Changed |= tryFoldFoldableCopy(MI, CurrentKnownM0Val);
2854 continue;
2855 }
2856
2857 // Saw an unknown clobber of m0, so we no longer know what it is.
2858 if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
2859 CurrentKnownM0Val = nullptr;
2860
2861 // TODO: Omod might be OK if there is NSZ only on the source
2862 // instruction, and not the omod multiply.
2863 if (IsIEEEMode || !MI.getFlag(MachineInstr::FmNsz) || !tryFoldOMod(MI))
2864 Changed |= tryFoldClamp(MI);
2865 }
2866
2867 Changed |= tryOptimizeAGPRPhis(*MBB);
2868 }
2869
2870 return Changed;
2871}
2872
2875 MFPropsModifier _(*this, MF);
2876
2877 bool Changed = SIFoldOperandsImpl().run(MF);
2878 if (!Changed) {
2879 return PreservedAnalyses::all();
2880 }
2882 PA.preserveSet<CFGAnalyses>();
2883 return PA;
2884}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat)
Updates the operand at Idx in instruction Inst with the result of instruction Mat.
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
#define _
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
static unsigned macToMad(unsigned Opc)
static bool isAGPRCopy(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, const MachineInstr &Copy, Register &OutReg, unsigned &OutSubReg)
Checks whether Copy is a AGPR -> VGPR copy.
static void appendFoldCandidate(SmallVectorImpl< FoldCandidate > &FoldList, FoldCandidate &&Entry)
static const TargetRegisterClass * getRegOpRC(const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const MachineOperand &MO)
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, uint32_t LHS, uint32_t RHS)
static int getOModValue(unsigned Opc, int64_t Val)
static unsigned getMovOpc(bool IsScalar)
static MachineOperand * lookUpCopyChain(const SIInstrInfo &TII, const MachineRegisterInfo &MRI, Register SrcReg)
static bool checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand(const FoldableDef &OpToFold)
static bool isPKF32InstrReplicatesLower32BitsOfScalarOperand(const GCNSubtarget *ST, MachineInstr *MI, unsigned OpNo)
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
#define LLVM_DEBUG(...)
Definition Debug.h:119
static int Lookup(ArrayRef< TableEntry > Table, unsigned Opcode)
Value * RHS
Value * LHS
Represent the analysis usage information of a pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:275
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
const SIInstrInfo * getInstrInfo() const override
bool hasDOTOpSelHazard() const
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
const HexagonRegisterInfo & getRegisterInfo() const
ArrayRef< MCOperandInfo > operands() const
int getOperandConstraint(unsigned OpNum, MCOI::OperandConstraint Constraint) const
Returns the value of the specified operand constraint if it is present.
bool isVariadic() const
Return true if this instruction can have a variable number of operands.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:86
uint8_t OperandType
Information about the type of the operand.
Definition MCInstrDesc.h:98
An RAII based helper class to modify MachineFunctionProperties when running pass.
LLVM_ABI iterator SkipPHIsLabelsAndDebug(iterator I, Register Reg=Register(), bool SkipPseudoOp=true)
Return the first instruction in MBB after I that is not a PHI, label or debug.
LLVM_ABI LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
LivenessQueryResult
Possible outcome of a register liveness query to computeRegisterLiveness()
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool isCopy() const
const MachineBasicBlock * getParent() const
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
void clearFlag(MIFlag Flag)
clearFlag - Clear a MI flag.
bool isRegSequence() const
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
MachineOperand * mop_iterator
iterator/begin/end - Iterate over all operands of a machine instruction.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
LLVM_ABI void substVirtReg(Register Reg, unsigned SubIdx, const TargetRegisterInfo &)
substVirtReg - Substitute the current register with the virtual subregister Reg:SubReg.
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
LLVM_ABI void substPhysReg(MCRegister Reg, const TargetRegisterInfo &)
substPhysReg - Substitute the current register with the physical register Reg, taking any existing Su...
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_GlobalAddress
Address of a global value.
@ MO_FrameIndex
Abstract Stack Frame Index.
@ MO_Register
Register operand.
static MachineOperand CreateFI(int Idx)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
use_nodbg_iterator use_nodbg_begin(Register RegNo) const
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
iterator_range< use_nodbg_iterator > use_nodbg_operands(Register Reg) const
bool use_nodbg_empty(Register RegNo) const
use_nodbg_empty - Return true if there are no non-Debug instructions using the specified register.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLVM_ABI bool hasOneNonDBGUser(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug instruction using the specified regis...
iterator_range< use_instr_nodbg_iterator > use_nodbg_instructions(Register Reg) const
void setRegAllocationHint(Register VReg, unsigned Type, Register PrefReg)
setRegAllocationHint - Specify a register allocation hint for the specified virtual register.
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool hasSameClamp(const MachineInstr &A, const MachineInstr &B)
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
SIModeRegisterDefaults getMode() const
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
Register getReg() const
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
static const unsigned CommuteAnyOperandIndex
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
IteratorT begin() const
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType)
LLVM_READONLY int32_t getMFMAEarlyClobberOp(uint32_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY int32_t getVOPe32(uint32_t Opcode)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
@ OPERAND_REG_IMM_V2FP64
Definition SIDefines.h:433
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:426
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:442
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:444
@ OPERAND_REG_IMM_V2INT64
Definition SIDefines.h:429
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:428
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:425
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:438
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:430
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:445
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:456
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:457
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:441
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:437
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:443
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:432
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:458
LLVM_READONLY int32_t getFlatScratchInstSSfromSV(uint32_t Opcode)
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
constexpr bool isVOP3(const T &...O)
Definition SIDefines.h:234
constexpr bool isMAI(const T &...O)
Definition SIDefines.h:345
constexpr bool isSWMMAC(const T &...O)
Definition SIDefines.h:372
constexpr bool isVOP3P(const T &...O)
Definition SIDefines.h:237
constexpr bool isWMMA(const T &...O)
Definition SIDefines.h:360
constexpr bool isDOT(const T &...O)
Definition SIDefines.h:348
constexpr bool isPacked(const T &...O)
Definition SIDefines.h:330
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
MachineBasicBlock::instr_iterator getBundleStart(MachineBasicBlock::instr_iterator I)
Returns an iterator to the first instruction in the bundle containing I.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
Op::Description Desc
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
FunctionPass * createSIFoldOperandsLegacyPass()
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
char & SIFoldOperandsLegacyID
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition iterator.h:368
iterator_range< df_iterator< T > > depth_first(const T &G)
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
constexpr uint64_t Make_64(uint32_t High, uint32_t Low)
Make a 64-bit integer from a high / low pair of 32-bit integers.
Definition MathExtras.h:160
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.