LLVM 23.0.0git
AArch64InstrInfo.cpp
Go to the documentation of this file.
1//===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the AArch64 implementation of the TargetInstrInfo class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64InstrInfo.h"
14#include "AArch64ExpandImm.h"
16#include "AArch64PointerAuth.h"
17#include "AArch64Subtarget.h"
21#include "llvm/ADT/ArrayRef.h"
22#include "llvm/ADT/STLExtras.h"
23#include "llvm/ADT/SmallSet.h"
25#include "llvm/ADT/Statistic.h"
44#include "llvm/IR/DebugLoc.h"
45#include "llvm/IR/GlobalValue.h"
46#include "llvm/IR/Module.h"
47#include "llvm/MC/MCAsmInfo.h"
48#include "llvm/MC/MCInst.h"
50#include "llvm/MC/MCInstrDesc.h"
55#include "llvm/Support/LEB128.h"
59#include <cassert>
60#include <cstdint>
61#include <iterator>
62#include <utility>
63
64using namespace llvm;
65
66#define GET_INSTRINFO_CTOR_DTOR
67#include "AArch64GenInstrInfo.inc"
68
69#define DEBUG_TYPE "AArch64InstrInfo"
70
71STATISTIC(NumCopyInstrs, "Number of COPY instructions expanded");
72STATISTIC(NumZCRegMoveInstrsGPR, "Number of zero-cycle GPR register move "
73 "instructions expanded from canonical COPY");
74STATISTIC(NumZCRegMoveInstrsFPR, "Number of zero-cycle FPR register move "
75 "instructions expanded from canonical COPY");
76STATISTIC(NumZCZeroingInstrsGPR, "Number of zero-cycle GPR zeroing "
77 "instructions expanded from canonical COPY");
78// NumZCZeroingInstrsFPR is counted at AArch64AsmPrinter
79
81 CBDisplacementBits("aarch64-cb-offset-bits", cl::Hidden, cl::init(9),
82 cl::desc("Restrict range of CB instructions (DEBUG)"));
83
85 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
86 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
87
89 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
90 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
91
93 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
94 cl::desc("Restrict range of Bcc instructions (DEBUG)"));
95
97 BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26),
98 cl::desc("Restrict range of B instructions (DEBUG)"));
99
101 "aarch64-search-limit", cl::Hidden, cl::init(2048),
102 cl::desc("Restrict range of instructions to search for the "
103 "machine-combiner gather pattern optimization"));
104
106 : AArch64GenInstrInfo(STI, RI, AArch64::ADJCALLSTACKDOWN,
107 AArch64::ADJCALLSTACKUP, AArch64::CATCHRET),
108 RI(STI.getTargetTriple(), STI.getHwMode()), Subtarget(STI) {}
109
110/// GetInstSize - Return the number of bytes of code the specified
111/// instruction may be. This returns the maximum number of bytes.
113 const MachineBasicBlock &MBB = *MI.getParent();
114 const MachineFunction *MF = MBB.getParent();
115 const Function &F = MF->getFunction();
116 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
117
118 {
119 auto Op = MI.getOpcode();
120 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
121 return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
122 }
123
124 // Meta-instructions emit no code.
125 if (MI.isMetaInstruction())
126 return 0;
127
128 // FIXME: We currently only handle pseudoinstructions that don't get expanded
129 // before the assembly printer.
130 unsigned NumBytes = 0;
131 const MCInstrDesc &Desc = MI.getDesc();
132
133 if (!MI.isBundle() && isTailCallReturnInst(MI)) {
134 NumBytes = Desc.getSize() ? Desc.getSize() : 4;
135
136 const auto *MFI = MF->getInfo<AArch64FunctionInfo>();
137 if (!MFI->shouldSignReturnAddress(*MF))
138 return NumBytes;
139
140 const auto &STI = MF->getSubtarget<AArch64Subtarget>();
141 auto Method = STI.getAuthenticatedLRCheckMethod(*MF);
142 NumBytes += AArch64PAuth::getCheckerSizeInBytes(Method);
143 return NumBytes;
144 }
145
146 // Size should be preferably set in
147 // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case).
148 // Specific cases handle instructions of variable sizes
149 switch (Desc.getOpcode()) {
150 default:
151 if (Desc.getSize())
152 return Desc.getSize();
153
154 // Anything not explicitly designated otherwise (i.e. pseudo-instructions
155 // with fixed constant size but not specified in .td file) is a normal
156 // 4-byte insn.
157 NumBytes = 4;
158 break;
159 case TargetOpcode::STACKMAP:
160 // The upper bound for a stackmap intrinsic is the full length of its shadow
161 NumBytes = StackMapOpers(&MI).getNumPatchBytes();
162 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
163 break;
164 case TargetOpcode::PATCHPOINT:
165 // The size of the patchpoint intrinsic is the number of bytes requested
166 NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
167 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
168 break;
169 case TargetOpcode::STATEPOINT:
170 NumBytes = StatepointOpers(&MI).getNumPatchBytes();
171 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
172 // No patch bytes means a normal call inst is emitted
173 if (NumBytes == 0)
174 NumBytes = 4;
175 break;
176 case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
177 // If `patchable-function-entry` is set, PATCHABLE_FUNCTION_ENTER
178 // instructions are expanded to the specified number of NOPs. Otherwise,
179 // they are expanded to 36-byte XRay sleds.
180 NumBytes =
181 F.getFnAttributeAsParsedInteger("patchable-function-entry", 9) * 4;
182 break;
183 case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
184 case TargetOpcode::PATCHABLE_TAIL_CALL:
185 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
186 // An XRay sled can be 4 bytes of alignment plus a 32-byte block.
187 NumBytes = 36;
188 break;
189 case TargetOpcode::PATCHABLE_EVENT_CALL:
190 // EVENT_CALL XRay sleds are exactly 6 instructions long (no alignment).
191 NumBytes = 24;
192 break;
193
194 case AArch64::SPACE:
195 NumBytes = MI.getOperand(1).getImm();
196 break;
197 case TargetOpcode::BUNDLE:
198 NumBytes = getInstBundleLength(MI);
199 break;
200 }
201
202 return NumBytes;
203}
204
205unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
206 unsigned Size = 0;
208 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
209 while (++I != E && I->isInsideBundle()) {
210 assert(!I->isBundle() && "No nested bundle!");
212 }
213 return Size;
214}
215
218 // Block ends with fall-through condbranch.
219 switch (LastInst->getOpcode()) {
220 default:
221 llvm_unreachable("Unknown branch instruction?");
222 case AArch64::Bcc:
223 Target = LastInst->getOperand(1).getMBB();
224 Cond.push_back(LastInst->getOperand(0));
225 break;
226 case AArch64::CBZW:
227 case AArch64::CBZX:
228 case AArch64::CBNZW:
229 case AArch64::CBNZX:
230 Target = LastInst->getOperand(1).getMBB();
231 Cond.push_back(MachineOperand::CreateImm(-1));
232 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
233 Cond.push_back(LastInst->getOperand(0));
234 break;
235 case AArch64::TBZW:
236 case AArch64::TBZX:
237 case AArch64::TBNZW:
238 case AArch64::TBNZX:
239 Target = LastInst->getOperand(2).getMBB();
240 Cond.push_back(MachineOperand::CreateImm(-1));
241 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
242 Cond.push_back(LastInst->getOperand(0));
243 Cond.push_back(LastInst->getOperand(1));
244 break;
245 case AArch64::CBWPri:
246 case AArch64::CBXPri:
247 case AArch64::CBWPrr:
248 case AArch64::CBXPrr:
249 Target = LastInst->getOperand(3).getMBB();
250 Cond.push_back(MachineOperand::CreateImm(-1));
251 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
252 Cond.push_back(LastInst->getOperand(0));
253 Cond.push_back(LastInst->getOperand(1));
254 Cond.push_back(LastInst->getOperand(2));
255 break;
256 case AArch64::CBBAssertExt:
257 case AArch64::CBHAssertExt:
258 Target = LastInst->getOperand(3).getMBB();
259 Cond.push_back(MachineOperand::CreateImm(-1)); // -1
260 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); // Opc
261 Cond.push_back(LastInst->getOperand(0)); // Cond
262 Cond.push_back(LastInst->getOperand(1)); // Op0
263 Cond.push_back(LastInst->getOperand(2)); // Op1
264 Cond.push_back(LastInst->getOperand(4)); // Ext0
265 Cond.push_back(LastInst->getOperand(5)); // Ext1
266 break;
267 }
268}
269
270static unsigned getBranchDisplacementBits(unsigned Opc) {
271 switch (Opc) {
272 default:
273 llvm_unreachable("unexpected opcode!");
274 case AArch64::B:
275 return BDisplacementBits;
276 case AArch64::TBNZW:
277 case AArch64::TBZW:
278 case AArch64::TBNZX:
279 case AArch64::TBZX:
280 return TBZDisplacementBits;
281 case AArch64::CBNZW:
282 case AArch64::CBZW:
283 case AArch64::CBNZX:
284 case AArch64::CBZX:
285 return CBZDisplacementBits;
286 case AArch64::Bcc:
287 return BCCDisplacementBits;
288 case AArch64::CBWPri:
289 case AArch64::CBXPri:
290 case AArch64::CBBAssertExt:
291 case AArch64::CBHAssertExt:
292 case AArch64::CBWPrr:
293 case AArch64::CBXPrr:
294 return CBDisplacementBits;
295 }
296}
297
299 int64_t BrOffset) const {
300 unsigned Bits = getBranchDisplacementBits(BranchOp);
301 assert(Bits >= 3 && "max branch displacement must be enough to jump"
302 "over conditional branch expansion");
303 return isIntN(Bits, BrOffset / 4);
304}
305
308 switch (MI.getOpcode()) {
309 default:
310 llvm_unreachable("unexpected opcode!");
311 case AArch64::B:
312 return MI.getOperand(0).getMBB();
313 case AArch64::TBZW:
314 case AArch64::TBNZW:
315 case AArch64::TBZX:
316 case AArch64::TBNZX:
317 return MI.getOperand(2).getMBB();
318 case AArch64::CBZW:
319 case AArch64::CBNZW:
320 case AArch64::CBZX:
321 case AArch64::CBNZX:
322 case AArch64::Bcc:
323 return MI.getOperand(1).getMBB();
324 case AArch64::CBWPri:
325 case AArch64::CBXPri:
326 case AArch64::CBBAssertExt:
327 case AArch64::CBHAssertExt:
328 case AArch64::CBWPrr:
329 case AArch64::CBXPrr:
330 return MI.getOperand(3).getMBB();
331 }
332}
333
335 MachineBasicBlock &NewDestBB,
336 MachineBasicBlock &RestoreBB,
337 const DebugLoc &DL,
338 int64_t BrOffset,
339 RegScavenger *RS) const {
340 assert(RS && "RegScavenger required for long branching");
341 assert(MBB.empty() &&
342 "new block should be inserted for expanding unconditional branch");
343 assert(MBB.pred_size() == 1);
344 assert(RestoreBB.empty() &&
345 "restore block should be inserted for restoring clobbered registers");
346
347 auto buildIndirectBranch = [&](Register Reg, MachineBasicBlock &DestBB) {
348 // Offsets outside of the signed 33-bit range are not supported for ADRP +
349 // ADD.
350 if (!isInt<33>(BrOffset))
352 "Branch offsets outside of the signed 33-bit range not supported");
353
354 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADRP), Reg)
355 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGE);
356 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADDXri), Reg)
357 .addReg(Reg)
358 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
359 .addImm(0);
360 BuildMI(MBB, MBB.end(), DL, get(AArch64::BR)).addReg(Reg);
361 };
362
363 RS->enterBasicBlockEnd(MBB);
364 // If X16 is unused, we can rely on the linker to insert a range extension
365 // thunk if NewDestBB is out of range of a single B instruction.
366 constexpr Register Reg = AArch64::X16;
367 if (!RS->isRegUsed(Reg)) {
368 insertUnconditionalBranch(MBB, &NewDestBB, DL);
369 RS->setRegUsed(Reg);
370 return;
371 }
372
373 // If there's a free register and it's worth inflating the code size,
374 // manually insert the indirect branch.
375 Register Scavenged = RS->FindUnusedReg(&AArch64::GPR64RegClass);
376 if (Scavenged != AArch64::NoRegister &&
377 MBB.getSectionID() == MBBSectionID::ColdSectionID) {
378 buildIndirectBranch(Scavenged, NewDestBB);
379 RS->setRegUsed(Scavenged);
380 return;
381 }
382
383 // Note: Spilling X16 briefly moves the stack pointer, making it incompatible
384 // with red zones.
385 AArch64FunctionInfo *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>();
386 if (!AFI || AFI->hasRedZone().value_or(true))
388 "Unable to insert indirect branch inside function that has red zone");
389
390 // Otherwise, spill X16 and defer range extension to the linker.
391 BuildMI(MBB, MBB.end(), DL, get(AArch64::STRXpre))
392 .addReg(AArch64::SP, RegState::Define)
393 .addReg(Reg)
394 .addReg(AArch64::SP)
395 .addImm(-16);
396
397 BuildMI(MBB, MBB.end(), DL, get(AArch64::B)).addMBB(&RestoreBB);
398
399 BuildMI(RestoreBB, RestoreBB.end(), DL, get(AArch64::LDRXpost))
400 .addReg(AArch64::SP, RegState::Define)
402 .addReg(AArch64::SP)
403 .addImm(16);
404}
405
406// Branch analysis.
409 MachineBasicBlock *&FBB,
411 bool AllowModify) const {
412 // If the block has no terminators, it just falls into the block after it.
413 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
414 if (I == MBB.end())
415 return false;
416
417 // Skip over SpeculationBarrierEndBB terminators
418 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
419 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
420 --I;
421 }
422
423 if (!isUnpredicatedTerminator(*I))
424 return false;
425
426 // Get the last instruction in the block.
427 MachineInstr *LastInst = &*I;
428
429 // If there is only one terminator instruction, process it.
430 unsigned LastOpc = LastInst->getOpcode();
431 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
432 if (isUncondBranchOpcode(LastOpc)) {
433 TBB = LastInst->getOperand(0).getMBB();
434 return false;
435 }
436 if (isCondBranchOpcode(LastOpc)) {
437 // Block ends with fall-through condbranch.
438 parseCondBranch(LastInst, TBB, Cond);
439 return false;
440 }
441 return true; // Can't handle indirect branch.
442 }
443
444 // Get the instruction before it if it is a terminator.
445 MachineInstr *SecondLastInst = &*I;
446 unsigned SecondLastOpc = SecondLastInst->getOpcode();
447
448 // If AllowModify is true and the block ends with two or more unconditional
449 // branches, delete all but the first unconditional branch.
450 if (AllowModify && isUncondBranchOpcode(LastOpc)) {
451 while (isUncondBranchOpcode(SecondLastOpc)) {
452 LastInst->eraseFromParent();
453 LastInst = SecondLastInst;
454 LastOpc = LastInst->getOpcode();
455 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
456 // Return now the only terminator is an unconditional branch.
457 TBB = LastInst->getOperand(0).getMBB();
458 return false;
459 }
460 SecondLastInst = &*I;
461 SecondLastOpc = SecondLastInst->getOpcode();
462 }
463 }
464
465 // If we're allowed to modify and the block ends in a unconditional branch
466 // which could simply fallthrough, remove the branch. (Note: This case only
467 // matters when we can't understand the whole sequence, otherwise it's also
468 // handled by BranchFolding.cpp.)
469 if (AllowModify && isUncondBranchOpcode(LastOpc) &&
470 MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) {
471 LastInst->eraseFromParent();
472 LastInst = SecondLastInst;
473 LastOpc = LastInst->getOpcode();
474 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
475 assert(!isUncondBranchOpcode(LastOpc) &&
476 "unreachable unconditional branches removed above");
477
478 if (isCondBranchOpcode(LastOpc)) {
479 // Block ends with fall-through condbranch.
480 parseCondBranch(LastInst, TBB, Cond);
481 return false;
482 }
483 return true; // Can't handle indirect branch.
484 }
485 SecondLastInst = &*I;
486 SecondLastOpc = SecondLastInst->getOpcode();
487 }
488
489 // If there are three terminators, we don't know what sort of block this is.
490 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
491 return true;
492
493 // If the block ends with a B and a Bcc, handle it.
494 if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
495 parseCondBranch(SecondLastInst, TBB, Cond);
496 FBB = LastInst->getOperand(0).getMBB();
497 return false;
498 }
499
500 // If the block ends with two unconditional branches, handle it. The second
501 // one is not executed, so remove it.
502 if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
503 TBB = SecondLastInst->getOperand(0).getMBB();
504 I = LastInst;
505 if (AllowModify)
506 I->eraseFromParent();
507 return false;
508 }
509
510 // ...likewise if it ends with an indirect branch followed by an unconditional
511 // branch.
512 if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
513 I = LastInst;
514 if (AllowModify)
515 I->eraseFromParent();
516 return true;
517 }
518
519 // Otherwise, can't handle this.
520 return true;
521}
522
524 MachineBranchPredicate &MBP,
525 bool AllowModify) const {
526 // Use analyzeBranch to validate the branch pattern.
527 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
529 if (analyzeBranch(MBB, TBB, FBB, Cond, AllowModify))
530 return true;
531
532 // analyzeBranch returns success with empty Cond for unconditional branches.
533 if (Cond.empty())
534 return true;
535
536 MBP.TrueDest = TBB;
537 assert(MBP.TrueDest && "expected!");
538 MBP.FalseDest = FBB ? FBB : MBB.getNextNode();
539
540 MBP.ConditionDef = nullptr;
541 MBP.SingleUseCondition = false;
542
543 // Find the conditional branch. After analyzeBranch succeeds with non-empty
544 // Cond, there's exactly one conditional branch - either last (fallthrough)
545 // or second-to-last (followed by unconditional B).
546 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
547 if (I == MBB.end())
548 return true;
549
550 if (isUncondBranchOpcode(I->getOpcode())) {
551 if (I == MBB.begin())
552 return true;
553 --I;
554 }
555
556 MachineInstr *CondBranch = &*I;
557 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
558
559 switch (CondBranch->getOpcode()) {
560 default:
561 return true;
562
563 case AArch64::Bcc:
564 // Bcc takes the NZCV flag as the operand to branch on, walk up the
565 // instruction stream to find the last instruction to define NZCV.
567 if (MI.modifiesRegister(AArch64::NZCV, /*TRI=*/nullptr)) {
568 MBP.ConditionDef = &MI;
569 break;
570 }
571 }
572 return false;
573
574 case AArch64::CBZW:
575 case AArch64::CBZX:
576 case AArch64::CBNZW:
577 case AArch64::CBNZX: {
578 MBP.LHS = CondBranch->getOperand(0);
579 MBP.RHS = MachineOperand::CreateImm(0);
580 unsigned Opc = CondBranch->getOpcode();
581 MBP.Predicate = (Opc == AArch64::CBNZX || Opc == AArch64::CBNZW)
582 ? MachineBranchPredicate::PRED_NE
583 : MachineBranchPredicate::PRED_EQ;
584 Register CondReg = MBP.LHS.getReg();
585 if (CondReg.isVirtual())
586 MBP.ConditionDef = MRI.getVRegDef(CondReg);
587 return false;
588 }
589
590 case AArch64::TBZW:
591 case AArch64::TBZX:
592 case AArch64::TBNZW:
593 case AArch64::TBNZX: {
594 Register CondReg = CondBranch->getOperand(0).getReg();
595 if (CondReg.isVirtual())
596 MBP.ConditionDef = MRI.getVRegDef(CondReg);
597 return false;
598 }
599 }
600}
601
604 if (Cond[0].getImm() != -1) {
605 // Regular Bcc
606 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
608 } else {
609 // Folded compare-and-branch
610 switch (Cond[1].getImm()) {
611 default:
612 llvm_unreachable("Unknown conditional branch!");
613 case AArch64::CBZW:
614 Cond[1].setImm(AArch64::CBNZW);
615 break;
616 case AArch64::CBNZW:
617 Cond[1].setImm(AArch64::CBZW);
618 break;
619 case AArch64::CBZX:
620 Cond[1].setImm(AArch64::CBNZX);
621 break;
622 case AArch64::CBNZX:
623 Cond[1].setImm(AArch64::CBZX);
624 break;
625 case AArch64::TBZW:
626 Cond[1].setImm(AArch64::TBNZW);
627 break;
628 case AArch64::TBNZW:
629 Cond[1].setImm(AArch64::TBZW);
630 break;
631 case AArch64::TBZX:
632 Cond[1].setImm(AArch64::TBNZX);
633 break;
634 case AArch64::TBNZX:
635 Cond[1].setImm(AArch64::TBZX);
636 break;
637
638 // Cond is { -1, Opcode, CC, Op0, Op1, ... }
639 case AArch64::CBWPri:
640 case AArch64::CBXPri:
641 case AArch64::CBBAssertExt:
642 case AArch64::CBHAssertExt:
643 case AArch64::CBWPrr:
644 case AArch64::CBXPrr: {
645 // Pseudos using standard 4bit Arm condition codes
647 static_cast<AArch64CC::CondCode>(Cond[2].getImm());
649 }
650 }
651 }
652
653 return false;
654}
655
657 int *BytesRemoved) const {
658 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
659 if (I == MBB.end())
660 return 0;
661
662 if (!isUncondBranchOpcode(I->getOpcode()) &&
663 !isCondBranchOpcode(I->getOpcode()))
664 return 0;
665
666 // Remove the branch.
667 I->eraseFromParent();
668
669 I = MBB.end();
670
671 if (I == MBB.begin()) {
672 if (BytesRemoved)
673 *BytesRemoved = 4;
674 return 1;
675 }
676 --I;
677 if (!isCondBranchOpcode(I->getOpcode())) {
678 if (BytesRemoved)
679 *BytesRemoved = 4;
680 return 1;
681 }
682
683 // Remove the branch.
684 I->eraseFromParent();
685 if (BytesRemoved)
686 *BytesRemoved = 8;
687
688 return 2;
689}
690
691void AArch64InstrInfo::instantiateCondBranch(
694 if (Cond[0].getImm() != -1) {
695 // Regular Bcc
696 BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
697 } else {
698 // Folded compare-and-branch
699 // Note that we use addOperand instead of addReg to keep the flags.
700
701 // cbz, cbnz
702 const MachineInstrBuilder MIB =
703 BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
704
705 // tbz/tbnz
706 if (Cond.size() > 3)
707 MIB.add(Cond[3]);
708
709 // cb
710 if (Cond.size() > 4)
711 MIB.add(Cond[4]);
712
713 MIB.addMBB(TBB);
714
715 // cb[b,h]
716 if (Cond.size() > 5) {
717 MIB.addImm(Cond[5].getImm());
718 MIB.addImm(Cond[6].getImm());
719 }
720 }
721}
722
725 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
726 // Shouldn't be a fall through.
727 assert(TBB && "insertBranch must not be told to insert a fallthrough");
728
729 if (!FBB) {
730 if (Cond.empty()) // Unconditional branch?
731 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
732 else
733 instantiateCondBranch(MBB, DL, TBB, Cond);
734
735 if (BytesAdded)
736 *BytesAdded = 4;
737
738 return 1;
739 }
740
741 // Two-way conditional branch.
742 instantiateCondBranch(MBB, DL, TBB, Cond);
743 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
744
745 if (BytesAdded)
746 *BytesAdded = 8;
747
748 return 2;
749}
750
752 const TargetInstrInfo &TII) {
753 for (MachineInstr &MI : MBB->terminators()) {
754 unsigned Opc = MI.getOpcode();
755 switch (Opc) {
756 case AArch64::CBZW:
757 case AArch64::CBZX:
758 case AArch64::TBZW:
759 case AArch64::TBZX:
760 // CBZ/TBZ with WZR/XZR -> unconditional B
761 if (MI.getOperand(0).getReg() == AArch64::WZR ||
762 MI.getOperand(0).getReg() == AArch64::XZR) {
763 DEBUG_WITH_TYPE("optimizeTerminators",
764 dbgs() << "Removing always taken branch: " << MI);
765 MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
766 SmallVector<MachineBasicBlock *> Succs(MBB->successors());
767 for (auto *S : Succs)
768 if (S != Target)
769 MBB->removeSuccessor(S);
770 DebugLoc DL = MI.getDebugLoc();
771 while (MBB->rbegin() != &MI)
772 MBB->rbegin()->eraseFromParent();
773 MI.eraseFromParent();
774 BuildMI(MBB, DL, TII.get(AArch64::B)).addMBB(Target);
775 return true;
776 }
777 break;
778 case AArch64::CBNZW:
779 case AArch64::CBNZX:
780 case AArch64::TBNZW:
781 case AArch64::TBNZX:
782 // CBNZ/TBNZ with WZR/XZR -> never taken, remove branch and successor
783 if (MI.getOperand(0).getReg() == AArch64::WZR ||
784 MI.getOperand(0).getReg() == AArch64::XZR) {
785 DEBUG_WITH_TYPE("optimizeTerminators",
786 dbgs() << "Removing never taken branch: " << MI);
787 MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
788 MI.getParent()->removeSuccessor(Target);
789 MI.eraseFromParent();
790 return true;
791 }
792 break;
793 }
794 }
795 return false;
796}
797
798// Find the original register that VReg is copied from.
799static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
800 while (Register::isVirtualRegister(VReg)) {
801 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
802 if (!DefMI->isFullCopy())
803 return VReg;
804 VReg = DefMI->getOperand(1).getReg();
805 }
806 return VReg;
807}
808
809// Determine if VReg is defined by an instruction that can be folded into a
810// csel instruction. If so, return the folded opcode, and the replacement
811// register.
812static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
813 unsigned *NewReg = nullptr) {
814 VReg = removeCopies(MRI, VReg);
816 return 0;
817
818 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
819 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
820 unsigned Opc = 0;
821 unsigned SrcReg = 0;
822 switch (DefMI->getOpcode()) {
823 case AArch64::SUBREG_TO_REG:
824 // Check for the following way to define an 64-bit immediate:
825 // %0:gpr32 = MOVi32imm 1
826 // %1:gpr64 = SUBREG_TO_REG %0:gpr32, %subreg.sub_32
827 if (!DefMI->getOperand(1).isReg())
828 return 0;
829 if (!DefMI->getOperand(2).isImm() ||
830 DefMI->getOperand(2).getImm() != AArch64::sub_32)
831 return 0;
832 DefMI = MRI.getVRegDef(DefMI->getOperand(1).getReg());
833 if (DefMI->getOpcode() != AArch64::MOVi32imm)
834 return 0;
835 if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 1)
836 return 0;
837 assert(Is64Bit);
838 SrcReg = AArch64::XZR;
839 Opc = AArch64::CSINCXr;
840 break;
841
842 case AArch64::MOVi32imm:
843 case AArch64::MOVi64imm:
844 if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 1)
845 return 0;
846 SrcReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
847 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
848 break;
849
850 case AArch64::ADDSXri:
851 case AArch64::ADDSWri:
852 // if NZCV is used, do not fold.
853 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
854 true) == -1)
855 return 0;
856 // fall-through to ADDXri and ADDWri.
857 [[fallthrough]];
858 case AArch64::ADDXri:
859 case AArch64::ADDWri:
860 // add x, 1 -> csinc.
861 if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
862 DefMI->getOperand(3).getImm() != 0)
863 return 0;
864 SrcReg = DefMI->getOperand(1).getReg();
865 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
866 break;
867
868 case AArch64::ORNXrr:
869 case AArch64::ORNWrr: {
870 // not x -> csinv, represented as orn dst, xzr, src.
871 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
872 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
873 return 0;
874 SrcReg = DefMI->getOperand(2).getReg();
875 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
876 break;
877 }
878
879 case AArch64::SUBSXrr:
880 case AArch64::SUBSWrr:
881 // if NZCV is used, do not fold.
882 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
883 true) == -1)
884 return 0;
885 // fall-through to SUBXrr and SUBWrr.
886 [[fallthrough]];
887 case AArch64::SUBXrr:
888 case AArch64::SUBWrr: {
889 // neg x -> csneg, represented as sub dst, xzr, src.
890 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
891 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
892 return 0;
893 SrcReg = DefMI->getOperand(2).getReg();
894 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
895 break;
896 }
897 default:
898 return 0;
899 }
900 assert(Opc && SrcReg && "Missing parameters");
901
902 if (NewReg)
903 *NewReg = SrcReg;
904 return Opc;
905}
906
909 Register DstReg, Register TrueReg,
910 Register FalseReg, int &CondCycles,
911 int &TrueCycles,
912 int &FalseCycles) const {
913 // Check register classes.
914 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
915 const TargetRegisterClass *RC =
916 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
917 if (!RC)
918 return false;
919
920 // Also need to check the dest regclass, in case we're trying to optimize
921 // something like:
922 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
923 if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))
924 return false;
925
926 // Expanding cbz/tbz requires an extra cycle of latency on the condition.
927 unsigned ExtraCondLat = Cond.size() != 1;
928
929 // GPRs are handled by csel.
930 // FIXME: Fold in x+1, -x, and ~x when applicable.
931 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
932 AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
933 // Single-cycle csel, csinc, csinv, and csneg.
934 CondCycles = 1 + ExtraCondLat;
935 TrueCycles = FalseCycles = 1;
936 if (canFoldIntoCSel(MRI, TrueReg))
937 TrueCycles = 0;
938 else if (canFoldIntoCSel(MRI, FalseReg))
939 FalseCycles = 0;
940 return true;
941 }
942
943 // Scalar floating point is handled by fcsel.
944 // FIXME: Form fabs, fmin, and fmax when applicable.
945 if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
946 AArch64::FPR32RegClass.hasSubClassEq(RC)) {
947 CondCycles = 5 + ExtraCondLat;
948 TrueCycles = FalseCycles = 2;
949 return true;
950 }
951
952 // Can't do vectors.
953 return false;
954}
955
958 const DebugLoc &DL, Register DstReg,
960 Register TrueReg, Register FalseReg) const {
961 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
962
963 // Parse the condition code, see parseCondBranch() above.
965 switch (Cond.size()) {
966 default:
967 llvm_unreachable("Unknown condition opcode in Cond");
968 case 1: // b.cc
970 break;
971 case 3: { // cbz/cbnz
972 // We must insert a compare against 0.
973 bool Is64Bit;
974 switch (Cond[1].getImm()) {
975 default:
976 llvm_unreachable("Unknown branch opcode in Cond");
977 case AArch64::CBZW:
978 Is64Bit = false;
979 CC = AArch64CC::EQ;
980 break;
981 case AArch64::CBZX:
982 Is64Bit = true;
983 CC = AArch64CC::EQ;
984 break;
985 case AArch64::CBNZW:
986 Is64Bit = false;
987 CC = AArch64CC::NE;
988 break;
989 case AArch64::CBNZX:
990 Is64Bit = true;
991 CC = AArch64CC::NE;
992 break;
993 }
994 Register SrcReg = Cond[2].getReg();
995 if (Is64Bit) {
996 // cmp reg, #0 is actually subs xzr, reg, #0.
997 MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
998 BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
999 .addReg(SrcReg)
1000 .addImm(0)
1001 .addImm(0);
1002 } else {
1003 MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
1004 BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
1005 .addReg(SrcReg)
1006 .addImm(0)
1007 .addImm(0);
1008 }
1009 break;
1010 }
1011 case 4: { // tbz/tbnz
1012 // We must insert a tst instruction.
1013 switch (Cond[1].getImm()) {
1014 default:
1015 llvm_unreachable("Unknown branch opcode in Cond");
1016 case AArch64::TBZW:
1017 case AArch64::TBZX:
1018 CC = AArch64CC::EQ;
1019 break;
1020 case AArch64::TBNZW:
1021 case AArch64::TBNZX:
1022 CC = AArch64CC::NE;
1023 break;
1024 }
1025 // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
1026 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
1027 BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
1028 .addReg(Cond[2].getReg())
1029 .addImm(
1031 else
1032 BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
1033 .addReg(Cond[2].getReg())
1034 .addImm(
1036 break;
1037 }
1038 case 5: { // cb
1039 // We must insert a cmp, that is a subs
1040 // 0 1 2 3 4
1041 // Cond is { -1, Opcode, CC, Op0, Op1 }
1042
1043 unsigned SubsOpc, SubsDestReg;
1044 bool IsImm = false;
1045 CC = static_cast<AArch64CC::CondCode>(Cond[2].getImm());
1046 switch (Cond[1].getImm()) {
1047 default:
1048 llvm_unreachable("Unknown branch opcode in Cond");
1049 case AArch64::CBWPri:
1050 SubsOpc = AArch64::SUBSWri;
1051 SubsDestReg = AArch64::WZR;
1052 IsImm = true;
1053 break;
1054 case AArch64::CBXPri:
1055 SubsOpc = AArch64::SUBSXri;
1056 SubsDestReg = AArch64::XZR;
1057 IsImm = true;
1058 break;
1059 case AArch64::CBWPrr:
1060 SubsOpc = AArch64::SUBSWrr;
1061 SubsDestReg = AArch64::WZR;
1062 IsImm = false;
1063 break;
1064 case AArch64::CBXPrr:
1065 SubsOpc = AArch64::SUBSXrr;
1066 SubsDestReg = AArch64::XZR;
1067 IsImm = false;
1068 break;
1069 }
1070
1071 if (IsImm)
1072 BuildMI(MBB, I, DL, get(SubsOpc), SubsDestReg)
1073 .addReg(Cond[3].getReg())
1074 .addImm(Cond[4].getImm())
1075 .addImm(0);
1076 else
1077 BuildMI(MBB, I, DL, get(SubsOpc), SubsDestReg)
1078 .addReg(Cond[3].getReg())
1079 .addReg(Cond[4].getReg());
1080 } break;
1081 case 7: { // cb[b,h]
1082 // We must insert a cmp, that is a subs, but also zero- or sign-extensions
1083 // that have been folded. For the first operand we codegen an explicit
1084 // extension, for the second operand we fold the extension into cmp.
1085 // 0 1 2 3 4 5 6
1086 // Cond is { -1, Opcode, CC, Op0, Op1, Ext0, Ext1 }
1087
1088 // We need a new register for the now explicitly extended register
1089 Register Reg = Cond[4].getReg();
1091 unsigned ExtOpc;
1092 unsigned ExtBits;
1093 AArch64_AM::ShiftExtendType ExtendType =
1095 switch (ExtendType) {
1096 default:
1097 llvm_unreachable("Unknown shift-extend for CB instruction");
1098 case AArch64_AM::SXTB:
1099 assert(
1100 Cond[1].getImm() == AArch64::CBBAssertExt &&
1101 "Unexpected compare-and-branch instruction for SXTB shift-extend");
1102 ExtOpc = AArch64::SBFMWri;
1103 ExtBits = AArch64_AM::encodeLogicalImmediate(0xff, 32);
1104 break;
1105 case AArch64_AM::SXTH:
1106 assert(
1107 Cond[1].getImm() == AArch64::CBHAssertExt &&
1108 "Unexpected compare-and-branch instruction for SXTH shift-extend");
1109 ExtOpc = AArch64::SBFMWri;
1110 ExtBits = AArch64_AM::encodeLogicalImmediate(0xffff, 32);
1111 break;
1112 case AArch64_AM::UXTB:
1113 assert(
1114 Cond[1].getImm() == AArch64::CBBAssertExt &&
1115 "Unexpected compare-and-branch instruction for UXTB shift-extend");
1116 ExtOpc = AArch64::ANDWri;
1117 ExtBits = AArch64_AM::encodeLogicalImmediate(0xff, 32);
1118 break;
1119 case AArch64_AM::UXTH:
1120 assert(
1121 Cond[1].getImm() == AArch64::CBHAssertExt &&
1122 "Unexpected compare-and-branch instruction for UXTH shift-extend");
1123 ExtOpc = AArch64::ANDWri;
1124 ExtBits = AArch64_AM::encodeLogicalImmediate(0xffff, 32);
1125 break;
1126 }
1127
1128 // Build the explicit extension of the first operand
1129 Reg = MRI.createVirtualRegister(&AArch64::GPR32spRegClass);
1131 BuildMI(MBB, I, DL, get(ExtOpc), Reg).addReg(Cond[4].getReg());
1132 if (ExtOpc != AArch64::ANDWri)
1133 MBBI.addImm(0);
1134 MBBI.addImm(ExtBits);
1135 }
1136
1137 // Now, subs with an extended second operand
1139 AArch64_AM::ShiftExtendType ExtendType =
1141 MRI.constrainRegClass(Reg, MRI.getRegClass(Cond[3].getReg()));
1142 MRI.constrainRegClass(Cond[3].getReg(), &AArch64::GPR32spRegClass);
1143 BuildMI(MBB, I, DL, get(AArch64::SUBSWrx), AArch64::WZR)
1144 .addReg(Cond[3].getReg())
1145 .addReg(Reg)
1146 .addImm(AArch64_AM::getArithExtendImm(ExtendType, 0));
1147 } // If no extension is needed, just a regular subs
1148 else {
1149 MRI.constrainRegClass(Reg, MRI.getRegClass(Cond[3].getReg()));
1150 MRI.constrainRegClass(Cond[3].getReg(), &AArch64::GPR32spRegClass);
1151 BuildMI(MBB, I, DL, get(AArch64::SUBSWrr), AArch64::WZR)
1152 .addReg(Cond[3].getReg())
1153 .addReg(Reg);
1154 }
1155
1156 CC = static_cast<AArch64CC::CondCode>(Cond[2].getImm());
1157 } break;
1158 }
1159
1160 unsigned Opc = 0;
1161 const TargetRegisterClass *RC = nullptr;
1162 bool TryFold = false;
1163 if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
1164 RC = &AArch64::GPR64RegClass;
1165 Opc = AArch64::CSELXr;
1166 TryFold = true;
1167 } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
1168 RC = &AArch64::GPR32RegClass;
1169 Opc = AArch64::CSELWr;
1170 TryFold = true;
1171 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
1172 RC = &AArch64::FPR64RegClass;
1173 Opc = AArch64::FCSELDrrr;
1174 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
1175 RC = &AArch64::FPR32RegClass;
1176 Opc = AArch64::FCSELSrrr;
1177 }
1178 assert(RC && "Unsupported regclass");
1179
1180 // Try folding simple instructions into the csel.
1181 if (TryFold) {
1182 unsigned NewReg = 0;
1183 unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewReg);
1184 if (FoldedOpc) {
1185 // The folded opcodes csinc, csinc and csneg apply the operation to
1186 // FalseReg, so we need to invert the condition.
1188 TrueReg = FalseReg;
1189 } else
1190 FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewReg);
1191
1192 // Fold the operation. Leave any dead instructions for DCE to clean up.
1193 if (FoldedOpc) {
1194 FalseReg = NewReg;
1195 Opc = FoldedOpc;
1196 // Extend the live range of NewReg.
1197 MRI.clearKillFlags(NewReg);
1198 }
1199 }
1200
1201 // Pull all virtual register into the appropriate class.
1202 MRI.constrainRegClass(TrueReg, RC);
1203 // FalseReg might be WZR or XZR if the folded operand is a literal 1.
1204 assert(
1205 (FalseReg.isVirtual() || FalseReg == AArch64::WZR ||
1206 FalseReg == AArch64::XZR) &&
1207 "FalseReg was folded into a non-virtual register other than WZR or XZR");
1208 if (FalseReg.isVirtual())
1209 MRI.constrainRegClass(FalseReg, RC);
1210
1211 // Insert the csel.
1212 BuildMI(MBB, I, DL, get(Opc), DstReg)
1213 .addReg(TrueReg)
1214 .addReg(FalseReg)
1215 .addImm(CC);
1216}
1217
1218// Return true if Imm can be loaded into a register by a "cheap" sequence of
1219// instructions. For now, "cheap" means at most two instructions.
1220static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
1221 if (BitSize == 32)
1222 return true;
1223
1224 assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed");
1225 uint64_t Imm = static_cast<uint64_t>(MI.getOperand(1).getImm());
1227 AArch64_IMM::expandMOVImm(Imm, BitSize, Is);
1228
1229 return Is.size() <= 2;
1230}
1231
1232// Check if a COPY instruction is cheap.
1233static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI) {
1234 assert(MI.isCopy() && "Expected COPY instruction");
1235 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
1236
1237 // Cross-bank copies (e.g., between GPR and FPR) are expensive on AArch64,
1238 // typically requiring an FMOV instruction with a 2-6 cycle latency.
1239 auto GetRegClass = [&](Register Reg) -> const TargetRegisterClass * {
1240 if (Reg.isVirtual())
1241 return MRI.getRegClass(Reg);
1242 if (Reg.isPhysical())
1243 return RI.getMinimalPhysRegClass(Reg);
1244 return nullptr;
1245 };
1246 const TargetRegisterClass *DstRC = GetRegClass(MI.getOperand(0).getReg());
1247 const TargetRegisterClass *SrcRC = GetRegClass(MI.getOperand(1).getReg());
1248 if (DstRC && SrcRC && !RI.getCommonSubClass(DstRC, SrcRC))
1249 return false;
1250
1251 return MI.isAsCheapAsAMove();
1252}
1253
1254// FIXME: this implementation should be micro-architecture dependent, so a
1255// micro-architecture target hook should be introduced here in future.
1257 if (Subtarget.hasExynosCheapAsMoveHandling()) {
1258 if (isExynosCheapAsMove(MI))
1259 return true;
1260 return MI.isAsCheapAsAMove();
1261 }
1262
1263 switch (MI.getOpcode()) {
1264 default:
1265 return MI.isAsCheapAsAMove();
1266
1267 case TargetOpcode::COPY:
1268 return isCheapCopy(MI, RI);
1269
1270 case AArch64::ADDWrs:
1271 case AArch64::ADDXrs:
1272 case AArch64::SUBWrs:
1273 case AArch64::SUBXrs:
1274 return Subtarget.hasALULSLFast() && MI.getOperand(3).getImm() <= 4;
1275
1276 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
1277 // ORRXri, it is as cheap as MOV.
1278 // Likewise if it can be expanded to MOVZ/MOVN/MOVK.
1279 case AArch64::MOVi32imm:
1280 return isCheapImmediate(MI, 32);
1281 case AArch64::MOVi64imm:
1282 return isCheapImmediate(MI, 64);
1283 }
1284}
1285
1286bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
1287 switch (MI.getOpcode()) {
1288 default:
1289 return false;
1290
1291 case AArch64::ADDWrs:
1292 case AArch64::ADDXrs:
1293 case AArch64::ADDSWrs:
1294 case AArch64::ADDSXrs: {
1295 unsigned Imm = MI.getOperand(3).getImm();
1296 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1297 if (ShiftVal == 0)
1298 return true;
1299 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
1300 }
1301
1302 case AArch64::ADDWrx:
1303 case AArch64::ADDXrx:
1304 case AArch64::ADDXrx64:
1305 case AArch64::ADDSWrx:
1306 case AArch64::ADDSXrx:
1307 case AArch64::ADDSXrx64: {
1308 unsigned Imm = MI.getOperand(3).getImm();
1309 switch (AArch64_AM::getArithExtendType(Imm)) {
1310 default:
1311 return false;
1312 case AArch64_AM::UXTB:
1313 case AArch64_AM::UXTH:
1314 case AArch64_AM::UXTW:
1315 case AArch64_AM::UXTX:
1316 return AArch64_AM::getArithShiftValue(Imm) <= 4;
1317 }
1318 }
1319
1320 case AArch64::SUBWrs:
1321 case AArch64::SUBSWrs: {
1322 unsigned Imm = MI.getOperand(3).getImm();
1323 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1324 return ShiftVal == 0 ||
1325 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
1326 }
1327
1328 case AArch64::SUBXrs:
1329 case AArch64::SUBSXrs: {
1330 unsigned Imm = MI.getOperand(3).getImm();
1331 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1332 return ShiftVal == 0 ||
1333 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
1334 }
1335
1336 case AArch64::SUBWrx:
1337 case AArch64::SUBXrx:
1338 case AArch64::SUBXrx64:
1339 case AArch64::SUBSWrx:
1340 case AArch64::SUBSXrx:
1341 case AArch64::SUBSXrx64: {
1342 unsigned Imm = MI.getOperand(3).getImm();
1343 switch (AArch64_AM::getArithExtendType(Imm)) {
1344 default:
1345 return false;
1346 case AArch64_AM::UXTB:
1347 case AArch64_AM::UXTH:
1348 case AArch64_AM::UXTW:
1349 case AArch64_AM::UXTX:
1350 return AArch64_AM::getArithShiftValue(Imm) == 0;
1351 }
1352 }
1353
1354 case AArch64::LDRBBroW:
1355 case AArch64::LDRBBroX:
1356 case AArch64::LDRBroW:
1357 case AArch64::LDRBroX:
1358 case AArch64::LDRDroW:
1359 case AArch64::LDRDroX:
1360 case AArch64::LDRHHroW:
1361 case AArch64::LDRHHroX:
1362 case AArch64::LDRHroW:
1363 case AArch64::LDRHroX:
1364 case AArch64::LDRQroW:
1365 case AArch64::LDRQroX:
1366 case AArch64::LDRSBWroW:
1367 case AArch64::LDRSBWroX:
1368 case AArch64::LDRSBXroW:
1369 case AArch64::LDRSBXroX:
1370 case AArch64::LDRSHWroW:
1371 case AArch64::LDRSHWroX:
1372 case AArch64::LDRSHXroW:
1373 case AArch64::LDRSHXroX:
1374 case AArch64::LDRSWroW:
1375 case AArch64::LDRSWroX:
1376 case AArch64::LDRSroW:
1377 case AArch64::LDRSroX:
1378 case AArch64::LDRWroW:
1379 case AArch64::LDRWroX:
1380 case AArch64::LDRXroW:
1381 case AArch64::LDRXroX:
1382 case AArch64::PRFMroW:
1383 case AArch64::PRFMroX:
1384 case AArch64::STRBBroW:
1385 case AArch64::STRBBroX:
1386 case AArch64::STRBroW:
1387 case AArch64::STRBroX:
1388 case AArch64::STRDroW:
1389 case AArch64::STRDroX:
1390 case AArch64::STRHHroW:
1391 case AArch64::STRHHroX:
1392 case AArch64::STRHroW:
1393 case AArch64::STRHroX:
1394 case AArch64::STRQroW:
1395 case AArch64::STRQroX:
1396 case AArch64::STRSroW:
1397 case AArch64::STRSroX:
1398 case AArch64::STRWroW:
1399 case AArch64::STRWroX:
1400 case AArch64::STRXroW:
1401 case AArch64::STRXroX: {
1402 unsigned IsSigned = MI.getOperand(3).getImm();
1403 return !IsSigned;
1404 }
1405 }
1406}
1407
1408bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
1409 unsigned Opc = MI.getOpcode();
1410 switch (Opc) {
1411 default:
1412 return false;
1413 case AArch64::SEH_StackAlloc:
1414 case AArch64::SEH_SaveFPLR:
1415 case AArch64::SEH_SaveFPLR_X:
1416 case AArch64::SEH_SaveReg:
1417 case AArch64::SEH_SaveReg_X:
1418 case AArch64::SEH_SaveRegP:
1419 case AArch64::SEH_SaveRegP_X:
1420 case AArch64::SEH_SaveFReg:
1421 case AArch64::SEH_SaveFReg_X:
1422 case AArch64::SEH_SaveFRegP:
1423 case AArch64::SEH_SaveFRegP_X:
1424 case AArch64::SEH_SetFP:
1425 case AArch64::SEH_AddFP:
1426 case AArch64::SEH_Nop:
1427 case AArch64::SEH_PrologEnd:
1428 case AArch64::SEH_EpilogStart:
1429 case AArch64::SEH_EpilogEnd:
1430 case AArch64::SEH_PACSignLR:
1431 case AArch64::SEH_SaveAnyRegI:
1432 case AArch64::SEH_SaveAnyRegIP:
1433 case AArch64::SEH_SaveAnyRegQP:
1434 case AArch64::SEH_SaveAnyRegQPX:
1435 case AArch64::SEH_AllocZ:
1436 case AArch64::SEH_SaveZReg:
1437 case AArch64::SEH_SavePReg:
1438 return true;
1439 }
1440}
1441
1443 Register &SrcReg, Register &DstReg,
1444 unsigned &SubIdx) const {
1445 switch (MI.getOpcode()) {
1446 default:
1447 return false;
1448 case AArch64::SBFMXri: // aka sxtw
1449 case AArch64::UBFMXri: // aka uxtw
1450 // Check for the 32 -> 64 bit extension case, these instructions can do
1451 // much more.
1452 if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
1453 return false;
1454 // This is a signed or unsigned 32 -> 64 bit extension.
1455 SrcReg = MI.getOperand(1).getReg();
1456 DstReg = MI.getOperand(0).getReg();
1457 SubIdx = AArch64::sub_32;
1458 return true;
1459 }
1460}
1461
1463 const MachineInstr &MIa, const MachineInstr &MIb) const {
1465 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
1466 int64_t OffsetA = 0, OffsetB = 0;
1467 TypeSize WidthA(0, false), WidthB(0, false);
1468 bool OffsetAIsScalable = false, OffsetBIsScalable = false;
1469
1470 assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
1471 assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
1472
1475 return false;
1476
1477 // Retrieve the base, offset from the base and width. Width
1478 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
1479 // base are identical, and the offset of a lower memory access +
1480 // the width doesn't overlap the offset of a higher memory access,
1481 // then the memory accesses are different.
1482 // If OffsetAIsScalable and OffsetBIsScalable are both true, they
1483 // are assumed to have the same scale (vscale).
1484 if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,
1485 WidthA, TRI) &&
1486 getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,
1487 WidthB, TRI)) {
1488 if (BaseOpA->isIdenticalTo(*BaseOpB) &&
1489 OffsetAIsScalable == OffsetBIsScalable) {
1490 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1491 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1492 TypeSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1493 if (LowWidth.isScalable() == OffsetAIsScalable &&
1494 LowOffset + (int)LowWidth.getKnownMinValue() <= HighOffset)
1495 return true;
1496 }
1497 }
1498 return false;
1499}
1500
1502 const MachineBasicBlock *MBB,
1503 const MachineFunction &MF) const {
1505 return true;
1506
1507 // Do not move an instruction that can be recognized as a branch target.
1508 if (hasBTISemantics(MI))
1509 return true;
1510
1511 switch (MI.getOpcode()) {
1512 case AArch64::HINT:
1513 // CSDB hints are scheduling barriers.
1514 if (MI.getOperand(0).getImm() == 0x14)
1515 return true;
1516 break;
1517 case AArch64::DSB:
1518 case AArch64::ISB:
1519 // DSB and ISB also are scheduling barriers.
1520 return true;
1521 case AArch64::MSRpstatesvcrImm1:
1522 // SMSTART and SMSTOP are also scheduling barriers.
1523 return true;
1524 default:;
1525 }
1526 if (isSEHInstruction(MI))
1527 return true;
1528 auto Next = std::next(MI.getIterator());
1529 return Next != MBB->end() && Next->isCFIInstruction();
1530}
1531
1532/// analyzeCompare - For a comparison instruction, return the source registers
1533/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1534/// Return true if the comparison instruction can be analyzed.
1536 Register &SrcReg2, int64_t &CmpMask,
1537 int64_t &CmpValue) const {
1538 // The first operand can be a frame index where we'd normally expect a
1539 // register.
1540 // FIXME: Pass subregisters out of analyzeCompare
1541 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1542 if (!MI.getOperand(1).isReg() || MI.getOperand(1).getSubReg())
1543 return false;
1544
1545 switch (MI.getOpcode()) {
1546 default:
1547 break;
1548 case AArch64::PTEST_PP:
1549 case AArch64::PTEST_PP_ANY:
1550 case AArch64::PTEST_PP_FIRST:
1551 SrcReg = MI.getOperand(0).getReg();
1552 SrcReg2 = MI.getOperand(1).getReg();
1553 if (MI.getOperand(2).getSubReg())
1554 return false;
1555
1556 // Not sure about the mask and value for now...
1557 CmpMask = ~0;
1558 CmpValue = 0;
1559 return true;
1560 case AArch64::SUBSWrr:
1561 case AArch64::SUBSWrs:
1562 case AArch64::SUBSWrx:
1563 case AArch64::SUBSXrr:
1564 case AArch64::SUBSXrs:
1565 case AArch64::SUBSXrx:
1566 case AArch64::ADDSWrr:
1567 case AArch64::ADDSWrs:
1568 case AArch64::ADDSWrx:
1569 case AArch64::ADDSXrr:
1570 case AArch64::ADDSXrs:
1571 case AArch64::ADDSXrx:
1572 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1573 SrcReg = MI.getOperand(1).getReg();
1574 SrcReg2 = MI.getOperand(2).getReg();
1575
1576 // FIXME: Pass subregisters out of analyzeCompare
1577 if (MI.getOperand(2).getSubReg())
1578 return false;
1579
1580 CmpMask = ~0;
1581 CmpValue = 0;
1582 return true;
1583 case AArch64::SUBSWri:
1584 case AArch64::ADDSWri:
1585 case AArch64::SUBSXri:
1586 case AArch64::ADDSXri:
1587 SrcReg = MI.getOperand(1).getReg();
1588 SrcReg2 = 0;
1589 CmpMask = ~0;
1590 CmpValue = MI.getOperand(2).getImm();
1591 return true;
1592 case AArch64::ANDSWri:
1593 case AArch64::ANDSXri:
1594 // ANDS does not use the same encoding scheme as the others xxxS
1595 // instructions.
1596 SrcReg = MI.getOperand(1).getReg();
1597 SrcReg2 = 0;
1598 CmpMask = ~0;
1600 MI.getOperand(2).getImm(),
1601 MI.getOpcode() == AArch64::ANDSWri ? 32 : 64);
1602 return true;
1603 }
1604
1605 return false;
1606}
1607
1609 MachineBasicBlock *MBB = Instr.getParent();
1610 assert(MBB && "Can't get MachineBasicBlock here");
1611 MachineFunction *MF = MBB->getParent();
1612 assert(MF && "Can't get MachineFunction here");
1615 MachineRegisterInfo *MRI = &MF->getRegInfo();
1616
1617 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1618 ++OpIdx) {
1619 MachineOperand &MO = Instr.getOperand(OpIdx);
1620 const TargetRegisterClass *OpRegCstraints =
1621 Instr.getRegClassConstraint(OpIdx, TII, TRI);
1622
1623 // If there's no constraint, there's nothing to do.
1624 if (!OpRegCstraints)
1625 continue;
1626 // If the operand is a frame index, there's nothing to do here.
1627 // A frame index operand will resolve correctly during PEI.
1628 if (MO.isFI())
1629 continue;
1630
1631 assert(MO.isReg() &&
1632 "Operand has register constraints without being a register!");
1633
1634 Register Reg = MO.getReg();
1635 if (Reg.isPhysical()) {
1636 if (!OpRegCstraints->contains(Reg))
1637 return false;
1638 } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1639 !MRI->constrainRegClass(Reg, OpRegCstraints))
1640 return false;
1641 }
1642
1643 return true;
1644}
1645
1646/// Return the opcode that does not set flags when possible - otherwise
1647/// return the original opcode. The caller is responsible to do the actual
1648/// substitution and legality checking.
1650 // Don't convert all compare instructions, because for some the zero register
1651 // encoding becomes the sp register.
1652 bool MIDefinesZeroReg = false;
1653 if (MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1654 MI.definesRegister(AArch64::XZR, /*TRI=*/nullptr))
1655 MIDefinesZeroReg = true;
1656
1657 switch (MI.getOpcode()) {
1658 default:
1659 return MI.getOpcode();
1660 case AArch64::ADDSWrr:
1661 return AArch64::ADDWrr;
1662 case AArch64::ADDSWri:
1663 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1664 case AArch64::ADDSWrs:
1665 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1666 case AArch64::ADDSWrx:
1667 return AArch64::ADDWrx;
1668 case AArch64::ADDSXrr:
1669 return AArch64::ADDXrr;
1670 case AArch64::ADDSXri:
1671 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1672 case AArch64::ADDSXrs:
1673 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1674 case AArch64::ADDSXrx:
1675 return AArch64::ADDXrx;
1676 case AArch64::SUBSWrr:
1677 return AArch64::SUBWrr;
1678 case AArch64::SUBSWri:
1679 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1680 case AArch64::SUBSWrs:
1681 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1682 case AArch64::SUBSWrx:
1683 return AArch64::SUBWrx;
1684 case AArch64::SUBSXrr:
1685 return AArch64::SUBXrr;
1686 case AArch64::SUBSXri:
1687 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1688 case AArch64::SUBSXrs:
1689 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1690 case AArch64::SUBSXrx:
1691 return AArch64::SUBXrx;
1692 }
1693}
1694
1695enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1696
1697/// True when condition flags are accessed (either by writing or reading)
1698/// on the instruction trace starting at From and ending at To.
1699///
1700/// Note: If From and To are from different blocks it's assumed CC are accessed
1701/// on the path.
1704 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1705 // Early exit if To is at the beginning of the BB.
1706 if (To == To->getParent()->begin())
1707 return true;
1708
1709 // Check whether the instructions are in the same basic block
1710 // If not, assume the condition flags might get modified somewhere.
1711 if (To->getParent() != From->getParent())
1712 return true;
1713
1714 // From must be above To.
1715 assert(std::any_of(
1716 ++To.getReverse(), To->getParent()->rend(),
1717 [From](MachineInstr &MI) { return MI.getIterator() == From; }));
1718
1719 // We iterate backward starting at \p To until we hit \p From.
1720 for (const MachineInstr &Instr :
1722 if (((AccessToCheck & AK_Write) &&
1723 Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1724 ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1725 return true;
1726 }
1727 return false;
1728}
1729
1730std::optional<unsigned>
1731AArch64InstrInfo::canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask,
1732 MachineInstr *Pred,
1733 const MachineRegisterInfo *MRI) const {
1734 unsigned MaskOpcode = Mask->getOpcode();
1735 unsigned PredOpcode = Pred->getOpcode();
1736 bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);
1737 bool PredIsWhileLike = isWhileOpcode(PredOpcode);
1738
1739 if (PredIsWhileLike) {
1740 // For PTEST(PG, PG), PTEST is redundant when PG is the result of a WHILEcc
1741 // instruction and the condition is "any" since WHILcc does an implicit
1742 // PTEST(ALL, PG) check and PG is always a subset of ALL.
1743 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1744 return PredOpcode;
1745
1746 // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is
1747 // redundant since WHILE performs an implicit PTEST with an all active
1748 // mask.
1749 if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1750 getElementSizeForOpcode(MaskOpcode) ==
1751 getElementSizeForOpcode(PredOpcode))
1752 return PredOpcode;
1753
1754 // For PTEST_FIRST(PTRUE_ALL, WHILE), the PTEST_FIRST is redundant since
1755 // WHILEcc performs an implicit PTEST with an all active mask, setting
1756 // the N flag as the PTEST_FIRST would.
1757 if (PTest->getOpcode() == AArch64::PTEST_PP_FIRST &&
1758 isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31)
1759 return PredOpcode;
1760
1761 return {};
1762 }
1763
1764 if (PredIsPTestLike) {
1765 // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
1766 // instruction that sets the flags as PTEST would and the condition is
1767 // "any" since PG is always a subset of the governing predicate of the
1768 // ptest-like instruction.
1769 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1770 return PredOpcode;
1771
1772 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1773
1774 // If the PTEST like instruction's general predicate is not `Mask`, attempt
1775 // to look through a copy and try again. This is because some instructions
1776 // take a predicate whose register class is a subset of its result class.
1777 if (Mask != PTestLikeMask && PTestLikeMask->isFullCopy() &&
1778 PTestLikeMask->getOperand(1).getReg().isVirtual())
1779 PTestLikeMask =
1780 MRI->getUniqueVRegDef(PTestLikeMask->getOperand(1).getReg());
1781
1782 // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the
1783 // the element size matches and either the PTEST_LIKE instruction uses
1784 // the same all active mask or the condition is "any".
1785 if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1786 getElementSizeForOpcode(MaskOpcode) ==
1787 getElementSizeForOpcode(PredOpcode)) {
1788 if (Mask == PTestLikeMask || PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1789 return PredOpcode;
1790 }
1791
1792 // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the
1793 // flags are set based on the same mask 'PG', but PTEST_LIKE must operate
1794 // on 8-bit predicates like the PTEST. Otherwise, for instructions like
1795 // compare that also support 16/32/64-bit predicates, the implicit PTEST
1796 // performed by the compare could consider fewer lanes for these element
1797 // sizes.
1798 //
1799 // For example, consider
1800 //
1801 // ptrue p0.b ; P0=1111-1111-1111-1111
1802 // index z0.s, #0, #1 ; Z0=<0,1,2,3>
1803 // index z1.s, #1, #1 ; Z1=<1,2,3,4>
1804 // cmphi p1.s, p0/z, z1.s, z0.s ; P1=0001-0001-0001-0001
1805 // ; ^ last active
1806 // ptest p0, p1.b ; P1=0001-0001-0001-0001
1807 // ; ^ last active
1808 //
1809 // where the compare generates a canonical all active 32-bit predicate
1810 // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last
1811 // active flag, whereas the PTEST instruction with the same mask doesn't.
1812 // For PTEST_ANY this doesn't apply as the flags in this case would be
1813 // identical regardless of element size.
1814 uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode);
1815 if (Mask == PTestLikeMask && (PredElementSize == AArch64::ElementSizeB ||
1816 PTest->getOpcode() == AArch64::PTEST_PP_ANY))
1817 return PredOpcode;
1818
1819 return {};
1820 }
1821
1822 // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the
1823 // opcode so the PTEST becomes redundant.
1824 switch (PredOpcode) {
1825 case AArch64::AND_PPzPP:
1826 case AArch64::BIC_PPzPP:
1827 case AArch64::EOR_PPzPP:
1828 case AArch64::NAND_PPzPP:
1829 case AArch64::NOR_PPzPP:
1830 case AArch64::ORN_PPzPP:
1831 case AArch64::ORR_PPzPP:
1832 case AArch64::BRKA_PPzP:
1833 case AArch64::BRKPA_PPzPP:
1834 case AArch64::BRKB_PPzP:
1835 case AArch64::BRKPB_PPzPP:
1836 case AArch64::RDFFR_PPz: {
1837 // Check to see if our mask is the same. If not the resulting flag bits
1838 // may be different and we can't remove the ptest.
1839 auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1840 if (Mask != PredMask)
1841 return {};
1842 break;
1843 }
1844 case AArch64::BRKN_PPzP: {
1845 // BRKN uses an all active implicit mask to set flags unlike the other
1846 // flag-setting instructions.
1847 // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B).
1848 if ((MaskOpcode != AArch64::PTRUE_B) ||
1849 (Mask->getOperand(1).getImm() != 31))
1850 return {};
1851 break;
1852 }
1853 case AArch64::PTRUE_B:
1854 // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A)
1855 break;
1856 default:
1857 // Bail out if we don't recognize the input
1858 return {};
1859 }
1860
1861 return convertToFlagSettingOpc(PredOpcode);
1862}
1863
1864/// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
1865/// operation which could set the flags in an identical manner
1866bool AArch64InstrInfo::optimizePTestInstr(
1867 MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
1868 const MachineRegisterInfo *MRI) const {
1869 auto *Mask = MRI->getUniqueVRegDef(MaskReg);
1870 auto *Pred = MRI->getUniqueVRegDef(PredReg);
1871
1872 if (Pred->isCopy() && PTest->getOpcode() == AArch64::PTEST_PP_FIRST) {
1873 // Instructions which return a multi-vector (e.g. WHILECC_x2) require copies
1874 // before the branch to extract each subregister.
1875 auto Op = Pred->getOperand(1);
1876 if (Op.isReg() && Op.getReg().isVirtual() &&
1877 Op.getSubReg() == AArch64::psub0)
1878 Pred = MRI->getUniqueVRegDef(Op.getReg());
1879 }
1880
1881 unsigned PredOpcode = Pred->getOpcode();
1882 auto NewOp = canRemovePTestInstr(PTest, Mask, Pred, MRI);
1883 if (!NewOp)
1884 return false;
1885
1886 const TargetRegisterInfo *TRI = &getRegisterInfo();
1887
1888 // If another instruction between Pred and PTest accesses flags, don't remove
1889 // the ptest or update the earlier instruction to modify them.
1890 if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI))
1891 return false;
1892
1893 // If we pass all the checks, it's safe to remove the PTEST and use the flags
1894 // as they are prior to PTEST. Sometimes this requires the tested PTEST
1895 // operand to be replaced with an equivalent instruction that also sets the
1896 // flags.
1897 PTest->eraseFromParent();
1898 if (*NewOp != PredOpcode) {
1899 Pred->setDesc(get(*NewOp));
1900 bool succeeded = UpdateOperandRegClass(*Pred);
1901 (void)succeeded;
1902 assert(succeeded && "Operands have incompatible register classes!");
1903 Pred->addRegisterDefined(AArch64::NZCV, TRI);
1904 }
1905
1906 // Ensure that the flags def is live.
1907 if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) {
1908 unsigned i = 0, e = Pred->getNumOperands();
1909 for (; i != e; ++i) {
1910 MachineOperand &MO = Pred->getOperand(i);
1911 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {
1912 MO.setIsDead(false);
1913 break;
1914 }
1915 }
1916 }
1917 return true;
1918}
1919
1920/// Try to optimize a compare instruction. A compare instruction is an
1921/// instruction which produces AArch64::NZCV. It can be truly compare
1922/// instruction
1923/// when there are no uses of its destination register.
1924///
1925/// The following steps are tried in order:
1926/// 1. Convert CmpInstr into an unconditional version.
1927/// 2. Remove CmpInstr if above there is an instruction producing a needed
1928/// condition code or an instruction which can be converted into such an
1929/// instruction.
1930/// Only comparison with zero is supported.
1932 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask,
1933 int64_t CmpValue, const MachineRegisterInfo *MRI) const {
1934 assert(CmpInstr.getParent());
1935 assert(MRI);
1936
1937 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1938 int DeadNZCVIdx =
1939 CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
1940 if (DeadNZCVIdx != -1) {
1941 if (CmpInstr.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1942 CmpInstr.definesRegister(AArch64::XZR, /*TRI=*/nullptr)) {
1943 CmpInstr.eraseFromParent();
1944 return true;
1945 }
1946 unsigned Opc = CmpInstr.getOpcode();
1947 unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
1948 if (NewOpc == Opc)
1949 return false;
1950 const MCInstrDesc &MCID = get(NewOpc);
1951 CmpInstr.setDesc(MCID);
1952 CmpInstr.removeOperand(DeadNZCVIdx);
1953 bool succeeded = UpdateOperandRegClass(CmpInstr);
1954 (void)succeeded;
1955 assert(succeeded && "Some operands reg class are incompatible!");
1956 return true;
1957 }
1958
1959 if (CmpInstr.getOpcode() == AArch64::PTEST_PP ||
1960 CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY ||
1961 CmpInstr.getOpcode() == AArch64::PTEST_PP_FIRST)
1962 return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);
1963
1964 if (SrcReg2 != 0)
1965 return false;
1966
1967 // CmpInstr is a Compare instruction if destination register is not used.
1968 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
1969 return false;
1970
1971 if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI))
1972 return true;
1973 return (CmpValue == 0 || CmpValue == 1) &&
1974 removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI);
1975}
1976
1977/// Get opcode of S version of Instr.
1978/// If Instr is S version its opcode is returned.
1979/// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1980/// or we are not interested in it.
1981static unsigned sForm(MachineInstr &Instr) {
1982 switch (Instr.getOpcode()) {
1983 default:
1984 return AArch64::INSTRUCTION_LIST_END;
1985
1986 case AArch64::ADDSWrr:
1987 case AArch64::ADDSWri:
1988 case AArch64::ADDSXrr:
1989 case AArch64::ADDSXri:
1990 case AArch64::ADDSWrx:
1991 case AArch64::ADDSXrx:
1992 case AArch64::SUBSWrr:
1993 case AArch64::SUBSWri:
1994 case AArch64::SUBSWrx:
1995 case AArch64::SUBSXrr:
1996 case AArch64::SUBSXri:
1997 case AArch64::SUBSXrx:
1998 case AArch64::ANDSWri:
1999 case AArch64::ANDSWrr:
2000 case AArch64::ANDSWrs:
2001 case AArch64::ANDSXri:
2002 case AArch64::ANDSXrr:
2003 case AArch64::ANDSXrs:
2004 case AArch64::BICSWrr:
2005 case AArch64::BICSXrr:
2006 case AArch64::BICSWrs:
2007 case AArch64::BICSXrs:
2008 return Instr.getOpcode();
2009
2010 case AArch64::ADDWrr:
2011 return AArch64::ADDSWrr;
2012 case AArch64::ADDWri:
2013 return AArch64::ADDSWri;
2014 case AArch64::ADDXrr:
2015 return AArch64::ADDSXrr;
2016 case AArch64::ADDXri:
2017 return AArch64::ADDSXri;
2018 case AArch64::ADDWrx:
2019 return AArch64::ADDSWrx;
2020 case AArch64::ADDXrx:
2021 return AArch64::ADDSXrx;
2022 case AArch64::ADCWr:
2023 return AArch64::ADCSWr;
2024 case AArch64::ADCXr:
2025 return AArch64::ADCSXr;
2026 case AArch64::SUBWrr:
2027 return AArch64::SUBSWrr;
2028 case AArch64::SUBWri:
2029 return AArch64::SUBSWri;
2030 case AArch64::SUBXrr:
2031 return AArch64::SUBSXrr;
2032 case AArch64::SUBXri:
2033 return AArch64::SUBSXri;
2034 case AArch64::SUBWrx:
2035 return AArch64::SUBSWrx;
2036 case AArch64::SUBXrx:
2037 return AArch64::SUBSXrx;
2038 case AArch64::SBCWr:
2039 return AArch64::SBCSWr;
2040 case AArch64::SBCXr:
2041 return AArch64::SBCSXr;
2042 case AArch64::ANDWri:
2043 return AArch64::ANDSWri;
2044 case AArch64::ANDXri:
2045 return AArch64::ANDSXri;
2046 case AArch64::ANDWrr:
2047 return AArch64::ANDSWrr;
2048 case AArch64::ANDWrs:
2049 return AArch64::ANDSWrs;
2050 case AArch64::ANDXrr:
2051 return AArch64::ANDSXrr;
2052 case AArch64::ANDXrs:
2053 return AArch64::ANDSXrs;
2054 case AArch64::BICWrr:
2055 return AArch64::BICSWrr;
2056 case AArch64::BICXrr:
2057 return AArch64::BICSXrr;
2058 case AArch64::BICWrs:
2059 return AArch64::BICSWrs;
2060 case AArch64::BICXrs:
2061 return AArch64::BICSXrs;
2062 }
2063}
2064
2065/// Check if AArch64::NZCV should be alive in successors of MBB.
2067 for (auto *BB : MBB->successors())
2068 if (BB->isLiveIn(AArch64::NZCV))
2069 return true;
2070 return false;
2071}
2072
2073/// \returns The condition code operand index for \p Instr if it is a branch
2074/// or select and -1 otherwise.
2076 const MachineInstr &Instr) {
2077 switch (Instr.getOpcode()) {
2078 default:
2079 return -1;
2080
2081 case AArch64::Bcc: {
2082 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
2083 assert(Idx >= 2);
2084 return Idx - 2;
2085 }
2086
2087 case AArch64::CSINVWr:
2088 case AArch64::CSINVXr:
2089 case AArch64::CSINCWr:
2090 case AArch64::CSINCXr:
2091 case AArch64::CSELWr:
2092 case AArch64::CSELXr:
2093 case AArch64::CSNEGWr:
2094 case AArch64::CSNEGXr:
2095 case AArch64::FCSELSrrr:
2096 case AArch64::FCSELDrrr: {
2097 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
2098 assert(Idx >= 1);
2099 return Idx - 1;
2100 }
2101 }
2102}
2103
2104/// Find a condition code used by the instruction.
2105/// Returns AArch64CC::Invalid if either the instruction does not use condition
2106/// codes or we don't optimize CmpInstr in the presence of such instructions.
2108 int CCIdx =
2110 return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>(
2111 Instr.getOperand(CCIdx).getImm())
2113}
2114
2117 UsedNZCV UsedFlags;
2118 switch (CC) {
2119 default:
2120 break;
2121
2122 case AArch64CC::EQ: // Z set
2123 case AArch64CC::NE: // Z clear
2124 UsedFlags.Z = true;
2125 break;
2126
2127 case AArch64CC::HI: // Z clear and C set
2128 case AArch64CC::LS: // Z set or C clear
2129 UsedFlags.Z = true;
2130 [[fallthrough]];
2131 case AArch64CC::HS: // C set
2132 case AArch64CC::LO: // C clear
2133 UsedFlags.C = true;
2134 break;
2135
2136 case AArch64CC::MI: // N set
2137 case AArch64CC::PL: // N clear
2138 UsedFlags.N = true;
2139 break;
2140
2141 case AArch64CC::VS: // V set
2142 case AArch64CC::VC: // V clear
2143 UsedFlags.V = true;
2144 break;
2145
2146 case AArch64CC::GT: // Z clear, N and V the same
2147 case AArch64CC::LE: // Z set, N and V differ
2148 UsedFlags.Z = true;
2149 [[fallthrough]];
2150 case AArch64CC::GE: // N and V the same
2151 case AArch64CC::LT: // N and V differ
2152 UsedFlags.N = true;
2153 UsedFlags.V = true;
2154 break;
2155 }
2156 return UsedFlags;
2157}
2158
2159/// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV
2160/// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
2161/// \returns std::nullopt otherwise.
2162///
2163/// Collect instructions using that flags in \p CCUseInstrs if provided.
2164std::optional<UsedNZCV>
2166 const TargetRegisterInfo &TRI,
2167 SmallVectorImpl<MachineInstr *> *CCUseInstrs) {
2168 MachineBasicBlock *CmpParent = CmpInstr.getParent();
2169 if (MI.getParent() != CmpParent)
2170 return std::nullopt;
2171
2172 if (areCFlagsAliveInSuccessors(CmpParent))
2173 return std::nullopt;
2174
2175 UsedNZCV NZCVUsedAfterCmp;
2177 std::next(CmpInstr.getIterator()), CmpParent->instr_end())) {
2178 if (Instr.readsRegister(AArch64::NZCV, &TRI)) {
2180 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
2181 return std::nullopt;
2182 NZCVUsedAfterCmp |= getUsedNZCV(CC);
2183 if (CCUseInstrs)
2184 CCUseInstrs->push_back(&Instr);
2185 }
2186 if (Instr.modifiesRegister(AArch64::NZCV, &TRI))
2187 break;
2188 }
2189 return NZCVUsedAfterCmp;
2190}
2191
2192static bool isADDSRegImm(unsigned Opcode) {
2193 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
2194}
2195
2196static bool isSUBSRegImm(unsigned Opcode) {
2197 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
2198}
2199
2201 unsigned Opc = sForm(MI);
2202 switch (Opc) {
2203 case AArch64::ANDSWri:
2204 case AArch64::ANDSWrr:
2205 case AArch64::ANDSWrs:
2206 case AArch64::ANDSXri:
2207 case AArch64::ANDSXrr:
2208 case AArch64::ANDSXrs:
2209 case AArch64::BICSWrr:
2210 case AArch64::BICSXrr:
2211 case AArch64::BICSWrs:
2212 case AArch64::BICSXrs:
2213 return true;
2214 default:
2215 return false;
2216 }
2217}
2218
2219/// Check if CmpInstr can be substituted by MI.
2220///
2221/// CmpInstr can be substituted:
2222/// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
2223/// - and, MI and CmpInstr are from the same MachineBB
2224/// - and, condition flags are not alive in successors of the CmpInstr parent
2225/// - and, if MI opcode is the S form there must be no defs of flags between
2226/// MI and CmpInstr
2227/// or if MI opcode is not the S form there must be neither defs of flags
2228/// nor uses of flags between MI and CmpInstr.
2229/// - and, if C/V flags are not used after CmpInstr
2230/// or if N flag is used but MI produces poison value if signed overflow
2231/// occurs.
2233 const TargetRegisterInfo &TRI) {
2234 // NOTE this assertion guarantees that MI.getOpcode() is add or subtraction
2235 // that may or may not set flags.
2236 assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END);
2237
2238 const unsigned CmpOpcode = CmpInstr.getOpcode();
2239 if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
2240 return false;
2241
2242 assert((CmpInstr.getOperand(2).isImm() &&
2243 CmpInstr.getOperand(2).getImm() == 0) &&
2244 "Caller guarantees that CmpInstr compares with constant 0");
2245
2246 std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI);
2247 if (!NZVCUsed || NZVCUsed->C)
2248 return false;
2249
2250 // CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either
2251 // '%vreg = add ...' or '%vreg = sub ...'.
2252 // Condition flag V is used to indicate signed overflow.
2253 // 1) MI and CmpInstr set N and V to the same value.
2254 // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when
2255 // signed overflow occurs, so CmpInstr could still be simplified away.
2256 // Note that Ands and Bics instructions always clear the V flag.
2257 if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap) && !isANDOpcode(MI))
2258 return false;
2259
2260 AccessKind AccessToCheck = AK_Write;
2261 if (sForm(MI) != MI.getOpcode())
2262 AccessToCheck = AK_All;
2263 return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck);
2264}
2265
2266/// Substitute an instruction comparing to zero with another instruction
2267/// which produces needed condition flags.
2268///
2269/// Return true on success.
2270bool AArch64InstrInfo::substituteCmpToZero(
2271 MachineInstr &CmpInstr, unsigned SrcReg,
2272 const MachineRegisterInfo &MRI) const {
2273 // Get the unique definition of SrcReg.
2274 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
2275 if (!MI)
2276 return false;
2277
2278 const TargetRegisterInfo &TRI = getRegisterInfo();
2279
2280 unsigned NewOpc = sForm(*MI);
2281 if (NewOpc == AArch64::INSTRUCTION_LIST_END)
2282 return false;
2283
2284 if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI))
2285 return false;
2286
2287 // Update the instruction to set NZCV.
2288 MI->setDesc(get(NewOpc));
2289 CmpInstr.eraseFromParent();
2291 (void)succeeded;
2292 assert(succeeded && "Some operands reg class are incompatible!");
2293 MI->addRegisterDefined(AArch64::NZCV, &TRI);
2294 return true;
2295}
2296
2297/// \returns True if \p CmpInstr can be removed.
2298///
2299/// \p IsInvertCC is true if, after removing \p CmpInstr, condition
2300/// codes used in \p CCUseInstrs must be inverted.
2302 int CmpValue, const TargetRegisterInfo &TRI,
2304 bool &IsInvertCC) {
2305 assert((CmpValue == 0 || CmpValue == 1) &&
2306 "Only comparisons to 0 or 1 considered for removal!");
2307
2308 // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>'
2309 unsigned MIOpc = MI.getOpcode();
2310 if (MIOpc == AArch64::CSINCWr) {
2311 if (MI.getOperand(1).getReg() != AArch64::WZR ||
2312 MI.getOperand(2).getReg() != AArch64::WZR)
2313 return false;
2314 } else if (MIOpc == AArch64::CSINCXr) {
2315 if (MI.getOperand(1).getReg() != AArch64::XZR ||
2316 MI.getOperand(2).getReg() != AArch64::XZR)
2317 return false;
2318 } else {
2319 return false;
2320 }
2322 if (MICC == AArch64CC::Invalid)
2323 return false;
2324
2325 // NZCV needs to be defined
2326 if (MI.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) != -1)
2327 return false;
2328
2329 // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1'
2330 const unsigned CmpOpcode = CmpInstr.getOpcode();
2331 bool IsSubsRegImm = isSUBSRegImm(CmpOpcode);
2332 if (CmpValue && !IsSubsRegImm)
2333 return false;
2334 if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode))
2335 return false;
2336
2337 // MI conditions allowed: eq, ne, mi, pl
2338 UsedNZCV MIUsedNZCV = getUsedNZCV(MICC);
2339 if (MIUsedNZCV.C || MIUsedNZCV.V)
2340 return false;
2341
2342 std::optional<UsedNZCV> NZCVUsedAfterCmp =
2343 examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs);
2344 // Condition flags are not used in CmpInstr basic block successors and only
2345 // Z or N flags allowed to be used after CmpInstr within its basic block
2346 if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V)
2347 return false;
2348 // Z or N flag used after CmpInstr must correspond to the flag used in MI
2349 if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
2350 (MIUsedNZCV.N && NZCVUsedAfterCmp->Z))
2351 return false;
2352 // If CmpInstr is comparison to zero MI conditions are limited to eq, ne
2353 if (MIUsedNZCV.N && !CmpValue)
2354 return false;
2355
2356 // There must be no defs of flags between MI and CmpInstr
2357 if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write))
2358 return false;
2359
2360 // Condition code is inverted in the following cases:
2361 // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
2362 // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1'
2363 IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) ||
2364 (!CmpValue && MICC == AArch64CC::NE);
2365 return true;
2366}
2367
2368/// Remove comparison in csinc-cmp sequence
2369///
2370/// Examples:
2371/// 1. \code
2372/// csinc w9, wzr, wzr, ne
2373/// cmp w9, #0
2374/// b.eq
2375/// \endcode
2376/// to
2377/// \code
2378/// csinc w9, wzr, wzr, ne
2379/// b.ne
2380/// \endcode
2381///
2382/// 2. \code
2383/// csinc x2, xzr, xzr, mi
2384/// cmp x2, #1
2385/// b.pl
2386/// \endcode
2387/// to
2388/// \code
2389/// csinc x2, xzr, xzr, mi
2390/// b.pl
2391/// \endcode
2392///
2393/// \param CmpInstr comparison instruction
2394/// \return True when comparison removed
2395bool AArch64InstrInfo::removeCmpToZeroOrOne(
2396 MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue,
2397 const MachineRegisterInfo &MRI) const {
2398 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
2399 if (!MI)
2400 return false;
2401 const TargetRegisterInfo &TRI = getRegisterInfo();
2402 SmallVector<MachineInstr *, 4> CCUseInstrs;
2403 bool IsInvertCC = false;
2404 if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs,
2405 IsInvertCC))
2406 return false;
2407 // Make transformation
2408 CmpInstr.eraseFromParent();
2409 if (IsInvertCC) {
2410 // Invert condition codes in CmpInstr CC users
2411 for (MachineInstr *CCUseInstr : CCUseInstrs) {
2412 int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr);
2413 assert(Idx >= 0 && "Unexpected instruction using CC.");
2414 MachineOperand &CCOperand = CCUseInstr->getOperand(Idx);
2416 static_cast<AArch64CC::CondCode>(CCOperand.getImm()));
2417 CCOperand.setImm(CCUse);
2418 }
2419 }
2420 return true;
2421}
2422
2423bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
2424 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
2425 MI.getOpcode() != AArch64::CATCHRET)
2426 return false;
2427
2428 MachineBasicBlock &MBB = *MI.getParent();
2429 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
2430 auto TRI = Subtarget.getRegisterInfo();
2431 DebugLoc DL = MI.getDebugLoc();
2432
2433 if (MI.getOpcode() == AArch64::CATCHRET) {
2434 // Skip to the first instruction before the epilog.
2435 const TargetInstrInfo *TII =
2437 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
2439 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
2440 while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
2441 FirstEpilogSEH != MBB.begin())
2442 FirstEpilogSEH = std::prev(FirstEpilogSEH);
2443 if (FirstEpilogSEH != MBB.begin())
2444 FirstEpilogSEH = std::next(FirstEpilogSEH);
2445 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
2446 .addReg(AArch64::X0, RegState::Define)
2447 .addMBB(TargetMBB);
2448 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
2449 .addReg(AArch64::X0, RegState::Define)
2450 .addReg(AArch64::X0)
2451 .addMBB(TargetMBB)
2452 .addImm(0);
2453 TargetMBB->setMachineBlockAddressTaken();
2454 return true;
2455 }
2456
2457 Register Reg = MI.getOperand(0).getReg();
2459 if (M.getStackProtectorGuard() == "sysreg") {
2460 const AArch64SysReg::SysReg *SrcReg =
2461 AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg());
2462 if (!SrcReg)
2463 report_fatal_error("Unknown SysReg for Stack Protector Guard Register");
2464
2465 // mrs xN, sysreg
2466 BuildMI(MBB, MI, DL, get(AArch64::MRS))
2468 .addImm(SrcReg->Encoding);
2469 int Offset = M.getStackProtectorGuardOffset();
2470 if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) {
2471 // ldr xN, [xN, #offset]
2472 BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2473 .addDef(Reg)
2475 .addImm(Offset / 8);
2476 } else if (Offset >= -256 && Offset <= 255) {
2477 // ldur xN, [xN, #offset]
2478 BuildMI(MBB, MI, DL, get(AArch64::LDURXi))
2479 .addDef(Reg)
2481 .addImm(Offset);
2482 } else if (Offset >= -4095 && Offset <= 4095) {
2483 if (Offset > 0) {
2484 // add xN, xN, #offset
2485 BuildMI(MBB, MI, DL, get(AArch64::ADDXri))
2486 .addDef(Reg)
2488 .addImm(Offset)
2489 .addImm(0);
2490 } else {
2491 // sub xN, xN, #offset
2492 BuildMI(MBB, MI, DL, get(AArch64::SUBXri))
2493 .addDef(Reg)
2495 .addImm(-Offset)
2496 .addImm(0);
2497 }
2498 // ldr xN, [xN]
2499 BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2500 .addDef(Reg)
2502 .addImm(0);
2503 } else {
2504 // Cases that are larger than +/- 4095 and not a multiple of 8, or larger
2505 // than 23760.
2506 // It might be nice to use AArch64::MOVi32imm here, which would get
2507 // expanded in PreSched2 after PostRA, but our lone scratch Reg already
2508 // contains the MRS result. findScratchNonCalleeSaveRegister() in
2509 // AArch64FrameLowering might help us find such a scratch register
2510 // though. If we failed to find a scratch register, we could emit a
2511 // stream of add instructions to build up the immediate. Or, we could try
2512 // to insert a AArch64::MOVi32imm before register allocation so that we
2513 // didn't need to scavenge for a scratch register.
2514 report_fatal_error("Unable to encode Stack Protector Guard Offset");
2515 }
2516 MBB.erase(MI);
2517 return true;
2518 }
2519
2520 const GlobalValue *GV =
2521 cast<GlobalValue>((*MI.memoperands_begin())->getValue());
2522 const TargetMachine &TM = MBB.getParent()->getTarget();
2523 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
2524 const unsigned char MO_NC = AArch64II::MO_NC;
2525
2526 if ((OpFlags & AArch64II::MO_GOT) != 0) {
2527 BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
2528 .addGlobalAddress(GV, 0, OpFlags);
2529 if (Subtarget.isTargetILP32()) {
2530 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2531 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2532 .addDef(Reg32, RegState::Dead)
2534 .addImm(0)
2535 .addMemOperand(*MI.memoperands_begin())
2537 } else {
2538 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2540 .addImm(0)
2541 .addMemOperand(*MI.memoperands_begin());
2542 }
2543 } else if (TM.getCodeModel() == CodeModel::Large) {
2544 assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
2545 BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
2546 .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
2547 .addImm(0);
2548 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2550 .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
2551 .addImm(16);
2552 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2554 .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
2555 .addImm(32);
2556 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2559 .addImm(48);
2560 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2562 .addImm(0)
2563 .addMemOperand(*MI.memoperands_begin());
2564 } else if (TM.getCodeModel() == CodeModel::Tiny) {
2565 BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
2566 .addGlobalAddress(GV, 0, OpFlags);
2567 } else {
2568 BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
2569 .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
2570 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
2571 if (Subtarget.isTargetILP32()) {
2572 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2573 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2574 .addDef(Reg32, RegState::Dead)
2576 .addGlobalAddress(GV, 0, LoFlags)
2577 .addMemOperand(*MI.memoperands_begin())
2579 } else {
2580 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2582 .addGlobalAddress(GV, 0, LoFlags)
2583 .addMemOperand(*MI.memoperands_begin());
2584 }
2585 }
2586
2587 MBB.erase(MI);
2588
2589 return true;
2590}
2591
2592// Return true if this instruction simply sets its single destination register
2593// to zero. This is equivalent to a register rename of the zero-register.
2595 switch (MI.getOpcode()) {
2596 default:
2597 break;
2598 case AArch64::MOVZWi:
2599 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
2600 if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
2601 assert(MI.getDesc().getNumOperands() == 3 &&
2602 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
2603 return true;
2604 }
2605 break;
2606 case AArch64::ANDWri: // and Rd, Rzr, #imm
2607 return MI.getOperand(1).getReg() == AArch64::WZR;
2608 case AArch64::ANDXri:
2609 return MI.getOperand(1).getReg() == AArch64::XZR;
2610 case TargetOpcode::COPY:
2611 return MI.getOperand(1).getReg() == AArch64::WZR;
2612 }
2613 return false;
2614}
2615
2616// Return true if this instruction simply renames a general register without
2617// modifying bits.
2619 switch (MI.getOpcode()) {
2620 default:
2621 break;
2622 case TargetOpcode::COPY: {
2623 // GPR32 copies will by lowered to ORRXrs
2624 Register DstReg = MI.getOperand(0).getReg();
2625 return (AArch64::GPR32RegClass.contains(DstReg) ||
2626 AArch64::GPR64RegClass.contains(DstReg));
2627 }
2628 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
2629 if (MI.getOperand(1).getReg() == AArch64::XZR) {
2630 assert(MI.getDesc().getNumOperands() == 4 &&
2631 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
2632 return true;
2633 }
2634 break;
2635 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
2636 if (MI.getOperand(2).getImm() == 0) {
2637 assert(MI.getDesc().getNumOperands() == 4 &&
2638 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
2639 return true;
2640 }
2641 break;
2642 }
2643 return false;
2644}
2645
2646// Return true if this instruction simply renames a general register without
2647// modifying bits.
2649 switch (MI.getOpcode()) {
2650 default:
2651 break;
2652 case TargetOpcode::COPY: {
2653 Register DstReg = MI.getOperand(0).getReg();
2654 return AArch64::FPR128RegClass.contains(DstReg);
2655 }
2656 case AArch64::ORRv16i8:
2657 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
2658 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
2659 "invalid ORRv16i8 operands");
2660 return true;
2661 }
2662 break;
2663 }
2664 return false;
2665}
2666
2667static bool isFrameLoadOpcode(int Opcode) {
2668 switch (Opcode) {
2669 default:
2670 return false;
2671 case AArch64::LDRWui:
2672 case AArch64::LDRXui:
2673 case AArch64::LDRBui:
2674 case AArch64::LDRHui:
2675 case AArch64::LDRSui:
2676 case AArch64::LDRDui:
2677 case AArch64::LDRQui:
2678 case AArch64::LDR_PXI:
2679 return true;
2680 }
2681}
2682
2684 int &FrameIndex) const {
2685 if (!isFrameLoadOpcode(MI.getOpcode()))
2686 return Register();
2687
2688 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2689 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2690 FrameIndex = MI.getOperand(1).getIndex();
2691 return MI.getOperand(0).getReg();
2692 }
2693 return Register();
2694}
2695
2696static bool isFrameStoreOpcode(int Opcode) {
2697 switch (Opcode) {
2698 default:
2699 return false;
2700 case AArch64::STRWui:
2701 case AArch64::STRXui:
2702 case AArch64::STRBui:
2703 case AArch64::STRHui:
2704 case AArch64::STRSui:
2705 case AArch64::STRDui:
2706 case AArch64::STRQui:
2707 case AArch64::STR_PXI:
2708 return true;
2709 }
2710}
2711
2713 int &FrameIndex) const {
2714 if (!isFrameStoreOpcode(MI.getOpcode()))
2715 return Register();
2716
2717 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2718 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2719 FrameIndex = MI.getOperand(1).getIndex();
2720 return MI.getOperand(0).getReg();
2721 }
2722 return Register();
2723}
2724
2726 int &FrameIndex) const {
2727 if (!isFrameStoreOpcode(MI.getOpcode()))
2728 return Register();
2729
2730 if (Register Reg = isStoreToStackSlot(MI, FrameIndex))
2731 return Reg;
2732
2734 if (hasStoreToStackSlot(MI, Accesses)) {
2735 if (Accesses.size() > 1)
2736 return Register();
2737
2738 FrameIndex =
2739 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
2740 ->getFrameIndex();
2741 return MI.getOperand(0).getReg();
2742 }
2743 return Register();
2744}
2745
2747 int &FrameIndex) const {
2748 if (!isFrameLoadOpcode(MI.getOpcode()))
2749 return Register();
2750
2751 if (Register Reg = isLoadFromStackSlot(MI, FrameIndex))
2752 return Reg;
2753
2755 if (hasLoadFromStackSlot(MI, Accesses)) {
2756 if (Accesses.size() > 1)
2757 return Register();
2758
2759 FrameIndex =
2760 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
2761 ->getFrameIndex();
2762 return MI.getOperand(0).getReg();
2763 }
2764 return Register();
2765}
2766
2767/// Check all MachineMemOperands for a hint to suppress pairing.
2769 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2770 return MMO->getFlags() & MOSuppressPair;
2771 });
2772}
2773
2774/// Set a flag on the first MachineMemOperand to suppress pairing.
2776 if (MI.memoperands_empty())
2777 return;
2778 (*MI.memoperands_begin())->setFlags(MOSuppressPair);
2779}
2780
2781/// Check all MachineMemOperands for a hint that the load/store is strided.
2783 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2784 return MMO->getFlags() & MOStridedAccess;
2785 });
2786}
2787
2789 switch (Opc) {
2790 default:
2791 return false;
2792 case AArch64::STURSi:
2793 case AArch64::STRSpre:
2794 case AArch64::STURDi:
2795 case AArch64::STRDpre:
2796 case AArch64::STURQi:
2797 case AArch64::STRQpre:
2798 case AArch64::STURBBi:
2799 case AArch64::STURHHi:
2800 case AArch64::STURWi:
2801 case AArch64::STRWpre:
2802 case AArch64::STURXi:
2803 case AArch64::STRXpre:
2804 case AArch64::LDURSi:
2805 case AArch64::LDRSpre:
2806 case AArch64::LDURDi:
2807 case AArch64::LDRDpre:
2808 case AArch64::LDURQi:
2809 case AArch64::LDRQpre:
2810 case AArch64::LDURWi:
2811 case AArch64::LDRWpre:
2812 case AArch64::LDURXi:
2813 case AArch64::LDRXpre:
2814 case AArch64::LDRSWpre:
2815 case AArch64::LDURSWi:
2816 case AArch64::LDURHHi:
2817 case AArch64::LDURBBi:
2818 case AArch64::LDURSBWi:
2819 case AArch64::LDURSHWi:
2820 return true;
2821 }
2822}
2823
2824std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
2825 switch (Opc) {
2826 default: return {};
2827 case AArch64::PRFMui: return AArch64::PRFUMi;
2828 case AArch64::LDRXui: return AArch64::LDURXi;
2829 case AArch64::LDRWui: return AArch64::LDURWi;
2830 case AArch64::LDRBui: return AArch64::LDURBi;
2831 case AArch64::LDRHui: return AArch64::LDURHi;
2832 case AArch64::LDRSui: return AArch64::LDURSi;
2833 case AArch64::LDRDui: return AArch64::LDURDi;
2834 case AArch64::LDRQui: return AArch64::LDURQi;
2835 case AArch64::LDRBBui: return AArch64::LDURBBi;
2836 case AArch64::LDRHHui: return AArch64::LDURHHi;
2837 case AArch64::LDRSBXui: return AArch64::LDURSBXi;
2838 case AArch64::LDRSBWui: return AArch64::LDURSBWi;
2839 case AArch64::LDRSHXui: return AArch64::LDURSHXi;
2840 case AArch64::LDRSHWui: return AArch64::LDURSHWi;
2841 case AArch64::LDRSWui: return AArch64::LDURSWi;
2842 case AArch64::STRXui: return AArch64::STURXi;
2843 case AArch64::STRWui: return AArch64::STURWi;
2844 case AArch64::STRBui: return AArch64::STURBi;
2845 case AArch64::STRHui: return AArch64::STURHi;
2846 case AArch64::STRSui: return AArch64::STURSi;
2847 case AArch64::STRDui: return AArch64::STURDi;
2848 case AArch64::STRQui: return AArch64::STURQi;
2849 case AArch64::STRBBui: return AArch64::STURBBi;
2850 case AArch64::STRHHui: return AArch64::STURHHi;
2851 }
2852}
2853
2855 switch (Opc) {
2856 default:
2857 llvm_unreachable("Unhandled Opcode in getLoadStoreImmIdx");
2858 case AArch64::ADDG:
2859 case AArch64::LDAPURBi:
2860 case AArch64::LDAPURHi:
2861 case AArch64::LDAPURi:
2862 case AArch64::LDAPURSBWi:
2863 case AArch64::LDAPURSBXi:
2864 case AArch64::LDAPURSHWi:
2865 case AArch64::LDAPURSHXi:
2866 case AArch64::LDAPURSWi:
2867 case AArch64::LDAPURXi:
2868 case AArch64::LDR_PPXI:
2869 case AArch64::LDR_PXI:
2870 case AArch64::LDR_ZXI:
2871 case AArch64::LDR_ZZXI:
2872 case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
2873 case AArch64::LDR_ZZZXI:
2874 case AArch64::LDR_ZZZZXI:
2875 case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
2876 case AArch64::LDRBBui:
2877 case AArch64::LDRBui:
2878 case AArch64::LDRDui:
2879 case AArch64::LDRHHui:
2880 case AArch64::LDRHui:
2881 case AArch64::LDRQui:
2882 case AArch64::LDRSBWui:
2883 case AArch64::LDRSBXui:
2884 case AArch64::LDRSHWui:
2885 case AArch64::LDRSHXui:
2886 case AArch64::LDRSui:
2887 case AArch64::LDRSWui:
2888 case AArch64::LDRWui:
2889 case AArch64::LDRXui:
2890 case AArch64::LDURBBi:
2891 case AArch64::LDURBi:
2892 case AArch64::LDURDi:
2893 case AArch64::LDURHHi:
2894 case AArch64::LDURHi:
2895 case AArch64::LDURQi:
2896 case AArch64::LDURSBWi:
2897 case AArch64::LDURSBXi:
2898 case AArch64::LDURSHWi:
2899 case AArch64::LDURSHXi:
2900 case AArch64::LDURSi:
2901 case AArch64::LDURSWi:
2902 case AArch64::LDURWi:
2903 case AArch64::LDURXi:
2904 case AArch64::PRFMui:
2905 case AArch64::PRFUMi:
2906 case AArch64::ST2Gi:
2907 case AArch64::STGi:
2908 case AArch64::STLURBi:
2909 case AArch64::STLURHi:
2910 case AArch64::STLURWi:
2911 case AArch64::STLURXi:
2912 case AArch64::StoreSwiftAsyncContext:
2913 case AArch64::STR_PPXI:
2914 case AArch64::STR_PXI:
2915 case AArch64::STR_ZXI:
2916 case AArch64::STR_ZZXI:
2917 case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
2918 case AArch64::STR_ZZZXI:
2919 case AArch64::STR_ZZZZXI:
2920 case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
2921 case AArch64::STRBBui:
2922 case AArch64::STRBui:
2923 case AArch64::STRDui:
2924 case AArch64::STRHHui:
2925 case AArch64::STRHui:
2926 case AArch64::STRQui:
2927 case AArch64::STRSui:
2928 case AArch64::STRWui:
2929 case AArch64::STRXui:
2930 case AArch64::STURBBi:
2931 case AArch64::STURBi:
2932 case AArch64::STURDi:
2933 case AArch64::STURHHi:
2934 case AArch64::STURHi:
2935 case AArch64::STURQi:
2936 case AArch64::STURSi:
2937 case AArch64::STURWi:
2938 case AArch64::STURXi:
2939 case AArch64::STZ2Gi:
2940 case AArch64::STZGi:
2941 case AArch64::TAGPstack:
2942 return 2;
2943 case AArch64::LD1B_D_IMM:
2944 case AArch64::LD1B_H_IMM:
2945 case AArch64::LD1B_IMM:
2946 case AArch64::LD1B_S_IMM:
2947 case AArch64::LD1D_IMM:
2948 case AArch64::LD1H_D_IMM:
2949 case AArch64::LD1H_IMM:
2950 case AArch64::LD1H_S_IMM:
2951 case AArch64::LD1RB_D_IMM:
2952 case AArch64::LD1RB_H_IMM:
2953 case AArch64::LD1RB_IMM:
2954 case AArch64::LD1RB_S_IMM:
2955 case AArch64::LD1RD_IMM:
2956 case AArch64::LD1RH_D_IMM:
2957 case AArch64::LD1RH_IMM:
2958 case AArch64::LD1RH_S_IMM:
2959 case AArch64::LD1RSB_D_IMM:
2960 case AArch64::LD1RSB_H_IMM:
2961 case AArch64::LD1RSB_S_IMM:
2962 case AArch64::LD1RSH_D_IMM:
2963 case AArch64::LD1RSH_S_IMM:
2964 case AArch64::LD1RSW_IMM:
2965 case AArch64::LD1RW_D_IMM:
2966 case AArch64::LD1RW_IMM:
2967 case AArch64::LD1SB_D_IMM:
2968 case AArch64::LD1SB_H_IMM:
2969 case AArch64::LD1SB_S_IMM:
2970 case AArch64::LD1SH_D_IMM:
2971 case AArch64::LD1SH_S_IMM:
2972 case AArch64::LD1SW_D_IMM:
2973 case AArch64::LD1W_D_IMM:
2974 case AArch64::LD1W_IMM:
2975 case AArch64::LD2B_IMM:
2976 case AArch64::LD2D_IMM:
2977 case AArch64::LD2H_IMM:
2978 case AArch64::LD2W_IMM:
2979 case AArch64::LD3B_IMM:
2980 case AArch64::LD3D_IMM:
2981 case AArch64::LD3H_IMM:
2982 case AArch64::LD3W_IMM:
2983 case AArch64::LD4B_IMM:
2984 case AArch64::LD4D_IMM:
2985 case AArch64::LD4H_IMM:
2986 case AArch64::LD4W_IMM:
2987 case AArch64::LDG:
2988 case AArch64::LDNF1B_D_IMM:
2989 case AArch64::LDNF1B_H_IMM:
2990 case AArch64::LDNF1B_IMM:
2991 case AArch64::LDNF1B_S_IMM:
2992 case AArch64::LDNF1D_IMM:
2993 case AArch64::LDNF1H_D_IMM:
2994 case AArch64::LDNF1H_IMM:
2995 case AArch64::LDNF1H_S_IMM:
2996 case AArch64::LDNF1SB_D_IMM:
2997 case AArch64::LDNF1SB_H_IMM:
2998 case AArch64::LDNF1SB_S_IMM:
2999 case AArch64::LDNF1SH_D_IMM:
3000 case AArch64::LDNF1SH_S_IMM:
3001 case AArch64::LDNF1SW_D_IMM:
3002 case AArch64::LDNF1W_D_IMM:
3003 case AArch64::LDNF1W_IMM:
3004 case AArch64::LDNPDi:
3005 case AArch64::LDNPQi:
3006 case AArch64::LDNPSi:
3007 case AArch64::LDNPWi:
3008 case AArch64::LDNPXi:
3009 case AArch64::LDNT1B_ZRI:
3010 case AArch64::LDNT1D_ZRI:
3011 case AArch64::LDNT1H_ZRI:
3012 case AArch64::LDNT1W_ZRI:
3013 case AArch64::LDPDi:
3014 case AArch64::LDPQi:
3015 case AArch64::LDPSi:
3016 case AArch64::LDPWi:
3017 case AArch64::LDPXi:
3018 case AArch64::LDRBBpost:
3019 case AArch64::LDRBBpre:
3020 case AArch64::LDRBpost:
3021 case AArch64::LDRBpre:
3022 case AArch64::LDRDpost:
3023 case AArch64::LDRDpre:
3024 case AArch64::LDRHHpost:
3025 case AArch64::LDRHHpre:
3026 case AArch64::LDRHpost:
3027 case AArch64::LDRHpre:
3028 case AArch64::LDRQpost:
3029 case AArch64::LDRQpre:
3030 case AArch64::LDRSpost:
3031 case AArch64::LDRSpre:
3032 case AArch64::LDRWpost:
3033 case AArch64::LDRWpre:
3034 case AArch64::LDRXpost:
3035 case AArch64::LDRXpre:
3036 case AArch64::ST1B_D_IMM:
3037 case AArch64::ST1B_H_IMM:
3038 case AArch64::ST1B_IMM:
3039 case AArch64::ST1B_S_IMM:
3040 case AArch64::ST1D_IMM:
3041 case AArch64::ST1H_D_IMM:
3042 case AArch64::ST1H_IMM:
3043 case AArch64::ST1H_S_IMM:
3044 case AArch64::ST1W_D_IMM:
3045 case AArch64::ST1W_IMM:
3046 case AArch64::ST2B_IMM:
3047 case AArch64::ST2D_IMM:
3048 case AArch64::ST2H_IMM:
3049 case AArch64::ST2W_IMM:
3050 case AArch64::ST3B_IMM:
3051 case AArch64::ST3D_IMM:
3052 case AArch64::ST3H_IMM:
3053 case AArch64::ST3W_IMM:
3054 case AArch64::ST4B_IMM:
3055 case AArch64::ST4D_IMM:
3056 case AArch64::ST4H_IMM:
3057 case AArch64::ST4W_IMM:
3058 case AArch64::STGPi:
3059 case AArch64::STGPreIndex:
3060 case AArch64::STZGPreIndex:
3061 case AArch64::ST2GPreIndex:
3062 case AArch64::STZ2GPreIndex:
3063 case AArch64::STGPostIndex:
3064 case AArch64::STZGPostIndex:
3065 case AArch64::ST2GPostIndex:
3066 case AArch64::STZ2GPostIndex:
3067 case AArch64::STNPDi:
3068 case AArch64::STNPQi:
3069 case AArch64::STNPSi:
3070 case AArch64::STNPWi:
3071 case AArch64::STNPXi:
3072 case AArch64::STNT1B_ZRI:
3073 case AArch64::STNT1D_ZRI:
3074 case AArch64::STNT1H_ZRI:
3075 case AArch64::STNT1W_ZRI:
3076 case AArch64::STPDi:
3077 case AArch64::STPQi:
3078 case AArch64::STPSi:
3079 case AArch64::STPWi:
3080 case AArch64::STPXi:
3081 case AArch64::STRBBpost:
3082 case AArch64::STRBBpre:
3083 case AArch64::STRBpost:
3084 case AArch64::STRBpre:
3085 case AArch64::STRDpost:
3086 case AArch64::STRDpre:
3087 case AArch64::STRHHpost:
3088 case AArch64::STRHHpre:
3089 case AArch64::STRHpost:
3090 case AArch64::STRHpre:
3091 case AArch64::STRQpost:
3092 case AArch64::STRQpre:
3093 case AArch64::STRSpost:
3094 case AArch64::STRSpre:
3095 case AArch64::STRWpost:
3096 case AArch64::STRWpre:
3097 case AArch64::STRXpost:
3098 case AArch64::STRXpre:
3099 return 3;
3100 case AArch64::LDPDpost:
3101 case AArch64::LDPDpre:
3102 case AArch64::LDPQpost:
3103 case AArch64::LDPQpre:
3104 case AArch64::LDPSpost:
3105 case AArch64::LDPSpre:
3106 case AArch64::LDPWpost:
3107 case AArch64::LDPWpre:
3108 case AArch64::LDPXpost:
3109 case AArch64::LDPXpre:
3110 case AArch64::STGPpre:
3111 case AArch64::STGPpost:
3112 case AArch64::STPDpost:
3113 case AArch64::STPDpre:
3114 case AArch64::STPQpost:
3115 case AArch64::STPQpre:
3116 case AArch64::STPSpost:
3117 case AArch64::STPSpre:
3118 case AArch64::STPWpost:
3119 case AArch64::STPWpre:
3120 case AArch64::STPXpost:
3121 case AArch64::STPXpre:
3122 return 4;
3123 }
3124}
3125
3127 switch (MI.getOpcode()) {
3128 default:
3129 return false;
3130 // Scaled instructions.
3131 case AArch64::STRSui:
3132 case AArch64::STRDui:
3133 case AArch64::STRQui:
3134 case AArch64::STRXui:
3135 case AArch64::STRWui:
3136 case AArch64::LDRSui:
3137 case AArch64::LDRDui:
3138 case AArch64::LDRQui:
3139 case AArch64::LDRXui:
3140 case AArch64::LDRWui:
3141 case AArch64::LDRSWui:
3142 // Unscaled instructions.
3143 case AArch64::STURSi:
3144 case AArch64::STRSpre:
3145 case AArch64::STURDi:
3146 case AArch64::STRDpre:
3147 case AArch64::STURQi:
3148 case AArch64::STRQpre:
3149 case AArch64::STURWi:
3150 case AArch64::STRWpre:
3151 case AArch64::STURXi:
3152 case AArch64::STRXpre:
3153 case AArch64::LDURSi:
3154 case AArch64::LDRSpre:
3155 case AArch64::LDURDi:
3156 case AArch64::LDRDpre:
3157 case AArch64::LDURQi:
3158 case AArch64::LDRQpre:
3159 case AArch64::LDURWi:
3160 case AArch64::LDRWpre:
3161 case AArch64::LDURXi:
3162 case AArch64::LDRXpre:
3163 case AArch64::LDURSWi:
3164 case AArch64::LDRSWpre:
3165 // SVE instructions.
3166 case AArch64::LDR_ZXI:
3167 case AArch64::STR_ZXI:
3168 return true;
3169 }
3170}
3171
3173 switch (MI.getOpcode()) {
3174 default:
3175 assert((!MI.isCall() || !MI.isReturn()) &&
3176 "Unexpected instruction - was a new tail call opcode introduced?");
3177 return false;
3178 case AArch64::TCRETURNdi:
3179 case AArch64::TCRETURNri:
3180 case AArch64::TCRETURNrix16x17:
3181 case AArch64::TCRETURNrix17:
3182 case AArch64::TCRETURNrinotx16:
3183 case AArch64::TCRETURNriALL:
3184 case AArch64::AUTH_TCRETURN:
3185 case AArch64::AUTH_TCRETURN_BTI:
3186 return true;
3187 }
3188}
3189
3191 switch (Opc) {
3192 default:
3193 llvm_unreachable("Opcode has no flag setting equivalent!");
3194 // 32-bit cases:
3195 case AArch64::ADDWri:
3196 return AArch64::ADDSWri;
3197 case AArch64::ADDWrr:
3198 return AArch64::ADDSWrr;
3199 case AArch64::ADDWrs:
3200 return AArch64::ADDSWrs;
3201 case AArch64::ADDWrx:
3202 return AArch64::ADDSWrx;
3203 case AArch64::ANDWri:
3204 return AArch64::ANDSWri;
3205 case AArch64::ANDWrr:
3206 return AArch64::ANDSWrr;
3207 case AArch64::ANDWrs:
3208 return AArch64::ANDSWrs;
3209 case AArch64::BICWrr:
3210 return AArch64::BICSWrr;
3211 case AArch64::BICWrs:
3212 return AArch64::BICSWrs;
3213 case AArch64::SUBWri:
3214 return AArch64::SUBSWri;
3215 case AArch64::SUBWrr:
3216 return AArch64::SUBSWrr;
3217 case AArch64::SUBWrs:
3218 return AArch64::SUBSWrs;
3219 case AArch64::SUBWrx:
3220 return AArch64::SUBSWrx;
3221 // 64-bit cases:
3222 case AArch64::ADDXri:
3223 return AArch64::ADDSXri;
3224 case AArch64::ADDXrr:
3225 return AArch64::ADDSXrr;
3226 case AArch64::ADDXrs:
3227 return AArch64::ADDSXrs;
3228 case AArch64::ADDXrx:
3229 return AArch64::ADDSXrx;
3230 case AArch64::ANDXri:
3231 return AArch64::ANDSXri;
3232 case AArch64::ANDXrr:
3233 return AArch64::ANDSXrr;
3234 case AArch64::ANDXrs:
3235 return AArch64::ANDSXrs;
3236 case AArch64::BICXrr:
3237 return AArch64::BICSXrr;
3238 case AArch64::BICXrs:
3239 return AArch64::BICSXrs;
3240 case AArch64::SUBXri:
3241 return AArch64::SUBSXri;
3242 case AArch64::SUBXrr:
3243 return AArch64::SUBSXrr;
3244 case AArch64::SUBXrs:
3245 return AArch64::SUBSXrs;
3246 case AArch64::SUBXrx:
3247 return AArch64::SUBSXrx;
3248 // SVE instructions:
3249 case AArch64::AND_PPzPP:
3250 return AArch64::ANDS_PPzPP;
3251 case AArch64::BIC_PPzPP:
3252 return AArch64::BICS_PPzPP;
3253 case AArch64::EOR_PPzPP:
3254 return AArch64::EORS_PPzPP;
3255 case AArch64::NAND_PPzPP:
3256 return AArch64::NANDS_PPzPP;
3257 case AArch64::NOR_PPzPP:
3258 return AArch64::NORS_PPzPP;
3259 case AArch64::ORN_PPzPP:
3260 return AArch64::ORNS_PPzPP;
3261 case AArch64::ORR_PPzPP:
3262 return AArch64::ORRS_PPzPP;
3263 case AArch64::BRKA_PPzP:
3264 return AArch64::BRKAS_PPzP;
3265 case AArch64::BRKPA_PPzPP:
3266 return AArch64::BRKPAS_PPzPP;
3267 case AArch64::BRKB_PPzP:
3268 return AArch64::BRKBS_PPzP;
3269 case AArch64::BRKPB_PPzPP:
3270 return AArch64::BRKPBS_PPzPP;
3271 case AArch64::BRKN_PPzP:
3272 return AArch64::BRKNS_PPzP;
3273 case AArch64::RDFFR_PPz:
3274 return AArch64::RDFFRS_PPz;
3275 case AArch64::PTRUE_B:
3276 return AArch64::PTRUES_B;
3277 }
3278}
3279
3280// Is this a candidate for ld/st merging or pairing? For example, we don't
3281// touch volatiles or load/stores that have a hint to avoid pair formation.
3283
3284 bool IsPreLdSt = isPreLdSt(MI);
3285
3286 // If this is a volatile load/store, don't mess with it.
3287 if (MI.hasOrderedMemoryRef())
3288 return false;
3289
3290 // Make sure this is a reg/fi+imm (as opposed to an address reloc).
3291 // For Pre-inc LD/ST, the operand is shifted by one.
3292 assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() ||
3293 MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) &&
3294 "Expected a reg or frame index operand.");
3295
3296 // For Pre-indexed addressing quadword instructions, the third operand is the
3297 // immediate value.
3298 bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm();
3299
3300 if (!MI.getOperand(2).isImm() && !IsImmPreLdSt)
3301 return false;
3302
3303 // Can't merge/pair if the instruction modifies the base register.
3304 // e.g., ldr x0, [x0]
3305 // This case will never occur with an FI base.
3306 // However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or
3307 // STR<S,D,Q,W,X>pre, it can be merged.
3308 // For example:
3309 // ldr q0, [x11, #32]!
3310 // ldr q1, [x11, #16]
3311 // to
3312 // ldp q0, q1, [x11, #32]!
3313 if (MI.getOperand(1).isReg() && !IsPreLdSt) {
3314 Register BaseReg = MI.getOperand(1).getReg();
3316 if (MI.modifiesRegister(BaseReg, TRI))
3317 return false;
3318 }
3319
3320 // Pairing SVE fills/spills is only valid for little-endian targets that
3321 // implement VLS 128.
3322 switch (MI.getOpcode()) {
3323 default:
3324 break;
3325 case AArch64::LDR_ZXI:
3326 case AArch64::STR_ZXI:
3327 if (!Subtarget.isLittleEndian() ||
3328 Subtarget.getSVEVectorSizeInBits() != 128)
3329 return false;
3330 }
3331
3332 // Check if this load/store has a hint to avoid pair formation.
3333 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
3335 return false;
3336
3337 // Do not pair any callee-save store/reload instructions in the
3338 // prologue/epilogue if the CFI information encoded the operations as separate
3339 // instructions, as that will cause the size of the actual prologue to mismatch
3340 // with the prologue size recorded in the Windows CFI.
3341 const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
3342 bool NeedsWinCFI = MAI->usesWindowsCFI() &&
3343 MI.getMF()->getFunction().needsUnwindTableEntry();
3344 if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
3346 return false;
3347
3348 // On some CPUs quad load/store pairs are slower than two single load/stores.
3349 if (Subtarget.isPaired128Slow()) {
3350 switch (MI.getOpcode()) {
3351 default:
3352 break;
3353 case AArch64::LDURQi:
3354 case AArch64::STURQi:
3355 case AArch64::LDRQui:
3356 case AArch64::STRQui:
3357 return false;
3358 }
3359 }
3360
3361 return true;
3362}
3363
3366 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
3367 const TargetRegisterInfo *TRI) const {
3368 if (!LdSt.mayLoadOrStore())
3369 return false;
3370
3371 const MachineOperand *BaseOp;
3372 TypeSize WidthN(0, false);
3373 if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
3374 WidthN, TRI))
3375 return false;
3376 // The maximum vscale is 16 under AArch64, return the maximal extent for the
3377 // vector.
3378 Width = LocationSize::precise(WidthN);
3379 BaseOps.push_back(BaseOp);
3380 return true;
3381}
3382
3383std::optional<ExtAddrMode>
3385 const TargetRegisterInfo *TRI) const {
3386 const MachineOperand *Base; // Filled with the base operand of MI.
3387 int64_t Offset; // Filled with the offset of MI.
3388 bool OffsetIsScalable;
3389 if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI))
3390 return std::nullopt;
3391
3392 if (!Base->isReg())
3393 return std::nullopt;
3394 ExtAddrMode AM;
3395 AM.BaseReg = Base->getReg();
3396 AM.Displacement = Offset;
3397 AM.ScaledReg = 0;
3398 AM.Scale = 0;
3399 return AM;
3400}
3401
3403 Register Reg,
3404 const MachineInstr &AddrI,
3405 ExtAddrMode &AM) const {
3406 // Filter out instructions into which we cannot fold.
3407 unsigned NumBytes;
3408 int64_t OffsetScale = 1;
3409 switch (MemI.getOpcode()) {
3410 default:
3411 return false;
3412
3413 case AArch64::LDURQi:
3414 case AArch64::STURQi:
3415 NumBytes = 16;
3416 break;
3417
3418 case AArch64::LDURDi:
3419 case AArch64::STURDi:
3420 case AArch64::LDURXi:
3421 case AArch64::STURXi:
3422 NumBytes = 8;
3423 break;
3424
3425 case AArch64::LDURWi:
3426 case AArch64::LDURSWi:
3427 case AArch64::STURWi:
3428 NumBytes = 4;
3429 break;
3430
3431 case AArch64::LDURHi:
3432 case AArch64::STURHi:
3433 case AArch64::LDURHHi:
3434 case AArch64::STURHHi:
3435 case AArch64::LDURSHXi:
3436 case AArch64::LDURSHWi:
3437 NumBytes = 2;
3438 break;
3439
3440 case AArch64::LDRBroX:
3441 case AArch64::LDRBBroX:
3442 case AArch64::LDRSBXroX:
3443 case AArch64::LDRSBWroX:
3444 case AArch64::STRBroX:
3445 case AArch64::STRBBroX:
3446 case AArch64::LDURBi:
3447 case AArch64::LDURBBi:
3448 case AArch64::LDURSBXi:
3449 case AArch64::LDURSBWi:
3450 case AArch64::STURBi:
3451 case AArch64::STURBBi:
3452 case AArch64::LDRBui:
3453 case AArch64::LDRBBui:
3454 case AArch64::LDRSBXui:
3455 case AArch64::LDRSBWui:
3456 case AArch64::STRBui:
3457 case AArch64::STRBBui:
3458 NumBytes = 1;
3459 break;
3460
3461 case AArch64::LDRQroX:
3462 case AArch64::STRQroX:
3463 case AArch64::LDRQui:
3464 case AArch64::STRQui:
3465 NumBytes = 16;
3466 OffsetScale = 16;
3467 break;
3468
3469 case AArch64::LDRDroX:
3470 case AArch64::STRDroX:
3471 case AArch64::LDRXroX:
3472 case AArch64::STRXroX:
3473 case AArch64::LDRDui:
3474 case AArch64::STRDui:
3475 case AArch64::LDRXui:
3476 case AArch64::STRXui:
3477 NumBytes = 8;
3478 OffsetScale = 8;
3479 break;
3480
3481 case AArch64::LDRWroX:
3482 case AArch64::LDRSWroX:
3483 case AArch64::STRWroX:
3484 case AArch64::LDRWui:
3485 case AArch64::LDRSWui:
3486 case AArch64::STRWui:
3487 NumBytes = 4;
3488 OffsetScale = 4;
3489 break;
3490
3491 case AArch64::LDRHroX:
3492 case AArch64::STRHroX:
3493 case AArch64::LDRHHroX:
3494 case AArch64::STRHHroX:
3495 case AArch64::LDRSHXroX:
3496 case AArch64::LDRSHWroX:
3497 case AArch64::LDRHui:
3498 case AArch64::STRHui:
3499 case AArch64::LDRHHui:
3500 case AArch64::STRHHui:
3501 case AArch64::LDRSHXui:
3502 case AArch64::LDRSHWui:
3503 NumBytes = 2;
3504 OffsetScale = 2;
3505 break;
3506 }
3507
3508 // Check the fold operand is not the loaded/stored value.
3509 const MachineOperand &BaseRegOp = MemI.getOperand(0);
3510 if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg)
3511 return false;
3512
3513 // Handle memory instructions with a [Reg, Reg] addressing mode.
3514 if (MemI.getOperand(2).isReg()) {
3515 // Bail if the addressing mode already includes extension of the offset
3516 // register.
3517 if (MemI.getOperand(3).getImm())
3518 return false;
3519
3520 // Check if we actually have a scaled offset.
3521 if (MemI.getOperand(4).getImm() == 0)
3522 OffsetScale = 1;
3523
3524 // If the address instructions is folded into the base register, then the
3525 // addressing mode must not have a scale. Then we can swap the base and the
3526 // scaled registers.
3527 if (MemI.getOperand(1).getReg() == Reg && OffsetScale != 1)
3528 return false;
3529
3530 switch (AddrI.getOpcode()) {
3531 default:
3532 return false;
3533
3534 case AArch64::SBFMXri:
3535 // sxtw Xa, Wm
3536 // ldr Xd, [Xn, Xa, lsl #N]
3537 // ->
3538 // ldr Xd, [Xn, Wm, sxtw #N]
3539 if (AddrI.getOperand(2).getImm() != 0 ||
3540 AddrI.getOperand(3).getImm() != 31)
3541 return false;
3542
3543 AM.BaseReg = MemI.getOperand(1).getReg();
3544 if (AM.BaseReg == Reg)
3545 AM.BaseReg = MemI.getOperand(2).getReg();
3546 AM.ScaledReg = AddrI.getOperand(1).getReg();
3547 AM.Scale = OffsetScale;
3548 AM.Displacement = 0;
3550 return true;
3551
3552 case TargetOpcode::SUBREG_TO_REG: {
3553 // mov Wa, Wm
3554 // ldr Xd, [Xn, Xa, lsl #N]
3555 // ->
3556 // ldr Xd, [Xn, Wm, uxtw #N]
3557
3558 // Zero-extension looks like an ORRWrs followed by a SUBREG_TO_REG.
3559 if (AddrI.getOperand(2).getImm() != AArch64::sub_32)
3560 return false;
3561
3562 const MachineRegisterInfo &MRI = AddrI.getMF()->getRegInfo();
3563 Register OffsetReg = AddrI.getOperand(1).getReg();
3564 if (!OffsetReg.isVirtual() || !MRI.hasOneNonDBGUse(OffsetReg))
3565 return false;
3566
3567 const MachineInstr &DefMI = *MRI.getVRegDef(OffsetReg);
3568 if (DefMI.getOpcode() != AArch64::ORRWrs ||
3569 DefMI.getOperand(1).getReg() != AArch64::WZR ||
3570 DefMI.getOperand(3).getImm() != 0)
3571 return false;
3572
3573 AM.BaseReg = MemI.getOperand(1).getReg();
3574 if (AM.BaseReg == Reg)
3575 AM.BaseReg = MemI.getOperand(2).getReg();
3576 AM.ScaledReg = DefMI.getOperand(2).getReg();
3577 AM.Scale = OffsetScale;
3578 AM.Displacement = 0;
3580 return true;
3581 }
3582 }
3583 }
3584
3585 // Handle memory instructions with a [Reg, #Imm] addressing mode.
3586
3587 // Check we are not breaking a potential conversion to an LDP.
3588 auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset,
3589 int64_t NewOffset) -> bool {
3590 int64_t MinOffset, MaxOffset;
3591 switch (NumBytes) {
3592 default:
3593 return true;
3594 case 4:
3595 MinOffset = -256;
3596 MaxOffset = 252;
3597 break;
3598 case 8:
3599 MinOffset = -512;
3600 MaxOffset = 504;
3601 break;
3602 case 16:
3603 MinOffset = -1024;
3604 MaxOffset = 1008;
3605 break;
3606 }
3607 return OldOffset < MinOffset || OldOffset > MaxOffset ||
3608 (NewOffset >= MinOffset && NewOffset <= MaxOffset);
3609 };
3610 auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool {
3611 int64_t OldOffset = MemI.getOperand(2).getImm() * OffsetScale;
3612 int64_t NewOffset = OldOffset + Disp;
3613 if (!isLegalAddressingMode(NumBytes, NewOffset, /* Scale */ 0))
3614 return false;
3615 // If the old offset would fit into an LDP, but the new offset wouldn't,
3616 // bail out.
3617 if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset))
3618 return false;
3619 AM.BaseReg = AddrI.getOperand(1).getReg();
3620 AM.ScaledReg = 0;
3621 AM.Scale = 0;
3622 AM.Displacement = NewOffset;
3624 return true;
3625 };
3626
3627 auto canFoldAddRegIntoAddrMode =
3628 [&](int64_t Scale,
3630 if (MemI.getOperand(2).getImm() != 0)
3631 return false;
3632 if ((unsigned)Scale != Scale)
3633 return false;
3634 if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale))
3635 return false;
3636 AM.BaseReg = AddrI.getOperand(1).getReg();
3637 AM.ScaledReg = AddrI.getOperand(2).getReg();
3638 AM.Scale = Scale;
3639 AM.Displacement = 0;
3640 AM.Form = Form;
3641 return true;
3642 };
3643
3644 auto avoidSlowSTRQ = [&](const MachineInstr &MemI) {
3645 unsigned Opcode = MemI.getOpcode();
3646 return (Opcode == AArch64::STURQi || Opcode == AArch64::STRQui) &&
3647 Subtarget.isSTRQroSlow();
3648 };
3649
3650 int64_t Disp = 0;
3651 const bool OptSize = MemI.getMF()->getFunction().hasOptSize();
3652 switch (AddrI.getOpcode()) {
3653 default:
3654 return false;
3655
3656 case AArch64::ADDXri:
3657 // add Xa, Xn, #N
3658 // ldr Xd, [Xa, #M]
3659 // ->
3660 // ldr Xd, [Xn, #N'+M]
3661 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
3662 return canFoldAddSubImmIntoAddrMode(Disp);
3663
3664 case AArch64::SUBXri:
3665 // sub Xa, Xn, #N
3666 // ldr Xd, [Xa, #M]
3667 // ->
3668 // ldr Xd, [Xn, #N'+M]
3669 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
3670 return canFoldAddSubImmIntoAddrMode(-Disp);
3671
3672 case AArch64::ADDXrs: {
3673 // add Xa, Xn, Xm, lsl #N
3674 // ldr Xd, [Xa]
3675 // ->
3676 // ldr Xd, [Xn, Xm, lsl #N]
3677
3678 // Don't fold the add if the result would be slower, unless optimising for
3679 // size.
3680 unsigned Shift = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3682 return false;
3683 Shift = AArch64_AM::getShiftValue(Shift);
3684 if (!OptSize) {
3685 if (Shift != 2 && Shift != 3 && Subtarget.hasAddrLSLSlow14())
3686 return false;
3687 if (avoidSlowSTRQ(MemI))
3688 return false;
3689 }
3690 return canFoldAddRegIntoAddrMode(1ULL << Shift);
3691 }
3692
3693 case AArch64::ADDXrr:
3694 // add Xa, Xn, Xm
3695 // ldr Xd, [Xa]
3696 // ->
3697 // ldr Xd, [Xn, Xm, lsl #0]
3698
3699 // Don't fold the add if the result would be slower, unless optimising for
3700 // size.
3701 if (!OptSize && avoidSlowSTRQ(MemI))
3702 return false;
3703 return canFoldAddRegIntoAddrMode(1);
3704
3705 case AArch64::ADDXrx:
3706 // add Xa, Xn, Wm, {s,u}xtw #N
3707 // ldr Xd, [Xa]
3708 // ->
3709 // ldr Xd, [Xn, Wm, {s,u}xtw #N]
3710
3711 // Don't fold the add if the result would be slower, unless optimising for
3712 // size.
3713 if (!OptSize && avoidSlowSTRQ(MemI))
3714 return false;
3715
3716 // Can fold only sign-/zero-extend of a word.
3717 unsigned Imm = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3719 if (Extend != AArch64_AM::UXTW && Extend != AArch64_AM::SXTW)
3720 return false;
3721
3722 return canFoldAddRegIntoAddrMode(
3723 1ULL << AArch64_AM::getArithShiftValue(Imm),
3726 }
3727}
3728
3729// Given an opcode for an instruction with a [Reg, #Imm] addressing mode,
3730// return the opcode of an instruction performing the same operation, but using
3731// the [Reg, Reg] addressing mode.
3732static unsigned regOffsetOpcode(unsigned Opcode) {
3733 switch (Opcode) {
3734 default:
3735 llvm_unreachable("Address folding not implemented for instruction");
3736
3737 case AArch64::LDURQi:
3738 case AArch64::LDRQui:
3739 return AArch64::LDRQroX;
3740 case AArch64::STURQi:
3741 case AArch64::STRQui:
3742 return AArch64::STRQroX;
3743 case AArch64::LDURDi:
3744 case AArch64::LDRDui:
3745 return AArch64::LDRDroX;
3746 case AArch64::STURDi:
3747 case AArch64::STRDui:
3748 return AArch64::STRDroX;
3749 case AArch64::LDURXi:
3750 case AArch64::LDRXui:
3751 return AArch64::LDRXroX;
3752 case AArch64::STURXi:
3753 case AArch64::STRXui:
3754 return AArch64::STRXroX;
3755 case AArch64::LDURWi:
3756 case AArch64::LDRWui:
3757 return AArch64::LDRWroX;
3758 case AArch64::LDURSWi:
3759 case AArch64::LDRSWui:
3760 return AArch64::LDRSWroX;
3761 case AArch64::STURWi:
3762 case AArch64::STRWui:
3763 return AArch64::STRWroX;
3764 case AArch64::LDURHi:
3765 case AArch64::LDRHui:
3766 return AArch64::LDRHroX;
3767 case AArch64::STURHi:
3768 case AArch64::STRHui:
3769 return AArch64::STRHroX;
3770 case AArch64::LDURHHi:
3771 case AArch64::LDRHHui:
3772 return AArch64::LDRHHroX;
3773 case AArch64::STURHHi:
3774 case AArch64::STRHHui:
3775 return AArch64::STRHHroX;
3776 case AArch64::LDURSHXi:
3777 case AArch64::LDRSHXui:
3778 return AArch64::LDRSHXroX;
3779 case AArch64::LDURSHWi:
3780 case AArch64::LDRSHWui:
3781 return AArch64::LDRSHWroX;
3782 case AArch64::LDURBi:
3783 case AArch64::LDRBui:
3784 return AArch64::LDRBroX;
3785 case AArch64::LDURBBi:
3786 case AArch64::LDRBBui:
3787 return AArch64::LDRBBroX;
3788 case AArch64::LDURSBXi:
3789 case AArch64::LDRSBXui:
3790 return AArch64::LDRSBXroX;
3791 case AArch64::LDURSBWi:
3792 case AArch64::LDRSBWui:
3793 return AArch64::LDRSBWroX;
3794 case AArch64::STURBi:
3795 case AArch64::STRBui:
3796 return AArch64::STRBroX;
3797 case AArch64::STURBBi:
3798 case AArch64::STRBBui:
3799 return AArch64::STRBBroX;
3800 }
3801}
3802
3803// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3804// the opcode of an instruction performing the same operation, but using the
3805// [Reg, #Imm] addressing mode with scaled offset.
3806unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) {
3807 switch (Opcode) {
3808 default:
3809 llvm_unreachable("Address folding not implemented for instruction");
3810
3811 case AArch64::LDURQi:
3812 Scale = 16;
3813 return AArch64::LDRQui;
3814 case AArch64::STURQi:
3815 Scale = 16;
3816 return AArch64::STRQui;
3817 case AArch64::LDURDi:
3818 Scale = 8;
3819 return AArch64::LDRDui;
3820 case AArch64::STURDi:
3821 Scale = 8;
3822 return AArch64::STRDui;
3823 case AArch64::LDURXi:
3824 Scale = 8;
3825 return AArch64::LDRXui;
3826 case AArch64::STURXi:
3827 Scale = 8;
3828 return AArch64::STRXui;
3829 case AArch64::LDURWi:
3830 Scale = 4;
3831 return AArch64::LDRWui;
3832 case AArch64::LDURSWi:
3833 Scale = 4;
3834 return AArch64::LDRSWui;
3835 case AArch64::STURWi:
3836 Scale = 4;
3837 return AArch64::STRWui;
3838 case AArch64::LDURHi:
3839 Scale = 2;
3840 return AArch64::LDRHui;
3841 case AArch64::STURHi:
3842 Scale = 2;
3843 return AArch64::STRHui;
3844 case AArch64::LDURHHi:
3845 Scale = 2;
3846 return AArch64::LDRHHui;
3847 case AArch64::STURHHi:
3848 Scale = 2;
3849 return AArch64::STRHHui;
3850 case AArch64::LDURSHXi:
3851 Scale = 2;
3852 return AArch64::LDRSHXui;
3853 case AArch64::LDURSHWi:
3854 Scale = 2;
3855 return AArch64::LDRSHWui;
3856 case AArch64::LDURBi:
3857 Scale = 1;
3858 return AArch64::LDRBui;
3859 case AArch64::LDURBBi:
3860 Scale = 1;
3861 return AArch64::LDRBBui;
3862 case AArch64::LDURSBXi:
3863 Scale = 1;
3864 return AArch64::LDRSBXui;
3865 case AArch64::LDURSBWi:
3866 Scale = 1;
3867 return AArch64::LDRSBWui;
3868 case AArch64::STURBi:
3869 Scale = 1;
3870 return AArch64::STRBui;
3871 case AArch64::STURBBi:
3872 Scale = 1;
3873 return AArch64::STRBBui;
3874 case AArch64::LDRQui:
3875 case AArch64::STRQui:
3876 Scale = 16;
3877 return Opcode;
3878 case AArch64::LDRDui:
3879 case AArch64::STRDui:
3880 case AArch64::LDRXui:
3881 case AArch64::STRXui:
3882 Scale = 8;
3883 return Opcode;
3884 case AArch64::LDRWui:
3885 case AArch64::LDRSWui:
3886 case AArch64::STRWui:
3887 Scale = 4;
3888 return Opcode;
3889 case AArch64::LDRHui:
3890 case AArch64::STRHui:
3891 case AArch64::LDRHHui:
3892 case AArch64::STRHHui:
3893 case AArch64::LDRSHXui:
3894 case AArch64::LDRSHWui:
3895 Scale = 2;
3896 return Opcode;
3897 case AArch64::LDRBui:
3898 case AArch64::LDRBBui:
3899 case AArch64::LDRSBXui:
3900 case AArch64::LDRSBWui:
3901 case AArch64::STRBui:
3902 case AArch64::STRBBui:
3903 Scale = 1;
3904 return Opcode;
3905 }
3906}
3907
3908// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3909// the opcode of an instruction performing the same operation, but using the
3910// [Reg, #Imm] addressing mode with unscaled offset.
3911unsigned unscaledOffsetOpcode(unsigned Opcode) {
3912 switch (Opcode) {
3913 default:
3914 llvm_unreachable("Address folding not implemented for instruction");
3915
3916 case AArch64::LDURQi:
3917 case AArch64::STURQi:
3918 case AArch64::LDURDi:
3919 case AArch64::STURDi:
3920 case AArch64::LDURXi:
3921 case AArch64::STURXi:
3922 case AArch64::LDURWi:
3923 case AArch64::LDURSWi:
3924 case AArch64::STURWi:
3925 case AArch64::LDURHi:
3926 case AArch64::STURHi:
3927 case AArch64::LDURHHi:
3928 case AArch64::STURHHi:
3929 case AArch64::LDURSHXi:
3930 case AArch64::LDURSHWi:
3931 case AArch64::LDURBi:
3932 case AArch64::STURBi:
3933 case AArch64::LDURBBi:
3934 case AArch64::STURBBi:
3935 case AArch64::LDURSBWi:
3936 case AArch64::LDURSBXi:
3937 return Opcode;
3938 case AArch64::LDRQui:
3939 return AArch64::LDURQi;
3940 case AArch64::STRQui:
3941 return AArch64::STURQi;
3942 case AArch64::LDRDui:
3943 return AArch64::LDURDi;
3944 case AArch64::STRDui:
3945 return AArch64::STURDi;
3946 case AArch64::LDRXui:
3947 return AArch64::LDURXi;
3948 case AArch64::STRXui:
3949 return AArch64::STURXi;
3950 case AArch64::LDRWui:
3951 return AArch64::LDURWi;
3952 case AArch64::LDRSWui:
3953 return AArch64::LDURSWi;
3954 case AArch64::STRWui:
3955 return AArch64::STURWi;
3956 case AArch64::LDRHui:
3957 return AArch64::LDURHi;
3958 case AArch64::STRHui:
3959 return AArch64::STURHi;
3960 case AArch64::LDRHHui:
3961 return AArch64::LDURHHi;
3962 case AArch64::STRHHui:
3963 return AArch64::STURHHi;
3964 case AArch64::LDRSHXui:
3965 return AArch64::LDURSHXi;
3966 case AArch64::LDRSHWui:
3967 return AArch64::LDURSHWi;
3968 case AArch64::LDRBBui:
3969 return AArch64::LDURBBi;
3970 case AArch64::LDRBui:
3971 return AArch64::LDURBi;
3972 case AArch64::STRBBui:
3973 return AArch64::STURBBi;
3974 case AArch64::STRBui:
3975 return AArch64::STURBi;
3976 case AArch64::LDRSBWui:
3977 return AArch64::LDURSBWi;
3978 case AArch64::LDRSBXui:
3979 return AArch64::LDURSBXi;
3980 }
3981}
3982
3983// Given the opcode of a memory load/store instruction, return the opcode of an
3984// instruction performing the same operation, but using
3985// the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the
3986// offset register.
3987static unsigned offsetExtendOpcode(unsigned Opcode) {
3988 switch (Opcode) {
3989 default:
3990 llvm_unreachable("Address folding not implemented for instruction");
3991
3992 case AArch64::LDRQroX:
3993 case AArch64::LDURQi:
3994 case AArch64::LDRQui:
3995 return AArch64::LDRQroW;
3996 case AArch64::STRQroX:
3997 case AArch64::STURQi:
3998 case AArch64::STRQui:
3999 return AArch64::STRQroW;
4000 case AArch64::LDRDroX:
4001 case AArch64::LDURDi:
4002 case AArch64::LDRDui:
4003 return AArch64::LDRDroW;
4004 case AArch64::STRDroX:
4005 case AArch64::STURDi:
4006 case AArch64::STRDui:
4007 return AArch64::STRDroW;
4008 case AArch64::LDRXroX:
4009 case AArch64::LDURXi:
4010 case AArch64::LDRXui:
4011 return AArch64::LDRXroW;
4012 case AArch64::STRXroX:
4013 case AArch64::STURXi:
4014 case AArch64::STRXui:
4015 return AArch64::STRXroW;
4016 case AArch64::LDRWroX:
4017 case AArch64::LDURWi:
4018 case AArch64::LDRWui:
4019 return AArch64::LDRWroW;
4020 case AArch64::LDRSWroX:
4021 case AArch64::LDURSWi:
4022 case AArch64::LDRSWui:
4023 return AArch64::LDRSWroW;
4024 case AArch64::STRWroX:
4025 case AArch64::STURWi:
4026 case AArch64::STRWui:
4027 return AArch64::STRWroW;
4028 case AArch64::LDRHroX:
4029 case AArch64::LDURHi:
4030 case AArch64::LDRHui:
4031 return AArch64::LDRHroW;
4032 case AArch64::STRHroX:
4033 case AArch64::STURHi:
4034 case AArch64::STRHui:
4035 return AArch64::STRHroW;
4036 case AArch64::LDRHHroX:
4037 case AArch64::LDURHHi:
4038 case AArch64::LDRHHui:
4039 return AArch64::LDRHHroW;
4040 case AArch64::STRHHroX:
4041 case AArch64::STURHHi:
4042 case AArch64::STRHHui:
4043 return AArch64::STRHHroW;
4044 case AArch64::LDRSHXroX:
4045 case AArch64::LDURSHXi:
4046 case AArch64::LDRSHXui:
4047 return AArch64::LDRSHXroW;
4048 case AArch64::LDRSHWroX:
4049 case AArch64::LDURSHWi:
4050 case AArch64::LDRSHWui:
4051 return AArch64::LDRSHWroW;
4052 case AArch64::LDRBroX:
4053 case AArch64::LDURBi:
4054 case AArch64::LDRBui:
4055 return AArch64::LDRBroW;
4056 case AArch64::LDRBBroX:
4057 case AArch64::LDURBBi:
4058 case AArch64::LDRBBui:
4059 return AArch64::LDRBBroW;
4060 case AArch64::LDRSBXroX:
4061 case AArch64::LDURSBXi:
4062 case AArch64::LDRSBXui:
4063 return AArch64::LDRSBXroW;
4064 case AArch64::LDRSBWroX:
4065 case AArch64::LDURSBWi:
4066 case AArch64::LDRSBWui:
4067 return AArch64::LDRSBWroW;
4068 case AArch64::STRBroX:
4069 case AArch64::STURBi:
4070 case AArch64::STRBui:
4071 return AArch64::STRBroW;
4072 case AArch64::STRBBroX:
4073 case AArch64::STURBBi:
4074 case AArch64::STRBBui:
4075 return AArch64::STRBBroW;
4076 }
4077}
4078
4080 const ExtAddrMode &AM) const {
4081
4082 const DebugLoc &DL = MemI.getDebugLoc();
4083 MachineBasicBlock &MBB = *MemI.getParent();
4084 MachineRegisterInfo &MRI = MemI.getMF()->getRegInfo();
4085
4087 if (AM.ScaledReg) {
4088 // The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`.
4089 unsigned Opcode = regOffsetOpcode(MemI.getOpcode());
4090 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
4091 auto B = BuildMI(MBB, MemI, DL, get(Opcode))
4092 .addReg(MemI.getOperand(0).getReg(),
4093 getDefRegState(MemI.mayLoad()))
4094 .addReg(AM.BaseReg)
4095 .addReg(AM.ScaledReg)
4096 .addImm(0)
4097 .addImm(AM.Scale > 1)
4098 .setMemRefs(MemI.memoperands())
4099 .setMIFlags(MemI.getFlags());
4100 return B.getInstr();
4101 }
4102
4103 assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
4104 "Addressing mode not supported for folding");
4105
4106 // The new instruction will be in the form `ld[u]r Rt, [Xn, #imm]`.
4107 unsigned Scale = 1;
4108 unsigned Opcode = MemI.getOpcode();
4109 if (isInt<9>(AM.Displacement))
4110 Opcode = unscaledOffsetOpcode(Opcode);
4111 else
4112 Opcode = scaledOffsetOpcode(Opcode, Scale);
4113
4114 auto B =
4115 BuildMI(MBB, MemI, DL, get(Opcode))
4116 .addReg(MemI.getOperand(0).getReg(), getDefRegState(MemI.mayLoad()))
4117 .addReg(AM.BaseReg)
4118 .addImm(AM.Displacement / Scale)
4119 .setMemRefs(MemI.memoperands())
4120 .setMIFlags(MemI.getFlags());
4121 return B.getInstr();
4122 }
4123
4126 // The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`.
4127 assert(AM.ScaledReg && !AM.Displacement &&
4128 "Address offset can be a register or an immediate, but not both");
4129 unsigned Opcode = offsetExtendOpcode(MemI.getOpcode());
4130 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
4131 // Make sure the offset register is in the correct register class.
4132 Register OffsetReg = AM.ScaledReg;
4133 const TargetRegisterClass *RC = MRI.getRegClass(OffsetReg);
4134 if (RC->hasSuperClassEq(&AArch64::GPR64RegClass)) {
4135 OffsetReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
4136 BuildMI(MBB, MemI, DL, get(TargetOpcode::COPY), OffsetReg)
4137 .addReg(AM.ScaledReg, {}, AArch64::sub_32);
4138 }
4139 auto B =
4140 BuildMI(MBB, MemI, DL, get(Opcode))
4141 .addReg(MemI.getOperand(0).getReg(), getDefRegState(MemI.mayLoad()))
4142 .addReg(AM.BaseReg)
4143 .addReg(OffsetReg)
4145 .addImm(AM.Scale != 1)
4146 .setMemRefs(MemI.memoperands())
4147 .setMIFlags(MemI.getFlags());
4148
4149 return B.getInstr();
4150 }
4151
4153 "Function must not be called with an addressing mode it can't handle");
4154}
4155
4156/// Return true if the opcode is a post-index ld/st instruction, which really
4157/// loads from base+0.
4158static bool isPostIndexLdStOpcode(unsigned Opcode) {
4159 switch (Opcode) {
4160 default:
4161 return false;
4162 case AArch64::LD1Fourv16b_POST:
4163 case AArch64::LD1Fourv1d_POST:
4164 case AArch64::LD1Fourv2d_POST:
4165 case AArch64::LD1Fourv2s_POST:
4166 case AArch64::LD1Fourv4h_POST:
4167 case AArch64::LD1Fourv4s_POST:
4168 case AArch64::LD1Fourv8b_POST:
4169 case AArch64::LD1Fourv8h_POST:
4170 case AArch64::LD1Onev16b_POST:
4171 case AArch64::LD1Onev1d_POST:
4172 case AArch64::LD1Onev2d_POST:
4173 case AArch64::LD1Onev2s_POST:
4174 case AArch64::LD1Onev4h_POST:
4175 case AArch64::LD1Onev4s_POST:
4176 case AArch64::LD1Onev8b_POST:
4177 case AArch64::LD1Onev8h_POST:
4178 case AArch64::LD1Rv16b_POST:
4179 case AArch64::LD1Rv1d_POST:
4180 case AArch64::LD1Rv2d_POST:
4181 case AArch64::LD1Rv2s_POST:
4182 case AArch64::LD1Rv4h_POST:
4183 case AArch64::LD1Rv4s_POST:
4184 case AArch64::LD1Rv8b_POST:
4185 case AArch64::LD1Rv8h_POST:
4186 case AArch64::LD1Threev16b_POST:
4187 case AArch64::LD1Threev1d_POST:
4188 case AArch64::LD1Threev2d_POST:
4189 case AArch64::LD1Threev2s_POST:
4190 case AArch64::LD1Threev4h_POST:
4191 case AArch64::LD1Threev4s_POST:
4192 case AArch64::LD1Threev8b_POST:
4193 case AArch64::LD1Threev8h_POST:
4194 case AArch64::LD1Twov16b_POST:
4195 case AArch64::LD1Twov1d_POST:
4196 case AArch64::LD1Twov2d_POST:
4197 case AArch64::LD1Twov2s_POST:
4198 case AArch64::LD1Twov4h_POST:
4199 case AArch64::LD1Twov4s_POST:
4200 case AArch64::LD1Twov8b_POST:
4201 case AArch64::LD1Twov8h_POST:
4202 case AArch64::LD1i16_POST:
4203 case AArch64::LD1i32_POST:
4204 case AArch64::LD1i64_POST:
4205 case AArch64::LD1i8_POST:
4206 case AArch64::LD2Rv16b_POST:
4207 case AArch64::LD2Rv1d_POST:
4208 case AArch64::LD2Rv2d_POST:
4209 case AArch64::LD2Rv2s_POST:
4210 case AArch64::LD2Rv4h_POST:
4211 case AArch64::LD2Rv4s_POST:
4212 case AArch64::LD2Rv8b_POST:
4213 case AArch64::LD2Rv8h_POST:
4214 case AArch64::LD2Twov16b_POST:
4215 case AArch64::LD2Twov2d_POST:
4216 case AArch64::LD2Twov2s_POST:
4217 case AArch64::LD2Twov4h_POST:
4218 case AArch64::LD2Twov4s_POST:
4219 case AArch64::LD2Twov8b_POST:
4220 case AArch64::LD2Twov8h_POST:
4221 case AArch64::LD2i16_POST:
4222 case AArch64::LD2i32_POST:
4223 case AArch64::LD2i64_POST:
4224 case AArch64::LD2i8_POST:
4225 case AArch64::LD3Rv16b_POST:
4226 case AArch64::LD3Rv1d_POST:
4227 case AArch64::LD3Rv2d_POST:
4228 case AArch64::LD3Rv2s_POST:
4229 case AArch64::LD3Rv4h_POST:
4230 case AArch64::LD3Rv4s_POST:
4231 case AArch64::LD3Rv8b_POST:
4232 case AArch64::LD3Rv8h_POST:
4233 case AArch64::LD3Threev16b_POST:
4234 case AArch64::LD3Threev2d_POST:
4235 case AArch64::LD3Threev2s_POST:
4236 case AArch64::LD3Threev4h_POST:
4237 case AArch64::LD3Threev4s_POST:
4238 case AArch64::LD3Threev8b_POST:
4239 case AArch64::LD3Threev8h_POST:
4240 case AArch64::LD3i16_POST:
4241 case AArch64::LD3i32_POST:
4242 case AArch64::LD3i64_POST:
4243 case AArch64::LD3i8_POST:
4244 case AArch64::LD4Fourv16b_POST:
4245 case AArch64::LD4Fourv2d_POST:
4246 case AArch64::LD4Fourv2s_POST:
4247 case AArch64::LD4Fourv4h_POST:
4248 case AArch64::LD4Fourv4s_POST:
4249 case AArch64::LD4Fourv8b_POST:
4250 case AArch64::LD4Fourv8h_POST:
4251 case AArch64::LD4Rv16b_POST:
4252 case AArch64::LD4Rv1d_POST:
4253 case AArch64::LD4Rv2d_POST:
4254 case AArch64::LD4Rv2s_POST:
4255 case AArch64::LD4Rv4h_POST:
4256 case AArch64::LD4Rv4s_POST:
4257 case AArch64::LD4Rv8b_POST:
4258 case AArch64::LD4Rv8h_POST:
4259 case AArch64::LD4i16_POST:
4260 case AArch64::LD4i32_POST:
4261 case AArch64::LD4i64_POST:
4262 case AArch64::LD4i8_POST:
4263 case AArch64::LDAPRWpost:
4264 case AArch64::LDAPRXpost:
4265 case AArch64::LDIAPPWpost:
4266 case AArch64::LDIAPPXpost:
4267 case AArch64::LDPDpost:
4268 case AArch64::LDPQpost:
4269 case AArch64::LDPSWpost:
4270 case AArch64::LDPSpost:
4271 case AArch64::LDPWpost:
4272 case AArch64::LDPXpost:
4273 case AArch64::LDRBBpost:
4274 case AArch64::LDRBpost:
4275 case AArch64::LDRDpost:
4276 case AArch64::LDRHHpost:
4277 case AArch64::LDRHpost:
4278 case AArch64::LDRQpost:
4279 case AArch64::LDRSBWpost:
4280 case AArch64::LDRSBXpost:
4281 case AArch64::LDRSHWpost:
4282 case AArch64::LDRSHXpost:
4283 case AArch64::LDRSWpost:
4284 case AArch64::LDRSpost:
4285 case AArch64::LDRWpost:
4286 case AArch64::LDRXpost:
4287 case AArch64::ST1Fourv16b_POST:
4288 case AArch64::ST1Fourv1d_POST:
4289 case AArch64::ST1Fourv2d_POST:
4290 case AArch64::ST1Fourv2s_POST:
4291 case AArch64::ST1Fourv4h_POST:
4292 case AArch64::ST1Fourv4s_POST:
4293 case AArch64::ST1Fourv8b_POST:
4294 case AArch64::ST1Fourv8h_POST:
4295 case AArch64::ST1Onev16b_POST:
4296 case AArch64::ST1Onev1d_POST:
4297 case AArch64::ST1Onev2d_POST:
4298 case AArch64::ST1Onev2s_POST:
4299 case AArch64::ST1Onev4h_POST:
4300 case AArch64::ST1Onev4s_POST:
4301 case AArch64::ST1Onev8b_POST:
4302 case AArch64::ST1Onev8h_POST:
4303 case AArch64::ST1Threev16b_POST:
4304 case AArch64::ST1Threev1d_POST:
4305 case AArch64::ST1Threev2d_POST:
4306 case AArch64::ST1Threev2s_POST:
4307 case AArch64::ST1Threev4h_POST:
4308 case AArch64::ST1Threev4s_POST:
4309 case AArch64::ST1Threev8b_POST:
4310 case AArch64::ST1Threev8h_POST:
4311 case AArch64::ST1Twov16b_POST:
4312 case AArch64::ST1Twov1d_POST:
4313 case AArch64::ST1Twov2d_POST:
4314 case AArch64::ST1Twov2s_POST:
4315 case AArch64::ST1Twov4h_POST:
4316 case AArch64::ST1Twov4s_POST:
4317 case AArch64::ST1Twov8b_POST:
4318 case AArch64::ST1Twov8h_POST:
4319 case AArch64::ST1i16_POST:
4320 case AArch64::ST1i32_POST:
4321 case AArch64::ST1i64_POST:
4322 case AArch64::ST1i8_POST:
4323 case AArch64::ST2GPostIndex:
4324 case AArch64::ST2Twov16b_POST:
4325 case AArch64::ST2Twov2d_POST:
4326 case AArch64::ST2Twov2s_POST:
4327 case AArch64::ST2Twov4h_POST:
4328 case AArch64::ST2Twov4s_POST:
4329 case AArch64::ST2Twov8b_POST:
4330 case AArch64::ST2Twov8h_POST:
4331 case AArch64::ST2i16_POST:
4332 case AArch64::ST2i32_POST:
4333 case AArch64::ST2i64_POST:
4334 case AArch64::ST2i8_POST:
4335 case AArch64::ST3Threev16b_POST:
4336 case AArch64::ST3Threev2d_POST:
4337 case AArch64::ST3Threev2s_POST:
4338 case AArch64::ST3Threev4h_POST:
4339 case AArch64::ST3Threev4s_POST:
4340 case AArch64::ST3Threev8b_POST:
4341 case AArch64::ST3Threev8h_POST:
4342 case AArch64::ST3i16_POST:
4343 case AArch64::ST3i32_POST:
4344 case AArch64::ST3i64_POST:
4345 case AArch64::ST3i8_POST:
4346 case AArch64::ST4Fourv16b_POST:
4347 case AArch64::ST4Fourv2d_POST:
4348 case AArch64::ST4Fourv2s_POST:
4349 case AArch64::ST4Fourv4h_POST:
4350 case AArch64::ST4Fourv4s_POST:
4351 case AArch64::ST4Fourv8b_POST:
4352 case AArch64::ST4Fourv8h_POST:
4353 case AArch64::ST4i16_POST:
4354 case AArch64::ST4i32_POST:
4355 case AArch64::ST4i64_POST:
4356 case AArch64::ST4i8_POST:
4357 case AArch64::STGPostIndex:
4358 case AArch64::STGPpost:
4359 case AArch64::STPDpost:
4360 case AArch64::STPQpost:
4361 case AArch64::STPSpost:
4362 case AArch64::STPWpost:
4363 case AArch64::STPXpost:
4364 case AArch64::STRBBpost:
4365 case AArch64::STRBpost:
4366 case AArch64::STRDpost:
4367 case AArch64::STRHHpost:
4368 case AArch64::STRHpost:
4369 case AArch64::STRQpost:
4370 case AArch64::STRSpost:
4371 case AArch64::STRWpost:
4372 case AArch64::STRXpost:
4373 case AArch64::STZ2GPostIndex:
4374 case AArch64::STZGPostIndex:
4375 return true;
4376 }
4377}
4378
4380 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
4381 bool &OffsetIsScalable, TypeSize &Width,
4382 const TargetRegisterInfo *TRI) const {
4383 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
4384 // Handle only loads/stores with base register followed by immediate offset.
4385 if (LdSt.getNumExplicitOperands() == 3) {
4386 // Non-paired instruction (e.g., ldr x1, [x0, #8]).
4387 if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
4388 !LdSt.getOperand(2).isImm())
4389 return false;
4390 } else if (LdSt.getNumExplicitOperands() == 4) {
4391 // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
4392 if (!LdSt.getOperand(1).isReg() ||
4393 (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
4394 !LdSt.getOperand(3).isImm())
4395 return false;
4396 } else
4397 return false;
4398
4399 // Get the scaling factor for the instruction and set the width for the
4400 // instruction.
4401 TypeSize Scale(0U, false);
4402 int64_t Dummy1, Dummy2;
4403
4404 // If this returns false, then it's an instruction we don't want to handle.
4405 if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
4406 return false;
4407
4408 // Compute the offset. Offset is calculated as the immediate operand
4409 // multiplied by the scaling factor. Unscaled instructions have scaling factor
4410 // set to 1. Postindex are a special case which have an offset of 0.
4411 if (isPostIndexLdStOpcode(LdSt.getOpcode())) {
4412 BaseOp = &LdSt.getOperand(2);
4413 Offset = 0;
4414 } else if (LdSt.getNumExplicitOperands() == 3) {
4415 BaseOp = &LdSt.getOperand(1);
4416 Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinValue();
4417 } else {
4418 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
4419 BaseOp = &LdSt.getOperand(2);
4420 Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinValue();
4421 }
4422 OffsetIsScalable = Scale.isScalable();
4423
4424 return BaseOp->isReg() || BaseOp->isFI();
4425}
4426
4429 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
4430 MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
4431 assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
4432 return OfsOp;
4433}
4434
4435bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
4436 TypeSize &Width, int64_t &MinOffset,
4437 int64_t &MaxOffset) {
4438 switch (Opcode) {
4439 // Not a memory operation or something we want to handle.
4440 default:
4441 Scale = TypeSize::getFixed(0);
4442 Width = TypeSize::getFixed(0);
4443 MinOffset = MaxOffset = 0;
4444 return false;
4445 // LDR / STR
4446 case AArch64::LDRQui:
4447 case AArch64::STRQui:
4448 Scale = TypeSize::getFixed(16);
4449 Width = TypeSize::getFixed(16);
4450 MinOffset = 0;
4451 MaxOffset = 4095;
4452 break;
4453 case AArch64::LDRXui:
4454 case AArch64::LDRDui:
4455 case AArch64::STRXui:
4456 case AArch64::STRDui:
4457 case AArch64::PRFMui:
4458 Scale = TypeSize::getFixed(8);
4459 Width = TypeSize::getFixed(8);
4460 MinOffset = 0;
4461 MaxOffset = 4095;
4462 break;
4463 case AArch64::LDRWui:
4464 case AArch64::LDRSui:
4465 case AArch64::LDRSWui:
4466 case AArch64::STRWui:
4467 case AArch64::STRSui:
4468 Scale = TypeSize::getFixed(4);
4469 Width = TypeSize::getFixed(4);
4470 MinOffset = 0;
4471 MaxOffset = 4095;
4472 break;
4473 case AArch64::LDRHui:
4474 case AArch64::LDRHHui:
4475 case AArch64::LDRSHWui:
4476 case AArch64::LDRSHXui:
4477 case AArch64::STRHui:
4478 case AArch64::STRHHui:
4479 Scale = TypeSize::getFixed(2);
4480 Width = TypeSize::getFixed(2);
4481 MinOffset = 0;
4482 MaxOffset = 4095;
4483 break;
4484 case AArch64::LDRBui:
4485 case AArch64::LDRBBui:
4486 case AArch64::LDRSBWui:
4487 case AArch64::LDRSBXui:
4488 case AArch64::STRBui:
4489 case AArch64::STRBBui:
4490 Scale = TypeSize::getFixed(1);
4491 Width = TypeSize::getFixed(1);
4492 MinOffset = 0;
4493 MaxOffset = 4095;
4494 break;
4495 // post/pre inc
4496 case AArch64::STRQpre:
4497 case AArch64::LDRQpost:
4498 Scale = TypeSize::getFixed(1);
4499 Width = TypeSize::getFixed(16);
4500 MinOffset = -256;
4501 MaxOffset = 255;
4502 break;
4503 case AArch64::LDRDpost:
4504 case AArch64::LDRDpre:
4505 case AArch64::LDRXpost:
4506 case AArch64::LDRXpre:
4507 case AArch64::STRDpost:
4508 case AArch64::STRDpre:
4509 case AArch64::STRXpost:
4510 case AArch64::STRXpre:
4511 Scale = TypeSize::getFixed(1);
4512 Width = TypeSize::getFixed(8);
4513 MinOffset = -256;
4514 MaxOffset = 255;
4515 break;
4516 case AArch64::STRWpost:
4517 case AArch64::STRWpre:
4518 case AArch64::LDRWpost:
4519 case AArch64::LDRWpre:
4520 case AArch64::STRSpost:
4521 case AArch64::STRSpre:
4522 case AArch64::LDRSpost:
4523 case AArch64::LDRSpre:
4524 Scale = TypeSize::getFixed(1);
4525 Width = TypeSize::getFixed(4);
4526 MinOffset = -256;
4527 MaxOffset = 255;
4528 break;
4529 case AArch64::LDRHpost:
4530 case AArch64::LDRHpre:
4531 case AArch64::STRHpost:
4532 case AArch64::STRHpre:
4533 case AArch64::LDRHHpost:
4534 case AArch64::LDRHHpre:
4535 case AArch64::STRHHpost:
4536 case AArch64::STRHHpre:
4537 Scale = TypeSize::getFixed(1);
4538 Width = TypeSize::getFixed(2);
4539 MinOffset = -256;
4540 MaxOffset = 255;
4541 break;
4542 case AArch64::LDRBpost:
4543 case AArch64::LDRBpre:
4544 case AArch64::STRBpost:
4545 case AArch64::STRBpre:
4546 case AArch64::LDRBBpost:
4547 case AArch64::LDRBBpre:
4548 case AArch64::STRBBpost:
4549 case AArch64::STRBBpre:
4550 Scale = TypeSize::getFixed(1);
4551 Width = TypeSize::getFixed(1);
4552 MinOffset = -256;
4553 MaxOffset = 255;
4554 break;
4555 // Unscaled
4556 case AArch64::LDURQi:
4557 case AArch64::STURQi:
4558 Scale = TypeSize::getFixed(1);
4559 Width = TypeSize::getFixed(16);
4560 MinOffset = -256;
4561 MaxOffset = 255;
4562 break;
4563 case AArch64::LDURXi:
4564 case AArch64::LDURDi:
4565 case AArch64::LDAPURXi:
4566 case AArch64::STURXi:
4567 case AArch64::STURDi:
4568 case AArch64::STLURXi:
4569 case AArch64::PRFUMi:
4570 Scale = TypeSize::getFixed(1);
4571 Width = TypeSize::getFixed(8);
4572 MinOffset = -256;
4573 MaxOffset = 255;
4574 break;
4575 case AArch64::LDURWi:
4576 case AArch64::LDURSi:
4577 case AArch64::LDURSWi:
4578 case AArch64::LDAPURi:
4579 case AArch64::LDAPURSWi:
4580 case AArch64::STURWi:
4581 case AArch64::STURSi:
4582 case AArch64::STLURWi:
4583 Scale = TypeSize::getFixed(1);
4584 Width = TypeSize::getFixed(4);
4585 MinOffset = -256;
4586 MaxOffset = 255;
4587 break;
4588 case AArch64::LDURHi:
4589 case AArch64::LDURHHi:
4590 case AArch64::LDURSHXi:
4591 case AArch64::LDURSHWi:
4592 case AArch64::LDAPURHi:
4593 case AArch64::LDAPURSHWi:
4594 case AArch64::LDAPURSHXi:
4595 case AArch64::STURHi:
4596 case AArch64::STURHHi:
4597 case AArch64::STLURHi:
4598 Scale = TypeSize::getFixed(1);
4599 Width = TypeSize::getFixed(2);
4600 MinOffset = -256;
4601 MaxOffset = 255;
4602 break;
4603 case AArch64::LDURBi:
4604 case AArch64::LDURBBi:
4605 case AArch64::LDURSBXi:
4606 case AArch64::LDURSBWi:
4607 case AArch64::LDAPURBi:
4608 case AArch64::LDAPURSBWi:
4609 case AArch64::LDAPURSBXi:
4610 case AArch64::STURBi:
4611 case AArch64::STURBBi:
4612 case AArch64::STLURBi:
4613 Scale = TypeSize::getFixed(1);
4614 Width = TypeSize::getFixed(1);
4615 MinOffset = -256;
4616 MaxOffset = 255;
4617 break;
4618 // LDP / STP (including pre/post inc)
4619 case AArch64::LDPQi:
4620 case AArch64::LDNPQi:
4621 case AArch64::STPQi:
4622 case AArch64::STNPQi:
4623 case AArch64::LDPQpost:
4624 case AArch64::LDPQpre:
4625 case AArch64::STPQpost:
4626 case AArch64::STPQpre:
4627 Scale = TypeSize::getFixed(16);
4628 Width = TypeSize::getFixed(16 * 2);
4629 MinOffset = -64;
4630 MaxOffset = 63;
4631 break;
4632 case AArch64::LDPXi:
4633 case AArch64::LDPDi:
4634 case AArch64::LDNPXi:
4635 case AArch64::LDNPDi:
4636 case AArch64::STPXi:
4637 case AArch64::STPDi:
4638 case AArch64::STNPXi:
4639 case AArch64::STNPDi:
4640 case AArch64::LDPDpost:
4641 case AArch64::LDPDpre:
4642 case AArch64::LDPXpost:
4643 case AArch64::LDPXpre:
4644 case AArch64::STPDpost:
4645 case AArch64::STPDpre:
4646 case AArch64::STPXpost:
4647 case AArch64::STPXpre:
4648 Scale = TypeSize::getFixed(8);
4649 Width = TypeSize::getFixed(8 * 2);
4650 MinOffset = -64;
4651 MaxOffset = 63;
4652 break;
4653 case AArch64::LDPWi:
4654 case AArch64::LDPSi:
4655 case AArch64::LDNPWi:
4656 case AArch64::LDNPSi:
4657 case AArch64::STPWi:
4658 case AArch64::STPSi:
4659 case AArch64::STNPWi:
4660 case AArch64::STNPSi:
4661 case AArch64::LDPSpost:
4662 case AArch64::LDPSpre:
4663 case AArch64::LDPWpost:
4664 case AArch64::LDPWpre:
4665 case AArch64::STPSpost:
4666 case AArch64::STPSpre:
4667 case AArch64::STPWpost:
4668 case AArch64::STPWpre:
4669 Scale = TypeSize::getFixed(4);
4670 Width = TypeSize::getFixed(4 * 2);
4671 MinOffset = -64;
4672 MaxOffset = 63;
4673 break;
4674 case AArch64::StoreSwiftAsyncContext:
4675 // Store is an STRXui, but there might be an ADDXri in the expansion too.
4676 Scale = TypeSize::getFixed(1);
4677 Width = TypeSize::getFixed(8);
4678 MinOffset = 0;
4679 MaxOffset = 4095;
4680 break;
4681 case AArch64::ADDG:
4682 Scale = TypeSize::getFixed(16);
4683 Width = TypeSize::getFixed(0);
4684 MinOffset = 0;
4685 MaxOffset = 63;
4686 break;
4687 case AArch64::TAGPstack:
4688 Scale = TypeSize::getFixed(16);
4689 Width = TypeSize::getFixed(0);
4690 // TAGP with a negative offset turns into SUBP, which has a maximum offset
4691 // of 63 (not 64!).
4692 MinOffset = -63;
4693 MaxOffset = 63;
4694 break;
4695 case AArch64::LDG:
4696 case AArch64::STGi:
4697 case AArch64::STGPreIndex:
4698 case AArch64::STGPostIndex:
4699 case AArch64::STZGi:
4700 case AArch64::STZGPreIndex:
4701 case AArch64::STZGPostIndex:
4702 Scale = TypeSize::getFixed(16);
4703 Width = TypeSize::getFixed(16);
4704 MinOffset = -256;
4705 MaxOffset = 255;
4706 break;
4707 // SVE
4708 case AArch64::STR_ZZZZXI:
4709 case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
4710 case AArch64::LDR_ZZZZXI:
4711 case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
4712 Scale = TypeSize::getScalable(16);
4713 Width = TypeSize::getScalable(16 * 4);
4714 MinOffset = -256;
4715 MaxOffset = 252;
4716 break;
4717 case AArch64::STR_ZZZXI:
4718 case AArch64::LDR_ZZZXI:
4719 Scale = TypeSize::getScalable(16);
4720 Width = TypeSize::getScalable(16 * 3);
4721 MinOffset = -256;
4722 MaxOffset = 253;
4723 break;
4724 case AArch64::STR_ZZXI:
4725 case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
4726 case AArch64::LDR_ZZXI:
4727 case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
4728 Scale = TypeSize::getScalable(16);
4729 Width = TypeSize::getScalable(16 * 2);
4730 MinOffset = -256;
4731 MaxOffset = 254;
4732 break;
4733 case AArch64::LDR_PXI:
4734 case AArch64::STR_PXI:
4735 Scale = TypeSize::getScalable(2);
4736 Width = TypeSize::getScalable(2);
4737 MinOffset = -256;
4738 MaxOffset = 255;
4739 break;
4740 case AArch64::LDR_PPXI:
4741 case AArch64::STR_PPXI:
4742 Scale = TypeSize::getScalable(2);
4743 Width = TypeSize::getScalable(2 * 2);
4744 MinOffset = -256;
4745 MaxOffset = 254;
4746 break;
4747 case AArch64::LDR_ZXI:
4748 case AArch64::STR_ZXI:
4749 Scale = TypeSize::getScalable(16);
4750 Width = TypeSize::getScalable(16);
4751 MinOffset = -256;
4752 MaxOffset = 255;
4753 break;
4754 case AArch64::LD1B_IMM:
4755 case AArch64::LD1H_IMM:
4756 case AArch64::LD1W_IMM:
4757 case AArch64::LD1D_IMM:
4758 case AArch64::LDNT1B_ZRI:
4759 case AArch64::LDNT1H_ZRI:
4760 case AArch64::LDNT1W_ZRI:
4761 case AArch64::LDNT1D_ZRI:
4762 case AArch64::ST1B_IMM:
4763 case AArch64::ST1H_IMM:
4764 case AArch64::ST1W_IMM:
4765 case AArch64::ST1D_IMM:
4766 case AArch64::STNT1B_ZRI:
4767 case AArch64::STNT1H_ZRI:
4768 case AArch64::STNT1W_ZRI:
4769 case AArch64::STNT1D_ZRI:
4770 case AArch64::LDNF1B_IMM:
4771 case AArch64::LDNF1H_IMM:
4772 case AArch64::LDNF1W_IMM:
4773 case AArch64::LDNF1D_IMM:
4774 // A full vectors worth of data
4775 // Width = mbytes * elements
4776 Scale = TypeSize::getScalable(16);
4777 Width = TypeSize::getScalable(16);
4778 MinOffset = -8;
4779 MaxOffset = 7;
4780 break;
4781 case AArch64::LD2B_IMM:
4782 case AArch64::LD2H_IMM:
4783 case AArch64::LD2W_IMM:
4784 case AArch64::LD2D_IMM:
4785 case AArch64::ST2B_IMM:
4786 case AArch64::ST2H_IMM:
4787 case AArch64::ST2W_IMM:
4788 case AArch64::ST2D_IMM:
4789 Scale = TypeSize::getScalable(32);
4790 Width = TypeSize::getScalable(16 * 2);
4791 MinOffset = -8;
4792 MaxOffset = 7;
4793 break;
4794 case AArch64::LD3B_IMM:
4795 case AArch64::LD3H_IMM:
4796 case AArch64::LD3W_IMM:
4797 case AArch64::LD3D_IMM:
4798 case AArch64::ST3B_IMM:
4799 case AArch64::ST3H_IMM:
4800 case AArch64::ST3W_IMM:
4801 case AArch64::ST3D_IMM:
4802 Scale = TypeSize::getScalable(48);
4803 Width = TypeSize::getScalable(16 * 3);
4804 MinOffset = -8;
4805 MaxOffset = 7;
4806 break;
4807 case AArch64::LD4B_IMM:
4808 case AArch64::LD4H_IMM:
4809 case AArch64::LD4W_IMM:
4810 case AArch64::LD4D_IMM:
4811 case AArch64::ST4B_IMM:
4812 case AArch64::ST4H_IMM:
4813 case AArch64::ST4W_IMM:
4814 case AArch64::ST4D_IMM:
4815 Scale = TypeSize::getScalable(64);
4816 Width = TypeSize::getScalable(16 * 4);
4817 MinOffset = -8;
4818 MaxOffset = 7;
4819 break;
4820 case AArch64::LD1B_H_IMM:
4821 case AArch64::LD1SB_H_IMM:
4822 case AArch64::LD1H_S_IMM:
4823 case AArch64::LD1SH_S_IMM:
4824 case AArch64::LD1W_D_IMM:
4825 case AArch64::LD1SW_D_IMM:
4826 case AArch64::ST1B_H_IMM:
4827 case AArch64::ST1H_S_IMM:
4828 case AArch64::ST1W_D_IMM:
4829 case AArch64::LDNF1B_H_IMM:
4830 case AArch64::LDNF1SB_H_IMM:
4831 case AArch64::LDNF1H_S_IMM:
4832 case AArch64::LDNF1SH_S_IMM:
4833 case AArch64::LDNF1W_D_IMM:
4834 case AArch64::LDNF1SW_D_IMM:
4835 // A half vector worth of data
4836 // Width = mbytes * elements
4837 Scale = TypeSize::getScalable(8);
4838 Width = TypeSize::getScalable(8);
4839 MinOffset = -8;
4840 MaxOffset = 7;
4841 break;
4842 case AArch64::LD1B_S_IMM:
4843 case AArch64::LD1SB_S_IMM:
4844 case AArch64::LD1H_D_IMM:
4845 case AArch64::LD1SH_D_IMM:
4846 case AArch64::ST1B_S_IMM:
4847 case AArch64::ST1H_D_IMM:
4848 case AArch64::LDNF1B_S_IMM:
4849 case AArch64::LDNF1SB_S_IMM:
4850 case AArch64::LDNF1H_D_IMM:
4851 case AArch64::LDNF1SH_D_IMM:
4852 // A quarter vector worth of data
4853 // Width = mbytes * elements
4854 Scale = TypeSize::getScalable(4);
4855 Width = TypeSize::getScalable(4);
4856 MinOffset = -8;
4857 MaxOffset = 7;
4858 break;
4859 case AArch64::LD1B_D_IMM:
4860 case AArch64::LD1SB_D_IMM:
4861 case AArch64::ST1B_D_IMM:
4862 case AArch64::LDNF1B_D_IMM:
4863 case AArch64::LDNF1SB_D_IMM:
4864 // A eighth vector worth of data
4865 // Width = mbytes * elements
4866 Scale = TypeSize::getScalable(2);
4867 Width = TypeSize::getScalable(2);
4868 MinOffset = -8;
4869 MaxOffset = 7;
4870 break;
4871 case AArch64::ST2Gi:
4872 case AArch64::ST2GPreIndex:
4873 case AArch64::ST2GPostIndex:
4874 case AArch64::STZ2Gi:
4875 case AArch64::STZ2GPreIndex:
4876 case AArch64::STZ2GPostIndex:
4877 Scale = TypeSize::getFixed(16);
4878 Width = TypeSize::getFixed(32);
4879 MinOffset = -256;
4880 MaxOffset = 255;
4881 break;
4882 case AArch64::STGPi:
4883 case AArch64::STGPpost:
4884 case AArch64::STGPpre:
4885 Scale = TypeSize::getFixed(16);
4886 Width = TypeSize::getFixed(16);
4887 MinOffset = -64;
4888 MaxOffset = 63;
4889 break;
4890 case AArch64::LD1RB_IMM:
4891 case AArch64::LD1RB_H_IMM:
4892 case AArch64::LD1RB_S_IMM:
4893 case AArch64::LD1RB_D_IMM:
4894 case AArch64::LD1RSB_H_IMM:
4895 case AArch64::LD1RSB_S_IMM:
4896 case AArch64::LD1RSB_D_IMM:
4897 Scale = TypeSize::getFixed(1);
4898 Width = TypeSize::getFixed(1);
4899 MinOffset = 0;
4900 MaxOffset = 63;
4901 break;
4902 case AArch64::LD1RH_IMM:
4903 case AArch64::LD1RH_S_IMM:
4904 case AArch64::LD1RH_D_IMM:
4905 case AArch64::LD1RSH_S_IMM:
4906 case AArch64::LD1RSH_D_IMM:
4907 Scale = TypeSize::getFixed(2);
4908 Width = TypeSize::getFixed(2);
4909 MinOffset = 0;
4910 MaxOffset = 63;
4911 break;
4912 case AArch64::LD1RW_IMM:
4913 case AArch64::LD1RW_D_IMM:
4914 case AArch64::LD1RSW_IMM:
4915 Scale = TypeSize::getFixed(4);
4916 Width = TypeSize::getFixed(4);
4917 MinOffset = 0;
4918 MaxOffset = 63;
4919 break;
4920 case AArch64::LD1RD_IMM:
4921 Scale = TypeSize::getFixed(8);
4922 Width = TypeSize::getFixed(8);
4923 MinOffset = 0;
4924 MaxOffset = 63;
4925 break;
4926 }
4927
4928 return true;
4929}
4930
4931// Scaling factor for unscaled load or store.
4933 switch (Opc) {
4934 default:
4935 llvm_unreachable("Opcode has unknown scale!");
4936 case AArch64::LDRBui:
4937 case AArch64::LDRBBui:
4938 case AArch64::LDURBBi:
4939 case AArch64::LDRSBWui:
4940 case AArch64::LDURSBWi:
4941 case AArch64::STRBui:
4942 case AArch64::STRBBui:
4943 case AArch64::STURBBi:
4944 return 1;
4945 case AArch64::LDRHui:
4946 case AArch64::LDRHHui:
4947 case AArch64::LDURHHi:
4948 case AArch64::LDRSHWui:
4949 case AArch64::LDURSHWi:
4950 case AArch64::STRHui:
4951 case AArch64::STRHHui:
4952 case AArch64::STURHHi:
4953 return 2;
4954 case AArch64::LDRSui:
4955 case AArch64::LDURSi:
4956 case AArch64::LDRSpre:
4957 case AArch64::LDRSWui:
4958 case AArch64::LDURSWi:
4959 case AArch64::LDRSWpre:
4960 case AArch64::LDRWpre:
4961 case AArch64::LDRWui:
4962 case AArch64::LDURWi:
4963 case AArch64::STRSui:
4964 case AArch64::STURSi:
4965 case AArch64::STRSpre:
4966 case AArch64::STRWui:
4967 case AArch64::STURWi:
4968 case AArch64::STRWpre:
4969 case AArch64::LDPSi:
4970 case AArch64::LDPSWi:
4971 case AArch64::LDPWi:
4972 case AArch64::STPSi:
4973 case AArch64::STPWi:
4974 return 4;
4975 case AArch64::LDRDui:
4976 case AArch64::LDURDi:
4977 case AArch64::LDRDpre:
4978 case AArch64::LDRXui:
4979 case AArch64::LDURXi:
4980 case AArch64::LDRXpre:
4981 case AArch64::STRDui:
4982 case AArch64::STURDi:
4983 case AArch64::STRDpre:
4984 case AArch64::STRXui:
4985 case AArch64::STURXi:
4986 case AArch64::STRXpre:
4987 case AArch64::LDPDi:
4988 case AArch64::LDPXi:
4989 case AArch64::STPDi:
4990 case AArch64::STPXi:
4991 return 8;
4992 case AArch64::LDRQui:
4993 case AArch64::LDURQi:
4994 case AArch64::STRQui:
4995 case AArch64::STURQi:
4996 case AArch64::STRQpre:
4997 case AArch64::LDPQi:
4998 case AArch64::LDRQpre:
4999 case AArch64::STPQi:
5000 case AArch64::STGi:
5001 case AArch64::STZGi:
5002 case AArch64::ST2Gi:
5003 case AArch64::STZ2Gi:
5004 case AArch64::STGPi:
5005 return 16;
5006 }
5007}
5008
5010 switch (MI.getOpcode()) {
5011 default:
5012 return false;
5013 case AArch64::LDRWpre:
5014 case AArch64::LDRXpre:
5015 case AArch64::LDRSWpre:
5016 case AArch64::LDRSpre:
5017 case AArch64::LDRDpre:
5018 case AArch64::LDRQpre:
5019 return true;
5020 }
5021}
5022
5024 switch (MI.getOpcode()) {
5025 default:
5026 return false;
5027 case AArch64::STRWpre:
5028 case AArch64::STRXpre:
5029 case AArch64::STRSpre:
5030 case AArch64::STRDpre:
5031 case AArch64::STRQpre:
5032 return true;
5033 }
5034}
5035
5037 return isPreLd(MI) || isPreSt(MI);
5038}
5039
5041 switch (MI.getOpcode()) {
5042 default:
5043 return false;
5044 case AArch64::LDURBBi:
5045 case AArch64::LDURHHi:
5046 case AArch64::LDURWi:
5047 case AArch64::LDRBBui:
5048 case AArch64::LDRHHui:
5049 case AArch64::LDRWui:
5050 case AArch64::LDRBBroX:
5051 case AArch64::LDRHHroX:
5052 case AArch64::LDRWroX:
5053 case AArch64::LDRBBroW:
5054 case AArch64::LDRHHroW:
5055 case AArch64::LDRWroW:
5056 return true;
5057 }
5058}
5059
5061 switch (MI.getOpcode()) {
5062 default:
5063 return false;
5064 case AArch64::LDURSBWi:
5065 case AArch64::LDURSHWi:
5066 case AArch64::LDURSBXi:
5067 case AArch64::LDURSHXi:
5068 case AArch64::LDURSWi:
5069 case AArch64::LDRSBWui:
5070 case AArch64::LDRSHWui:
5071 case AArch64::LDRSBXui:
5072 case AArch64::LDRSHXui:
5073 case AArch64::LDRSWui:
5074 case AArch64::LDRSBWroX:
5075 case AArch64::LDRSHWroX:
5076 case AArch64::LDRSBXroX:
5077 case AArch64::LDRSHXroX:
5078 case AArch64::LDRSWroX:
5079 case AArch64::LDRSBWroW:
5080 case AArch64::LDRSHWroW:
5081 case AArch64::LDRSBXroW:
5082 case AArch64::LDRSHXroW:
5083 case AArch64::LDRSWroW:
5084 return true;
5085 }
5086}
5087
5089 switch (MI.getOpcode()) {
5090 default:
5091 return false;
5092 case AArch64::LDPSi:
5093 case AArch64::LDPSWi:
5094 case AArch64::LDPDi:
5095 case AArch64::LDPQi:
5096 case AArch64::LDPWi:
5097 case AArch64::LDPXi:
5098 case AArch64::STPSi:
5099 case AArch64::STPDi:
5100 case AArch64::STPQi:
5101 case AArch64::STPWi:
5102 case AArch64::STPXi:
5103 case AArch64::STGPi:
5104 return true;
5105 }
5106}
5107
5109 assert(MI.mayLoadOrStore() && "Load or store instruction expected");
5110 unsigned Idx =
5112 : 1;
5113 return MI.getOperand(Idx);
5114}
5115
5116const MachineOperand &
5118 assert(MI.mayLoadOrStore() && "Load or store instruction expected");
5119 unsigned Idx =
5121 : 2;
5122 return MI.getOperand(Idx);
5123}
5124
5125const MachineOperand &
5127 switch (MI.getOpcode()) {
5128 default:
5129 llvm_unreachable("Unexpected opcode");
5130 case AArch64::LDRBroX:
5131 case AArch64::LDRBBroX:
5132 case AArch64::LDRSBXroX:
5133 case AArch64::LDRSBWroX:
5134 case AArch64::LDRHroX:
5135 case AArch64::LDRHHroX:
5136 case AArch64::LDRSHXroX:
5137 case AArch64::LDRSHWroX:
5138 case AArch64::LDRWroX:
5139 case AArch64::LDRSroX:
5140 case AArch64::LDRSWroX:
5141 case AArch64::LDRDroX:
5142 case AArch64::LDRXroX:
5143 case AArch64::LDRQroX:
5144 return MI.getOperand(4);
5145 }
5146}
5147
5149 Register Reg) {
5150 if (MI.getParent() == nullptr)
5151 return nullptr;
5152 const MachineFunction *MF = MI.getParent()->getParent();
5153 return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr;
5154}
5155
5157 auto IsHFPR = [&](const MachineOperand &Op) {
5158 if (!Op.isReg())
5159 return false;
5160 auto Reg = Op.getReg();
5161 if (Reg.isPhysical())
5162 return AArch64::FPR16RegClass.contains(Reg);
5163 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5164 return TRC == &AArch64::FPR16RegClass ||
5165 TRC == &AArch64::FPR16_loRegClass;
5166 };
5167 return llvm::any_of(MI.operands(), IsHFPR);
5168}
5169
5171 auto IsQFPR = [&](const MachineOperand &Op) {
5172 if (!Op.isReg())
5173 return false;
5174 auto Reg = Op.getReg();
5175 if (Reg.isPhysical())
5176 return AArch64::FPR128RegClass.contains(Reg);
5177 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5178 return TRC == &AArch64::FPR128RegClass ||
5179 TRC == &AArch64::FPR128_loRegClass;
5180 };
5181 return llvm::any_of(MI.operands(), IsQFPR);
5182}
5183
5185 switch (MI.getOpcode()) {
5186 case AArch64::BRK:
5187 case AArch64::HLT:
5188 case AArch64::PACIASP:
5189 case AArch64::PACIBSP:
5190 // Implicit BTI behavior.
5191 return true;
5192 case AArch64::PAUTH_PROLOGUE:
5193 // PAUTH_PROLOGUE expands to PACI(A|B)SP.
5194 return true;
5195 case AArch64::HINT: {
5196 unsigned Imm = MI.getOperand(0).getImm();
5197 // Explicit BTI instruction.
5198 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
5199 return true;
5200 // PACI(A|B)SP instructions.
5201 if (Imm == 25 || Imm == 27)
5202 return true;
5203 return false;
5204 }
5205 default:
5206 return false;
5207 }
5208}
5209
5211 if (Reg == 0)
5212 return false;
5213 assert(Reg.isPhysical() && "Expected physical register in isFpOrNEON");
5214 return AArch64::FPR128RegClass.contains(Reg) ||
5215 AArch64::FPR64RegClass.contains(Reg) ||
5216 AArch64::FPR32RegClass.contains(Reg) ||
5217 AArch64::FPR16RegClass.contains(Reg) ||
5218 AArch64::FPR8RegClass.contains(Reg);
5219}
5220
5222 auto IsFPR = [&](const MachineOperand &Op) {
5223 if (!Op.isReg())
5224 return false;
5225 auto Reg = Op.getReg();
5226 if (Reg.isPhysical())
5227 return isFpOrNEON(Reg);
5228
5229 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5230 return TRC == &AArch64::FPR128RegClass ||
5231 TRC == &AArch64::FPR128_loRegClass ||
5232 TRC == &AArch64::FPR64RegClass ||
5233 TRC == &AArch64::FPR64_loRegClass ||
5234 TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass ||
5235 TRC == &AArch64::FPR8RegClass;
5236 };
5237 return llvm::any_of(MI.operands(), IsFPR);
5238}
5239
5240// Scale the unscaled offsets. Returns false if the unscaled offset can't be
5241// scaled.
5242static bool scaleOffset(unsigned Opc, int64_t &Offset) {
5244
5245 // If the byte-offset isn't a multiple of the stride, we can't scale this
5246 // offset.
5247 if (Offset % Scale != 0)
5248 return false;
5249
5250 // Convert the byte-offset used by unscaled into an "element" offset used
5251 // by the scaled pair load/store instructions.
5252 Offset /= Scale;
5253 return true;
5254}
5255
5256static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
5257 if (FirstOpc == SecondOpc)
5258 return true;
5259 // We can also pair sign-ext and zero-ext instructions.
5260 switch (FirstOpc) {
5261 default:
5262 return false;
5263 case AArch64::STRSui:
5264 case AArch64::STURSi:
5265 return SecondOpc == AArch64::STRSui || SecondOpc == AArch64::STURSi;
5266 case AArch64::STRDui:
5267 case AArch64::STURDi:
5268 return SecondOpc == AArch64::STRDui || SecondOpc == AArch64::STURDi;
5269 case AArch64::STRQui:
5270 case AArch64::STURQi:
5271 return SecondOpc == AArch64::STRQui || SecondOpc == AArch64::STURQi;
5272 case AArch64::STRWui:
5273 case AArch64::STURWi:
5274 return SecondOpc == AArch64::STRWui || SecondOpc == AArch64::STURWi;
5275 case AArch64::STRXui:
5276 case AArch64::STURXi:
5277 return SecondOpc == AArch64::STRXui || SecondOpc == AArch64::STURXi;
5278 case AArch64::LDRSui:
5279 case AArch64::LDURSi:
5280 return SecondOpc == AArch64::LDRSui || SecondOpc == AArch64::LDURSi;
5281 case AArch64::LDRDui:
5282 case AArch64::LDURDi:
5283 return SecondOpc == AArch64::LDRDui || SecondOpc == AArch64::LDURDi;
5284 case AArch64::LDRQui:
5285 case AArch64::LDURQi:
5286 return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
5287 case AArch64::LDRWui:
5288 case AArch64::LDURWi:
5289 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
5290 case AArch64::LDRSWui:
5291 case AArch64::LDURSWi:
5292 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
5293 case AArch64::LDRXui:
5294 case AArch64::LDURXi:
5295 return SecondOpc == AArch64::LDRXui || SecondOpc == AArch64::LDURXi;
5296 }
5297 // These instructions can't be paired based on their opcodes.
5298 return false;
5299}
5300
5301static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
5302 int64_t Offset1, unsigned Opcode1, int FI2,
5303 int64_t Offset2, unsigned Opcode2) {
5304 // Accesses through fixed stack object frame indices may access a different
5305 // fixed stack slot. Check that the object offsets + offsets match.
5306 if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
5307 int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
5308 int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
5309 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
5310 // Convert to scaled object offsets.
5311 int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
5312 if (ObjectOffset1 % Scale1 != 0)
5313 return false;
5314 ObjectOffset1 /= Scale1;
5315 int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
5316 if (ObjectOffset2 % Scale2 != 0)
5317 return false;
5318 ObjectOffset2 /= Scale2;
5319 ObjectOffset1 += Offset1;
5320 ObjectOffset2 += Offset2;
5321 return ObjectOffset1 + 1 == ObjectOffset2;
5322 }
5323
5324 return FI1 == FI2;
5325}
5326
5327/// Detect opportunities for ldp/stp formation.
5328///
5329/// Only called for LdSt for which getMemOperandWithOffset returns true.
5331 ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
5332 bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
5333 int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
5334 unsigned NumBytes) const {
5335 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
5336 const MachineOperand &BaseOp1 = *BaseOps1.front();
5337 const MachineOperand &BaseOp2 = *BaseOps2.front();
5338 const MachineInstr &FirstLdSt = *BaseOp1.getParent();
5339 const MachineInstr &SecondLdSt = *BaseOp2.getParent();
5340 if (BaseOp1.getType() != BaseOp2.getType())
5341 return false;
5342
5343 assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
5344 "Only base registers and frame indices are supported.");
5345
5346 // Check for both base regs and base FI.
5347 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
5348 return false;
5349
5350 // Only cluster up to a single pair.
5351 if (ClusterSize > 2)
5352 return false;
5353
5354 if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
5355 return false;
5356
5357 // Can we pair these instructions based on their opcodes?
5358 unsigned FirstOpc = FirstLdSt.getOpcode();
5359 unsigned SecondOpc = SecondLdSt.getOpcode();
5360 if (!canPairLdStOpc(FirstOpc, SecondOpc))
5361 return false;
5362
5363 // Can't merge volatiles or load/stores that have a hint to avoid pair
5364 // formation, for example.
5365 if (!isCandidateToMergeOrPair(FirstLdSt) ||
5366 !isCandidateToMergeOrPair(SecondLdSt))
5367 return false;
5368
5369 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
5370 int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
5371 if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
5372 return false;
5373
5374 int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
5375 if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
5376 return false;
5377
5378 // Pairwise instructions have a 7-bit signed offset field.
5379 if (Offset1 > 63 || Offset1 < -64)
5380 return false;
5381
5382 // The caller should already have ordered First/SecondLdSt by offset.
5383 // Note: except for non-equal frame index bases
5384 if (BaseOp1.isFI()) {
5385 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
5386 "Caller should have ordered offsets.");
5387
5388 const MachineFrameInfo &MFI =
5389 FirstLdSt.getParent()->getParent()->getFrameInfo();
5390 return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
5391 BaseOp2.getIndex(), Offset2, SecondOpc);
5392 }
5393
5394 assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
5395
5396 return Offset1 + 1 == Offset2;
5397}
5398
5400 MCRegister Reg, unsigned SubIdx,
5401 RegState State,
5402 const TargetRegisterInfo *TRI) {
5403 if (!SubIdx)
5404 return MIB.addReg(Reg, State);
5405
5406 if (Reg.isPhysical())
5407 return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
5408 return MIB.addReg(Reg, State, SubIdx);
5409}
5410
5411static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
5412 unsigned NumRegs) {
5413 // We really want the positive remainder mod 32 here, that happens to be
5414 // easily obtainable with a mask.
5415 return ((DestReg - SrcReg) & 0x1f) < NumRegs;
5416}
5417
5420 const DebugLoc &DL, MCRegister DestReg,
5421 MCRegister SrcReg, bool KillSrc,
5422 unsigned Opcode,
5423 ArrayRef<unsigned> Indices) const {
5424 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
5426 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
5427 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
5428 unsigned NumRegs = Indices.size();
5429
5430 int SubReg = 0, End = NumRegs, Incr = 1;
5431 if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
5432 SubReg = NumRegs - 1;
5433 End = -1;
5434 Incr = -1;
5435 }
5436
5437 for (; SubReg != End; SubReg += Incr) {
5438 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
5439 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
5440 AddSubReg(MIB, SrcReg, Indices[SubReg], {}, TRI);
5441 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
5442 }
5443}
5444
5447 const DebugLoc &DL, MCRegister DestReg,
5448 MCRegister SrcReg, bool KillSrc,
5449 unsigned Opcode, unsigned ZeroReg,
5450 llvm::ArrayRef<unsigned> Indices) const {
5452 unsigned NumRegs = Indices.size();
5453
5454#ifndef NDEBUG
5455 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
5456 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
5457 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
5458 "GPR reg sequences should not be able to overlap");
5459#endif
5460
5461 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
5462 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
5463 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
5464 MIB.addReg(ZeroReg);
5465 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
5466 MIB.addImm(0);
5467 }
5468}
5469
5470/// Returns true if the instruction at I is in a streaming call site region,
5471/// within a single basic block.
5472/// A "call site streaming region" starts after smstart and ends at smstop
5473/// around a call to a streaming function. This walks backward from I.
5476 MachineFunction &MF = *MBB.getParent();
5478 if (!AFI->hasStreamingModeChanges())
5479 return false;
5480 // Walk backwards to find smstart/smstop
5481 for (MachineInstr &MI : reverse(make_range(MBB.begin(), I))) {
5482 unsigned Opc = MI.getOpcode();
5483 if (Opc == AArch64::MSRpstatesvcrImm1 || Opc == AArch64::MSRpstatePseudo) {
5484 // Check if this is SM change (not ZA)
5485 int64_t PState = MI.getOperand(0).getImm();
5486 if (PState == AArch64SVCR::SVCRSM || PState == AArch64SVCR::SVCRSMZA) {
5487 // Operand 1 is 1 for start, 0 for stop
5488 return MI.getOperand(1).getImm() == 1;
5489 }
5490 }
5491 }
5492 return false;
5493}
5494
5495/// Returns true if in a streaming call site region without SME-FA64.
5496static bool mustAvoidNeonAtMBBI(const AArch64Subtarget &Subtarget,
5499 return !Subtarget.hasSMEFA64() && isInStreamingCallSiteRegion(MBB, I);
5500}
5501
5504 const DebugLoc &DL, Register DestReg,
5505 Register SrcReg, bool KillSrc,
5506 bool RenamableDest,
5507 bool RenamableSrc) const {
5508 ++NumCopyInstrs;
5509 if (AArch64::GPR32spRegClass.contains(DestReg) &&
5510 AArch64::GPR32spRegClass.contains(SrcReg)) {
5511 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
5512 // If either operand is WSP, expand to ADD #0.
5513 if (Subtarget.hasZeroCycleRegMoveGPR64() &&
5514 !Subtarget.hasZeroCycleRegMoveGPR32()) {
5515 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
5516 MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
5517 &AArch64::GPR64spRegClass);
5518 MCRegister SrcRegX = RI.getMatchingSuperReg(SrcReg, AArch64::sub_32,
5519 &AArch64::GPR64spRegClass);
5520 // This instruction is reading and writing X registers. This may upset
5521 // the register scavenger and machine verifier, so we need to indicate
5522 // that we are reading an undefined value from SrcRegX, but a proper
5523 // value from SrcReg.
5524 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
5525 .addReg(SrcRegX, RegState::Undef)
5526 .addImm(0)
5528 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5529 ++NumZCRegMoveInstrsGPR;
5530 } else {
5531 BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
5532 .addReg(SrcReg, getKillRegState(KillSrc))
5533 .addImm(0)
5535 if (Subtarget.hasZeroCycleRegMoveGPR32())
5536 ++NumZCRegMoveInstrsGPR;
5537 }
5538 } else if (Subtarget.hasZeroCycleRegMoveGPR64() &&
5539 !Subtarget.hasZeroCycleRegMoveGPR32()) {
5540 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
5541 MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
5542 &AArch64::GPR64spRegClass);
5543 assert(DestRegX.isValid() && "Destination super-reg not valid");
5544 MCRegister SrcRegX = RI.getMatchingSuperReg(SrcReg, AArch64::sub_32,
5545 &AArch64::GPR64spRegClass);
5546 assert(SrcRegX.isValid() && "Source super-reg not valid");
5547 // This instruction is reading and writing X registers. This may upset
5548 // the register scavenger and machine verifier, so we need to indicate
5549 // that we are reading an undefined value from SrcRegX, but a proper
5550 // value from SrcReg.
5551 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
5552 .addReg(AArch64::XZR)
5553 .addReg(SrcRegX, RegState::Undef)
5554 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5555 ++NumZCRegMoveInstrsGPR;
5556 } else {
5557 // Otherwise, expand to ORR WZR.
5558 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
5559 .addReg(AArch64::WZR)
5560 .addReg(SrcReg, getKillRegState(KillSrc));
5561 if (Subtarget.hasZeroCycleRegMoveGPR32())
5562 ++NumZCRegMoveInstrsGPR;
5563 }
5564 return;
5565 }
5566
5567 // GPR32 zeroing
5568 if (AArch64::GPR32spRegClass.contains(DestReg) && SrcReg == AArch64::WZR) {
5569 if (Subtarget.hasZeroCycleZeroingGPR64() &&
5570 !Subtarget.hasZeroCycleZeroingGPR32()) {
5571 MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
5572 &AArch64::GPR64spRegClass);
5573 assert(DestRegX.isValid() && "Destination super-reg not valid");
5574 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestRegX)
5575 .addImm(0)
5577 ++NumZCZeroingInstrsGPR;
5578 } else if (Subtarget.hasZeroCycleZeroingGPR32()) {
5579 BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
5580 .addImm(0)
5582 ++NumZCZeroingInstrsGPR;
5583 } else {
5584 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
5585 .addReg(AArch64::WZR)
5586 .addReg(AArch64::WZR);
5587 }
5588 return;
5589 }
5590
5591 if (AArch64::GPR64spRegClass.contains(DestReg) &&
5592 AArch64::GPR64spRegClass.contains(SrcReg)) {
5593 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
5594 // If either operand is SP, expand to ADD #0.
5595 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
5596 .addReg(SrcReg, getKillRegState(KillSrc))
5597 .addImm(0)
5599 if (Subtarget.hasZeroCycleRegMoveGPR64())
5600 ++NumZCRegMoveInstrsGPR;
5601 } else {
5602 // Otherwise, expand to ORR XZR.
5603 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
5604 .addReg(AArch64::XZR)
5605 .addReg(SrcReg, getKillRegState(KillSrc));
5606 if (Subtarget.hasZeroCycleRegMoveGPR64())
5607 ++NumZCRegMoveInstrsGPR;
5608 }
5609 return;
5610 }
5611
5612 // GPR64 zeroing
5613 if (AArch64::GPR64spRegClass.contains(DestReg) && SrcReg == AArch64::XZR) {
5614 if (Subtarget.hasZeroCycleZeroingGPR64()) {
5615 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
5616 .addImm(0)
5618 ++NumZCZeroingInstrsGPR;
5619 } else {
5620 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
5621 .addReg(AArch64::XZR)
5622 .addReg(AArch64::XZR);
5623 }
5624 return;
5625 }
5626
5627 // Copy a Predicate register by ORRing with itself.
5628 if (AArch64::PPRRegClass.contains(DestReg) &&
5629 AArch64::PPRRegClass.contains(SrcReg)) {
5630 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5631 "Unexpected SVE register.");
5632 BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
5633 .addReg(SrcReg) // Pg
5634 .addReg(SrcReg)
5635 .addReg(SrcReg, getKillRegState(KillSrc));
5636 return;
5637 }
5638
5639 // Copy a predicate-as-counter register by ORRing with itself as if it
5640 // were a regular predicate (mask) register.
5641 bool DestIsPNR = AArch64::PNRRegClass.contains(DestReg);
5642 bool SrcIsPNR = AArch64::PNRRegClass.contains(SrcReg);
5643 if (DestIsPNR || SrcIsPNR) {
5644 auto ToPPR = [](MCRegister R) -> MCRegister {
5645 return (R - AArch64::PN0) + AArch64::P0;
5646 };
5647 MCRegister PPRSrcReg = SrcIsPNR ? ToPPR(SrcReg) : SrcReg.asMCReg();
5648 MCRegister PPRDestReg = DestIsPNR ? ToPPR(DestReg) : DestReg.asMCReg();
5649
5650 if (PPRSrcReg != PPRDestReg) {
5651 auto NewMI = BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), PPRDestReg)
5652 .addReg(PPRSrcReg) // Pg
5653 .addReg(PPRSrcReg)
5654 .addReg(PPRSrcReg, getKillRegState(KillSrc));
5655 if (DestIsPNR)
5656 NewMI.addDef(DestReg, RegState::Implicit);
5657 }
5658 return;
5659 }
5660
5661 // Copy a Z register by ORRing with itself.
5662 if (AArch64::ZPRRegClass.contains(DestReg) &&
5663 AArch64::ZPRRegClass.contains(SrcReg)) {
5664 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5665 "Unexpected SVE register.");
5666 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
5667 .addReg(SrcReg)
5668 .addReg(SrcReg, getKillRegState(KillSrc));
5669 return;
5670 }
5671
5672 // Copy a Z register pair by copying the individual sub-registers.
5673 if ((AArch64::ZPR2RegClass.contains(DestReg) ||
5674 AArch64::ZPR2StridedOrContiguousRegClass.contains(DestReg)) &&
5675 (AArch64::ZPR2RegClass.contains(SrcReg) ||
5676 AArch64::ZPR2StridedOrContiguousRegClass.contains(SrcReg))) {
5677 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5678 "Unexpected SVE register.");
5679 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
5680 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5681 Indices);
5682 return;
5683 }
5684
5685 // Copy a Z register triple by copying the individual sub-registers.
5686 if (AArch64::ZPR3RegClass.contains(DestReg) &&
5687 AArch64::ZPR3RegClass.contains(SrcReg)) {
5688 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5689 "Unexpected SVE register.");
5690 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5691 AArch64::zsub2};
5692 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5693 Indices);
5694 return;
5695 }
5696
5697 // Copy a Z register quad by copying the individual sub-registers.
5698 if ((AArch64::ZPR4RegClass.contains(DestReg) ||
5699 AArch64::ZPR4StridedOrContiguousRegClass.contains(DestReg)) &&
5700 (AArch64::ZPR4RegClass.contains(SrcReg) ||
5701 AArch64::ZPR4StridedOrContiguousRegClass.contains(SrcReg))) {
5702 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5703 "Unexpected SVE register.");
5704 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5705 AArch64::zsub2, AArch64::zsub3};
5706 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5707 Indices);
5708 return;
5709 }
5710
5711 // Copy a DDDD register quad by copying the individual sub-registers.
5712 if (AArch64::DDDDRegClass.contains(DestReg) &&
5713 AArch64::DDDDRegClass.contains(SrcReg)) {
5714 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5715 AArch64::dsub2, AArch64::dsub3};
5716 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5717 Indices);
5718 return;
5719 }
5720
5721 // Copy a DDD register triple by copying the individual sub-registers.
5722 if (AArch64::DDDRegClass.contains(DestReg) &&
5723 AArch64::DDDRegClass.contains(SrcReg)) {
5724 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5725 AArch64::dsub2};
5726 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5727 Indices);
5728 return;
5729 }
5730
5731 // Copy a DD register pair by copying the individual sub-registers.
5732 if (AArch64::DDRegClass.contains(DestReg) &&
5733 AArch64::DDRegClass.contains(SrcReg)) {
5734 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
5735 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5736 Indices);
5737 return;
5738 }
5739
5740 // Copy a QQQQ register quad by copying the individual sub-registers.
5741 if (AArch64::QQQQRegClass.contains(DestReg) &&
5742 AArch64::QQQQRegClass.contains(SrcReg)) {
5743 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5744 AArch64::qsub2, AArch64::qsub3};
5745 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5746 Indices);
5747 return;
5748 }
5749
5750 // Copy a QQQ register triple by copying the individual sub-registers.
5751 if (AArch64::QQQRegClass.contains(DestReg) &&
5752 AArch64::QQQRegClass.contains(SrcReg)) {
5753 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5754 AArch64::qsub2};
5755 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5756 Indices);
5757 return;
5758 }
5759
5760 // Copy a QQ register pair by copying the individual sub-registers.
5761 if (AArch64::QQRegClass.contains(DestReg) &&
5762 AArch64::QQRegClass.contains(SrcReg)) {
5763 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
5764 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5765 Indices);
5766 return;
5767 }
5768
5769 if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
5770 AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
5771 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
5772 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
5773 AArch64::XZR, Indices);
5774 return;
5775 }
5776
5777 if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
5778 AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
5779 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
5780 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
5781 AArch64::WZR, Indices);
5782 return;
5783 }
5784
5785 if (AArch64::FPR128RegClass.contains(DestReg) &&
5786 AArch64::FPR128RegClass.contains(SrcReg)) {
5787 // In streaming regions, NEON is illegal but streaming-SVE is available.
5788 // Use SVE for copies if we're in a streaming region and SME is available.
5789 // With +sme-fa64, NEON is legal in streaming mode so we can use it.
5790 if ((Subtarget.isSVEorStreamingSVEAvailable() &&
5791 !Subtarget.isNeonAvailable()) ||
5792 mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5793 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ))
5794 .addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define)
5795 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0))
5796 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0));
5797 } else if (Subtarget.isNeonAvailable()) {
5798 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
5799 .addReg(SrcReg)
5800 .addReg(SrcReg, getKillRegState(KillSrc));
5801 if (Subtarget.hasZeroCycleRegMoveFPR128())
5802 ++NumZCRegMoveInstrsFPR;
5803 } else {
5804 BuildMI(MBB, I, DL, get(AArch64::STRQpre))
5805 .addReg(AArch64::SP, RegState::Define)
5806 .addReg(SrcReg, getKillRegState(KillSrc))
5807 .addReg(AArch64::SP)
5808 .addImm(-16);
5809 BuildMI(MBB, I, DL, get(AArch64::LDRQpost))
5810 .addReg(AArch64::SP, RegState::Define)
5811 .addReg(DestReg, RegState::Define)
5812 .addReg(AArch64::SP)
5813 .addImm(16);
5814 }
5815 return;
5816 }
5817
5818 if (AArch64::FPR64RegClass.contains(DestReg) &&
5819 AArch64::FPR64RegClass.contains(SrcReg)) {
5820 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5821 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5822 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5823 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5824 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
5825 &AArch64::FPR128RegClass);
5826 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
5827 &AArch64::FPR128RegClass);
5828 // This instruction is reading and writing Q registers. This may upset
5829 // the register scavenger and machine verifier, so we need to indicate
5830 // that we are reading an undefined value from SrcRegQ, but a proper
5831 // value from SrcReg.
5832 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
5833 .addReg(SrcRegQ, RegState::Undef)
5834 .addReg(SrcRegQ, RegState::Undef)
5835 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5836 ++NumZCRegMoveInstrsFPR;
5837 } else {
5838 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
5839 .addReg(SrcReg, getKillRegState(KillSrc));
5840 if (Subtarget.hasZeroCycleRegMoveFPR64())
5841 ++NumZCRegMoveInstrsFPR;
5842 }
5843 return;
5844 }
5845
5846 if (AArch64::FPR32RegClass.contains(DestReg) &&
5847 AArch64::FPR32RegClass.contains(SrcReg)) {
5848 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5849 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5850 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5851 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5852 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
5853 &AArch64::FPR128RegClass);
5854 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
5855 &AArch64::FPR128RegClass);
5856 // This instruction is reading and writing Q registers. This may upset
5857 // the register scavenger and machine verifier, so we need to indicate
5858 // that we are reading an undefined value from SrcRegQ, but a proper
5859 // value from SrcReg.
5860 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
5861 .addReg(SrcRegQ, RegState::Undef)
5862 .addReg(SrcRegQ, RegState::Undef)
5863 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5864 ++NumZCRegMoveInstrsFPR;
5865 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5866 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5867 MCRegister DestRegD = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
5868 &AArch64::FPR64RegClass);
5869 MCRegister SrcRegD = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
5870 &AArch64::FPR64RegClass);
5871 // This instruction is reading and writing D registers. This may upset
5872 // the register scavenger and machine verifier, so we need to indicate
5873 // that we are reading an undefined value from SrcRegD, but a proper
5874 // value from SrcReg.
5875 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5876 .addReg(SrcRegD, RegState::Undef)
5877 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5878 ++NumZCRegMoveInstrsFPR;
5879 } else {
5880 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5881 .addReg(SrcReg, getKillRegState(KillSrc));
5882 if (Subtarget.hasZeroCycleRegMoveFPR32())
5883 ++NumZCRegMoveInstrsFPR;
5884 }
5885 return;
5886 }
5887
5888 if (AArch64::FPR16RegClass.contains(DestReg) &&
5889 AArch64::FPR16RegClass.contains(SrcReg)) {
5890 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5891 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5892 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5893 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5894 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
5895 &AArch64::FPR128RegClass);
5896 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
5897 &AArch64::FPR128RegClass);
5898 // This instruction is reading and writing Q registers. This may upset
5899 // the register scavenger and machine verifier, so we need to indicate
5900 // that we are reading an undefined value from SrcRegQ, but a proper
5901 // value from SrcReg.
5902 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
5903 .addReg(SrcRegQ, RegState::Undef)
5904 .addReg(SrcRegQ, RegState::Undef)
5905 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5906 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5907 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5908 MCRegister DestRegD = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
5909 &AArch64::FPR64RegClass);
5910 MCRegister SrcRegD = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
5911 &AArch64::FPR64RegClass);
5912 // This instruction is reading and writing D registers. This may upset
5913 // the register scavenger and machine verifier, so we need to indicate
5914 // that we are reading an undefined value from SrcRegD, but a proper
5915 // value from SrcReg.
5916 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5917 .addReg(SrcRegD, RegState::Undef)
5918 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5919 } else {
5920 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
5921 &AArch64::FPR32RegClass);
5922 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
5923 &AArch64::FPR32RegClass);
5924 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5925 .addReg(SrcReg, getKillRegState(KillSrc));
5926 }
5927 return;
5928 }
5929
5930 if (AArch64::FPR8RegClass.contains(DestReg) &&
5931 AArch64::FPR8RegClass.contains(SrcReg)) {
5932 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5933 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5934 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5935 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5936 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
5937 &AArch64::FPR128RegClass);
5938 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
5939 &AArch64::FPR128RegClass);
5940 // This instruction is reading and writing Q registers. This may upset
5941 // the register scavenger and machine verifier, so we need to indicate
5942 // that we are reading an undefined value from SrcRegQ, but a proper
5943 // value from SrcReg.
5944 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
5945 .addReg(SrcRegQ, RegState::Undef)
5946 .addReg(SrcRegQ, RegState::Undef)
5947 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5948 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5949 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5950 MCRegister DestRegD = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
5951 &AArch64::FPR64RegClass);
5952 MCRegister SrcRegD = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
5953 &AArch64::FPR64RegClass);
5954 // This instruction is reading and writing D registers. This may upset
5955 // the register scavenger and machine verifier, so we need to indicate
5956 // that we are reading an undefined value from SrcRegD, but a proper
5957 // value from SrcReg.
5958 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5959 .addReg(SrcRegD, RegState::Undef)
5960 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5961 } else {
5962 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
5963 &AArch64::FPR32RegClass);
5964 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
5965 &AArch64::FPR32RegClass);
5966 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5967 .addReg(SrcReg, getKillRegState(KillSrc));
5968 }
5969 return;
5970 }
5971
5972 // Copies between GPR64 and FPR64.
5973 if (AArch64::FPR64RegClass.contains(DestReg) &&
5974 AArch64::GPR64RegClass.contains(SrcReg)) {
5975 if (AArch64::XZR == SrcReg) {
5976 BuildMI(MBB, I, DL, get(AArch64::FMOVD0), DestReg);
5977 } else {
5978 BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
5979 .addReg(SrcReg, getKillRegState(KillSrc));
5980 }
5981 return;
5982 }
5983 if (AArch64::GPR64RegClass.contains(DestReg) &&
5984 AArch64::FPR64RegClass.contains(SrcReg)) {
5985 BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
5986 .addReg(SrcReg, getKillRegState(KillSrc));
5987 return;
5988 }
5989 // Copies between GPR32 and FPR32.
5990 if (AArch64::FPR32RegClass.contains(DestReg) &&
5991 AArch64::GPR32RegClass.contains(SrcReg)) {
5992 if (AArch64::WZR == SrcReg) {
5993 BuildMI(MBB, I, DL, get(AArch64::FMOVS0), DestReg);
5994 } else {
5995 BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
5996 .addReg(SrcReg, getKillRegState(KillSrc));
5997 }
5998 return;
5999 }
6000 if (AArch64::GPR32RegClass.contains(DestReg) &&
6001 AArch64::FPR32RegClass.contains(SrcReg)) {
6002 BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
6003 .addReg(SrcReg, getKillRegState(KillSrc));
6004 return;
6005 }
6006
6007 if (DestReg == AArch64::NZCV) {
6008 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
6009 BuildMI(MBB, I, DL, get(AArch64::MSR))
6010 .addImm(AArch64SysReg::NZCV)
6011 .addReg(SrcReg, getKillRegState(KillSrc))
6012 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
6013 return;
6014 }
6015
6016 if (SrcReg == AArch64::NZCV) {
6017 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
6018 BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
6019 .addImm(AArch64SysReg::NZCV)
6020 .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
6021 return;
6022 }
6023
6024#ifndef NDEBUG
6025 errs() << RI.getRegAsmName(DestReg) << " = COPY " << RI.getRegAsmName(SrcReg)
6026 << "\n";
6027#endif
6028 llvm_unreachable("unimplemented reg-to-reg copy");
6029}
6030
6033 MachineBasicBlock::iterator InsertBefore,
6034 const MCInstrDesc &MCID,
6035 Register SrcReg, bool IsKill,
6036 unsigned SubIdx0, unsigned SubIdx1, int FI,
6037 MachineMemOperand *MMO) {
6038 Register SrcReg0 = SrcReg;
6039 Register SrcReg1 = SrcReg;
6040 if (SrcReg.isPhysical()) {
6041 SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
6042 SubIdx0 = 0;
6043 SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
6044 SubIdx1 = 0;
6045 }
6046 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
6047 .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
6048 .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
6049 .addFrameIndex(FI)
6050 .addImm(0)
6051 .addMemOperand(MMO);
6052}
6053
6056 Register SrcReg, bool isKill, int FI,
6057 const TargetRegisterClass *RC,
6058 Register VReg,
6059 MachineInstr::MIFlag Flags) const {
6060 MachineFunction &MF = *MBB.getParent();
6061 MachineFrameInfo &MFI = MF.getFrameInfo();
6062
6064 MachineMemOperand *MMO =
6066 MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
6067 unsigned Opc = 0;
6068 bool Offset = true;
6070 unsigned StackID = TargetStackID::Default;
6071 switch (RI.getSpillSize(*RC)) {
6072 case 1:
6073 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
6074 Opc = AArch64::STRBui;
6075 break;
6076 case 2: {
6077 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
6078 Opc = AArch64::STRHui;
6079 else if (AArch64::PNRRegClass.hasSubClassEq(RC) ||
6080 AArch64::PPRRegClass.hasSubClassEq(RC)) {
6081 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6082 "Unexpected register store without SVE store instructions");
6083 Opc = AArch64::STR_PXI;
6085 }
6086 break;
6087 }
6088 case 4:
6089 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
6090 Opc = AArch64::STRWui;
6091 if (SrcReg.isVirtual())
6092 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
6093 else
6094 assert(SrcReg != AArch64::WSP);
6095 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6096 Opc = AArch64::STRSui;
6097 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
6098 Opc = AArch64::STR_PPXI;
6100 }
6101 break;
6102 case 8:
6103 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
6104 Opc = AArch64::STRXui;
6105 if (SrcReg.isVirtual())
6106 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
6107 else
6108 assert(SrcReg != AArch64::SP);
6109 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
6110 Opc = AArch64::STRDui;
6111 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
6113 get(AArch64::STPWi), SrcReg, isKill,
6114 AArch64::sube32, AArch64::subo32, FI, MMO);
6115 return;
6116 }
6117 break;
6118 case 16:
6119 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
6120 Opc = AArch64::STRQui;
6121 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
6122 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6123 Opc = AArch64::ST1Twov1d;
6124 Offset = false;
6125 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
6127 get(AArch64::STPXi), SrcReg, isKill,
6128 AArch64::sube64, AArch64::subo64, FI, MMO);
6129 return;
6130 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
6131 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6132 "Unexpected register store without SVE store instructions");
6133 Opc = AArch64::STR_ZXI;
6135 }
6136 break;
6137 case 24:
6138 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
6139 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6140 Opc = AArch64::ST1Threev1d;
6141 Offset = false;
6142 }
6143 break;
6144 case 32:
6145 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
6146 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6147 Opc = AArch64::ST1Fourv1d;
6148 Offset = false;
6149 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
6150 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6151 Opc = AArch64::ST1Twov2d;
6152 Offset = false;
6153 } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6154 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6155 "Unexpected register store without SVE store instructions");
6156 Opc = AArch64::STR_ZZXI_STRIDED_CONTIGUOUS;
6158 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
6159 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6160 "Unexpected register store without SVE store instructions");
6161 Opc = AArch64::STR_ZZXI;
6163 }
6164 break;
6165 case 48:
6166 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
6167 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6168 Opc = AArch64::ST1Threev2d;
6169 Offset = false;
6170 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
6171 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6172 "Unexpected register store without SVE store instructions");
6173 Opc = AArch64::STR_ZZZXI;
6175 }
6176 break;
6177 case 64:
6178 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
6179 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6180 Opc = AArch64::ST1Fourv2d;
6181 Offset = false;
6182 } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6183 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6184 "Unexpected register store without SVE store instructions");
6185 Opc = AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS;
6187 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
6188 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6189 "Unexpected register store without SVE store instructions");
6190 Opc = AArch64::STR_ZZZZXI;
6192 }
6193 break;
6194 }
6195 assert(Opc && "Unknown register class");
6196 MFI.setStackID(FI, StackID);
6197
6199 .addReg(SrcReg, getKillRegState(isKill))
6200 .addFrameIndex(FI);
6201
6202 if (Offset)
6203 MI.addImm(0);
6204 if (PNRReg.isValid())
6205 MI.addDef(PNRReg, RegState::Implicit);
6206 MI.addMemOperand(MMO);
6207}
6208
6211 MachineBasicBlock::iterator InsertBefore,
6212 const MCInstrDesc &MCID,
6213 Register DestReg, unsigned SubIdx0,
6214 unsigned SubIdx1, int FI,
6215 MachineMemOperand *MMO) {
6216 Register DestReg0 = DestReg;
6217 Register DestReg1 = DestReg;
6218 bool IsUndef = true;
6219 if (DestReg.isPhysical()) {
6220 DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
6221 SubIdx0 = 0;
6222 DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
6223 SubIdx1 = 0;
6224 IsUndef = false;
6225 }
6226 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
6227 .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
6228 .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
6229 .addFrameIndex(FI)
6230 .addImm(0)
6231 .addMemOperand(MMO);
6232}
6233
6236 Register DestReg, int FI,
6237 const TargetRegisterClass *RC,
6238 Register VReg, unsigned SubReg,
6239 MachineInstr::MIFlag Flags) const {
6240 MachineFunction &MF = *MBB.getParent();
6241 MachineFrameInfo &MFI = MF.getFrameInfo();
6243 MachineMemOperand *MMO =
6245 MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
6246
6247 unsigned Opc = 0;
6248 bool Offset = true;
6249 unsigned StackID = TargetStackID::Default;
6251 switch (TRI.getSpillSize(*RC)) {
6252 case 1:
6253 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
6254 Opc = AArch64::LDRBui;
6255 break;
6256 case 2: {
6257 bool IsPNR = AArch64::PNRRegClass.hasSubClassEq(RC);
6258 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
6259 Opc = AArch64::LDRHui;
6260 else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) {
6261 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6262 "Unexpected register load without SVE load instructions");
6263 if (IsPNR)
6264 PNRReg = DestReg;
6265 Opc = AArch64::LDR_PXI;
6267 }
6268 break;
6269 }
6270 case 4:
6271 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
6272 Opc = AArch64::LDRWui;
6273 if (DestReg.isVirtual())
6274 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
6275 else
6276 assert(DestReg != AArch64::WSP);
6277 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6278 Opc = AArch64::LDRSui;
6279 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
6280 Opc = AArch64::LDR_PPXI;
6282 }
6283 break;
6284 case 8:
6285 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
6286 Opc = AArch64::LDRXui;
6287 if (DestReg.isVirtual())
6288 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
6289 else
6290 assert(DestReg != AArch64::SP);
6291 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
6292 Opc = AArch64::LDRDui;
6293 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
6295 get(AArch64::LDPWi), DestReg, AArch64::sube32,
6296 AArch64::subo32, FI, MMO);
6297 return;
6298 }
6299 break;
6300 case 16:
6301 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
6302 Opc = AArch64::LDRQui;
6303 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
6304 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6305 Opc = AArch64::LD1Twov1d;
6306 Offset = false;
6307 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
6309 get(AArch64::LDPXi), DestReg, AArch64::sube64,
6310 AArch64::subo64, FI, MMO);
6311 return;
6312 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
6313 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6314 "Unexpected register load without SVE load instructions");
6315 Opc = AArch64::LDR_ZXI;
6317 }
6318 break;
6319 case 24:
6320 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
6321 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6322 Opc = AArch64::LD1Threev1d;
6323 Offset = false;
6324 }
6325 break;
6326 case 32:
6327 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
6328 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6329 Opc = AArch64::LD1Fourv1d;
6330 Offset = false;
6331 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
6332 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6333 Opc = AArch64::LD1Twov2d;
6334 Offset = false;
6335 } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6336 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6337 "Unexpected register load without SVE load instructions");
6338 Opc = AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS;
6340 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
6341 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6342 "Unexpected register load without SVE load instructions");
6343 Opc = AArch64::LDR_ZZXI;
6345 }
6346 break;
6347 case 48:
6348 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
6349 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6350 Opc = AArch64::LD1Threev2d;
6351 Offset = false;
6352 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
6353 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6354 "Unexpected register load without SVE load instructions");
6355 Opc = AArch64::LDR_ZZZXI;
6357 }
6358 break;
6359 case 64:
6360 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
6361 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6362 Opc = AArch64::LD1Fourv2d;
6363 Offset = false;
6364 } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6365 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6366 "Unexpected register load without SVE load instructions");
6367 Opc = AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS;
6369 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
6370 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6371 "Unexpected register load without SVE load instructions");
6372 Opc = AArch64::LDR_ZZZZXI;
6374 }
6375 break;
6376 }
6377
6378 assert(Opc && "Unknown register class");
6379 MFI.setStackID(FI, StackID);
6380
6382 .addReg(DestReg, getDefRegState(true))
6383 .addFrameIndex(FI);
6384 if (Offset)
6385 MI.addImm(0);
6386 if (PNRReg.isValid() && !PNRReg.isVirtual())
6387 MI.addDef(PNRReg, RegState::Implicit);
6388 MI.addMemOperand(MMO);
6389}
6390
6392 const MachineInstr &UseMI,
6393 const TargetRegisterInfo *TRI) {
6394 return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),
6395 UseMI.getIterator()),
6396 [TRI](const MachineInstr &I) {
6397 return I.modifiesRegister(AArch64::NZCV, TRI) ||
6398 I.readsRegister(AArch64::NZCV, TRI);
6399 });
6400}
6401
6402void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
6403 const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
6404 // The smallest scalable element supported by scaled SVE addressing
6405 // modes are predicates, which are 2 scalable bytes in size. So the scalable
6406 // byte offset must always be a multiple of 2.
6407 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
6408
6409 // VGSized offsets are divided by '2', because the VG register is the
6410 // the number of 64bit granules as opposed to 128bit vector chunks,
6411 // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
6412 // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
6413 // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
6414 ByteSized = Offset.getFixed();
6415 VGSized = Offset.getScalable() / 2;
6416}
6417
6418/// Returns the offset in parts to which this frame offset can be
6419/// decomposed for the purpose of describing a frame offset.
6420/// For non-scalable offsets this is simply its byte size.
6421void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
6422 const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
6423 int64_t &NumDataVectors) {
6424 // The smallest scalable element supported by scaled SVE addressing
6425 // modes are predicates, which are 2 scalable bytes in size. So the scalable
6426 // byte offset must always be a multiple of 2.
6427 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
6428
6429 NumBytes = Offset.getFixed();
6430 NumDataVectors = 0;
6431 NumPredicateVectors = Offset.getScalable() / 2;
6432 // This method is used to get the offsets to adjust the frame offset.
6433 // If the function requires ADDPL to be used and needs more than two ADDPL
6434 // instructions, part of the offset is folded into NumDataVectors so that it
6435 // uses ADDVL for part of it, reducing the number of ADDPL instructions.
6436 if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
6437 NumPredicateVectors > 62) {
6438 NumDataVectors = NumPredicateVectors / 8;
6439 NumPredicateVectors -= NumDataVectors * 8;
6440 }
6441}
6442
6443// Convenience function to create a DWARF expression for: Constant `Operation`.
6444// This helper emits compact sequences for common cases. For example, for`-15
6445// DW_OP_plus`, this helper would create DW_OP_lit15 DW_OP_minus.
6448 if (Operation == dwarf::DW_OP_plus && Constant < 0 && -Constant <= 31) {
6449 // -Constant (1 to 31)
6450 Expr.push_back(dwarf::DW_OP_lit0 - Constant);
6451 Operation = dwarf::DW_OP_minus;
6452 } else if (Constant >= 0 && Constant <= 31) {
6453 // Literal value 0 to 31
6454 Expr.push_back(dwarf::DW_OP_lit0 + Constant);
6455 } else {
6456 // Signed constant
6457 Expr.push_back(dwarf::DW_OP_consts);
6459 }
6460 return Expr.push_back(Operation);
6461}
6462
6463// Convenience function to create a DWARF expression for a register.
6464static void appendReadRegExpr(SmallVectorImpl<char> &Expr, unsigned RegNum) {
6465 Expr.push_back((char)dwarf::DW_OP_bregx);
6467 Expr.push_back(0);
6468}
6469
6470// Convenience function to create a DWARF expression for loading a register from
6471// a CFA offset.
6473 int64_t OffsetFromDefCFA) {
6474 // This assumes the top of the DWARF stack contains the CFA.
6475 Expr.push_back(dwarf::DW_OP_dup);
6476 // Add the offset to the register.
6477 appendConstantExpr(Expr, OffsetFromDefCFA, dwarf::DW_OP_plus);
6478 // Dereference the address (loads a 64 bit value)..
6479 Expr.push_back(dwarf::DW_OP_deref);
6480}
6481
6482// Convenience function to create a comment for
6483// (+/-) NumBytes (* RegScale)?
6484static void appendOffsetComment(int NumBytes, llvm::raw_string_ostream &Comment,
6485 StringRef RegScale = {}) {
6486 if (NumBytes) {
6487 Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
6488 if (!RegScale.empty())
6489 Comment << ' ' << RegScale;
6490 }
6491}
6492
6493// Creates an MCCFIInstruction:
6494// { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
6496 unsigned Reg,
6497 const StackOffset &Offset) {
6498 int64_t NumBytes, NumVGScaledBytes;
6499 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, NumBytes,
6500 NumVGScaledBytes);
6501 std::string CommentBuffer;
6502 llvm::raw_string_ostream Comment(CommentBuffer);
6503
6504 if (Reg == AArch64::SP)
6505 Comment << "sp";
6506 else if (Reg == AArch64::FP)
6507 Comment << "fp";
6508 else
6509 Comment << printReg(Reg, &TRI);
6510
6511 // Build up the expression (Reg + NumBytes + VG * NumVGScaledBytes)
6512 SmallString<64> Expr;
6513 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
6514 assert(DwarfReg <= 31 && "DwarfReg out of bounds (0..31)");
6515 // Reg + NumBytes
6516 Expr.push_back(dwarf::DW_OP_breg0 + DwarfReg);
6517 appendLEB128<LEB128Sign::Signed>(Expr, NumBytes);
6518 appendOffsetComment(NumBytes, Comment);
6519 if (NumVGScaledBytes) {
6520 // + VG * NumVGScaledBytes
6521 appendOffsetComment(NumVGScaledBytes, Comment, "* VG");
6522 appendReadRegExpr(Expr, TRI.getDwarfRegNum(AArch64::VG, true));
6523 appendConstantExpr(Expr, NumVGScaledBytes, dwarf::DW_OP_mul);
6524 Expr.push_back(dwarf::DW_OP_plus);
6525 }
6526
6527 // Wrap this into DW_CFA_def_cfa.
6528 SmallString<64> DefCfaExpr;
6529 DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
6530 appendLEB128<LEB128Sign::Unsigned>(DefCfaExpr, Expr.size());
6531 DefCfaExpr.append(Expr.str());
6532 return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(),
6533 Comment.str());
6534}
6535
6537 unsigned FrameReg, unsigned Reg,
6538 const StackOffset &Offset,
6539 bool LastAdjustmentWasScalable) {
6540 if (Offset.getScalable())
6541 return createDefCFAExpression(TRI, Reg, Offset);
6542
6543 if (FrameReg == Reg && !LastAdjustmentWasScalable)
6544 return MCCFIInstruction::cfiDefCfaOffset(nullptr, int(Offset.getFixed()));
6545
6546 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
6547 return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed());
6548}
6549
6552 const StackOffset &OffsetFromDefCFA,
6553 std::optional<int64_t> IncomingVGOffsetFromDefCFA) {
6554 int64_t NumBytes, NumVGScaledBytes;
6555 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
6556 OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
6557
6558 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
6559
6560 // Non-scalable offsets can use DW_CFA_offset directly.
6561 if (!NumVGScaledBytes)
6562 return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
6563
6564 std::string CommentBuffer;
6565 llvm::raw_string_ostream Comment(CommentBuffer);
6566 Comment << printReg(Reg, &TRI) << " @ cfa";
6567
6568 // Build up expression (CFA + VG * NumVGScaledBytes + NumBytes)
6569 assert(NumVGScaledBytes && "Expected scalable offset");
6570 SmallString<64> OffsetExpr;
6571 // + VG * NumVGScaledBytes
6572 StringRef VGRegScale;
6573 if (IncomingVGOffsetFromDefCFA) {
6574 appendLoadRegExpr(OffsetExpr, *IncomingVGOffsetFromDefCFA);
6575 VGRegScale = "* IncomingVG";
6576 } else {
6577 appendReadRegExpr(OffsetExpr, TRI.getDwarfRegNum(AArch64::VG, true));
6578 VGRegScale = "* VG";
6579 }
6580 appendConstantExpr(OffsetExpr, NumVGScaledBytes, dwarf::DW_OP_mul);
6581 appendOffsetComment(NumVGScaledBytes, Comment, VGRegScale);
6582 OffsetExpr.push_back(dwarf::DW_OP_plus);
6583 if (NumBytes) {
6584 // + NumBytes
6585 appendOffsetComment(NumBytes, Comment);
6586 appendConstantExpr(OffsetExpr, NumBytes, dwarf::DW_OP_plus);
6587 }
6588
6589 // Wrap this into DW_CFA_expression
6590 SmallString<64> CfaExpr;
6591 CfaExpr.push_back(dwarf::DW_CFA_expression);
6592 appendLEB128<LEB128Sign::Unsigned>(CfaExpr, DwarfReg);
6593 appendLEB128<LEB128Sign::Unsigned>(CfaExpr, OffsetExpr.size());
6594 CfaExpr.append(OffsetExpr.str());
6595
6596 return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), SMLoc(),
6597 Comment.str());
6598}
6599
6600// Helper function to emit a frame offset adjustment from a given
6601// pointer (SrcReg), stored into DestReg. This function is explicit
6602// in that it requires the opcode.
6605 const DebugLoc &DL, unsigned DestReg,
6606 unsigned SrcReg, int64_t Offset, unsigned Opc,
6607 const TargetInstrInfo *TII,
6608 MachineInstr::MIFlag Flag, bool NeedsWinCFI,
6609 bool *HasWinCFI, bool EmitCFAOffset,
6610 StackOffset CFAOffset, unsigned FrameReg) {
6611 int Sign = 1;
6612 unsigned MaxEncoding, ShiftSize;
6613 switch (Opc) {
6614 case AArch64::ADDXri:
6615 case AArch64::ADDSXri:
6616 case AArch64::SUBXri:
6617 case AArch64::SUBSXri:
6618 MaxEncoding = 0xfff;
6619 ShiftSize = 12;
6620 break;
6621 case AArch64::ADDVL_XXI:
6622 case AArch64::ADDPL_XXI:
6623 case AArch64::ADDSVL_XXI:
6624 case AArch64::ADDSPL_XXI:
6625 MaxEncoding = 31;
6626 ShiftSize = 0;
6627 if (Offset < 0) {
6628 MaxEncoding = 32;
6629 Sign = -1;
6630 Offset = -Offset;
6631 }
6632 break;
6633 default:
6634 llvm_unreachable("Unsupported opcode");
6635 }
6636
6637 // `Offset` can be in bytes or in "scalable bytes".
6638 int VScale = 1;
6639 if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI)
6640 VScale = 16;
6641 else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI)
6642 VScale = 2;
6643
6644 // FIXME: If the offset won't fit in 24-bits, compute the offset into a
6645 // scratch register. If DestReg is a virtual register, use it as the
6646 // scratch register; otherwise, create a new virtual register (to be
6647 // replaced by the scavenger at the end of PEI). That case can be optimized
6648 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
6649 // register can be loaded with offset%8 and the add/sub can use an extending
6650 // instruction with LSL#3.
6651 // Currently the function handles any offsets but generates a poor sequence
6652 // of code.
6653 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
6654
6655 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
6656 Register TmpReg = DestReg;
6657 if (TmpReg == AArch64::XZR)
6658 TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
6659 &AArch64::GPR64RegClass);
6660 do {
6661 uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
6662 unsigned LocalShiftSize = 0;
6663 if (ThisVal > MaxEncoding) {
6664 ThisVal = ThisVal >> ShiftSize;
6665 LocalShiftSize = ShiftSize;
6666 }
6667 assert((ThisVal >> ShiftSize) <= MaxEncoding &&
6668 "Encoding cannot handle value that big");
6669
6670 Offset -= ThisVal << LocalShiftSize;
6671 if (Offset == 0)
6672 TmpReg = DestReg;
6673 auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
6674 .addReg(SrcReg)
6675 .addImm(Sign * (int)ThisVal);
6676 if (ShiftSize)
6677 MBI = MBI.addImm(
6679 MBI = MBI.setMIFlag(Flag);
6680
6681 auto Change =
6682 VScale == 1
6683 ? StackOffset::getFixed(ThisVal << LocalShiftSize)
6684 : StackOffset::getScalable(VScale * (ThisVal << LocalShiftSize));
6685 if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri)
6686 CFAOffset += Change;
6687 else
6688 CFAOffset -= Change;
6689 if (EmitCFAOffset && DestReg == TmpReg) {
6690 MachineFunction &MF = *MBB.getParent();
6691 const TargetSubtargetInfo &STI = MF.getSubtarget();
6692 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
6693
6694 unsigned CFIIndex = MF.addFrameInst(
6695 createDefCFA(TRI, FrameReg, DestReg, CFAOffset, VScale != 1));
6696 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
6697 .addCFIIndex(CFIIndex)
6698 .setMIFlags(Flag);
6699 }
6700
6701 if (NeedsWinCFI) {
6702 int Imm = (int)(ThisVal << LocalShiftSize);
6703 if (VScale != 1 && DestReg == AArch64::SP) {
6704 if (HasWinCFI)
6705 *HasWinCFI = true;
6706 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AllocZ))
6707 .addImm(ThisVal)
6708 .setMIFlag(Flag);
6709 } else if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
6710 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
6711 assert(VScale == 1 && "Expected non-scalable operation");
6712 if (HasWinCFI)
6713 *HasWinCFI = true;
6714 if (Imm == 0)
6715 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
6716 else
6717 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
6718 .addImm(Imm)
6719 .setMIFlag(Flag);
6720 assert(Offset == 0 && "Expected remaining offset to be zero to "
6721 "emit a single SEH directive");
6722 } else if (DestReg == AArch64::SP) {
6723 assert(VScale == 1 && "Expected non-scalable operation");
6724 if (HasWinCFI)
6725 *HasWinCFI = true;
6726 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
6727 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
6728 .addImm(Imm)
6729 .setMIFlag(Flag);
6730 }
6731 }
6732
6733 SrcReg = TmpReg;
6734 } while (Offset);
6735}
6736
6739 unsigned DestReg, unsigned SrcReg,
6741 MachineInstr::MIFlag Flag, bool SetNZCV,
6742 bool NeedsWinCFI, bool *HasWinCFI,
6743 bool EmitCFAOffset, StackOffset CFAOffset,
6744 unsigned FrameReg) {
6745 // If a function is marked as arm_locally_streaming, then the runtime value of
6746 // vscale in the prologue/epilogue is different the runtime value of vscale
6747 // in the function's body. To avoid having to consider multiple vscales,
6748 // we can use `addsvl` to allocate any scalable stack-slots, which under
6749 // most circumstances will be only locals, not callee-save slots.
6750 const Function &F = MBB.getParent()->getFunction();
6751 bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body");
6752
6753 int64_t Bytes, NumPredicateVectors, NumDataVectors;
6754 AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
6755 Offset, Bytes, NumPredicateVectors, NumDataVectors);
6756
6757 // Insert ADDSXri for scalable offset at the end.
6758 bool NeedsFinalDefNZCV = SetNZCV && (NumPredicateVectors || NumDataVectors);
6759 if (NeedsFinalDefNZCV)
6760 SetNZCV = false;
6761
6762 // First emit non-scalable frame offsets, or a simple 'mov'.
6763 if (Bytes || (!Offset && SrcReg != DestReg)) {
6764 assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
6765 "SP increment/decrement not 8-byte aligned");
6766 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
6767 if (Bytes < 0) {
6768 Bytes = -Bytes;
6769 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
6770 }
6771 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
6772 NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6773 FrameReg);
6774 CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri)
6775 ? StackOffset::getFixed(-Bytes)
6776 : StackOffset::getFixed(Bytes);
6777 SrcReg = DestReg;
6778 FrameReg = DestReg;
6779 }
6780
6781 assert(!(NeedsWinCFI && NumPredicateVectors) &&
6782 "WinCFI can't allocate fractions of an SVE data vector");
6783
6784 if (NumDataVectors) {
6785 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
6786 UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI, TII,
6787 Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6788 FrameReg);
6789 CFAOffset += StackOffset::getScalable(-NumDataVectors * 16);
6790 SrcReg = DestReg;
6791 }
6792
6793 if (NumPredicateVectors) {
6794 assert(DestReg != AArch64::SP && "Unaligned access to SP");
6795 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
6796 UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI, TII,
6797 Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6798 FrameReg);
6799 }
6800
6801 if (NeedsFinalDefNZCV)
6802 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDSXri), DestReg)
6803 .addReg(DestReg)
6804 .addImm(0)
6805 .addImm(0);
6806}
6807
6810 MachineBasicBlock::iterator InsertPt, int FrameIndex,
6811 LiveIntervals *LIS, VirtRegMap *VRM) const {
6812 // This is a bit of a hack. Consider this instruction:
6813 //
6814 // %0 = COPY %sp; GPR64all:%0
6815 //
6816 // We explicitly chose GPR64all for the virtual register so such a copy might
6817 // be eliminated by RegisterCoalescer. However, that may not be possible, and
6818 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
6819 // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
6820 //
6821 // To prevent that, we are going to constrain the %0 register class here.
6822 if (MI.isFullCopy()) {
6823 Register DstReg = MI.getOperand(0).getReg();
6824 Register SrcReg = MI.getOperand(1).getReg();
6825 if (SrcReg == AArch64::SP && DstReg.isVirtual()) {
6826 MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
6827 return nullptr;
6828 }
6829 if (DstReg == AArch64::SP && SrcReg.isVirtual()) {
6830 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
6831 return nullptr;
6832 }
6833 // Nothing can folded with copy from/to NZCV.
6834 if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV)
6835 return nullptr;
6836 }
6837
6838 // Handle the case where a copy is being spilled or filled but the source
6839 // and destination register class don't match. For example:
6840 //
6841 // %0 = COPY %xzr; GPR64common:%0
6842 //
6843 // In this case we can still safely fold away the COPY and generate the
6844 // following spill code:
6845 //
6846 // STRXui %xzr, %stack.0
6847 //
6848 // This also eliminates spilled cross register class COPYs (e.g. between x and
6849 // d regs) of the same size. For example:
6850 //
6851 // %0 = COPY %1; GPR64:%0, FPR64:%1
6852 //
6853 // will be filled as
6854 //
6855 // LDRDui %0, fi<#0>
6856 //
6857 // instead of
6858 //
6859 // LDRXui %Temp, fi<#0>
6860 // %0 = FMOV %Temp
6861 //
6862 if (MI.isCopy() && Ops.size() == 1 &&
6863 // Make sure we're only folding the explicit COPY defs/uses.
6864 (Ops[0] == 0 || Ops[0] == 1)) {
6865 bool IsSpill = Ops[0] == 0;
6866 bool IsFill = !IsSpill;
6868 const MachineRegisterInfo &MRI = MF.getRegInfo();
6869 MachineBasicBlock &MBB = *MI.getParent();
6870 const MachineOperand &DstMO = MI.getOperand(0);
6871 const MachineOperand &SrcMO = MI.getOperand(1);
6872 Register DstReg = DstMO.getReg();
6873 Register SrcReg = SrcMO.getReg();
6874 // This is slightly expensive to compute for physical regs since
6875 // getMinimalPhysRegClass is slow.
6876 auto getRegClass = [&](unsigned Reg) {
6877 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
6878 : TRI.getMinimalPhysRegClass(Reg);
6879 };
6880
6881 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
6882 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
6883 TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
6884 "Mismatched register size in non subreg COPY");
6885 if (IsSpill)
6886 storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
6887 getRegClass(SrcReg), Register());
6888 else
6889 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
6890 getRegClass(DstReg), Register());
6891 return &*--InsertPt;
6892 }
6893
6894 // Handle cases like spilling def of:
6895 //
6896 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
6897 //
6898 // where the physical register source can be widened and stored to the full
6899 // virtual reg destination stack slot, in this case producing:
6900 //
6901 // STRXui %xzr, %stack.0
6902 //
6903 if (IsSpill && DstMO.isUndef() && SrcReg == AArch64::WZR &&
6904 TRI.getRegSizeInBits(*getRegClass(DstReg)) == 64) {
6905 assert(SrcMO.getSubReg() == 0 &&
6906 "Unexpected subreg on physical register");
6907 storeRegToStackSlot(MBB, InsertPt, AArch64::XZR, SrcMO.isKill(),
6908 FrameIndex, &AArch64::GPR64RegClass, Register());
6909 return &*--InsertPt;
6910 }
6911
6912 // Handle cases like filling use of:
6913 //
6914 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
6915 //
6916 // where we can load the full virtual reg source stack slot, into the subreg
6917 // destination, in this case producing:
6918 //
6919 // LDRWui %0:sub_32<def,read-undef>, %stack.0
6920 //
6921 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
6922 const TargetRegisterClass *FillRC = nullptr;
6923 switch (DstMO.getSubReg()) {
6924 default:
6925 break;
6926 case AArch64::sub_32:
6927 if (AArch64::GPR64RegClass.hasSubClassEq(getRegClass(DstReg)))
6928 FillRC = &AArch64::GPR32RegClass;
6929 break;
6930 case AArch64::ssub:
6931 FillRC = &AArch64::FPR32RegClass;
6932 break;
6933 case AArch64::dsub:
6934 FillRC = &AArch64::FPR64RegClass;
6935 break;
6936 }
6937
6938 if (FillRC) {
6939 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
6940 TRI.getRegSizeInBits(*FillRC) &&
6941 "Mismatched regclass size on folded subreg COPY");
6942 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC,
6943 Register());
6944 MachineInstr &LoadMI = *--InsertPt;
6945 MachineOperand &LoadDst = LoadMI.getOperand(0);
6946 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
6947 LoadDst.setSubReg(DstMO.getSubReg());
6948 LoadDst.setIsUndef();
6949 return &LoadMI;
6950 }
6951 }
6952 }
6953
6954 // Cannot fold.
6955 return nullptr;
6956}
6957
6959 StackOffset &SOffset,
6960 bool *OutUseUnscaledOp,
6961 unsigned *OutUnscaledOp,
6962 int64_t *EmittableOffset) {
6963 // Set output values in case of early exit.
6964 if (EmittableOffset)
6965 *EmittableOffset = 0;
6966 if (OutUseUnscaledOp)
6967 *OutUseUnscaledOp = false;
6968 if (OutUnscaledOp)
6969 *OutUnscaledOp = 0;
6970
6971 // Exit early for structured vector spills/fills as they can't take an
6972 // immediate offset.
6973 switch (MI.getOpcode()) {
6974 default:
6975 break;
6976 case AArch64::LD1Rv1d:
6977 case AArch64::LD1Rv2s:
6978 case AArch64::LD1Rv2d:
6979 case AArch64::LD1Rv4h:
6980 case AArch64::LD1Rv4s:
6981 case AArch64::LD1Rv8b:
6982 case AArch64::LD1Rv8h:
6983 case AArch64::LD1Rv16b:
6984 case AArch64::LD1Twov2d:
6985 case AArch64::LD1Threev2d:
6986 case AArch64::LD1Fourv2d:
6987 case AArch64::LD1Twov1d:
6988 case AArch64::LD1Threev1d:
6989 case AArch64::LD1Fourv1d:
6990 case AArch64::ST1Twov2d:
6991 case AArch64::ST1Threev2d:
6992 case AArch64::ST1Fourv2d:
6993 case AArch64::ST1Twov1d:
6994 case AArch64::ST1Threev1d:
6995 case AArch64::ST1Fourv1d:
6996 case AArch64::ST1i8:
6997 case AArch64::ST1i16:
6998 case AArch64::ST1i32:
6999 case AArch64::ST1i64:
7000 case AArch64::IRG:
7001 case AArch64::IRGstack:
7002 case AArch64::STGloop:
7003 case AArch64::STZGloop:
7005 }
7006
7007 // Get the min/max offset and the scale.
7008 TypeSize ScaleValue(0U, false), Width(0U, false);
7009 int64_t MinOff, MaxOff;
7010 if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,
7011 MaxOff))
7012 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
7013
7014 // Construct the complete offset.
7015 bool IsMulVL = ScaleValue.isScalable();
7016 unsigned Scale = ScaleValue.getKnownMinValue();
7017 int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
7018
7019 const MachineOperand &ImmOpnd =
7020 MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
7021 Offset += ImmOpnd.getImm() * Scale;
7022
7023 // If the offset doesn't match the scale, we rewrite the instruction to
7024 // use the unscaled instruction instead. Likewise, if we have a negative
7025 // offset and there is an unscaled op to use.
7026 std::optional<unsigned> UnscaledOp =
7028 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
7029 if (useUnscaledOp &&
7030 !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,
7031 MaxOff))
7032 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
7033
7034 Scale = ScaleValue.getKnownMinValue();
7035 assert(IsMulVL == ScaleValue.isScalable() &&
7036 "Unscaled opcode has different value for scalable");
7037
7038 int64_t Remainder = Offset % Scale;
7039 assert(!(Remainder && useUnscaledOp) &&
7040 "Cannot have remainder when using unscaled op");
7041
7042 assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
7043 int64_t NewOffset = Offset / Scale;
7044 if (MinOff <= NewOffset && NewOffset <= MaxOff)
7045 Offset = Remainder;
7046 else {
7047 NewOffset = NewOffset < 0 ? MinOff : MaxOff;
7048 Offset = Offset - (NewOffset * Scale);
7049 }
7050
7051 if (EmittableOffset)
7052 *EmittableOffset = NewOffset;
7053 if (OutUseUnscaledOp)
7054 *OutUseUnscaledOp = useUnscaledOp;
7055 if (OutUnscaledOp && UnscaledOp)
7056 *OutUnscaledOp = *UnscaledOp;
7057
7058 if (IsMulVL)
7059 SOffset = StackOffset::get(SOffset.getFixed(), Offset);
7060 else
7061 SOffset = StackOffset::get(Offset, SOffset.getScalable());
7063 (SOffset ? 0 : AArch64FrameOffsetIsLegal);
7064}
7065
7067 unsigned FrameReg, StackOffset &Offset,
7068 const AArch64InstrInfo *TII) {
7069 unsigned Opcode = MI.getOpcode();
7070 unsigned ImmIdx = FrameRegIdx + 1;
7071
7072 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
7073 Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm());
7074 emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
7075 MI.getOperand(0).getReg(), FrameReg, Offset, TII,
7076 MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
7077 MI.eraseFromParent();
7078 Offset = StackOffset();
7079 return true;
7080 }
7081
7082 int64_t NewOffset;
7083 unsigned UnscaledOp;
7084 bool UseUnscaledOp;
7085 int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
7086 &UnscaledOp, &NewOffset);
7089 // Replace the FrameIndex with FrameReg.
7090 MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
7091 if (UseUnscaledOp)
7092 MI.setDesc(TII->get(UnscaledOp));
7093
7094 MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
7095 return !Offset;
7096 }
7097
7098 return false;
7099}
7100
7106
7107MCInst AArch64InstrInfo::getNop() const { return MCInstBuilder(AArch64::NOP); }
7108
7109// AArch64 supports MachineCombiner.
7110bool AArch64InstrInfo::useMachineCombiner() const { return true; }
7111
7112// True when Opc sets flag
7113static bool isCombineInstrSettingFlag(unsigned Opc) {
7114 switch (Opc) {
7115 case AArch64::ADDSWrr:
7116 case AArch64::ADDSWri:
7117 case AArch64::ADDSXrr:
7118 case AArch64::ADDSXri:
7119 case AArch64::SUBSWrr:
7120 case AArch64::SUBSXrr:
7121 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7122 case AArch64::SUBSWri:
7123 case AArch64::SUBSXri:
7124 return true;
7125 default:
7126 break;
7127 }
7128 return false;
7129}
7130
7131// 32b Opcodes that can be combined with a MUL
7132static bool isCombineInstrCandidate32(unsigned Opc) {
7133 switch (Opc) {
7134 case AArch64::ADDWrr:
7135 case AArch64::ADDWri:
7136 case AArch64::SUBWrr:
7137 case AArch64::ADDSWrr:
7138 case AArch64::ADDSWri:
7139 case AArch64::SUBSWrr:
7140 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7141 case AArch64::SUBWri:
7142 case AArch64::SUBSWri:
7143 return true;
7144 default:
7145 break;
7146 }
7147 return false;
7148}
7149
7150// 64b Opcodes that can be combined with a MUL
7151static bool isCombineInstrCandidate64(unsigned Opc) {
7152 switch (Opc) {
7153 case AArch64::ADDXrr:
7154 case AArch64::ADDXri:
7155 case AArch64::SUBXrr:
7156 case AArch64::ADDSXrr:
7157 case AArch64::ADDSXri:
7158 case AArch64::SUBSXrr:
7159 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7160 case AArch64::SUBXri:
7161 case AArch64::SUBSXri:
7162 case AArch64::ADDv8i8:
7163 case AArch64::ADDv16i8:
7164 case AArch64::ADDv4i16:
7165 case AArch64::ADDv8i16:
7166 case AArch64::ADDv2i32:
7167 case AArch64::ADDv4i32:
7168 case AArch64::SUBv8i8:
7169 case AArch64::SUBv16i8:
7170 case AArch64::SUBv4i16:
7171 case AArch64::SUBv8i16:
7172 case AArch64::SUBv2i32:
7173 case AArch64::SUBv4i32:
7174 return true;
7175 default:
7176 break;
7177 }
7178 return false;
7179}
7180
7181// FP Opcodes that can be combined with a FMUL.
7182static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
7183 switch (Inst.getOpcode()) {
7184 default:
7185 break;
7186 case AArch64::FADDHrr:
7187 case AArch64::FADDSrr:
7188 case AArch64::FADDDrr:
7189 case AArch64::FADDv4f16:
7190 case AArch64::FADDv8f16:
7191 case AArch64::FADDv2f32:
7192 case AArch64::FADDv2f64:
7193 case AArch64::FADDv4f32:
7194 case AArch64::FSUBHrr:
7195 case AArch64::FSUBSrr:
7196 case AArch64::FSUBDrr:
7197 case AArch64::FSUBv4f16:
7198 case AArch64::FSUBv8f16:
7199 case AArch64::FSUBv2f32:
7200 case AArch64::FSUBv2f64:
7201 case AArch64::FSUBv4f32:
7203 // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
7204 // the target options or if FADD/FSUB has the contract fast-math flag.
7205 return Options.AllowFPOpFusion == FPOpFusion::Fast ||
7207 }
7208 return false;
7209}
7210
7211// Opcodes that can be combined with a MUL
7215
7216//
7217// Utility routine that checks if \param MO is defined by an
7218// \param CombineOpc instruction in the basic block \param MBB
7220 unsigned CombineOpc, unsigned ZeroReg = 0,
7221 bool CheckZeroReg = false) {
7222 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7223 MachineInstr *MI = nullptr;
7224
7225 if (MO.isReg() && MO.getReg().isVirtual())
7226 MI = MRI.getUniqueVRegDef(MO.getReg());
7227 // And it needs to be in the trace (otherwise, it won't have a depth).
7228 if (!MI || MI->getParent() != &MBB || MI->getOpcode() != CombineOpc)
7229 return false;
7230 // Must only used by the user we combine with.
7231 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
7232 return false;
7233
7234 if (CheckZeroReg) {
7235 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
7236 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
7237 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
7238 // The third input reg must be zero.
7239 if (MI->getOperand(3).getReg() != ZeroReg)
7240 return false;
7241 }
7242
7243 if (isCombineInstrSettingFlag(CombineOpc) &&
7244 MI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) == -1)
7245 return false;
7246
7247 return true;
7248}
7249
7250//
7251// Is \param MO defined by an integer multiply and can be combined?
7253 unsigned MulOpc, unsigned ZeroReg) {
7254 return canCombine(MBB, MO, MulOpc, ZeroReg, true);
7255}
7256
7257//
7258// Is \param MO defined by a floating-point multiply and can be combined?
7260 unsigned MulOpc) {
7261 return canCombine(MBB, MO, MulOpc);
7262}
7263
7264// TODO: There are many more machine instruction opcodes to match:
7265// 1. Other data types (integer, vectors)
7266// 2. Other math / logic operations (xor, or)
7267// 3. Other forms of the same operation (intrinsics and other variants)
7268bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
7269 bool Invert) const {
7270 if (Invert)
7271 return false;
7272 switch (Inst.getOpcode()) {
7273 // == Floating-point types ==
7274 // -- Floating-point instructions --
7275 case AArch64::FADDHrr:
7276 case AArch64::FADDSrr:
7277 case AArch64::FADDDrr:
7278 case AArch64::FMULHrr:
7279 case AArch64::FMULSrr:
7280 case AArch64::FMULDrr:
7281 case AArch64::FMULX16:
7282 case AArch64::FMULX32:
7283 case AArch64::FMULX64:
7284 // -- Advanced SIMD instructions --
7285 case AArch64::FADDv4f16:
7286 case AArch64::FADDv8f16:
7287 case AArch64::FADDv2f32:
7288 case AArch64::FADDv4f32:
7289 case AArch64::FADDv2f64:
7290 case AArch64::FMULv4f16:
7291 case AArch64::FMULv8f16:
7292 case AArch64::FMULv2f32:
7293 case AArch64::FMULv4f32:
7294 case AArch64::FMULv2f64:
7295 case AArch64::FMULXv4f16:
7296 case AArch64::FMULXv8f16:
7297 case AArch64::FMULXv2f32:
7298 case AArch64::FMULXv4f32:
7299 case AArch64::FMULXv2f64:
7300 // -- SVE instructions --
7301 // Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX
7302 // in the SVE instruction set (though there are predicated ones).
7303 case AArch64::FADD_ZZZ_H:
7304 case AArch64::FADD_ZZZ_S:
7305 case AArch64::FADD_ZZZ_D:
7306 case AArch64::FMUL_ZZZ_H:
7307 case AArch64::FMUL_ZZZ_S:
7308 case AArch64::FMUL_ZZZ_D:
7311
7312 // == Integer types ==
7313 // -- Base instructions --
7314 // Opcodes MULWrr and MULXrr don't exist because
7315 // `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of
7316 // `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively.
7317 // The machine-combiner does not support three-source-operands machine
7318 // instruction. So we cannot reassociate MULs.
7319 case AArch64::ADDWrr:
7320 case AArch64::ADDXrr:
7321 case AArch64::ANDWrr:
7322 case AArch64::ANDXrr:
7323 case AArch64::ORRWrr:
7324 case AArch64::ORRXrr:
7325 case AArch64::EORWrr:
7326 case AArch64::EORXrr:
7327 case AArch64::EONWrr:
7328 case AArch64::EONXrr:
7329 // -- Advanced SIMD instructions --
7330 // Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL
7331 // in the Advanced SIMD instruction set.
7332 case AArch64::ADDv8i8:
7333 case AArch64::ADDv16i8:
7334 case AArch64::ADDv4i16:
7335 case AArch64::ADDv8i16:
7336 case AArch64::ADDv2i32:
7337 case AArch64::ADDv4i32:
7338 case AArch64::ADDv1i64:
7339 case AArch64::ADDv2i64:
7340 case AArch64::MULv8i8:
7341 case AArch64::MULv16i8:
7342 case AArch64::MULv4i16:
7343 case AArch64::MULv8i16:
7344 case AArch64::MULv2i32:
7345 case AArch64::MULv4i32:
7346 case AArch64::ANDv8i8:
7347 case AArch64::ANDv16i8:
7348 case AArch64::ORRv8i8:
7349 case AArch64::ORRv16i8:
7350 case AArch64::EORv8i8:
7351 case AArch64::EORv16i8:
7352 // -- SVE instructions --
7353 case AArch64::ADD_ZZZ_B:
7354 case AArch64::ADD_ZZZ_H:
7355 case AArch64::ADD_ZZZ_S:
7356 case AArch64::ADD_ZZZ_D:
7357 case AArch64::MUL_ZZZ_B:
7358 case AArch64::MUL_ZZZ_H:
7359 case AArch64::MUL_ZZZ_S:
7360 case AArch64::MUL_ZZZ_D:
7361 case AArch64::AND_ZZZ:
7362 case AArch64::ORR_ZZZ:
7363 case AArch64::EOR_ZZZ:
7364 return true;
7365
7366 default:
7367 return false;
7368 }
7369}
7370
7371/// Find instructions that can be turned into madd.
7373 SmallVectorImpl<unsigned> &Patterns) {
7374 unsigned Opc = Root.getOpcode();
7375 MachineBasicBlock &MBB = *Root.getParent();
7376 bool Found = false;
7377
7379 return false;
7381 int Cmp_NZCV =
7382 Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
7383 // When NZCV is live bail out.
7384 if (Cmp_NZCV == -1)
7385 return false;
7386 unsigned NewOpc = convertToNonFlagSettingOpc(Root);
7387 // When opcode can't change bail out.
7388 // CHECKME: do we miss any cases for opcode conversion?
7389 if (NewOpc == Opc)
7390 return false;
7391 Opc = NewOpc;
7392 }
7393
7394 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
7395 unsigned Pattern) {
7396 if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
7397 Patterns.push_back(Pattern);
7398 Found = true;
7399 }
7400 };
7401
7402 auto setVFound = [&](int Opcode, int Operand, unsigned Pattern) {
7403 if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
7404 Patterns.push_back(Pattern);
7405 Found = true;
7406 }
7407 };
7408
7410
7411 switch (Opc) {
7412 default:
7413 break;
7414 case AArch64::ADDWrr:
7415 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7416 "ADDWrr does not have register operands");
7417 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
7418 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
7419 break;
7420 case AArch64::ADDXrr:
7421 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
7422 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
7423 break;
7424 case AArch64::SUBWrr:
7425 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
7426 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
7427 break;
7428 case AArch64::SUBXrr:
7429 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
7430 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
7431 break;
7432 case AArch64::ADDWri:
7433 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
7434 break;
7435 case AArch64::ADDXri:
7436 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
7437 break;
7438 case AArch64::SUBWri:
7439 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
7440 break;
7441 case AArch64::SUBXri:
7442 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
7443 break;
7444 case AArch64::ADDv8i8:
7445 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
7446 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
7447 break;
7448 case AArch64::ADDv16i8:
7449 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
7450 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
7451 break;
7452 case AArch64::ADDv4i16:
7453 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
7454 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
7455 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
7456 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
7457 break;
7458 case AArch64::ADDv8i16:
7459 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
7460 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
7461 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
7462 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
7463 break;
7464 case AArch64::ADDv2i32:
7465 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
7466 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
7467 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
7468 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
7469 break;
7470 case AArch64::ADDv4i32:
7471 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
7472 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
7473 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
7474 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
7475 break;
7476 case AArch64::SUBv8i8:
7477 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
7478 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
7479 break;
7480 case AArch64::SUBv16i8:
7481 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
7482 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
7483 break;
7484 case AArch64::SUBv4i16:
7485 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
7486 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
7487 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
7488 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
7489 break;
7490 case AArch64::SUBv8i16:
7491 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
7492 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
7493 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
7494 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
7495 break;
7496 case AArch64::SUBv2i32:
7497 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
7498 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
7499 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
7500 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
7501 break;
7502 case AArch64::SUBv4i32:
7503 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
7504 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
7505 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
7506 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
7507 break;
7508 }
7509 return Found;
7510}
7511
7512bool AArch64InstrInfo::isAccumulationOpcode(unsigned Opcode) const {
7513 switch (Opcode) {
7514 default:
7515 break;
7516 case AArch64::UABALB_ZZZ_D:
7517 case AArch64::UABALB_ZZZ_H:
7518 case AArch64::UABALB_ZZZ_S:
7519 case AArch64::UABALT_ZZZ_D:
7520 case AArch64::UABALT_ZZZ_H:
7521 case AArch64::UABALT_ZZZ_S:
7522 case AArch64::SABALB_ZZZ_D:
7523 case AArch64::SABALB_ZZZ_S:
7524 case AArch64::SABALB_ZZZ_H:
7525 case AArch64::SABALT_ZZZ_D:
7526 case AArch64::SABALT_ZZZ_S:
7527 case AArch64::SABALT_ZZZ_H:
7528 case AArch64::UABALv16i8_v8i16:
7529 case AArch64::UABALv2i32_v2i64:
7530 case AArch64::UABALv4i16_v4i32:
7531 case AArch64::UABALv4i32_v2i64:
7532 case AArch64::UABALv8i16_v4i32:
7533 case AArch64::UABALv8i8_v8i16:
7534 case AArch64::UABAv16i8:
7535 case AArch64::UABAv2i32:
7536 case AArch64::UABAv4i16:
7537 case AArch64::UABAv4i32:
7538 case AArch64::UABAv8i16:
7539 case AArch64::UABAv8i8:
7540 case AArch64::SABALv16i8_v8i16:
7541 case AArch64::SABALv2i32_v2i64:
7542 case AArch64::SABALv4i16_v4i32:
7543 case AArch64::SABALv4i32_v2i64:
7544 case AArch64::SABALv8i16_v4i32:
7545 case AArch64::SABALv8i8_v8i16:
7546 case AArch64::SABAv16i8:
7547 case AArch64::SABAv2i32:
7548 case AArch64::SABAv4i16:
7549 case AArch64::SABAv4i32:
7550 case AArch64::SABAv8i16:
7551 case AArch64::SABAv8i8:
7552 return true;
7553 }
7554
7555 return false;
7556}
7557
7558unsigned AArch64InstrInfo::getAccumulationStartOpcode(
7559 unsigned AccumulationOpcode) const {
7560 switch (AccumulationOpcode) {
7561 default:
7562 llvm_unreachable("Unsupported accumulation Opcode!");
7563 case AArch64::UABALB_ZZZ_D:
7564 return AArch64::UABDLB_ZZZ_D;
7565 case AArch64::UABALB_ZZZ_H:
7566 return AArch64::UABDLB_ZZZ_H;
7567 case AArch64::UABALB_ZZZ_S:
7568 return AArch64::UABDLB_ZZZ_S;
7569 case AArch64::UABALT_ZZZ_D:
7570 return AArch64::UABDLT_ZZZ_D;
7571 case AArch64::UABALT_ZZZ_H:
7572 return AArch64::UABDLT_ZZZ_H;
7573 case AArch64::UABALT_ZZZ_S:
7574 return AArch64::UABDLT_ZZZ_S;
7575 case AArch64::UABALv16i8_v8i16:
7576 return AArch64::UABDLv16i8_v8i16;
7577 case AArch64::UABALv2i32_v2i64:
7578 return AArch64::UABDLv2i32_v2i64;
7579 case AArch64::UABALv4i16_v4i32:
7580 return AArch64::UABDLv4i16_v4i32;
7581 case AArch64::UABALv4i32_v2i64:
7582 return AArch64::UABDLv4i32_v2i64;
7583 case AArch64::UABALv8i16_v4i32:
7584 return AArch64::UABDLv8i16_v4i32;
7585 case AArch64::UABALv8i8_v8i16:
7586 return AArch64::UABDLv8i8_v8i16;
7587 case AArch64::UABAv16i8:
7588 return AArch64::UABDv16i8;
7589 case AArch64::UABAv2i32:
7590 return AArch64::UABDv2i32;
7591 case AArch64::UABAv4i16:
7592 return AArch64::UABDv4i16;
7593 case AArch64::UABAv4i32:
7594 return AArch64::UABDv4i32;
7595 case AArch64::UABAv8i16:
7596 return AArch64::UABDv8i16;
7597 case AArch64::UABAv8i8:
7598 return AArch64::UABDv8i8;
7599 case AArch64::SABALB_ZZZ_D:
7600 return AArch64::SABDLB_ZZZ_D;
7601 case AArch64::SABALB_ZZZ_S:
7602 return AArch64::SABDLB_ZZZ_S;
7603 case AArch64::SABALB_ZZZ_H:
7604 return AArch64::SABDLB_ZZZ_H;
7605 case AArch64::SABALT_ZZZ_D:
7606 return AArch64::SABDLT_ZZZ_D;
7607 case AArch64::SABALT_ZZZ_S:
7608 return AArch64::SABDLT_ZZZ_S;
7609 case AArch64::SABALT_ZZZ_H:
7610 return AArch64::SABDLT_ZZZ_H;
7611 case AArch64::SABALv16i8_v8i16:
7612 return AArch64::SABDLv16i8_v8i16;
7613 case AArch64::SABALv2i32_v2i64:
7614 return AArch64::SABDLv2i32_v2i64;
7615 case AArch64::SABALv4i16_v4i32:
7616 return AArch64::SABDLv4i16_v4i32;
7617 case AArch64::SABALv4i32_v2i64:
7618 return AArch64::SABDLv4i32_v2i64;
7619 case AArch64::SABALv8i16_v4i32:
7620 return AArch64::SABDLv8i16_v4i32;
7621 case AArch64::SABALv8i8_v8i16:
7622 return AArch64::SABDLv8i8_v8i16;
7623 case AArch64::SABAv16i8:
7624 return AArch64::SABDv16i8;
7625 case AArch64::SABAv2i32:
7626 return AArch64::SABAv2i32;
7627 case AArch64::SABAv4i16:
7628 return AArch64::SABDv4i16;
7629 case AArch64::SABAv4i32:
7630 return AArch64::SABDv4i32;
7631 case AArch64::SABAv8i16:
7632 return AArch64::SABDv8i16;
7633 case AArch64::SABAv8i8:
7634 return AArch64::SABDv8i8;
7635 }
7636}
7637
7638/// Floating-Point Support
7639
7640/// Find instructions that can be turned into madd.
7642 SmallVectorImpl<unsigned> &Patterns) {
7643
7644 if (!isCombineInstrCandidateFP(Root))
7645 return false;
7646
7647 MachineBasicBlock &MBB = *Root.getParent();
7648 bool Found = false;
7649
7650 auto Match = [&](int Opcode, int Operand, unsigned Pattern) -> bool {
7651 if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
7652 Patterns.push_back(Pattern);
7653 return true;
7654 }
7655 return false;
7656 };
7657
7659
7660 switch (Root.getOpcode()) {
7661 default:
7662 assert(false && "Unsupported FP instruction in combiner\n");
7663 break;
7664 case AArch64::FADDHrr:
7665 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7666 "FADDHrr does not have register operands");
7667
7668 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
7669 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
7670 break;
7671 case AArch64::FADDSrr:
7672 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7673 "FADDSrr does not have register operands");
7674
7675 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
7676 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
7677
7678 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
7679 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
7680 break;
7681 case AArch64::FADDDrr:
7682 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
7683 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
7684
7685 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
7686 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
7687 break;
7688 case AArch64::FADDv4f16:
7689 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
7690 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
7691
7692 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
7693 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
7694 break;
7695 case AArch64::FADDv8f16:
7696 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
7697 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
7698
7699 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
7700 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
7701 break;
7702 case AArch64::FADDv2f32:
7703 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
7704 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
7705
7706 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
7707 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
7708 break;
7709 case AArch64::FADDv2f64:
7710 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
7711 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
7712
7713 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
7714 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
7715 break;
7716 case AArch64::FADDv4f32:
7717 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
7718 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
7719
7720 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
7721 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
7722 break;
7723 case AArch64::FSUBHrr:
7724 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
7725 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
7726 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
7727 break;
7728 case AArch64::FSUBSrr:
7729 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
7730
7731 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
7732 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
7733
7734 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
7735 break;
7736 case AArch64::FSUBDrr:
7737 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
7738
7739 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
7740 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
7741
7742 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
7743 break;
7744 case AArch64::FSUBv4f16:
7745 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
7746 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
7747
7748 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
7749 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
7750 break;
7751 case AArch64::FSUBv8f16:
7752 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
7753 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
7754
7755 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
7756 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
7757 break;
7758 case AArch64::FSUBv2f32:
7759 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
7760 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
7761
7762 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
7763 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
7764 break;
7765 case AArch64::FSUBv2f64:
7766 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
7767 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
7768
7769 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
7770 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
7771 break;
7772 case AArch64::FSUBv4f32:
7773 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
7774 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
7775
7776 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
7777 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
7778 break;
7779 }
7780 return Found;
7781}
7782
7784 SmallVectorImpl<unsigned> &Patterns) {
7785 MachineBasicBlock &MBB = *Root.getParent();
7786 bool Found = false;
7787
7788 auto Match = [&](unsigned Opcode, int Operand, unsigned Pattern) -> bool {
7789 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7790 MachineOperand &MO = Root.getOperand(Operand);
7791 MachineInstr *MI = nullptr;
7792 if (MO.isReg() && MO.getReg().isVirtual())
7793 MI = MRI.getUniqueVRegDef(MO.getReg());
7794 // Ignore No-op COPYs in FMUL(COPY(DUP(..)))
7795 if (MI && MI->getOpcode() == TargetOpcode::COPY &&
7796 MI->getOperand(1).getReg().isVirtual())
7797 MI = MRI.getUniqueVRegDef(MI->getOperand(1).getReg());
7798 if (MI && MI->getOpcode() == Opcode) {
7799 Patterns.push_back(Pattern);
7800 return true;
7801 }
7802 return false;
7803 };
7804
7806
7807 switch (Root.getOpcode()) {
7808 default:
7809 return false;
7810 case AArch64::FMULv2f32:
7811 Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1);
7812 Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2);
7813 break;
7814 case AArch64::FMULv2f64:
7815 Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1);
7816 Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2);
7817 break;
7818 case AArch64::FMULv4f16:
7819 Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1);
7820 Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2);
7821 break;
7822 case AArch64::FMULv4f32:
7823 Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1);
7824 Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2);
7825 break;
7826 case AArch64::FMULv8f16:
7827 Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1);
7828 Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2);
7829 break;
7830 }
7831
7832 return Found;
7833}
7834
7836 SmallVectorImpl<unsigned> &Patterns) {
7837 unsigned Opc = Root.getOpcode();
7838 MachineBasicBlock &MBB = *Root.getParent();
7839 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7840
7841 auto Match = [&](unsigned Opcode, unsigned Pattern) -> bool {
7842 MachineOperand &MO = Root.getOperand(1);
7844 if (MI != nullptr && (MI->getOpcode() == Opcode) &&
7845 MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) &&
7849 MI->getFlag(MachineInstr::MIFlag::FmNsz)) {
7850 Patterns.push_back(Pattern);
7851 return true;
7852 }
7853 return false;
7854 };
7855
7856 switch (Opc) {
7857 default:
7858 break;
7859 case AArch64::FNEGDr:
7860 return Match(AArch64::FMADDDrrr, AArch64MachineCombinerPattern::FNMADD);
7861 case AArch64::FNEGSr:
7862 return Match(AArch64::FMADDSrrr, AArch64MachineCombinerPattern::FNMADD);
7863 }
7864
7865 return false;
7866}
7867
7868/// Return true when a code sequence can improve throughput. It
7869/// should be called only for instructions in loops.
7870/// \param Pattern - combiner pattern
7872 switch (Pattern) {
7873 default:
7874 break;
7980 return true;
7981 } // end switch (Pattern)
7982 return false;
7983}
7984
7985/// Find other MI combine patterns.
7987 SmallVectorImpl<unsigned> &Patterns) {
7988 // A - (B + C) ==> (A - B) - C or (A - C) - B
7989 unsigned Opc = Root.getOpcode();
7990 MachineBasicBlock &MBB = *Root.getParent();
7991
7992 switch (Opc) {
7993 case AArch64::SUBWrr:
7994 case AArch64::SUBSWrr:
7995 case AArch64::SUBXrr:
7996 case AArch64::SUBSXrr:
7997 // Found candidate root.
7998 break;
7999 default:
8000 return false;
8001 }
8002
8004 Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) ==
8005 -1)
8006 return false;
8007
8008 if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) ||
8009 canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) ||
8010 canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) ||
8011 canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) {
8014 return true;
8015 }
8016
8017 return false;
8018}
8019
8020/// Check if the given instruction forms a gather load pattern that can be
8021/// optimized for better Memory-Level Parallelism (MLP). This function
8022/// identifies chains of NEON lane load instructions that load data from
8023/// different memory addresses into individual lanes of a 128-bit vector
8024/// register, then attempts to split the pattern into parallel loads to break
8025/// the serial dependency between instructions.
8026///
8027/// Pattern Matched:
8028/// Initial scalar load -> SUBREG_TO_REG (lane 0) -> LD1i* (lane 1) ->
8029/// LD1i* (lane 2) -> ... -> LD1i* (lane N-1, Root)
8030///
8031/// Transformed Into:
8032/// Two parallel vector loads using fewer lanes each, followed by ZIP1v2i64
8033/// to combine the results, enabling better memory-level parallelism.
8034///
8035/// Supported Element Types:
8036/// - 32-bit elements (LD1i32, 4 lanes total)
8037/// - 16-bit elements (LD1i16, 8 lanes total)
8038/// - 8-bit elements (LD1i8, 16 lanes total)
8040 SmallVectorImpl<unsigned> &Patterns,
8041 unsigned LoadLaneOpCode, unsigned NumLanes) {
8042 const MachineFunction *MF = Root.getMF();
8043
8044 // Early exit if optimizing for size.
8045 if (MF->getFunction().hasMinSize())
8046 return false;
8047
8048 const MachineRegisterInfo &MRI = MF->getRegInfo();
8050
8051 // The root of the pattern must load into the last lane of the vector.
8052 if (Root.getOperand(2).getImm() != NumLanes - 1)
8053 return false;
8054
8055 // Check that we have load into all lanes except lane 0.
8056 // For each load we also want to check that:
8057 // 1. It has a single non-debug use (since we will be replacing the virtual
8058 // register)
8059 // 2. That the addressing mode only uses a single pointer operand
8060 auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
8061 auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
8062 SmallSet<unsigned, 16> RemainingLanes(Range.begin(), Range.end());
8064 while (!RemainingLanes.empty() && CurrInstr &&
8065 CurrInstr->getOpcode() == LoadLaneOpCode &&
8066 MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
8067 CurrInstr->getNumOperands() == 4) {
8068 RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
8069 LoadInstrs.push_back(CurrInstr);
8070 CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
8071 }
8072
8073 // Check that we have found a match for lanes N-1.. 1.
8074 if (!RemainingLanes.empty())
8075 return false;
8076
8077 // Match the SUBREG_TO_REG sequence.
8078 if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
8079 return false;
8080
8081 // Verify that the subreg to reg loads an integer into the first lane.
8082 auto Lane0LoadReg = CurrInstr->getOperand(1).getReg();
8083 unsigned SingleLaneSizeInBits = 128 / NumLanes;
8084 if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits)
8085 return false;
8086
8087 // Verify that it also has a single non debug use.
8088 if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
8089 return false;
8090
8091 LoadInstrs.push_back(MRI.getUniqueVRegDef(Lane0LoadReg));
8092
8093 // If there is any chance of aliasing, do not apply the pattern.
8094 // Walk backward through the MBB starting from Root.
8095 // Exit early if we've encountered all load instructions or hit the search
8096 // limit.
8097 auto MBBItr = Root.getIterator();
8098 unsigned RemainingSteps = GatherOptSearchLimit;
8099 SmallPtrSet<const MachineInstr *, 16> RemainingLoadInstrs;
8100 RemainingLoadInstrs.insert(LoadInstrs.begin(), LoadInstrs.end());
8101 const MachineBasicBlock *MBB = Root.getParent();
8102
8103 for (; MBBItr != MBB->begin() && RemainingSteps > 0 &&
8104 !RemainingLoadInstrs.empty();
8105 --MBBItr, --RemainingSteps) {
8106 const MachineInstr &CurrInstr = *MBBItr;
8107
8108 // Remove this instruction from remaining loads if it's one we're tracking.
8109 RemainingLoadInstrs.erase(&CurrInstr);
8110
8111 // Check for potential aliasing with any of the load instructions to
8112 // optimize.
8113 if (CurrInstr.isLoadFoldBarrier())
8114 return false;
8115 }
8116
8117 // If we hit the search limit without finding all load instructions,
8118 // don't match the pattern.
8119 if (RemainingSteps == 0 && !RemainingLoadInstrs.empty())
8120 return false;
8121
8122 switch (NumLanes) {
8123 case 4:
8125 break;
8126 case 8:
8128 break;
8129 case 16:
8131 break;
8132 default:
8133 llvm_unreachable("Got bad number of lanes for gather pattern.");
8134 }
8135
8136 return true;
8137}
8138
8139/// Search for patterns of LD instructions we can optimize.
8141 SmallVectorImpl<unsigned> &Patterns) {
8142
8143 // The pattern searches for loads into single lanes.
8144 switch (Root.getOpcode()) {
8145 case AArch64::LD1i32:
8146 return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 4);
8147 case AArch64::LD1i16:
8148 return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 8);
8149 case AArch64::LD1i8:
8150 return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 16);
8151 default:
8152 return false;
8153 }
8154}
8155
8156/// Generate optimized instruction sequence for gather load patterns to improve
8157/// Memory-Level Parallelism (MLP). This function transforms a chain of
8158/// sequential NEON lane loads into parallel vector loads that can execute
8159/// concurrently.
8160static void
8164 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
8165 unsigned Pattern, unsigned NumLanes) {
8166 MachineFunction &MF = *Root.getParent()->getParent();
8167 MachineRegisterInfo &MRI = MF.getRegInfo();
8169
8170 // Gather the initial load instructions to build the pattern.
8171 SmallVector<MachineInstr *, 16> LoadToLaneInstrs;
8172 MachineInstr *CurrInstr = &Root;
8173 for (unsigned i = 0; i < NumLanes - 1; ++i) {
8174 LoadToLaneInstrs.push_back(CurrInstr);
8175 CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
8176 }
8177
8178 // Sort the load instructions according to the lane.
8179 llvm::sort(LoadToLaneInstrs,
8180 [](const MachineInstr *A, const MachineInstr *B) {
8181 return A->getOperand(2).getImm() > B->getOperand(2).getImm();
8182 });
8183
8184 MachineInstr *SubregToReg = CurrInstr;
8185 LoadToLaneInstrs.push_back(
8186 MRI.getUniqueVRegDef(SubregToReg->getOperand(1).getReg()));
8187 auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs);
8188
8189 const TargetRegisterClass *FPR128RegClass =
8190 MRI.getRegClass(Root.getOperand(0).getReg());
8191
8192 // Helper lambda to create a LD1 instruction.
8193 auto CreateLD1Instruction = [&](MachineInstr *OriginalInstr,
8194 Register SrcRegister, unsigned Lane,
8195 Register OffsetRegister,
8196 bool OffsetRegisterKillState) {
8197 auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
8198 MachineInstrBuilder LoadIndexIntoRegister =
8199 BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
8200 NewRegister)
8201 .addReg(SrcRegister)
8202 .addImm(Lane)
8203 .addReg(OffsetRegister, getKillRegState(OffsetRegisterKillState));
8204 InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
8205 InsInstrs.push_back(LoadIndexIntoRegister);
8206 return NewRegister;
8207 };
8208
8209 // Helper to create load instruction based on the NumLanes in the NEON
8210 // register we are rewriting.
8211 auto CreateLDRInstruction = [&](unsigned NumLanes, Register DestReg,
8212 Register OffsetReg,
8213 bool KillState) -> MachineInstrBuilder {
8214 unsigned Opcode;
8215 switch (NumLanes) {
8216 case 4:
8217 Opcode = AArch64::LDRSui;
8218 break;
8219 case 8:
8220 Opcode = AArch64::LDRHui;
8221 break;
8222 case 16:
8223 Opcode = AArch64::LDRBui;
8224 break;
8225 default:
8227 "Got unsupported number of lanes in machine-combiner gather pattern");
8228 }
8229 // Immediate offset load
8230 return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg)
8231 .addReg(OffsetReg)
8232 .addImm(0);
8233 };
8234
8235 // Load the remaining lanes into register 0.
8236 auto LanesToLoadToReg0 =
8237 llvm::make_range(LoadToLaneInstrsAscending.begin() + 1,
8238 LoadToLaneInstrsAscending.begin() + NumLanes / 2);
8239 Register PrevReg = SubregToReg->getOperand(0).getReg();
8240 for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) {
8241 const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(3);
8242 PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
8243 OffsetRegOperand.getReg(),
8244 OffsetRegOperand.isKill());
8245 DelInstrs.push_back(LoadInstr);
8246 }
8247 Register LastLoadReg0 = PrevReg;
8248
8249 // First load into register 1. Perform an integer load to zero out the upper
8250 // lanes in a single instruction.
8251 MachineInstr *Lane0Load = *LoadToLaneInstrsAscending.begin();
8252 MachineInstr *OriginalSplitLoad =
8253 *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2);
8254 Register DestRegForMiddleIndex = MRI.createVirtualRegister(
8255 MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
8256
8257 const MachineOperand &OriginalSplitToLoadOffsetOperand =
8258 OriginalSplitLoad->getOperand(3);
8259 MachineInstrBuilder MiddleIndexLoadInstr =
8260 CreateLDRInstruction(NumLanes, DestRegForMiddleIndex,
8261 OriginalSplitToLoadOffsetOperand.getReg(),
8262 OriginalSplitToLoadOffsetOperand.isKill());
8263
8264 InstrIdxForVirtReg.insert(
8265 std::make_pair(DestRegForMiddleIndex, InsInstrs.size()));
8266 InsInstrs.push_back(MiddleIndexLoadInstr);
8267 DelInstrs.push_back(OriginalSplitLoad);
8268
8269 // Subreg To Reg instruction for register 1.
8270 Register DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
8271 unsigned SubregType;
8272 switch (NumLanes) {
8273 case 4:
8274 SubregType = AArch64::ssub;
8275 break;
8276 case 8:
8277 SubregType = AArch64::hsub;
8278 break;
8279 case 16:
8280 SubregType = AArch64::bsub;
8281 break;
8282 default:
8284 "Got invalid NumLanes for machine-combiner gather pattern");
8285 }
8286
8287 auto SubRegToRegInstr =
8288 BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()),
8289 DestRegForSubregToReg)
8290 .addReg(DestRegForMiddleIndex, getKillRegState(true))
8291 .addImm(SubregType);
8292 InstrIdxForVirtReg.insert(
8293 std::make_pair(DestRegForSubregToReg, InsInstrs.size()));
8294 InsInstrs.push_back(SubRegToRegInstr);
8295
8296 // Load remaining lanes into register 1.
8297 auto LanesToLoadToReg1 =
8298 llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1,
8299 LoadToLaneInstrsAscending.end());
8300 PrevReg = SubRegToRegInstr->getOperand(0).getReg();
8301 for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) {
8302 const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(3);
8303 PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
8304 OffsetRegOperand.getReg(),
8305 OffsetRegOperand.isKill());
8306
8307 // Do not add the last reg to DelInstrs - it will be removed later.
8308 if (Index == NumLanes / 2 - 2) {
8309 break;
8310 }
8311 DelInstrs.push_back(LoadInstr);
8312 }
8313 Register LastLoadReg1 = PrevReg;
8314
8315 // Create the final zip instruction to combine the results.
8316 MachineInstrBuilder ZipInstr =
8317 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
8318 Root.getOperand(0).getReg())
8319 .addReg(LastLoadReg0)
8320 .addReg(LastLoadReg1);
8321 InsInstrs.push_back(ZipInstr);
8322}
8323
8337
8338/// Return true when there is potentially a faster code sequence for an
8339/// instruction chain ending in \p Root. All potential patterns are listed in
8340/// the \p Pattern vector. Pattern should be sorted in priority order since the
8341/// pattern evaluator stops checking as soon as it finds a faster sequence.
8342
8343bool AArch64InstrInfo::getMachineCombinerPatterns(
8344 MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns,
8345 bool DoRegPressureReduce) const {
8346 // Integer patterns
8347 if (getMaddPatterns(Root, Patterns))
8348 return true;
8349 // Floating point patterns
8350 if (getFMULPatterns(Root, Patterns))
8351 return true;
8352 if (getFMAPatterns(Root, Patterns))
8353 return true;
8354 if (getFNEGPatterns(Root, Patterns))
8355 return true;
8356
8357 // Other patterns
8358 if (getMiscPatterns(Root, Patterns))
8359 return true;
8360
8361 // Load patterns
8362 if (getLoadPatterns(Root, Patterns))
8363 return true;
8364
8365 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
8366 DoRegPressureReduce);
8367}
8368
8370/// genFusedMultiply - Generate fused multiply instructions.
8371/// This function supports both integer and floating point instructions.
8372/// A typical example:
8373/// F|MUL I=A,B,0
8374/// F|ADD R,I,C
8375/// ==> F|MADD R,A,B,C
8376/// \param MF Containing MachineFunction
8377/// \param MRI Register information
8378/// \param TII Target information
8379/// \param Root is the F|ADD instruction
8380/// \param [out] InsInstrs is a vector of machine instructions and will
8381/// contain the generated madd instruction
8382/// \param IdxMulOpd is index of operand in Root that is the result of
8383/// the F|MUL. In the example above IdxMulOpd is 1.
8384/// \param MaddOpc the opcode fo the f|madd instruction
8385/// \param RC Register class of operands
8386/// \param kind of fma instruction (addressing mode) to be generated
8387/// \param ReplacedAddend is the result register from the instruction
8388/// replacing the non-combined operand, if any.
8389static MachineInstr *
8391 const TargetInstrInfo *TII, MachineInstr &Root,
8392 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
8393 unsigned MaddOpc, const TargetRegisterClass *RC,
8395 const Register *ReplacedAddend = nullptr) {
8396 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
8397
8398 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
8399 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
8400 Register ResultReg = Root.getOperand(0).getReg();
8401 Register SrcReg0 = MUL->getOperand(1).getReg();
8402 bool Src0IsKill = MUL->getOperand(1).isKill();
8403 Register SrcReg1 = MUL->getOperand(2).getReg();
8404 bool Src1IsKill = MUL->getOperand(2).isKill();
8405
8406 Register SrcReg2;
8407 bool Src2IsKill;
8408 if (ReplacedAddend) {
8409 // If we just generated a new addend, we must be it's only use.
8410 SrcReg2 = *ReplacedAddend;
8411 Src2IsKill = true;
8412 } else {
8413 SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
8414 Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
8415 }
8416
8417 if (ResultReg.isVirtual())
8418 MRI.constrainRegClass(ResultReg, RC);
8419 if (SrcReg0.isVirtual())
8420 MRI.constrainRegClass(SrcReg0, RC);
8421 if (SrcReg1.isVirtual())
8422 MRI.constrainRegClass(SrcReg1, RC);
8423 if (SrcReg2.isVirtual())
8424 MRI.constrainRegClass(SrcReg2, RC);
8425
8427 if (kind == FMAInstKind::Default)
8428 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8429 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8430 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8431 .addReg(SrcReg2, getKillRegState(Src2IsKill));
8432 else if (kind == FMAInstKind::Indexed)
8433 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8434 .addReg(SrcReg2, getKillRegState(Src2IsKill))
8435 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8436 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8437 .addImm(MUL->getOperand(3).getImm());
8438 else if (kind == FMAInstKind::Accumulator)
8439 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8440 .addReg(SrcReg2, getKillRegState(Src2IsKill))
8441 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8442 .addReg(SrcReg1, getKillRegState(Src1IsKill));
8443 else
8444 assert(false && "Invalid FMA instruction kind \n");
8445 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
8446 InsInstrs.push_back(MIB);
8447 return MUL;
8448}
8449
8450static MachineInstr *
8452 const TargetInstrInfo *TII, MachineInstr &Root,
8454 MachineInstr *MAD = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
8455
8456 unsigned Opc = 0;
8457 const TargetRegisterClass *RC = MRI.getRegClass(MAD->getOperand(0).getReg());
8458 if (AArch64::FPR32RegClass.hasSubClassEq(RC))
8459 Opc = AArch64::FNMADDSrrr;
8460 else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
8461 Opc = AArch64::FNMADDDrrr;
8462 else
8463 return nullptr;
8464
8465 Register ResultReg = Root.getOperand(0).getReg();
8466 Register SrcReg0 = MAD->getOperand(1).getReg();
8467 Register SrcReg1 = MAD->getOperand(2).getReg();
8468 Register SrcReg2 = MAD->getOperand(3).getReg();
8469 bool Src0IsKill = MAD->getOperand(1).isKill();
8470 bool Src1IsKill = MAD->getOperand(2).isKill();
8471 bool Src2IsKill = MAD->getOperand(3).isKill();
8472 if (ResultReg.isVirtual())
8473 MRI.constrainRegClass(ResultReg, RC);
8474 if (SrcReg0.isVirtual())
8475 MRI.constrainRegClass(SrcReg0, RC);
8476 if (SrcReg1.isVirtual())
8477 MRI.constrainRegClass(SrcReg1, RC);
8478 if (SrcReg2.isVirtual())
8479 MRI.constrainRegClass(SrcReg2, RC);
8480
8482 BuildMI(MF, MIMetadata(Root), TII->get(Opc), ResultReg)
8483 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8484 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8485 .addReg(SrcReg2, getKillRegState(Src2IsKill));
8486 InsInstrs.push_back(MIB);
8487
8488 return MAD;
8489}
8490
8491/// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
8492static MachineInstr *
8495 unsigned IdxDupOp, unsigned MulOpc,
8496 const TargetRegisterClass *RC, MachineRegisterInfo &MRI) {
8497 assert(((IdxDupOp == 1) || (IdxDupOp == 2)) &&
8498 "Invalid index of FMUL operand");
8499
8500 MachineFunction &MF = *Root.getMF();
8502
8503 MachineInstr *Dup =
8504 MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg());
8505
8506 if (Dup->getOpcode() == TargetOpcode::COPY)
8507 Dup = MRI.getUniqueVRegDef(Dup->getOperand(1).getReg());
8508
8509 Register DupSrcReg = Dup->getOperand(1).getReg();
8510 MRI.clearKillFlags(DupSrcReg);
8511 MRI.constrainRegClass(DupSrcReg, RC);
8512
8513 unsigned DupSrcLane = Dup->getOperand(2).getImm();
8514
8515 unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1;
8516 MachineOperand &MulOp = Root.getOperand(IdxMulOp);
8517
8518 Register ResultReg = Root.getOperand(0).getReg();
8519
8521 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MulOpc), ResultReg)
8522 .add(MulOp)
8523 .addReg(DupSrcReg)
8524 .addImm(DupSrcLane);
8525
8526 InsInstrs.push_back(MIB);
8527 return &Root;
8528}
8529
8530/// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
8531/// instructions.
8532///
8533/// \see genFusedMultiply
8537 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
8538 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8540}
8541
8542/// genNeg - Helper to generate an intermediate negation of the second operand
8543/// of Root
8545 const TargetInstrInfo *TII, MachineInstr &Root,
8547 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
8548 unsigned MnegOpc, const TargetRegisterClass *RC) {
8549 Register NewVR = MRI.createVirtualRegister(RC);
8551 BuildMI(MF, MIMetadata(Root), TII->get(MnegOpc), NewVR)
8552 .add(Root.getOperand(2));
8553 InsInstrs.push_back(MIB);
8554
8555 assert(InstrIdxForVirtReg.empty());
8556 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8557
8558 return NewVR;
8559}
8560
8561/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
8562/// instructions with an additional negation of the accumulator
8566 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
8567 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
8568 assert(IdxMulOpd == 1);
8569
8570 Register NewVR =
8571 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
8572 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8573 FMAInstKind::Accumulator, &NewVR);
8574}
8575
8576/// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
8577/// instructions.
8578///
8579/// \see genFusedMultiply
8583 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
8584 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8586}
8587
8588/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
8589/// instructions with an additional negation of the accumulator
8593 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
8594 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
8595 assert(IdxMulOpd == 1);
8596
8597 Register NewVR =
8598 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
8599
8600 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8601 FMAInstKind::Indexed, &NewVR);
8602}
8603
8604/// genMaddR - Generate madd instruction and combine mul and add using
8605/// an extra virtual register
8606/// Example - an ADD intermediate needs to be stored in a register:
8607/// MUL I=A,B,0
8608/// ADD R,I,Imm
8609/// ==> ORR V, ZR, Imm
8610/// ==> MADD R,A,B,V
8611/// \param MF Containing MachineFunction
8612/// \param MRI Register information
8613/// \param TII Target information
8614/// \param Root is the ADD instruction
8615/// \param [out] InsInstrs is a vector of machine instructions and will
8616/// contain the generated madd instruction
8617/// \param IdxMulOpd is index of operand in Root that is the result of
8618/// the MUL. In the example above IdxMulOpd is 1.
8619/// \param MaddOpc the opcode fo the madd instruction
8620/// \param VR is a virtual register that holds the value of an ADD operand
8621/// (V in the example above).
8622/// \param RC Register class of operands
8624 const TargetInstrInfo *TII, MachineInstr &Root,
8626 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
8627 const TargetRegisterClass *RC) {
8628 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
8629
8630 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
8631 Register ResultReg = Root.getOperand(0).getReg();
8632 Register SrcReg0 = MUL->getOperand(1).getReg();
8633 bool Src0IsKill = MUL->getOperand(1).isKill();
8634 Register SrcReg1 = MUL->getOperand(2).getReg();
8635 bool Src1IsKill = MUL->getOperand(2).isKill();
8636
8637 if (ResultReg.isVirtual())
8638 MRI.constrainRegClass(ResultReg, RC);
8639 if (SrcReg0.isVirtual())
8640 MRI.constrainRegClass(SrcReg0, RC);
8641 if (SrcReg1.isVirtual())
8642 MRI.constrainRegClass(SrcReg1, RC);
8644 MRI.constrainRegClass(VR, RC);
8645
8647 BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8648 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8649 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8650 .addReg(VR);
8651 // Insert the MADD
8652 InsInstrs.push_back(MIB);
8653 return MUL;
8654}
8655
8656/// Do the following transformation
8657/// A - (B + C) ==> (A - B) - C
8658/// A - (B + C) ==> (A - C) - B
8660 const TargetInstrInfo *TII, MachineInstr &Root,
8663 unsigned IdxOpd1,
8664 DenseMap<Register, unsigned> &InstrIdxForVirtReg) {
8665 assert(IdxOpd1 == 1 || IdxOpd1 == 2);
8666 unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1;
8667 MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
8668
8669 Register ResultReg = Root.getOperand(0).getReg();
8670 Register RegA = Root.getOperand(1).getReg();
8671 bool RegAIsKill = Root.getOperand(1).isKill();
8672 Register RegB = AddMI->getOperand(IdxOpd1).getReg();
8673 bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill();
8674 Register RegC = AddMI->getOperand(IdxOtherOpd).getReg();
8675 bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill();
8676 Register NewVR =
8678
8679 unsigned Opcode = Root.getOpcode();
8680 if (Opcode == AArch64::SUBSWrr)
8681 Opcode = AArch64::SUBWrr;
8682 else if (Opcode == AArch64::SUBSXrr)
8683 Opcode = AArch64::SUBXrr;
8684 else
8685 assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) &&
8686 "Unexpected instruction opcode.");
8687
8688 uint32_t Flags = Root.mergeFlagsWith(*AddMI);
8689 Flags &= ~MachineInstr::NoSWrap;
8690 Flags &= ~MachineInstr::NoUWrap;
8691
8692 MachineInstrBuilder MIB1 =
8693 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), NewVR)
8694 .addReg(RegA, getKillRegState(RegAIsKill))
8695 .addReg(RegB, getKillRegState(RegBIsKill))
8696 .setMIFlags(Flags);
8697 MachineInstrBuilder MIB2 =
8698 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), ResultReg)
8699 .addReg(NewVR, getKillRegState(true))
8700 .addReg(RegC, getKillRegState(RegCIsKill))
8701 .setMIFlags(Flags);
8702
8703 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8704 InsInstrs.push_back(MIB1);
8705 InsInstrs.push_back(MIB2);
8706 DelInstrs.push_back(AddMI);
8707 DelInstrs.push_back(&Root);
8708}
8709
8710unsigned AArch64InstrInfo::getReduceOpcodeForAccumulator(
8711 unsigned int AccumulatorOpCode) const {
8712 switch (AccumulatorOpCode) {
8713 case AArch64::UABALB_ZZZ_D:
8714 case AArch64::SABALB_ZZZ_D:
8715 case AArch64::UABALT_ZZZ_D:
8716 case AArch64::SABALT_ZZZ_D:
8717 return AArch64::ADD_ZZZ_D;
8718 case AArch64::UABALB_ZZZ_H:
8719 case AArch64::SABALB_ZZZ_H:
8720 case AArch64::UABALT_ZZZ_H:
8721 case AArch64::SABALT_ZZZ_H:
8722 return AArch64::ADD_ZZZ_H;
8723 case AArch64::UABALB_ZZZ_S:
8724 case AArch64::SABALB_ZZZ_S:
8725 case AArch64::UABALT_ZZZ_S:
8726 case AArch64::SABALT_ZZZ_S:
8727 return AArch64::ADD_ZZZ_S;
8728 case AArch64::UABALv16i8_v8i16:
8729 case AArch64::SABALv8i8_v8i16:
8730 case AArch64::SABAv8i16:
8731 case AArch64::UABAv8i16:
8732 return AArch64::ADDv8i16;
8733 case AArch64::SABALv2i32_v2i64:
8734 case AArch64::UABALv2i32_v2i64:
8735 case AArch64::SABALv4i32_v2i64:
8736 return AArch64::ADDv2i64;
8737 case AArch64::UABALv4i16_v4i32:
8738 case AArch64::SABALv4i16_v4i32:
8739 case AArch64::SABALv8i16_v4i32:
8740 case AArch64::SABAv4i32:
8741 case AArch64::UABAv4i32:
8742 return AArch64::ADDv4i32;
8743 case AArch64::UABALv4i32_v2i64:
8744 return AArch64::ADDv2i64;
8745 case AArch64::UABALv8i16_v4i32:
8746 return AArch64::ADDv4i32;
8747 case AArch64::UABALv8i8_v8i16:
8748 case AArch64::SABALv16i8_v8i16:
8749 return AArch64::ADDv8i16;
8750 case AArch64::UABAv16i8:
8751 case AArch64::SABAv16i8:
8752 return AArch64::ADDv16i8;
8753 case AArch64::UABAv4i16:
8754 case AArch64::SABAv4i16:
8755 return AArch64::ADDv4i16;
8756 case AArch64::UABAv2i32:
8757 case AArch64::SABAv2i32:
8758 return AArch64::ADDv2i32;
8759 case AArch64::UABAv8i8:
8760 case AArch64::SABAv8i8:
8761 return AArch64::ADDv8i8;
8762 default:
8763 llvm_unreachable("Unknown accumulator opcode");
8764 }
8765}
8766
8767/// When getMachineCombinerPatterns() finds potential patterns,
8768/// this function generates the instructions that could replace the
8769/// original code sequence
8770void AArch64InstrInfo::genAlternativeCodeSequence(
8771 MachineInstr &Root, unsigned Pattern,
8774 DenseMap<Register, unsigned> &InstrIdxForVirtReg) const {
8775 MachineBasicBlock &MBB = *Root.getParent();
8776 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8777 MachineFunction &MF = *MBB.getParent();
8778 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
8779
8780 MachineInstr *MUL = nullptr;
8781 const TargetRegisterClass *RC;
8782 unsigned Opc;
8783 switch (Pattern) {
8784 default:
8785 // Reassociate instructions.
8786 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
8787 DelInstrs, InstrIdxForVirtReg);
8788 return;
8790 // A - (B + C)
8791 // ==> (A - B) - C
8792 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1,
8793 InstrIdxForVirtReg);
8794 return;
8796 // A - (B + C)
8797 // ==> (A - C) - B
8798 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2,
8799 InstrIdxForVirtReg);
8800 return;
8803 // MUL I=A,B,0
8804 // ADD R,I,C
8805 // ==> MADD R,A,B,C
8806 // --- Create(MADD);
8808 Opc = AArch64::MADDWrrr;
8809 RC = &AArch64::GPR32RegClass;
8810 } else {
8811 Opc = AArch64::MADDXrrr;
8812 RC = &AArch64::GPR64RegClass;
8813 }
8814 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8815 break;
8818 // MUL I=A,B,0
8819 // ADD R,C,I
8820 // ==> MADD R,A,B,C
8821 // --- Create(MADD);
8823 Opc = AArch64::MADDWrrr;
8824 RC = &AArch64::GPR32RegClass;
8825 } else {
8826 Opc = AArch64::MADDXrrr;
8827 RC = &AArch64::GPR64RegClass;
8828 }
8829 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8830 break;
8835 // MUL I=A,B,0
8836 // ADD/SUB R,I,Imm
8837 // ==> MOV V, Imm/-Imm
8838 // ==> MADD R,A,B,V
8839 // --- Create(MADD);
8840 const TargetRegisterClass *RC;
8841 unsigned BitSize, MovImm;
8844 MovImm = AArch64::MOVi32imm;
8845 RC = &AArch64::GPR32spRegClass;
8846 BitSize = 32;
8847 Opc = AArch64::MADDWrrr;
8848 RC = &AArch64::GPR32RegClass;
8849 } else {
8850 MovImm = AArch64::MOVi64imm;
8851 RC = &AArch64::GPR64spRegClass;
8852 BitSize = 64;
8853 Opc = AArch64::MADDXrrr;
8854 RC = &AArch64::GPR64RegClass;
8855 }
8856 Register NewVR = MRI.createVirtualRegister(RC);
8857 uint64_t Imm = Root.getOperand(2).getImm();
8858
8859 if (Root.getOperand(3).isImm()) {
8860 unsigned Val = Root.getOperand(3).getImm();
8861 Imm = Imm << Val;
8862 }
8863 bool IsSub = Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1 ||
8865 uint64_t UImm = SignExtend64(IsSub ? -Imm : Imm, BitSize);
8866 // Check that the immediate can be composed via a single instruction.
8868 AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
8869 if (Insn.size() != 1)
8870 return;
8871 MachineInstrBuilder MIB1 =
8872 BuildMI(MF, MIMetadata(Root), TII->get(MovImm), NewVR)
8873 .addImm(IsSub ? -Imm : Imm);
8874 InsInstrs.push_back(MIB1);
8875 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8876 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
8877 break;
8878 }
8881 // MUL I=A,B,0
8882 // SUB R,I, C
8883 // ==> SUB V, 0, C
8884 // ==> MADD R,A,B,V // = -C + A*B
8885 // --- Create(MADD);
8886 const TargetRegisterClass *SubRC;
8887 unsigned SubOpc, ZeroReg;
8889 SubOpc = AArch64::SUBWrr;
8890 SubRC = &AArch64::GPR32spRegClass;
8891 ZeroReg = AArch64::WZR;
8892 Opc = AArch64::MADDWrrr;
8893 RC = &AArch64::GPR32RegClass;
8894 } else {
8895 SubOpc = AArch64::SUBXrr;
8896 SubRC = &AArch64::GPR64spRegClass;
8897 ZeroReg = AArch64::XZR;
8898 Opc = AArch64::MADDXrrr;
8899 RC = &AArch64::GPR64RegClass;
8900 }
8901 Register NewVR = MRI.createVirtualRegister(SubRC);
8902 // SUB NewVR, 0, C
8903 MachineInstrBuilder MIB1 =
8904 BuildMI(MF, MIMetadata(Root), TII->get(SubOpc), NewVR)
8905 .addReg(ZeroReg)
8906 .add(Root.getOperand(2));
8907 InsInstrs.push_back(MIB1);
8908 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8909 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
8910 break;
8911 }
8914 // MUL I=A,B,0
8915 // SUB R,C,I
8916 // ==> MSUB R,A,B,C (computes C - A*B)
8917 // --- Create(MSUB);
8919 Opc = AArch64::MSUBWrrr;
8920 RC = &AArch64::GPR32RegClass;
8921 } else {
8922 Opc = AArch64::MSUBXrrr;
8923 RC = &AArch64::GPR64RegClass;
8924 }
8925 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8926 break;
8928 Opc = AArch64::MLAv8i8;
8929 RC = &AArch64::FPR64RegClass;
8930 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8931 break;
8933 Opc = AArch64::MLAv8i8;
8934 RC = &AArch64::FPR64RegClass;
8935 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8936 break;
8938 Opc = AArch64::MLAv16i8;
8939 RC = &AArch64::FPR128RegClass;
8940 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8941 break;
8943 Opc = AArch64::MLAv16i8;
8944 RC = &AArch64::FPR128RegClass;
8945 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8946 break;
8948 Opc = AArch64::MLAv4i16;
8949 RC = &AArch64::FPR64RegClass;
8950 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8951 break;
8953 Opc = AArch64::MLAv4i16;
8954 RC = &AArch64::FPR64RegClass;
8955 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8956 break;
8958 Opc = AArch64::MLAv8i16;
8959 RC = &AArch64::FPR128RegClass;
8960 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8961 break;
8963 Opc = AArch64::MLAv8i16;
8964 RC = &AArch64::FPR128RegClass;
8965 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8966 break;
8968 Opc = AArch64::MLAv2i32;
8969 RC = &AArch64::FPR64RegClass;
8970 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8971 break;
8973 Opc = AArch64::MLAv2i32;
8974 RC = &AArch64::FPR64RegClass;
8975 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8976 break;
8978 Opc = AArch64::MLAv4i32;
8979 RC = &AArch64::FPR128RegClass;
8980 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8981 break;
8983 Opc = AArch64::MLAv4i32;
8984 RC = &AArch64::FPR128RegClass;
8985 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8986 break;
8987
8989 Opc = AArch64::MLAv8i8;
8990 RC = &AArch64::FPR64RegClass;
8991 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8992 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8,
8993 RC);
8994 break;
8996 Opc = AArch64::MLSv8i8;
8997 RC = &AArch64::FPR64RegClass;
8998 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8999 break;
9001 Opc = AArch64::MLAv16i8;
9002 RC = &AArch64::FPR128RegClass;
9003 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9004 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8,
9005 RC);
9006 break;
9008 Opc = AArch64::MLSv16i8;
9009 RC = &AArch64::FPR128RegClass;
9010 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9011 break;
9013 Opc = AArch64::MLAv4i16;
9014 RC = &AArch64::FPR64RegClass;
9015 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9016 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
9017 RC);
9018 break;
9020 Opc = AArch64::MLSv4i16;
9021 RC = &AArch64::FPR64RegClass;
9022 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9023 break;
9025 Opc = AArch64::MLAv8i16;
9026 RC = &AArch64::FPR128RegClass;
9027 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9028 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
9029 RC);
9030 break;
9032 Opc = AArch64::MLSv8i16;
9033 RC = &AArch64::FPR128RegClass;
9034 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9035 break;
9037 Opc = AArch64::MLAv2i32;
9038 RC = &AArch64::FPR64RegClass;
9039 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9040 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
9041 RC);
9042 break;
9044 Opc = AArch64::MLSv2i32;
9045 RC = &AArch64::FPR64RegClass;
9046 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9047 break;
9049 Opc = AArch64::MLAv4i32;
9050 RC = &AArch64::FPR128RegClass;
9051 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9052 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
9053 RC);
9054 break;
9056 Opc = AArch64::MLSv4i32;
9057 RC = &AArch64::FPR128RegClass;
9058 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9059 break;
9060
9062 Opc = AArch64::MLAv4i16_indexed;
9063 RC = &AArch64::FPR64RegClass;
9064 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9065 break;
9067 Opc = AArch64::MLAv4i16_indexed;
9068 RC = &AArch64::FPR64RegClass;
9069 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9070 break;
9072 Opc = AArch64::MLAv8i16_indexed;
9073 RC = &AArch64::FPR128RegClass;
9074 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9075 break;
9077 Opc = AArch64::MLAv8i16_indexed;
9078 RC = &AArch64::FPR128RegClass;
9079 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9080 break;
9082 Opc = AArch64::MLAv2i32_indexed;
9083 RC = &AArch64::FPR64RegClass;
9084 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9085 break;
9087 Opc = AArch64::MLAv2i32_indexed;
9088 RC = &AArch64::FPR64RegClass;
9089 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9090 break;
9092 Opc = AArch64::MLAv4i32_indexed;
9093 RC = &AArch64::FPR128RegClass;
9094 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9095 break;
9097 Opc = AArch64::MLAv4i32_indexed;
9098 RC = &AArch64::FPR128RegClass;
9099 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9100 break;
9101
9103 Opc = AArch64::MLAv4i16_indexed;
9104 RC = &AArch64::FPR64RegClass;
9105 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9106 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
9107 RC);
9108 break;
9110 Opc = AArch64::MLSv4i16_indexed;
9111 RC = &AArch64::FPR64RegClass;
9112 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9113 break;
9115 Opc = AArch64::MLAv8i16_indexed;
9116 RC = &AArch64::FPR128RegClass;
9117 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9118 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
9119 RC);
9120 break;
9122 Opc = AArch64::MLSv8i16_indexed;
9123 RC = &AArch64::FPR128RegClass;
9124 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9125 break;
9127 Opc = AArch64::MLAv2i32_indexed;
9128 RC = &AArch64::FPR64RegClass;
9129 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9130 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
9131 RC);
9132 break;
9134 Opc = AArch64::MLSv2i32_indexed;
9135 RC = &AArch64::FPR64RegClass;
9136 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9137 break;
9139 Opc = AArch64::MLAv4i32_indexed;
9140 RC = &AArch64::FPR128RegClass;
9141 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9142 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
9143 RC);
9144 break;
9146 Opc = AArch64::MLSv4i32_indexed;
9147 RC = &AArch64::FPR128RegClass;
9148 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9149 break;
9150
9151 // Floating Point Support
9153 Opc = AArch64::FMADDHrrr;
9154 RC = &AArch64::FPR16RegClass;
9155 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9156 break;
9158 Opc = AArch64::FMADDSrrr;
9159 RC = &AArch64::FPR32RegClass;
9160 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9161 break;
9163 Opc = AArch64::FMADDDrrr;
9164 RC = &AArch64::FPR64RegClass;
9165 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9166 break;
9167
9169 Opc = AArch64::FMADDHrrr;
9170 RC = &AArch64::FPR16RegClass;
9171 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9172 break;
9174 Opc = AArch64::FMADDSrrr;
9175 RC = &AArch64::FPR32RegClass;
9176 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9177 break;
9179 Opc = AArch64::FMADDDrrr;
9180 RC = &AArch64::FPR64RegClass;
9181 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9182 break;
9183
9185 Opc = AArch64::FMLAv1i32_indexed;
9186 RC = &AArch64::FPR32RegClass;
9187 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9189 break;
9191 Opc = AArch64::FMLAv1i32_indexed;
9192 RC = &AArch64::FPR32RegClass;
9193 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9195 break;
9196
9198 Opc = AArch64::FMLAv1i64_indexed;
9199 RC = &AArch64::FPR64RegClass;
9200 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9202 break;
9204 Opc = AArch64::FMLAv1i64_indexed;
9205 RC = &AArch64::FPR64RegClass;
9206 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9208 break;
9209
9211 RC = &AArch64::FPR64RegClass;
9212 Opc = AArch64::FMLAv4i16_indexed;
9213 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9215 break;
9217 RC = &AArch64::FPR64RegClass;
9218 Opc = AArch64::FMLAv4f16;
9219 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9221 break;
9223 RC = &AArch64::FPR64RegClass;
9224 Opc = AArch64::FMLAv4i16_indexed;
9225 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9227 break;
9229 RC = &AArch64::FPR64RegClass;
9230 Opc = AArch64::FMLAv4f16;
9231 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9233 break;
9234
9237 RC = &AArch64::FPR64RegClass;
9239 Opc = AArch64::FMLAv2i32_indexed;
9240 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9242 } else {
9243 Opc = AArch64::FMLAv2f32;
9244 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9246 }
9247 break;
9250 RC = &AArch64::FPR64RegClass;
9252 Opc = AArch64::FMLAv2i32_indexed;
9253 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9255 } else {
9256 Opc = AArch64::FMLAv2f32;
9257 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9259 }
9260 break;
9261
9263 RC = &AArch64::FPR128RegClass;
9264 Opc = AArch64::FMLAv8i16_indexed;
9265 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9267 break;
9269 RC = &AArch64::FPR128RegClass;
9270 Opc = AArch64::FMLAv8f16;
9271 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9273 break;
9275 RC = &AArch64::FPR128RegClass;
9276 Opc = AArch64::FMLAv8i16_indexed;
9277 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9279 break;
9281 RC = &AArch64::FPR128RegClass;
9282 Opc = AArch64::FMLAv8f16;
9283 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9285 break;
9286
9289 RC = &AArch64::FPR128RegClass;
9291 Opc = AArch64::FMLAv2i64_indexed;
9292 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9294 } else {
9295 Opc = AArch64::FMLAv2f64;
9296 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9298 }
9299 break;
9302 RC = &AArch64::FPR128RegClass;
9304 Opc = AArch64::FMLAv2i64_indexed;
9305 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9307 } else {
9308 Opc = AArch64::FMLAv2f64;
9309 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9311 }
9312 break;
9313
9316 RC = &AArch64::FPR128RegClass;
9318 Opc = AArch64::FMLAv4i32_indexed;
9319 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9321 } else {
9322 Opc = AArch64::FMLAv4f32;
9323 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9325 }
9326 break;
9327
9330 RC = &AArch64::FPR128RegClass;
9332 Opc = AArch64::FMLAv4i32_indexed;
9333 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9335 } else {
9336 Opc = AArch64::FMLAv4f32;
9337 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9339 }
9340 break;
9341
9343 Opc = AArch64::FNMSUBHrrr;
9344 RC = &AArch64::FPR16RegClass;
9345 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9346 break;
9348 Opc = AArch64::FNMSUBSrrr;
9349 RC = &AArch64::FPR32RegClass;
9350 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9351 break;
9353 Opc = AArch64::FNMSUBDrrr;
9354 RC = &AArch64::FPR64RegClass;
9355 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9356 break;
9357
9359 Opc = AArch64::FNMADDHrrr;
9360 RC = &AArch64::FPR16RegClass;
9361 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9362 break;
9364 Opc = AArch64::FNMADDSrrr;
9365 RC = &AArch64::FPR32RegClass;
9366 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9367 break;
9369 Opc = AArch64::FNMADDDrrr;
9370 RC = &AArch64::FPR64RegClass;
9371 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9372 break;
9373
9375 Opc = AArch64::FMSUBHrrr;
9376 RC = &AArch64::FPR16RegClass;
9377 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9378 break;
9380 Opc = AArch64::FMSUBSrrr;
9381 RC = &AArch64::FPR32RegClass;
9382 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9383 break;
9385 Opc = AArch64::FMSUBDrrr;
9386 RC = &AArch64::FPR64RegClass;
9387 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9388 break;
9389
9391 Opc = AArch64::FMLSv1i32_indexed;
9392 RC = &AArch64::FPR32RegClass;
9393 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9395 break;
9396
9398 Opc = AArch64::FMLSv1i64_indexed;
9399 RC = &AArch64::FPR64RegClass;
9400 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9402 break;
9403
9406 RC = &AArch64::FPR64RegClass;
9407 Register NewVR = MRI.createVirtualRegister(RC);
9408 MachineInstrBuilder MIB1 =
9409 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f16), NewVR)
9410 .add(Root.getOperand(2));
9411 InsInstrs.push_back(MIB1);
9412 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9414 Opc = AArch64::FMLAv4f16;
9415 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9416 FMAInstKind::Accumulator, &NewVR);
9417 } else {
9418 Opc = AArch64::FMLAv4i16_indexed;
9419 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9420 FMAInstKind::Indexed, &NewVR);
9421 }
9422 break;
9423 }
9425 RC = &AArch64::FPR64RegClass;
9426 Opc = AArch64::FMLSv4f16;
9427 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9429 break;
9431 RC = &AArch64::FPR64RegClass;
9432 Opc = AArch64::FMLSv4i16_indexed;
9433 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9435 break;
9436
9439 RC = &AArch64::FPR64RegClass;
9441 Opc = AArch64::FMLSv2i32_indexed;
9442 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9444 } else {
9445 Opc = AArch64::FMLSv2f32;
9446 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9448 }
9449 break;
9450
9453 RC = &AArch64::FPR128RegClass;
9454 Register NewVR = MRI.createVirtualRegister(RC);
9455 MachineInstrBuilder MIB1 =
9456 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv8f16), NewVR)
9457 .add(Root.getOperand(2));
9458 InsInstrs.push_back(MIB1);
9459 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9461 Opc = AArch64::FMLAv8f16;
9462 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9463 FMAInstKind::Accumulator, &NewVR);
9464 } else {
9465 Opc = AArch64::FMLAv8i16_indexed;
9466 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9467 FMAInstKind::Indexed, &NewVR);
9468 }
9469 break;
9470 }
9472 RC = &AArch64::FPR128RegClass;
9473 Opc = AArch64::FMLSv8f16;
9474 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9476 break;
9478 RC = &AArch64::FPR128RegClass;
9479 Opc = AArch64::FMLSv8i16_indexed;
9480 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9482 break;
9483
9486 RC = &AArch64::FPR128RegClass;
9488 Opc = AArch64::FMLSv2i64_indexed;
9489 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9491 } else {
9492 Opc = AArch64::FMLSv2f64;
9493 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9495 }
9496 break;
9497
9500 RC = &AArch64::FPR128RegClass;
9502 Opc = AArch64::FMLSv4i32_indexed;
9503 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9505 } else {
9506 Opc = AArch64::FMLSv4f32;
9507 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9509 }
9510 break;
9513 RC = &AArch64::FPR64RegClass;
9514 Register NewVR = MRI.createVirtualRegister(RC);
9515 MachineInstrBuilder MIB1 =
9516 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f32), NewVR)
9517 .add(Root.getOperand(2));
9518 InsInstrs.push_back(MIB1);
9519 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9521 Opc = AArch64::FMLAv2i32_indexed;
9522 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9523 FMAInstKind::Indexed, &NewVR);
9524 } else {
9525 Opc = AArch64::FMLAv2f32;
9526 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9527 FMAInstKind::Accumulator, &NewVR);
9528 }
9529 break;
9530 }
9533 RC = &AArch64::FPR128RegClass;
9534 Register NewVR = MRI.createVirtualRegister(RC);
9535 MachineInstrBuilder MIB1 =
9536 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f32), NewVR)
9537 .add(Root.getOperand(2));
9538 InsInstrs.push_back(MIB1);
9539 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9541 Opc = AArch64::FMLAv4i32_indexed;
9542 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9543 FMAInstKind::Indexed, &NewVR);
9544 } else {
9545 Opc = AArch64::FMLAv4f32;
9546 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9547 FMAInstKind::Accumulator, &NewVR);
9548 }
9549 break;
9550 }
9553 RC = &AArch64::FPR128RegClass;
9554 Register NewVR = MRI.createVirtualRegister(RC);
9555 MachineInstrBuilder MIB1 =
9556 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f64), NewVR)
9557 .add(Root.getOperand(2));
9558 InsInstrs.push_back(MIB1);
9559 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9561 Opc = AArch64::FMLAv2i64_indexed;
9562 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9563 FMAInstKind::Indexed, &NewVR);
9564 } else {
9565 Opc = AArch64::FMLAv2f64;
9566 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9567 FMAInstKind::Accumulator, &NewVR);
9568 }
9569 break;
9570 }
9573 unsigned IdxDupOp =
9575 : 2;
9576 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed,
9577 &AArch64::FPR128RegClass, MRI);
9578 break;
9579 }
9582 unsigned IdxDupOp =
9584 : 2;
9585 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed,
9586 &AArch64::FPR128RegClass, MRI);
9587 break;
9588 }
9591 unsigned IdxDupOp =
9593 : 2;
9594 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed,
9595 &AArch64::FPR128_loRegClass, MRI);
9596 break;
9597 }
9600 unsigned IdxDupOp =
9602 : 2;
9603 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed,
9604 &AArch64::FPR128RegClass, MRI);
9605 break;
9606 }
9609 unsigned IdxDupOp =
9611 : 2;
9612 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed,
9613 &AArch64::FPR128_loRegClass, MRI);
9614 break;
9615 }
9617 MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
9618 break;
9619 }
9621 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9622 Pattern, 4);
9623 break;
9624 }
9626 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9627 Pattern, 8);
9628 break;
9629 }
9631 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9632 Pattern, 16);
9633 break;
9634 }
9635
9636 } // end switch (Pattern)
9637 // Record MUL and ADD/SUB for deletion
9638 if (MUL)
9639 DelInstrs.push_back(MUL);
9640 DelInstrs.push_back(&Root);
9641
9642 // Set the flags on the inserted instructions to be the merged flags of the
9643 // instructions that we have combined.
9644 uint32_t Flags = Root.getFlags();
9645 if (MUL)
9646 Flags = Root.mergeFlagsWith(*MUL);
9647 for (auto *MI : InsInstrs)
9648 MI->setFlags(Flags);
9649}
9650
9651/// Replace csincr-branch sequence by simple conditional branch
9652///
9653/// Examples:
9654/// 1. \code
9655/// csinc w9, wzr, wzr, <condition code>
9656/// tbnz w9, #0, 0x44
9657/// \endcode
9658/// to
9659/// \code
9660/// b.<inverted condition code>
9661/// \endcode
9662///
9663/// 2. \code
9664/// csinc w9, wzr, wzr, <condition code>
9665/// tbz w9, #0, 0x44
9666/// \endcode
9667/// to
9668/// \code
9669/// b.<condition code>
9670/// \endcode
9671///
9672/// Replace compare and branch sequence by TBZ/TBNZ instruction when the
9673/// compare's constant operand is power of 2.
9674///
9675/// Examples:
9676/// \code
9677/// and w8, w8, #0x400
9678/// cbnz w8, L1
9679/// \endcode
9680/// to
9681/// \code
9682/// tbnz w8, #10, L1
9683/// \endcode
9684///
9685/// \param MI Conditional Branch
9686/// \return True when the simple conditional branch is generated
9687///
9689 bool IsNegativeBranch = false;
9690 bool IsTestAndBranch = false;
9691 unsigned TargetBBInMI = 0;
9692 switch (MI.getOpcode()) {
9693 default:
9694 llvm_unreachable("Unknown branch instruction?");
9695 case AArch64::Bcc:
9696 case AArch64::CBWPri:
9697 case AArch64::CBXPri:
9698 case AArch64::CBBAssertExt:
9699 case AArch64::CBHAssertExt:
9700 case AArch64::CBWPrr:
9701 case AArch64::CBXPrr:
9702 return false;
9703 case AArch64::CBZW:
9704 case AArch64::CBZX:
9705 TargetBBInMI = 1;
9706 break;
9707 case AArch64::CBNZW:
9708 case AArch64::CBNZX:
9709 TargetBBInMI = 1;
9710 IsNegativeBranch = true;
9711 break;
9712 case AArch64::TBZW:
9713 case AArch64::TBZX:
9714 TargetBBInMI = 2;
9715 IsTestAndBranch = true;
9716 break;
9717 case AArch64::TBNZW:
9718 case AArch64::TBNZX:
9719 TargetBBInMI = 2;
9720 IsNegativeBranch = true;
9721 IsTestAndBranch = true;
9722 break;
9723 }
9724 // So we increment a zero register and test for bits other
9725 // than bit 0? Conservatively bail out in case the verifier
9726 // missed this case.
9727 if (IsTestAndBranch && MI.getOperand(1).getImm())
9728 return false;
9729
9730 // Find Definition.
9731 assert(MI.getParent() && "Incomplete machine instruction\n");
9732 MachineBasicBlock *MBB = MI.getParent();
9733 MachineFunction *MF = MBB->getParent();
9734 MachineRegisterInfo *MRI = &MF->getRegInfo();
9735 Register VReg = MI.getOperand(0).getReg();
9736 if (!VReg.isVirtual())
9737 return false;
9738
9739 MachineInstr *DefMI = MRI->getVRegDef(VReg);
9740
9741 // Look through COPY instructions to find definition.
9742 while (DefMI->isCopy()) {
9743 Register CopyVReg = DefMI->getOperand(1).getReg();
9744 if (!MRI->hasOneNonDBGUse(CopyVReg))
9745 return false;
9746 if (!MRI->hasOneDef(CopyVReg))
9747 return false;
9748 DefMI = MRI->getVRegDef(CopyVReg);
9749 }
9750
9751 switch (DefMI->getOpcode()) {
9752 default:
9753 return false;
9754 // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
9755 case AArch64::ANDWri:
9756 case AArch64::ANDXri: {
9757 if (IsTestAndBranch)
9758 return false;
9759 if (DefMI->getParent() != MBB)
9760 return false;
9761 if (!MRI->hasOneNonDBGUse(VReg))
9762 return false;
9763
9764 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
9766 DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
9767 if (!isPowerOf2_64(Mask))
9768 return false;
9769
9770 MachineOperand &MO = DefMI->getOperand(1);
9771 Register NewReg = MO.getReg();
9772 if (!NewReg.isVirtual())
9773 return false;
9774
9775 assert(!MRI->def_empty(NewReg) && "Register must be defined.");
9776
9777 MachineBasicBlock &RefToMBB = *MBB;
9778 MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
9779 DebugLoc DL = MI.getDebugLoc();
9780 unsigned Imm = Log2_64(Mask);
9781 unsigned Opc = (Imm < 32)
9782 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
9783 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
9784 MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
9785 .addReg(NewReg)
9786 .addImm(Imm)
9787 .addMBB(TBB);
9788 // Register lives on to the CBZ now.
9789 MO.setIsKill(false);
9790
9791 // For immediate smaller than 32, we need to use the 32-bit
9792 // variant (W) in all cases. Indeed the 64-bit variant does not
9793 // allow to encode them.
9794 // Therefore, if the input register is 64-bit, we need to take the
9795 // 32-bit sub-part.
9796 if (!Is32Bit && Imm < 32)
9797 NewMI->getOperand(0).setSubReg(AArch64::sub_32);
9798 MI.eraseFromParent();
9799 return true;
9800 }
9801 // Look for CSINC
9802 case AArch64::CSINCWr:
9803 case AArch64::CSINCXr: {
9804 if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
9805 DefMI->getOperand(2).getReg() == AArch64::WZR) &&
9806 !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
9807 DefMI->getOperand(2).getReg() == AArch64::XZR))
9808 return false;
9809
9810 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
9811 true) != -1)
9812 return false;
9813
9814 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
9815 // Convert only when the condition code is not modified between
9816 // the CSINC and the branch. The CC may be used by other
9817 // instructions in between.
9819 return false;
9820 MachineBasicBlock &RefToMBB = *MBB;
9821 MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
9822 DebugLoc DL = MI.getDebugLoc();
9823 if (IsNegativeBranch)
9825 BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
9826 MI.eraseFromParent();
9827 return true;
9828 }
9829 }
9830}
9831
9832std::pair<unsigned, unsigned>
9833AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
9834 const unsigned Mask = AArch64II::MO_FRAGMENT;
9835 return std::make_pair(TF & Mask, TF & ~Mask);
9836}
9837
9839AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
9840 using namespace AArch64II;
9841
9842 static const std::pair<unsigned, const char *> TargetFlags[] = {
9843 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
9844 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"},
9845 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"},
9846 {MO_HI12, "aarch64-hi12"}};
9847 return ArrayRef(TargetFlags);
9848}
9849
9851AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
9852 using namespace AArch64II;
9853
9854 static const std::pair<unsigned, const char *> TargetFlags[] = {
9855 {MO_COFFSTUB, "aarch64-coffstub"},
9856 {MO_GOT, "aarch64-got"},
9857 {MO_NC, "aarch64-nc"},
9858 {MO_S, "aarch64-s"},
9859 {MO_TLS, "aarch64-tls"},
9860 {MO_DLLIMPORT, "aarch64-dllimport"},
9861 {MO_PREL, "aarch64-prel"},
9862 {MO_TAGGED, "aarch64-tagged"},
9863 {MO_ARM64EC_CALLMANGLE, "aarch64-arm64ec-callmangle"},
9864 };
9865 return ArrayRef(TargetFlags);
9866}
9867
9869AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
9870 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9871 {{MOSuppressPair, "aarch64-suppress-pair"},
9872 {MOStridedAccess, "aarch64-strided-access"}};
9873 return ArrayRef(TargetFlags);
9874}
9875
9876/// Constants defining how certain sequences should be outlined.
9877/// This encompasses how an outlined function should be called, and what kind of
9878/// frame should be emitted for that outlined function.
9879///
9880/// \p MachineOutlinerDefault implies that the function should be called with
9881/// a save and restore of LR to the stack.
9882///
9883/// That is,
9884///
9885/// I1 Save LR OUTLINED_FUNCTION:
9886/// I2 --> BL OUTLINED_FUNCTION I1
9887/// I3 Restore LR I2
9888/// I3
9889/// RET
9890///
9891/// * Call construction overhead: 3 (save + BL + restore)
9892/// * Frame construction overhead: 1 (ret)
9893/// * Requires stack fixups? Yes
9894///
9895/// \p MachineOutlinerTailCall implies that the function is being created from
9896/// a sequence of instructions ending in a return.
9897///
9898/// That is,
9899///
9900/// I1 OUTLINED_FUNCTION:
9901/// I2 --> B OUTLINED_FUNCTION I1
9902/// RET I2
9903/// RET
9904///
9905/// * Call construction overhead: 1 (B)
9906/// * Frame construction overhead: 0 (Return included in sequence)
9907/// * Requires stack fixups? No
9908///
9909/// \p MachineOutlinerNoLRSave implies that the function should be called using
9910/// a BL instruction, but doesn't require LR to be saved and restored. This
9911/// happens when LR is known to be dead.
9912///
9913/// That is,
9914///
9915/// I1 OUTLINED_FUNCTION:
9916/// I2 --> BL OUTLINED_FUNCTION I1
9917/// I3 I2
9918/// I3
9919/// RET
9920///
9921/// * Call construction overhead: 1 (BL)
9922/// * Frame construction overhead: 1 (RET)
9923/// * Requires stack fixups? No
9924///
9925/// \p MachineOutlinerThunk implies that the function is being created from
9926/// a sequence of instructions ending in a call. The outlined function is
9927/// called with a BL instruction, and the outlined function tail-calls the
9928/// original call destination.
9929///
9930/// That is,
9931///
9932/// I1 OUTLINED_FUNCTION:
9933/// I2 --> BL OUTLINED_FUNCTION I1
9934/// BL f I2
9935/// B f
9936/// * Call construction overhead: 1 (BL)
9937/// * Frame construction overhead: 0
9938/// * Requires stack fixups? No
9939///
9940/// \p MachineOutlinerRegSave implies that the function should be called with a
9941/// save and restore of LR to an available register. This allows us to avoid
9942/// stack fixups. Note that this outlining variant is compatible with the
9943/// NoLRSave case.
9944///
9945/// That is,
9946///
9947/// I1 Save LR OUTLINED_FUNCTION:
9948/// I2 --> BL OUTLINED_FUNCTION I1
9949/// I3 Restore LR I2
9950/// I3
9951/// RET
9952///
9953/// * Call construction overhead: 3 (save + BL + restore)
9954/// * Frame construction overhead: 1 (ret)
9955/// * Requires stack fixups? No
9957 MachineOutlinerDefault, /// Emit a save, restore, call, and return.
9958 MachineOutlinerTailCall, /// Only emit a branch.
9959 MachineOutlinerNoLRSave, /// Emit a call and return.
9960 MachineOutlinerThunk, /// Emit a call and tail-call.
9961 MachineOutlinerRegSave /// Same as default, but save to a register.
9962};
9963
9969
9971AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const {
9972 MachineFunction *MF = C.getMF();
9973 const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
9974 const AArch64RegisterInfo *ARI =
9975 static_cast<const AArch64RegisterInfo *>(&TRI);
9976 // Check if there is an available register across the sequence that we can
9977 // use.
9978 for (unsigned Reg : AArch64::GPR64RegClass) {
9979 if (!ARI->isReservedReg(*MF, Reg) &&
9980 Reg != AArch64::LR && // LR is not reserved, but don't use it.
9981 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
9982 Reg != AArch64::X17 && // Ditto for X17.
9983 C.isAvailableAcrossAndOutOfSeq(Reg, TRI) &&
9984 C.isAvailableInsideSeq(Reg, TRI))
9985 return Reg;
9986 }
9987 return Register();
9988}
9989
9990static bool
9992 const outliner::Candidate &b) {
9993 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
9994 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
9995
9996 return MFIa->getSignReturnAddressCondition() ==
9998}
9999
10000static bool
10002 const outliner::Candidate &b) {
10003 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
10004 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
10005
10006 return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey();
10007}
10008
10010 const outliner::Candidate &b) {
10011 const AArch64Subtarget &SubtargetA =
10013 const AArch64Subtarget &SubtargetB =
10014 b.getMF()->getSubtarget<AArch64Subtarget>();
10015 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
10016}
10017
10018std::optional<std::unique_ptr<outliner::OutlinedFunction>>
10019AArch64InstrInfo::getOutliningCandidateInfo(
10020 const MachineModuleInfo &MMI,
10021 std::vector<outliner::Candidate> &RepeatedSequenceLocs,
10022 unsigned MinRepeats) const {
10023 unsigned SequenceSize = 0;
10024 for (auto &MI : RepeatedSequenceLocs[0])
10025 SequenceSize += getInstSizeInBytes(MI);
10026
10027 unsigned NumBytesToCreateFrame = 0;
10028
10029 // Avoid splitting ADRP ADD/LDR pair into outlined functions.
10030 // These instructions are fused together by the scheduler.
10031 // Any candidate where ADRP is the last instruction should be rejected
10032 // as that will lead to splitting ADRP pair.
10033 MachineInstr &LastMI = RepeatedSequenceLocs[0].back();
10034 MachineInstr &FirstMI = RepeatedSequenceLocs[0].front();
10035 if (LastMI.getOpcode() == AArch64::ADRP &&
10036 (LastMI.getOperand(1).getTargetFlags() & AArch64II::MO_PAGE) != 0 &&
10037 (LastMI.getOperand(1).getTargetFlags() & AArch64II::MO_GOT) != 0) {
10038 return std::nullopt;
10039 }
10040
10041 // Similarly any candidate where the first instruction is ADD/LDR with a
10042 // page offset should be rejected to avoid ADRP splitting.
10043 if ((FirstMI.getOpcode() == AArch64::ADDXri ||
10044 FirstMI.getOpcode() == AArch64::LDRXui) &&
10045 (FirstMI.getOperand(2).getTargetFlags() & AArch64II::MO_PAGEOFF) != 0 &&
10046 (FirstMI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT) != 0) {
10047 return std::nullopt;
10048 }
10049
10050 // We only allow outlining for functions having exactly matching return
10051 // address signing attributes, i.e., all share the same value for the
10052 // attribute "sign-return-address" and all share the same type of key they
10053 // are signed with.
10054 // Additionally we require all functions to simultaneously either support
10055 // v8.3a features or not. Otherwise an outlined function could get signed
10056 // using dedicated v8.3 instructions and a call from a function that doesn't
10057 // support v8.3 instructions would therefore be invalid.
10058 if (std::adjacent_find(
10059 RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
10060 [](const outliner::Candidate &a, const outliner::Candidate &b) {
10061 // Return true if a and b are non-equal w.r.t. return address
10062 // signing or support of v8.3a features
10063 if (outliningCandidatesSigningScopeConsensus(a, b) &&
10064 outliningCandidatesSigningKeyConsensus(a, b) &&
10065 outliningCandidatesV8_3OpsConsensus(a, b)) {
10066 return false;
10067 }
10068 return true;
10069 }) != RepeatedSequenceLocs.end()) {
10070 return std::nullopt;
10071 }
10072
10073 // Since at this point all candidates agree on their return address signing
10074 // picking just one is fine. If the candidate functions potentially sign their
10075 // return addresses, the outlined function should do the same. Note that in
10076 // the case of "sign-return-address"="non-leaf" this is an assumption: It is
10077 // not certainly true that the outlined function will have to sign its return
10078 // address but this decision is made later, when the decision to outline
10079 // has already been made.
10080 // The same holds for the number of additional instructions we need: On
10081 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
10082 // necessary. However, at this point we don't know if the outlined function
10083 // will have a RET instruction so we assume the worst.
10084 const TargetRegisterInfo &TRI = getRegisterInfo();
10085 // Performing a tail call may require extra checks when PAuth is enabled.
10086 // If PAuth is disabled, set it to zero for uniformity.
10087 unsigned NumBytesToCheckLRInTCEpilogue = 0;
10088 const auto RASignCondition = RepeatedSequenceLocs[0]
10089 .getMF()
10090 ->getInfo<AArch64FunctionInfo>()
10091 ->getSignReturnAddressCondition();
10092 if (RASignCondition != SignReturnAddress::None) {
10093 // One PAC and one AUT instructions
10094 NumBytesToCreateFrame += 8;
10095
10096 // PAuth is enabled - set extra tail call cost, if any.
10097 auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod(
10098 *RepeatedSequenceLocs[0].getMF());
10099 NumBytesToCheckLRInTCEpilogue =
10101 // Checking the authenticated LR value may significantly impact
10102 // SequenceSize, so account for it for more precise results.
10103 if (isTailCallReturnInst(RepeatedSequenceLocs[0].back()))
10104 SequenceSize += NumBytesToCheckLRInTCEpilogue;
10105
10106 // We have to check if sp modifying instructions would get outlined.
10107 // If so we only allow outlining if sp is unchanged overall, so matching
10108 // sub and add instructions are okay to outline, all other sp modifications
10109 // are not
10110 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
10111 int SPValue = 0;
10112 for (auto &MI : C) {
10113 if (MI.modifiesRegister(AArch64::SP, &TRI)) {
10114 switch (MI.getOpcode()) {
10115 case AArch64::ADDXri:
10116 case AArch64::ADDWri:
10117 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
10118 assert(MI.getOperand(2).isImm() &&
10119 "Expected operand to be immediate");
10120 assert(MI.getOperand(1).isReg() &&
10121 "Expected operand to be a register");
10122 // Check if the add just increments sp. If so, we search for
10123 // matching sub instructions that decrement sp. If not, the
10124 // modification is illegal
10125 if (MI.getOperand(1).getReg() == AArch64::SP)
10126 SPValue += MI.getOperand(2).getImm();
10127 else
10128 return true;
10129 break;
10130 case AArch64::SUBXri:
10131 case AArch64::SUBWri:
10132 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
10133 assert(MI.getOperand(2).isImm() &&
10134 "Expected operand to be immediate");
10135 assert(MI.getOperand(1).isReg() &&
10136 "Expected operand to be a register");
10137 // Check if the sub just decrements sp. If so, we search for
10138 // matching add instructions that increment sp. If not, the
10139 // modification is illegal
10140 if (MI.getOperand(1).getReg() == AArch64::SP)
10141 SPValue -= MI.getOperand(2).getImm();
10142 else
10143 return true;
10144 break;
10145 default:
10146 return true;
10147 }
10148 }
10149 }
10150 if (SPValue)
10151 return true;
10152 return false;
10153 };
10154 // Remove candidates with illegal stack modifying instructions
10155 llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification);
10156
10157 // If the sequence doesn't have enough candidates left, then we're done.
10158 if (RepeatedSequenceLocs.size() < MinRepeats)
10159 return std::nullopt;
10160 }
10161
10162 // Properties about candidate MBBs that hold for all of them.
10163 unsigned FlagsSetInAll = 0xF;
10164
10165 // Compute liveness information for each candidate, and set FlagsSetInAll.
10166 for (outliner::Candidate &C : RepeatedSequenceLocs)
10167 FlagsSetInAll &= C.Flags;
10168
10169 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back().getOpcode();
10170
10171 // Helper lambda which sets call information for every candidate.
10172 auto SetCandidateCallInfo =
10173 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
10174 for (outliner::Candidate &C : RepeatedSequenceLocs)
10175 C.setCallInfo(CallID, NumBytesForCall);
10176 };
10177
10178 unsigned FrameID = MachineOutlinerDefault;
10179 NumBytesToCreateFrame += 4;
10180
10181 bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
10182 return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement();
10183 });
10184
10185 // We check to see if CFI Instructions are present, and if they are
10186 // we find the number of CFI Instructions in the candidates.
10187 unsigned CFICount = 0;
10188 for (auto &I : RepeatedSequenceLocs[0]) {
10189 if (I.isCFIInstruction())
10190 CFICount++;
10191 }
10192
10193 // We compare the number of found CFI Instructions to the number of CFI
10194 // instructions in the parent function for each candidate. We must check this
10195 // since if we outline one of the CFI instructions in a function, we have to
10196 // outline them all for correctness. If we do not, the address offsets will be
10197 // incorrect between the two sections of the program.
10198 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10199 std::vector<MCCFIInstruction> CFIInstructions =
10200 C.getMF()->getFrameInstructions();
10201
10202 if (CFICount > 0 && CFICount != CFIInstructions.size())
10203 return std::nullopt;
10204 }
10205
10206 // Returns true if an instructions is safe to fix up, false otherwise.
10207 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
10208 if (MI.isCall())
10209 return true;
10210
10211 if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
10212 !MI.readsRegister(AArch64::SP, &TRI))
10213 return true;
10214
10215 // Any modification of SP will break our code to save/restore LR.
10216 // FIXME: We could handle some instructions which add a constant
10217 // offset to SP, with a bit more work.
10218 if (MI.modifiesRegister(AArch64::SP, &TRI))
10219 return false;
10220
10221 // At this point, we have a stack instruction that we might need to
10222 // fix up. We'll handle it if it's a load or store.
10223 if (MI.mayLoadOrStore()) {
10224 const MachineOperand *Base; // Filled with the base operand of MI.
10225 int64_t Offset; // Filled with the offset of MI.
10226 bool OffsetIsScalable;
10227
10228 // Does it allow us to offset the base operand and is the base the
10229 // register SP?
10230 if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) ||
10231 !Base->isReg() || Base->getReg() != AArch64::SP)
10232 return false;
10233
10234 // Fixe-up code below assumes bytes.
10235 if (OffsetIsScalable)
10236 return false;
10237
10238 // Find the minimum/maximum offset for this instruction and check
10239 // if fixing it up would be in range.
10240 int64_t MinOffset,
10241 MaxOffset; // Unscaled offsets for the instruction.
10242 // The scale to multiply the offsets by.
10243 TypeSize Scale(0U, false), DummyWidth(0U, false);
10244 getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
10245
10246 Offset += 16; // Update the offset to what it would be if we outlined.
10247 if (Offset < MinOffset * (int64_t)Scale.getFixedValue() ||
10248 Offset > MaxOffset * (int64_t)Scale.getFixedValue())
10249 return false;
10250
10251 // It's in range, so we can outline it.
10252 return true;
10253 }
10254
10255 // FIXME: Add handling for instructions like "add x0, sp, #8".
10256
10257 // We can't fix it up, so don't outline it.
10258 return false;
10259 };
10260
10261 // True if it's possible to fix up each stack instruction in this sequence.
10262 // Important for frames/call variants that modify the stack.
10263 bool AllStackInstrsSafe =
10264 llvm::all_of(RepeatedSequenceLocs[0], IsSafeToFixup);
10265
10266 // If the last instruction in any candidate is a terminator, then we should
10267 // tail call all of the candidates.
10268 if (RepeatedSequenceLocs[0].back().isTerminator()) {
10269 FrameID = MachineOutlinerTailCall;
10270 NumBytesToCreateFrame = 0;
10271 unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue;
10272 SetCandidateCallInfo(MachineOutlinerTailCall, NumBytesForCall);
10273 }
10274
10275 else if (LastInstrOpcode == AArch64::BL ||
10276 ((LastInstrOpcode == AArch64::BLR ||
10277 LastInstrOpcode == AArch64::BLRNoIP) &&
10278 !HasBTI)) {
10279 // FIXME: Do we need to check if the code after this uses the value of LR?
10280 FrameID = MachineOutlinerThunk;
10281 NumBytesToCreateFrame = NumBytesToCheckLRInTCEpilogue;
10282 SetCandidateCallInfo(MachineOutlinerThunk, 4);
10283 }
10284
10285 else {
10286 // We need to decide how to emit calls + frames. We can always emit the same
10287 // frame if we don't need to save to the stack. If we have to save to the
10288 // stack, then we need a different frame.
10289 unsigned NumBytesNoStackCalls = 0;
10290 std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
10291
10292 // Check if we have to save LR.
10293 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10294 bool LRAvailable =
10296 ? C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI)
10297 : true;
10298 // If we have a noreturn caller, then we're going to be conservative and
10299 // say that we have to save LR. If we don't have a ret at the end of the
10300 // block, then we can't reason about liveness accurately.
10301 //
10302 // FIXME: We can probably do better than always disabling this in
10303 // noreturn functions by fixing up the liveness info.
10304 bool IsNoReturn =
10305 C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);
10306
10307 // Is LR available? If so, we don't need a save.
10308 if (LRAvailable && !IsNoReturn) {
10309 NumBytesNoStackCalls += 4;
10310 C.setCallInfo(MachineOutlinerNoLRSave, 4);
10311 CandidatesWithoutStackFixups.push_back(C);
10312 }
10313
10314 // Is an unused register available? If so, we won't modify the stack, so
10315 // we can outline with the same frame type as those that don't save LR.
10316 else if (findRegisterToSaveLRTo(C)) {
10317 NumBytesNoStackCalls += 12;
10318 C.setCallInfo(MachineOutlinerRegSave, 12);
10319 CandidatesWithoutStackFixups.push_back(C);
10320 }
10321
10322 // Is SP used in the sequence at all? If not, we don't have to modify
10323 // the stack, so we are guaranteed to get the same frame.
10324 else if (C.isAvailableInsideSeq(AArch64::SP, TRI)) {
10325 NumBytesNoStackCalls += 12;
10326 C.setCallInfo(MachineOutlinerDefault, 12);
10327 CandidatesWithoutStackFixups.push_back(C);
10328 }
10329
10330 // If we outline this, we need to modify the stack. Pretend we don't
10331 // outline this by saving all of its bytes.
10332 else {
10333 NumBytesNoStackCalls += SequenceSize;
10334 }
10335 }
10336
10337 // If there are no places where we have to save LR, then note that we
10338 // don't have to update the stack. Otherwise, give every candidate the
10339 // default call type, as long as it's safe to do so.
10340 if (!AllStackInstrsSafe ||
10341 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
10342 RepeatedSequenceLocs = CandidatesWithoutStackFixups;
10343 FrameID = MachineOutlinerNoLRSave;
10344 if (RepeatedSequenceLocs.size() < MinRepeats)
10345 return std::nullopt;
10346 } else {
10347 SetCandidateCallInfo(MachineOutlinerDefault, 12);
10348
10349 // Bugzilla ID: 46767
10350 // TODO: Check if fixing up the stack more than once is safe so we can
10351 // outline these.
10352 //
10353 // An outline resulting in a caller that requires stack fixups at the
10354 // callsite to a callee that also requires stack fixups can happen when
10355 // there are no available registers at the candidate callsite for a
10356 // candidate that itself also has calls.
10357 //
10358 // In other words if function_containing_sequence in the following pseudo
10359 // assembly requires that we save LR at the point of the call, but there
10360 // are no available registers: in this case we save using SP and as a
10361 // result the SP offsets requires stack fixups by multiples of 16.
10362 //
10363 // function_containing_sequence:
10364 // ...
10365 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
10366 // call OUTLINED_FUNCTION_N
10367 // restore LR from SP
10368 // ...
10369 //
10370 // OUTLINED_FUNCTION_N:
10371 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
10372 // ...
10373 // bl foo
10374 // restore LR from SP
10375 // ret
10376 //
10377 // Because the code to handle more than one stack fixup does not
10378 // currently have the proper checks for legality, these cases will assert
10379 // in the AArch64 MachineOutliner. This is because the code to do this
10380 // needs more hardening, testing, better checks that generated code is
10381 // legal, etc and because it is only verified to handle a single pass of
10382 // stack fixup.
10383 //
10384 // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch
10385 // these cases until they are known to be handled. Bugzilla 46767 is
10386 // referenced in comments at the assert site.
10387 //
10388 // To avoid asserting (or generating non-legal code on noassert builds)
10389 // we remove all candidates which would need more than one stack fixup by
10390 // pruning the cases where the candidate has calls while also having no
10391 // available LR and having no available general purpose registers to copy
10392 // LR to (ie one extra stack save/restore).
10393 //
10394 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
10395 erase_if(RepeatedSequenceLocs, [this, &TRI](outliner::Candidate &C) {
10396 auto IsCall = [](const MachineInstr &MI) { return MI.isCall(); };
10397 return (llvm::any_of(C, IsCall)) &&
10398 (!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) ||
10399 !findRegisterToSaveLRTo(C));
10400 });
10401 }
10402 }
10403
10404 // If we dropped all of the candidates, bail out here.
10405 if (RepeatedSequenceLocs.size() < MinRepeats)
10406 return std::nullopt;
10407 }
10408
10409 // Does every candidate's MBB contain a call? If so, then we might have a call
10410 // in the range.
10411 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
10412 // Check if the range contains a call. These require a save + restore of the
10413 // link register.
10414 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
10415 bool ModStackToSaveLR = false;
10416 if (any_of(drop_end(FirstCand),
10417 [](const MachineInstr &MI) { return MI.isCall(); }))
10418 ModStackToSaveLR = true;
10419
10420 // Handle the last instruction separately. If this is a tail call, then the
10421 // last instruction is a call. We don't want to save + restore in this case.
10422 // However, it could be possible that the last instruction is a call without
10423 // it being valid to tail call this sequence. We should consider this as
10424 // well.
10425 else if (FrameID != MachineOutlinerThunk &&
10426 FrameID != MachineOutlinerTailCall && FirstCand.back().isCall())
10427 ModStackToSaveLR = true;
10428
10429 if (ModStackToSaveLR) {
10430 // We can't fix up the stack. Bail out.
10431 if (!AllStackInstrsSafe)
10432 return std::nullopt;
10433
10434 // Save + restore LR.
10435 NumBytesToCreateFrame += 8;
10436 }
10437 }
10438
10439 // If we have CFI instructions, we can only outline if the outlined section
10440 // can be a tail call
10441 if (FrameID != MachineOutlinerTailCall && CFICount > 0)
10442 return std::nullopt;
10443
10444 return std::make_unique<outliner::OutlinedFunction>(
10445 RepeatedSequenceLocs, SequenceSize, NumBytesToCreateFrame, FrameID);
10446}
10447
10448void AArch64InstrInfo::mergeOutliningCandidateAttributes(
10449 Function &F, std::vector<outliner::Candidate> &Candidates) const {
10450 // If a bunch of candidates reach this point they must agree on their return
10451 // address signing. It is therefore enough to just consider the signing
10452 // behaviour of one of them
10453 const auto &CFn = Candidates.front().getMF()->getFunction();
10454
10455 if (CFn.hasFnAttribute("ptrauth-returns"))
10456 F.addFnAttr(CFn.getFnAttribute("ptrauth-returns"));
10457 if (CFn.hasFnAttribute("ptrauth-auth-traps"))
10458 F.addFnAttr(CFn.getFnAttribute("ptrauth-auth-traps"));
10459 // Since all candidates belong to the same module, just copy the
10460 // function-level attributes of an arbitrary function.
10461 if (CFn.hasFnAttribute("sign-return-address"))
10462 F.addFnAttr(CFn.getFnAttribute("sign-return-address"));
10463 if (CFn.hasFnAttribute("sign-return-address-key"))
10464 F.addFnAttr(CFn.getFnAttribute("sign-return-address-key"));
10465
10466 AArch64GenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates);
10467}
10468
10469bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
10470 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
10471 const Function &F = MF.getFunction();
10472
10473 // Can F be deduplicated by the linker? If it can, don't outline from it.
10474 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
10475 return false;
10476
10477 // Don't outline from functions with section markings; the program could
10478 // expect that all the code is in the named section.
10479 // FIXME: Allow outlining from multiple functions with the same section
10480 // marking.
10481 if (F.hasSection())
10482 return false;
10483
10484 // Outlining from functions with redzones is unsafe since the outliner may
10485 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
10486 // outline from it.
10487 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
10488 if (!AFI || AFI->hasRedZone().value_or(true))
10489 return false;
10490
10491 // FIXME: Determine whether it is safe to outline from functions which contain
10492 // streaming-mode changes. We may need to ensure any smstart/smstop pairs are
10493 // outlined together and ensure it is safe to outline with async unwind info,
10494 // required for saving & restoring VG around calls.
10495 if (AFI->hasStreamingModeChanges())
10496 return false;
10497
10498 // FIXME: Teach the outliner to generate/handle Windows unwind info.
10500 return false;
10501
10502 // It's safe to outline from MF.
10503 return true;
10504}
10505
10507AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
10508 unsigned &Flags) const {
10510 "Must track liveness!");
10512 std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
10513 Ranges;
10514 // According to the AArch64 Procedure Call Standard, the following are
10515 // undefined on entry/exit from a function call:
10516 //
10517 // * Registers x16, x17, (and thus w16, w17)
10518 // * Condition codes (and thus the NZCV register)
10519 //
10520 // If any of these registers are used inside or live across an outlined
10521 // function, then they may be modified later, either by the compiler or
10522 // some other tool (like the linker).
10523 //
10524 // To avoid outlining in these situations, partition each block into ranges
10525 // where these registers are dead. We will only outline from those ranges.
10526 LiveRegUnits LRU(getRegisterInfo());
10527 auto AreAllUnsafeRegsDead = [&LRU]() {
10528 return LRU.available(AArch64::W16) && LRU.available(AArch64::W17) &&
10529 LRU.available(AArch64::NZCV);
10530 };
10531
10532 // We need to know if LR is live across an outlining boundary later on in
10533 // order to decide how we'll create the outlined call, frame, etc.
10534 //
10535 // It's pretty expensive to check this for *every candidate* within a block.
10536 // That's some potentially n^2 behaviour, since in the worst case, we'd need
10537 // to compute liveness from the end of the block for O(n) candidates within
10538 // the block.
10539 //
10540 // So, to improve the average case, let's keep track of liveness from the end
10541 // of the block to the beginning of *every outlinable range*. If we know that
10542 // LR is available in every range we could outline from, then we know that
10543 // we don't need to check liveness for any candidate within that range.
10544 bool LRAvailableEverywhere = true;
10545 // Compute liveness bottom-up.
10546 LRU.addLiveOuts(MBB);
10547 // Update flags that require info about the entire MBB.
10548 auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) {
10549 if (MI.isCall() && !MI.isTerminator())
10551 };
10552 // Range: [RangeBegin, RangeEnd)
10553 MachineBasicBlock::instr_iterator RangeBegin, RangeEnd;
10554 unsigned RangeLen;
10555 auto CreateNewRangeStartingAt =
10556 [&RangeBegin, &RangeEnd,
10557 &RangeLen](MachineBasicBlock::instr_iterator NewBegin) {
10558 RangeBegin = NewBegin;
10559 RangeEnd = std::next(RangeBegin);
10560 RangeLen = 0;
10561 };
10562 auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() {
10563 // At least one unsafe register is not dead. We do not want to outline at
10564 // this point. If it is long enough to outline from and does not cross a
10565 // bundle boundary, save the range [RangeBegin, RangeEnd).
10566 if (RangeLen <= 1)
10567 return;
10568 if (!RangeBegin.isEnd() && RangeBegin->isBundledWithPred())
10569 return;
10570 if (!RangeEnd.isEnd() && RangeEnd->isBundledWithPred())
10571 return;
10572 Ranges.emplace_back(RangeBegin, RangeEnd);
10573 };
10574 // Find the first point where all unsafe registers are dead.
10575 // FIND: <safe instr> <-- end of first potential range
10576 // SKIP: <unsafe def>
10577 // SKIP: ... everything between ...
10578 // SKIP: <unsafe use>
10579 auto FirstPossibleEndPt = MBB.instr_rbegin();
10580 for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) {
10581 LRU.stepBackward(*FirstPossibleEndPt);
10582 // Update flags that impact how we outline across the entire block,
10583 // regardless of safety.
10584 UpdateWholeMBBFlags(*FirstPossibleEndPt);
10585 if (AreAllUnsafeRegsDead())
10586 break;
10587 }
10588 // If we exhausted the entire block, we have no safe ranges to outline.
10589 if (FirstPossibleEndPt == MBB.instr_rend())
10590 return Ranges;
10591 // Current range.
10592 CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator());
10593 // StartPt points to the first place where all unsafe registers
10594 // are dead (if there is any such point). Begin partitioning the MBB into
10595 // ranges.
10596 for (auto &MI : make_range(FirstPossibleEndPt, MBB.instr_rend())) {
10597 LRU.stepBackward(MI);
10598 UpdateWholeMBBFlags(MI);
10599 if (!AreAllUnsafeRegsDead()) {
10600 SaveRangeIfNonEmpty();
10601 CreateNewRangeStartingAt(MI.getIterator());
10602 continue;
10603 }
10604 LRAvailableEverywhere &= LRU.available(AArch64::LR);
10605 RangeBegin = MI.getIterator();
10606 ++RangeLen;
10607 }
10608 // Above loop misses the last (or only) range. If we are still safe, then
10609 // let's save the range.
10610 if (AreAllUnsafeRegsDead())
10611 SaveRangeIfNonEmpty();
10612 if (Ranges.empty())
10613 return Ranges;
10614 // We found the ranges bottom-up. Mapping expects the top-down. Reverse
10615 // the order.
10616 std::reverse(Ranges.begin(), Ranges.end());
10617 // If there is at least one outlinable range where LR is unavailable
10618 // somewhere, remember that.
10619 if (!LRAvailableEverywhere)
10621 return Ranges;
10622}
10623
10625AArch64InstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI,
10627 unsigned Flags) const {
10628 MachineInstr &MI = *MIT;
10629
10630 // Don't outline anything used for return address signing. The outlined
10631 // function will get signed later if needed
10632 switch (MI.getOpcode()) {
10633 case AArch64::PACM:
10634 case AArch64::PACIASP:
10635 case AArch64::PACIBSP:
10636 case AArch64::PACIASPPC:
10637 case AArch64::PACIBSPPC:
10638 case AArch64::AUTIASP:
10639 case AArch64::AUTIBSP:
10640 case AArch64::AUTIASPPCi:
10641 case AArch64::AUTIASPPCr:
10642 case AArch64::AUTIBSPPCi:
10643 case AArch64::AUTIBSPPCr:
10644 case AArch64::RETAA:
10645 case AArch64::RETAB:
10646 case AArch64::RETAASPPCi:
10647 case AArch64::RETAASPPCr:
10648 case AArch64::RETABSPPCi:
10649 case AArch64::RETABSPPCr:
10650 case AArch64::EMITBKEY:
10651 case AArch64::PAUTH_PROLOGUE:
10652 case AArch64::PAUTH_EPILOGUE:
10654 }
10655
10656 // We can only outline these if we will tail call the outlined function, or
10657 // fix up the CFI offsets. Currently, CFI instructions are outlined only if
10658 // in a tail call.
10659 //
10660 // FIXME: If the proper fixups for the offset are implemented, this should be
10661 // possible.
10662 if (MI.isCFIInstruction())
10664
10665 // Is this a terminator for a basic block?
10666 if (MI.isTerminator())
10667 // TargetInstrInfo::getOutliningType has already filtered out anything
10668 // that would break this, so we can allow it here.
10670
10671 // Make sure none of the operands are un-outlinable.
10672 for (const MachineOperand &MOP : MI.operands()) {
10673 // A check preventing CFI indices was here before, but only CFI
10674 // instructions should have those.
10675 assert(!MOP.isCFIIndex());
10676
10677 // If it uses LR or W30 explicitly, then don't touch it.
10678 if (MOP.isReg() && !MOP.isImplicit() &&
10679 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
10681 }
10682
10683 // Special cases for instructions that can always be outlined, but will fail
10684 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
10685 // be outlined because they don't require a *specific* value to be in LR.
10686 if (MI.getOpcode() == AArch64::ADRP)
10688
10689 // If MI is a call we might be able to outline it. We don't want to outline
10690 // any calls that rely on the position of items on the stack. When we outline
10691 // something containing a call, we have to emit a save and restore of LR in
10692 // the outlined function. Currently, this always happens by saving LR to the
10693 // stack. Thus, if we outline, say, half the parameters for a function call
10694 // plus the call, then we'll break the callee's expectations for the layout
10695 // of the stack.
10696 //
10697 // FIXME: Allow calls to functions which construct a stack frame, as long
10698 // as they don't access arguments on the stack.
10699 // FIXME: Figure out some way to analyze functions defined in other modules.
10700 // We should be able to compute the memory usage based on the IR calling
10701 // convention, even if we can't see the definition.
10702 if (MI.isCall()) {
10703 // Get the function associated with the call. Look at each operand and find
10704 // the one that represents the callee and get its name.
10705 const Function *Callee = nullptr;
10706 for (const MachineOperand &MOP : MI.operands()) {
10707 if (MOP.isGlobal()) {
10708 Callee = dyn_cast<Function>(MOP.getGlobal());
10709 break;
10710 }
10711 }
10712
10713 // Never outline calls to mcount. There isn't any rule that would require
10714 // this, but the Linux kernel's "ftrace" feature depends on it.
10715 if (Callee && Callee->getName() == "\01_mcount")
10717
10718 // If we don't know anything about the callee, assume it depends on the
10719 // stack layout of the caller. In that case, it's only legal to outline
10720 // as a tail-call. Explicitly list the call instructions we know about so we
10721 // don't get unexpected results with call pseudo-instructions.
10722 auto UnknownCallOutlineType = outliner::InstrType::Illegal;
10723 if (MI.getOpcode() == AArch64::BLR ||
10724 MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
10725 UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
10726
10727 if (!Callee)
10728 return UnknownCallOutlineType;
10729
10730 // We have a function we have information about. Check it if it's something
10731 // can safely outline.
10732 MachineFunction *CalleeMF = MMI.getMachineFunction(*Callee);
10733
10734 // We don't know what's going on with the callee at all. Don't touch it.
10735 if (!CalleeMF)
10736 return UnknownCallOutlineType;
10737
10738 // Check if we know anything about the callee saves on the function. If we
10739 // don't, then don't touch it, since that implies that we haven't
10740 // computed anything about its stack frame yet.
10741 MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
10742 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
10743 MFI.getNumObjects() > 0)
10744 return UnknownCallOutlineType;
10745
10746 // At this point, we can say that CalleeMF ought to not pass anything on the
10747 // stack. Therefore, we can outline it.
10749 }
10750
10751 // Don't touch the link register or W30.
10752 if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
10753 MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
10755
10756 // Don't outline BTI instructions, because that will prevent the outlining
10757 // site from being indirectly callable.
10758 if (hasBTISemantics(MI))
10760
10762}
10763
10764void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
10765 for (MachineInstr &MI : MBB) {
10766 const MachineOperand *Base;
10767 TypeSize Width(0, false);
10768 int64_t Offset;
10769 bool OffsetIsScalable;
10770
10771 // Is this a load or store with an immediate offset with SP as the base?
10772 if (!MI.mayLoadOrStore() ||
10773 !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width,
10774 &RI) ||
10775 (Base->isReg() && Base->getReg() != AArch64::SP))
10776 continue;
10777
10778 // It is, so we have to fix it up.
10779 TypeSize Scale(0U, false);
10780 int64_t Dummy1, Dummy2;
10781
10782 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
10783 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
10784 getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
10785 assert(Scale != 0 && "Unexpected opcode!");
10786 assert(!OffsetIsScalable && "Expected offset to be a byte offset");
10787
10788 // We've pushed the return address to the stack, so add 16 to the offset.
10789 // This is safe, since we already checked if it would overflow when we
10790 // checked if this instruction was legal to outline.
10791 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue();
10792 StackOffsetOperand.setImm(NewImm);
10793 }
10794}
10795
10797 const AArch64InstrInfo *TII,
10798 bool ShouldSignReturnAddr) {
10799 if (!ShouldSignReturnAddr)
10800 return;
10801
10802 BuildMI(MBB, MBB.begin(), DebugLoc(), TII->get(AArch64::PAUTH_PROLOGUE))
10804 BuildMI(MBB, MBB.getFirstInstrTerminator(), DebugLoc(),
10805 TII->get(AArch64::PAUTH_EPILOGUE))
10807}
10808
10809void AArch64InstrInfo::buildOutlinedFrame(
10811 const outliner::OutlinedFunction &OF) const {
10812
10813 AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
10814
10815 if (OF.FrameConstructionID == MachineOutlinerTailCall)
10816 FI->setOutliningStyle("Tail Call");
10817 else if (OF.FrameConstructionID == MachineOutlinerThunk) {
10818 // For thunk outlining, rewrite the last instruction from a call to a
10819 // tail-call.
10820 MachineInstr *Call = &*--MBB.instr_end();
10821 unsigned TailOpcode;
10822 if (Call->getOpcode() == AArch64::BL) {
10823 TailOpcode = AArch64::TCRETURNdi;
10824 } else {
10825 assert(Call->getOpcode() == AArch64::BLR ||
10826 Call->getOpcode() == AArch64::BLRNoIP);
10827 TailOpcode = AArch64::TCRETURNriALL;
10828 }
10829 MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
10830 .add(Call->getOperand(0))
10831 .addImm(0);
10832 MBB.insert(MBB.end(), TC);
10834
10835 FI->setOutliningStyle("Thunk");
10836 }
10837
10838 bool IsLeafFunction = true;
10839
10840 // Is there a call in the outlined range?
10841 auto IsNonTailCall = [](const MachineInstr &MI) {
10842 return MI.isCall() && !MI.isReturn();
10843 };
10844
10845 if (llvm::any_of(MBB.instrs(), IsNonTailCall)) {
10846 // Fix up the instructions in the range, since we're going to modify the
10847 // stack.
10848
10849 // Bugzilla ID: 46767
10850 // TODO: Check if fixing up twice is safe so we can outline these.
10851 assert(OF.FrameConstructionID != MachineOutlinerDefault &&
10852 "Can only fix up stack references once");
10853 fixupPostOutline(MBB);
10854
10855 IsLeafFunction = false;
10856
10857 // LR has to be a live in so that we can save it.
10858 if (!MBB.isLiveIn(AArch64::LR))
10859 MBB.addLiveIn(AArch64::LR);
10860
10863
10864 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
10865 OF.FrameConstructionID == MachineOutlinerThunk)
10866 Et = std::prev(MBB.end());
10867
10868 // Insert a save before the outlined region
10869 MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
10870 .addReg(AArch64::SP, RegState::Define)
10871 .addReg(AArch64::LR)
10872 .addReg(AArch64::SP)
10873 .addImm(-16);
10874 It = MBB.insert(It, STRXpre);
10875
10876 if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) {
10877 CFIInstBuilder CFIBuilder(MBB, It, MachineInstr::FrameSetup);
10878
10879 // Add a CFI saying the stack was moved 16 B down.
10880 CFIBuilder.buildDefCFAOffset(16);
10881
10882 // Add a CFI saying that the LR that we want to find is now 16 B higher
10883 // than before.
10884 CFIBuilder.buildOffset(AArch64::LR, -16);
10885 }
10886
10887 // Insert a restore before the terminator for the function.
10888 MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
10889 .addReg(AArch64::SP, RegState::Define)
10890 .addReg(AArch64::LR, RegState::Define)
10891 .addReg(AArch64::SP)
10892 .addImm(16);
10893 Et = MBB.insert(Et, LDRXpost);
10894 }
10895
10896 auto RASignCondition = FI->getSignReturnAddressCondition();
10897 bool ShouldSignReturnAddr = AArch64FunctionInfo::shouldSignReturnAddress(
10898 RASignCondition, !IsLeafFunction);
10899
10900 // If this is a tail call outlined function, then there's already a return.
10901 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
10902 OF.FrameConstructionID == MachineOutlinerThunk) {
10903 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
10904 return;
10905 }
10906
10907 // It's not a tail call, so we have to insert the return ourselves.
10908
10909 // LR has to be a live in so that we can return to it.
10910 if (!MBB.isLiveIn(AArch64::LR))
10911 MBB.addLiveIn(AArch64::LR);
10912
10913 MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
10914 .addReg(AArch64::LR);
10915 MBB.insert(MBB.end(), ret);
10916
10917 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
10918
10919 FI->setOutliningStyle("Function");
10920
10921 // Did we have to modify the stack by saving the link register?
10922 if (OF.FrameConstructionID != MachineOutlinerDefault)
10923 return;
10924
10925 // We modified the stack.
10926 // Walk over the basic block and fix up all the stack accesses.
10927 fixupPostOutline(MBB);
10928}
10929
10930MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
10933
10934 // Are we tail calling?
10935 if (C.CallConstructionID == MachineOutlinerTailCall) {
10936 // If yes, then we can just branch to the label.
10937 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
10938 .addGlobalAddress(M.getNamedValue(MF.getName()))
10939 .addImm(0));
10940 return It;
10941 }
10942
10943 // Are we saving the link register?
10944 if (C.CallConstructionID == MachineOutlinerNoLRSave ||
10945 C.CallConstructionID == MachineOutlinerThunk) {
10946 // No, so just insert the call.
10947 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
10948 .addGlobalAddress(M.getNamedValue(MF.getName())));
10949 return It;
10950 }
10951
10952 // We want to return the spot where we inserted the call.
10954
10955 // Instructions for saving and restoring LR around the call instruction we're
10956 // going to insert.
10957 MachineInstr *Save;
10958 MachineInstr *Restore;
10959 // Can we save to a register?
10960 if (C.CallConstructionID == MachineOutlinerRegSave) {
10961 // FIXME: This logic should be sunk into a target-specific interface so that
10962 // we don't have to recompute the register.
10963 Register Reg = findRegisterToSaveLRTo(C);
10964 assert(Reg && "No callee-saved register available?");
10965
10966 // LR has to be a live in so that we can save it.
10967 if (!MBB.isLiveIn(AArch64::LR))
10968 MBB.addLiveIn(AArch64::LR);
10969
10970 // Save and restore LR from Reg.
10971 Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
10972 .addReg(AArch64::XZR)
10973 .addReg(AArch64::LR)
10974 .addImm(0);
10975 Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
10976 .addReg(AArch64::XZR)
10977 .addReg(Reg)
10978 .addImm(0);
10979 } else {
10980 // We have the default case. Save and restore from SP.
10981 Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
10982 .addReg(AArch64::SP, RegState::Define)
10983 .addReg(AArch64::LR)
10984 .addReg(AArch64::SP)
10985 .addImm(-16);
10986 Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
10987 .addReg(AArch64::SP, RegState::Define)
10988 .addReg(AArch64::LR, RegState::Define)
10989 .addReg(AArch64::SP)
10990 .addImm(16);
10991 }
10992
10993 It = MBB.insert(It, Save);
10994 It++;
10995
10996 // Insert the call.
10997 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
10998 .addGlobalAddress(M.getNamedValue(MF.getName())));
10999 CallPt = It;
11000 It++;
11001
11002 It = MBB.insert(It, Restore);
11003 return CallPt;
11004}
11005
11006bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
11007 MachineFunction &MF) const {
11008 return MF.getFunction().hasMinSize();
11009}
11010
11011void AArch64InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
11013 DebugLoc &DL,
11014 bool AllowSideEffects) const {
11015 const MachineFunction &MF = *MBB.getParent();
11016 const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
11017 const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
11018
11019 if (TRI.isGeneralPurposeRegister(MF, Reg)) {
11020 BuildMI(MBB, Iter, DL, get(AArch64::MOVZXi), Reg).addImm(0).addImm(0);
11021 } else if (STI.isSVEorStreamingSVEAvailable()) {
11022 BuildMI(MBB, Iter, DL, get(AArch64::DUP_ZI_D), Reg)
11023 .addImm(0)
11024 .addImm(0);
11025 } else if (STI.isNeonAvailable()) {
11026 BuildMI(MBB, Iter, DL, get(AArch64::MOVIv2d_ns), Reg)
11027 .addImm(0);
11028 } else {
11029 // This is a streaming-compatible function without SVE. We don't have full
11030 // Neon (just FPRs), so we can at most use the first 64-bit sub-register.
11031 // So given `movi v..` would be illegal use `fmov d..` instead.
11032 assert(STI.hasNEON() && "Expected to have NEON.");
11033 Register Reg64 = TRI.getSubReg(Reg, AArch64::dsub);
11034 BuildMI(MBB, Iter, DL, get(AArch64::FMOVD0), Reg64);
11035 }
11036}
11037
11038std::optional<DestSourcePair>
11040
11041 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
11042 // and zero immediate operands used as an alias for mov instruction.
11043 if (((MI.getOpcode() == AArch64::ORRWrs &&
11044 MI.getOperand(1).getReg() == AArch64::WZR &&
11045 MI.getOperand(3).getImm() == 0x0) ||
11046 (MI.getOpcode() == AArch64::ORRWrr &&
11047 MI.getOperand(1).getReg() == AArch64::WZR)) &&
11048 // Check that the w->w move is not a zero-extending w->x mov.
11049 (!MI.getOperand(0).getReg().isVirtual() ||
11050 MI.getOperand(0).getSubReg() == 0) &&
11051 (!MI.getOperand(0).getReg().isPhysical() ||
11052 MI.findRegisterDefOperandIdx(getXRegFromWReg(MI.getOperand(0).getReg()),
11053 /*TRI=*/nullptr) == -1))
11054 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
11055
11056 if (MI.getOpcode() == AArch64::ORRXrs &&
11057 MI.getOperand(1).getReg() == AArch64::XZR &&
11058 MI.getOperand(3).getImm() == 0x0)
11059 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
11060
11061 return std::nullopt;
11062}
11063
11064std::optional<DestSourcePair>
11066 if ((MI.getOpcode() == AArch64::ORRWrs &&
11067 MI.getOperand(1).getReg() == AArch64::WZR &&
11068 MI.getOperand(3).getImm() == 0x0) ||
11069 (MI.getOpcode() == AArch64::ORRWrr &&
11070 MI.getOperand(1).getReg() == AArch64::WZR))
11071 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
11072 return std::nullopt;
11073}
11074
11075std::optional<RegImmPair>
11076AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const {
11077 int Sign = 1;
11078 int64_t Offset = 0;
11079
11080 // TODO: Handle cases where Reg is a super- or sub-register of the
11081 // destination register.
11082 const MachineOperand &Op0 = MI.getOperand(0);
11083 if (!Op0.isReg() || Reg != Op0.getReg())
11084 return std::nullopt;
11085
11086 switch (MI.getOpcode()) {
11087 default:
11088 return std::nullopt;
11089 case AArch64::SUBWri:
11090 case AArch64::SUBXri:
11091 case AArch64::SUBSWri:
11092 case AArch64::SUBSXri:
11093 Sign *= -1;
11094 [[fallthrough]];
11095 case AArch64::ADDSWri:
11096 case AArch64::ADDSXri:
11097 case AArch64::ADDWri:
11098 case AArch64::ADDXri: {
11099 // TODO: Third operand can be global address (usually some string).
11100 if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||
11101 !MI.getOperand(2).isImm())
11102 return std::nullopt;
11103 int Shift = MI.getOperand(3).getImm();
11104 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
11105 Offset = Sign * (MI.getOperand(2).getImm() << Shift);
11106 }
11107 }
11108 return RegImmPair{MI.getOperand(1).getReg(), Offset};
11109}
11110
11111/// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
11112/// the destination register then, if possible, describe the value in terms of
11113/// the source register.
11114static std::optional<ParamLoadedValue>
11116 const TargetInstrInfo *TII,
11117 const TargetRegisterInfo *TRI) {
11118 auto DestSrc = TII->isCopyLikeInstr(MI);
11119 if (!DestSrc)
11120 return std::nullopt;
11121
11122 Register DestReg = DestSrc->Destination->getReg();
11123 Register SrcReg = DestSrc->Source->getReg();
11124
11125 if (!DestReg.isValid() || !SrcReg.isValid())
11126 return std::nullopt;
11127
11128 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
11129
11130 // If the described register is the destination, just return the source.
11131 if (DestReg == DescribedReg)
11132 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
11133
11134 // ORRWrs zero-extends to 64-bits, so we need to consider such cases.
11135 if (MI.getOpcode() == AArch64::ORRWrs &&
11136 TRI->isSuperRegister(DestReg, DescribedReg))
11137 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
11138
11139 // We may need to describe the lower part of a ORRXrs move.
11140 if (MI.getOpcode() == AArch64::ORRXrs &&
11141 TRI->isSubRegister(DestReg, DescribedReg)) {
11142 Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32);
11143 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
11144 }
11145
11146 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
11147 "Unhandled ORR[XW]rs copy case");
11148
11149 return std::nullopt;
11150}
11151
11152bool AArch64InstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const {
11153 // Functions cannot be split to different sections on AArch64 if they have
11154 // a red zone. This is because relaxing a cross-section branch may require
11155 // incrementing the stack pointer to spill a register, which would overwrite
11156 // the red zone.
11157 if (MF.getInfo<AArch64FunctionInfo>()->hasRedZone().value_or(true))
11158 return false;
11159
11161}
11162
11163bool AArch64InstrInfo::isMBBSafeToSplitToCold(
11164 const MachineBasicBlock &MBB) const {
11165 // Asm Goto blocks can contain conditional branches to goto labels, which can
11166 // get moved out of range of the branch instruction.
11167 auto isAsmGoto = [](const MachineInstr &MI) {
11168 return MI.getOpcode() == AArch64::INLINEASM_BR;
11169 };
11170 if (llvm::any_of(MBB, isAsmGoto) || MBB.isInlineAsmBrIndirectTarget())
11171 return false;
11172
11173 // Because jump tables are label-relative instead of table-relative, they all
11174 // must be in the same section or relocation fixup handling will fail.
11175
11176 // Check if MBB is a jump table target
11177 const MachineJumpTableInfo *MJTI = MBB.getParent()->getJumpTableInfo();
11178 auto containsMBB = [&MBB](const MachineJumpTableEntry &JTE) {
11179 return llvm::is_contained(JTE.MBBs, &MBB);
11180 };
11181 if (MJTI != nullptr && llvm::any_of(MJTI->getJumpTables(), containsMBB))
11182 return false;
11183
11184 // Check if MBB contains a jump table lookup
11185 for (const MachineInstr &MI : MBB) {
11186 switch (MI.getOpcode()) {
11187 case TargetOpcode::G_BRJT:
11188 case AArch64::JumpTableDest32:
11189 case AArch64::JumpTableDest16:
11190 case AArch64::JumpTableDest8:
11191 return false;
11192 default:
11193 continue;
11194 }
11195 }
11196
11197 // MBB isn't a special case, so it's safe to be split to the cold section.
11198 return true;
11199}
11200
11201std::optional<ParamLoadedValue>
11202AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
11203 Register Reg) const {
11204 const MachineFunction *MF = MI.getMF();
11205 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
11206 switch (MI.getOpcode()) {
11207 case AArch64::MOVZWi:
11208 case AArch64::MOVZXi: {
11209 // MOVZWi may be used for producing zero-extended 32-bit immediates in
11210 // 64-bit parameters, so we need to consider super-registers.
11211 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
11212 return std::nullopt;
11213
11214 if (!MI.getOperand(1).isImm())
11215 return std::nullopt;
11216 int64_t Immediate = MI.getOperand(1).getImm();
11217 int Shift = MI.getOperand(2).getImm();
11218 return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift),
11219 nullptr);
11220 }
11221 case AArch64::ORRWrs:
11222 case AArch64::ORRXrs:
11223 return describeORRLoadedValue(MI, Reg, this, TRI);
11224 }
11225
11227}
11228
11229bool AArch64InstrInfo::isExtendLikelyToBeFolded(
11230 MachineInstr &ExtMI, MachineRegisterInfo &MRI) const {
11231 assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT ||
11232 ExtMI.getOpcode() == TargetOpcode::G_ZEXT ||
11233 ExtMI.getOpcode() == TargetOpcode::G_ANYEXT);
11234
11235 // Anyexts are nops.
11236 if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT)
11237 return true;
11238
11239 Register DefReg = ExtMI.getOperand(0).getReg();
11240 if (!MRI.hasOneNonDBGUse(DefReg))
11241 return false;
11242
11243 // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an
11244 // addressing mode.
11245 auto *UserMI = &*MRI.use_instr_nodbg_begin(DefReg);
11246 return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD;
11247}
11248
11249uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
11250 return get(Opc).TSFlags & AArch64::ElementSizeMask;
11251}
11252
11253bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const {
11254 return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike;
11255}
11256
11257bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const {
11258 return get(Opc).TSFlags & AArch64::InstrFlagIsWhile;
11259}
11260
11261unsigned int
11262AArch64InstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const {
11263 return OptLevel >= CodeGenOptLevel::Aggressive ? 6 : 2;
11264}
11265
11266bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset,
11267 unsigned Scale) const {
11268 if (Offset && Scale)
11269 return false;
11270
11271 // Check Reg + Imm
11272 if (!Scale) {
11273 // 9-bit signed offset
11274 if (isInt<9>(Offset))
11275 return true;
11276
11277 // 12-bit unsigned offset
11278 unsigned Shift = Log2_64(NumBytes);
11279 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
11280 // Must be a multiple of NumBytes (NumBytes is a power of 2)
11281 (Offset >> Shift) << Shift == Offset)
11282 return true;
11283 return false;
11284 }
11285
11286 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
11287 return Scale == 1 || (Scale > 0 && Scale == NumBytes);
11288}
11289
11291 if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
11292 return AArch64::BLRNoIP;
11293 else
11294 return AArch64::BLR;
11295}
11296
11299 Register TargetReg, bool FrameSetup) const {
11300 assert(TargetReg != AArch64::SP && "New top of stack cannot already be in SP");
11301
11302 MachineBasicBlock &MBB = *MBBI->getParent();
11303 MachineFunction &MF = *MBB.getParent();
11304 const AArch64InstrInfo *TII =
11305 MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
11306 int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
11307 DebugLoc DL = MBB.findDebugLoc(MBBI);
11308
11309 MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
11310 MachineBasicBlock *LoopTestMBB =
11311 MF.CreateMachineBasicBlock(MBB.getBasicBlock());
11312 MF.insert(MBBInsertPoint, LoopTestMBB);
11313 MachineBasicBlock *LoopBodyMBB =
11314 MF.CreateMachineBasicBlock(MBB.getBasicBlock());
11315 MF.insert(MBBInsertPoint, LoopBodyMBB);
11316 MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
11317 MF.insert(MBBInsertPoint, ExitMBB);
11318 MachineInstr::MIFlag Flags =
11320
11321 // LoopTest:
11322 // SUB SP, SP, #ProbeSize
11323 emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP,
11324 AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags);
11325
11326 // CMP SP, TargetReg
11327 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
11328 AArch64::XZR)
11329 .addReg(AArch64::SP)
11330 .addReg(TargetReg)
11332 .setMIFlags(Flags);
11333
11334 // B.<Cond> LoopExit
11335 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc))
11337 .addMBB(ExitMBB)
11338 .setMIFlags(Flags);
11339
11340 // LDR XZR, [SP]
11341 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::LDRXui))
11342 .addDef(AArch64::XZR)
11343 .addReg(AArch64::SP)
11344 .addImm(0)
11348 Align(8)))
11349 .setMIFlags(Flags);
11350
11351 // B loop
11352 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B))
11353 .addMBB(LoopTestMBB)
11354 .setMIFlags(Flags);
11355
11356 // LoopExit:
11357 // MOV SP, TargetReg
11358 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP)
11359 .addReg(TargetReg)
11360 .addImm(0)
11362 .setMIFlags(Flags);
11363
11364 // LDR XZR, [SP]
11365 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::LDRXui))
11366 .addReg(AArch64::XZR, RegState::Define)
11367 .addReg(AArch64::SP)
11368 .addImm(0)
11369 .setMIFlags(Flags);
11370
11371 ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
11373
11374 LoopTestMBB->addSuccessor(ExitMBB);
11375 LoopTestMBB->addSuccessor(LoopBodyMBB);
11376 LoopBodyMBB->addSuccessor(LoopTestMBB);
11377 MBB.addSuccessor(LoopTestMBB);
11378
11379 // Update liveins.
11380 if (MF.getRegInfo().reservedRegsFrozen())
11381 fullyRecomputeLiveIns({ExitMBB, LoopBodyMBB, LoopTestMBB});
11382
11383 return ExitMBB->begin();
11384}
11385
11386namespace {
11387class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
11388 MachineFunction *MF;
11389 const TargetInstrInfo *TII;
11390 const TargetRegisterInfo *TRI;
11392
11393 /// The block of the loop
11394 MachineBasicBlock *LoopBB;
11395 /// The conditional branch of the loop
11396 MachineInstr *CondBranch;
11397 /// The compare instruction for loop control
11398 MachineInstr *Comp;
11399 /// The number of the operand of the loop counter value in Comp
11400 unsigned CompCounterOprNum;
11401 /// The instruction that updates the loop counter value
11402 MachineInstr *Update;
11403 /// The number of the operand of the loop counter value in Update
11404 unsigned UpdateCounterOprNum;
11405 /// The initial value of the loop counter
11406 Register Init;
11407 /// True iff Update is a predecessor of Comp
11408 bool IsUpdatePriorComp;
11409
11410 /// The normalized condition used by createTripCountGreaterCondition()
11412
11413public:
11414 AArch64PipelinerLoopInfo(MachineBasicBlock *LoopBB, MachineInstr *CondBranch,
11415 MachineInstr *Comp, unsigned CompCounterOprNum,
11416 MachineInstr *Update, unsigned UpdateCounterOprNum,
11417 Register Init, bool IsUpdatePriorComp,
11419 : MF(Comp->getParent()->getParent()),
11420 TII(MF->getSubtarget().getInstrInfo()),
11421 TRI(MF->getSubtarget().getRegisterInfo()), MRI(MF->getRegInfo()),
11422 LoopBB(LoopBB), CondBranch(CondBranch), Comp(Comp),
11423 CompCounterOprNum(CompCounterOprNum), Update(Update),
11424 UpdateCounterOprNum(UpdateCounterOprNum), Init(Init),
11425 IsUpdatePriorComp(IsUpdatePriorComp), Cond(Cond.begin(), Cond.end()) {}
11426
11427 bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
11428 // Make the instructions for loop control be placed in stage 0.
11429 // The predecessors of Comp are considered by the caller.
11430 return MI == Comp;
11431 }
11432
11433 std::optional<bool> createTripCountGreaterCondition(
11434 int TC, MachineBasicBlock &MBB,
11435 SmallVectorImpl<MachineOperand> &CondParam) override {
11436 // A branch instruction will be inserted as "if (Cond) goto epilogue".
11437 // Cond is normalized for such use.
11438 // The predecessors of the branch are assumed to have already been inserted.
11439 CondParam = Cond;
11440 return {};
11441 }
11442
11443 void createRemainingIterationsGreaterCondition(
11444 int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
11445 DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) override;
11446
11447 void setPreheader(MachineBasicBlock *NewPreheader) override {}
11448
11449 void adjustTripCount(int TripCountAdjust) override {}
11450
11451 bool isMVEExpanderSupported() override { return true; }
11452};
11453} // namespace
11454
11455/// Clone an instruction from MI. The register of ReplaceOprNum-th operand
11456/// is replaced by ReplaceReg. The output register is newly created.
11457/// The other operands are unchanged from MI.
11458static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,
11459 Register ReplaceReg, MachineBasicBlock &MBB,
11460 MachineBasicBlock::iterator InsertTo) {
11461 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
11462 const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
11463 MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(MI);
11464 Register Result = 0;
11465 for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) {
11466 if (I == 0 && NewMI->getOperand(0).getReg().isVirtual()) {
11467 Result = MRI.createVirtualRegister(
11468 MRI.getRegClass(NewMI->getOperand(0).getReg()));
11469 NewMI->getOperand(I).setReg(Result);
11470 } else if (I == ReplaceOprNum) {
11471 MRI.constrainRegClass(ReplaceReg, TII->getRegClass(NewMI->getDesc(), I));
11472 NewMI->getOperand(I).setReg(ReplaceReg);
11473 }
11474 }
11475 MBB.insert(InsertTo, NewMI);
11476 return Result;
11477}
11478
11479void AArch64PipelinerLoopInfo::createRemainingIterationsGreaterCondition(
11482 // Create and accumulate conditions for next TC iterations.
11483 // Example:
11484 // SUBSXrr N, counter, implicit-def $nzcv # compare instruction for the last
11485 // # iteration of the kernel
11486 //
11487 // # insert the following instructions
11488 // cond = CSINCXr 0, 0, C, implicit $nzcv
11489 // counter = ADDXri counter, 1 # clone from this->Update
11490 // SUBSXrr n, counter, implicit-def $nzcv # clone from this->Comp
11491 // cond = CSINCXr cond, cond, C, implicit $nzcv
11492 // ... (repeat TC times)
11493 // SUBSXri cond, 0, implicit-def $nzcv
11494
11495 assert(CondBranch->getOpcode() == AArch64::Bcc);
11496 // CondCode to exit the loop
11498 (AArch64CC::CondCode)CondBranch->getOperand(0).getImm();
11499 if (CondBranch->getOperand(1).getMBB() == LoopBB)
11501
11502 // Accumulate conditions to exit the loop
11503 Register AccCond = AArch64::XZR;
11504
11505 // If CC holds, CurCond+1 is returned; otherwise CurCond is returned.
11506 auto AccumulateCond = [&](Register CurCond,
11508 Register NewCond = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
11509 BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::CSINCXr))
11510 .addReg(NewCond, RegState::Define)
11511 .addReg(CurCond)
11512 .addReg(CurCond)
11514 return NewCond;
11515 };
11516
11517 if (!LastStage0Insts.empty() && LastStage0Insts[Comp]->getParent() == &MBB) {
11518 // Update and Comp for I==0 are already exists in MBB
11519 // (MBB is an unrolled kernel)
11520 Register Counter;
11521 for (int I = 0; I <= TC; ++I) {
11522 Register NextCounter;
11523 if (I != 0)
11524 NextCounter =
11525 cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
11526
11527 AccCond = AccumulateCond(AccCond, CC);
11528
11529 if (I != TC) {
11530 if (I == 0) {
11531 if (Update != Comp && IsUpdatePriorComp) {
11532 Counter =
11533 LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
11534 NextCounter = cloneInstr(Update, UpdateCounterOprNum, Counter, MBB,
11535 MBB.end());
11536 } else {
11537 // can use already calculated value
11538 NextCounter = LastStage0Insts[Update]->getOperand(0).getReg();
11539 }
11540 } else if (Update != Comp) {
11541 NextCounter =
11542 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
11543 }
11544 }
11545 Counter = NextCounter;
11546 }
11547 } else {
11548 Register Counter;
11549 if (LastStage0Insts.empty()) {
11550 // use initial counter value (testing if the trip count is sufficient to
11551 // be executed by pipelined code)
11552 Counter = Init;
11553 if (IsUpdatePriorComp)
11554 Counter =
11555 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
11556 } else {
11557 // MBB is an epilogue block. LastStage0Insts[Comp] is in the kernel block.
11558 Counter = LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
11559 }
11560
11561 for (int I = 0; I <= TC; ++I) {
11562 Register NextCounter;
11563 NextCounter =
11564 cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
11565 AccCond = AccumulateCond(AccCond, CC);
11566 if (I != TC && Update != Comp)
11567 NextCounter =
11568 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
11569 Counter = NextCounter;
11570 }
11571 }
11572
11573 // If AccCond == 0, the remainder is greater than TC.
11574 BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::SUBSXri))
11575 .addReg(AArch64::XZR, RegState::Define | RegState::Dead)
11576 .addReg(AccCond)
11577 .addImm(0)
11578 .addImm(0);
11579 Cond.clear();
11581}
11582
11583static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB,
11584 Register &RegMBB, Register &RegOther) {
11585 assert(Phi.getNumOperands() == 5);
11586 if (Phi.getOperand(2).getMBB() == MBB) {
11587 RegMBB = Phi.getOperand(1).getReg();
11588 RegOther = Phi.getOperand(3).getReg();
11589 } else {
11590 assert(Phi.getOperand(4).getMBB() == MBB);
11591 RegMBB = Phi.getOperand(3).getReg();
11592 RegOther = Phi.getOperand(1).getReg();
11593 }
11594}
11595
11597 if (!Reg.isVirtual())
11598 return false;
11599 const MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
11600 return MRI.getVRegDef(Reg)->getParent() != BB;
11601}
11602
11603/// If Reg is an induction variable, return true and set some parameters
11604static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB,
11605 MachineInstr *&UpdateInst,
11606 unsigned &UpdateCounterOprNum, Register &InitReg,
11607 bool &IsUpdatePriorComp) {
11608 // Example:
11609 //
11610 // Preheader:
11611 // InitReg = ...
11612 // LoopBB:
11613 // Reg0 = PHI (InitReg, Preheader), (Reg1, LoopBB)
11614 // Reg = COPY Reg0 ; COPY is ignored.
11615 // Reg1 = ADD Reg, #1; UpdateInst. Incremented by a loop invariant value.
11616 // ; Reg is the value calculated in the previous
11617 // ; iteration, so IsUpdatePriorComp == false.
11618
11619 if (LoopBB->pred_size() != 2)
11620 return false;
11621 if (!Reg.isVirtual())
11622 return false;
11623 const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
11624 UpdateInst = nullptr;
11625 UpdateCounterOprNum = 0;
11626 InitReg = 0;
11627 IsUpdatePriorComp = true;
11628 Register CurReg = Reg;
11629 while (true) {
11630 MachineInstr *Def = MRI.getVRegDef(CurReg);
11631 if (Def->getParent() != LoopBB)
11632 return false;
11633 if (Def->isCopy()) {
11634 // Ignore copy instructions unless they contain subregisters
11635 if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg())
11636 return false;
11637 CurReg = Def->getOperand(1).getReg();
11638 } else if (Def->isPHI()) {
11639 if (InitReg != 0)
11640 return false;
11641 if (!UpdateInst)
11642 IsUpdatePriorComp = false;
11643 extractPhiReg(*Def, LoopBB, CurReg, InitReg);
11644 } else {
11645 if (UpdateInst)
11646 return false;
11647 switch (Def->getOpcode()) {
11648 case AArch64::ADDSXri:
11649 case AArch64::ADDSWri:
11650 case AArch64::SUBSXri:
11651 case AArch64::SUBSWri:
11652 case AArch64::ADDXri:
11653 case AArch64::ADDWri:
11654 case AArch64::SUBXri:
11655 case AArch64::SUBWri:
11656 UpdateInst = Def;
11657 UpdateCounterOprNum = 1;
11658 break;
11659 case AArch64::ADDSXrr:
11660 case AArch64::ADDSWrr:
11661 case AArch64::SUBSXrr:
11662 case AArch64::SUBSWrr:
11663 case AArch64::ADDXrr:
11664 case AArch64::ADDWrr:
11665 case AArch64::SUBXrr:
11666 case AArch64::SUBWrr:
11667 UpdateInst = Def;
11668 if (isDefinedOutside(Def->getOperand(2).getReg(), LoopBB))
11669 UpdateCounterOprNum = 1;
11670 else if (isDefinedOutside(Def->getOperand(1).getReg(), LoopBB))
11671 UpdateCounterOprNum = 2;
11672 else
11673 return false;
11674 break;
11675 default:
11676 return false;
11677 }
11678 CurReg = Def->getOperand(UpdateCounterOprNum).getReg();
11679 }
11680
11681 if (!CurReg.isVirtual())
11682 return false;
11683 if (Reg == CurReg)
11684 break;
11685 }
11686
11687 if (!UpdateInst)
11688 return false;
11689
11690 return true;
11691}
11692
11693std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
11695 // Accept loops that meet the following conditions
11696 // * The conditional branch is BCC
11697 // * The compare instruction is ADDS/SUBS/WHILEXX
11698 // * One operand of the compare is an induction variable and the other is a
11699 // loop invariant value
11700 // * The induction variable is incremented/decremented by a single instruction
11701 // * Does not contain CALL or instructions which have unmodeled side effects
11702
11703 for (MachineInstr &MI : *LoopBB)
11704 if (MI.isCall() || MI.hasUnmodeledSideEffects())
11705 // This instruction may use NZCV, which interferes with the instruction to
11706 // be inserted for loop control.
11707 return nullptr;
11708
11709 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
11711 if (analyzeBranch(*LoopBB, TBB, FBB, Cond))
11712 return nullptr;
11713
11714 // Infinite loops are not supported
11715 if (TBB == LoopBB && FBB == LoopBB)
11716 return nullptr;
11717
11718 // Must be conditional branch
11719 if (TBB != LoopBB && FBB == nullptr)
11720 return nullptr;
11721
11722 assert((TBB == LoopBB || FBB == LoopBB) &&
11723 "The Loop must be a single-basic-block loop");
11724
11725 MachineInstr *CondBranch = &*LoopBB->getFirstTerminator();
11727
11728 if (CondBranch->getOpcode() != AArch64::Bcc)
11729 return nullptr;
11730
11731 // Normalization for createTripCountGreaterCondition()
11732 if (TBB == LoopBB)
11734
11735 MachineInstr *Comp = nullptr;
11736 unsigned CompCounterOprNum = 0;
11737 for (MachineInstr &MI : reverse(*LoopBB)) {
11738 if (MI.modifiesRegister(AArch64::NZCV, &TRI)) {
11739 // Guarantee that the compare is SUBS/ADDS/WHILEXX and that one of the
11740 // operands is a loop invariant value
11741
11742 switch (MI.getOpcode()) {
11743 case AArch64::SUBSXri:
11744 case AArch64::SUBSWri:
11745 case AArch64::ADDSXri:
11746 case AArch64::ADDSWri:
11747 Comp = &MI;
11748 CompCounterOprNum = 1;
11749 break;
11750 case AArch64::ADDSWrr:
11751 case AArch64::ADDSXrr:
11752 case AArch64::SUBSWrr:
11753 case AArch64::SUBSXrr:
11754 Comp = &MI;
11755 break;
11756 default:
11757 if (isWhileOpcode(MI.getOpcode())) {
11758 Comp = &MI;
11759 break;
11760 }
11761 return nullptr;
11762 }
11763
11764 if (CompCounterOprNum == 0) {
11765 if (isDefinedOutside(Comp->getOperand(1).getReg(), LoopBB))
11766 CompCounterOprNum = 2;
11767 else if (isDefinedOutside(Comp->getOperand(2).getReg(), LoopBB))
11768 CompCounterOprNum = 1;
11769 else
11770 return nullptr;
11771 }
11772 break;
11773 }
11774 }
11775 if (!Comp)
11776 return nullptr;
11777
11778 MachineInstr *Update = nullptr;
11779 Register Init;
11780 bool IsUpdatePriorComp;
11781 unsigned UpdateCounterOprNum;
11782 if (!getIndVarInfo(Comp->getOperand(CompCounterOprNum).getReg(), LoopBB,
11783 Update, UpdateCounterOprNum, Init, IsUpdatePriorComp))
11784 return nullptr;
11785
11786 return std::make_unique<AArch64PipelinerLoopInfo>(
11787 LoopBB, CondBranch, Comp, CompCounterOprNum, Update, UpdateCounterOprNum,
11788 Init, IsUpdatePriorComp, Cond);
11789}
11790
11791/// verifyInstruction - Perform target specific instruction verification.
11792bool AArch64InstrInfo::verifyInstruction(const MachineInstr &MI,
11793 StringRef &ErrInfo) const {
11794 // Verify that immediate offsets on load/store instructions are within range.
11795 // Stack objects with an FI operand are excluded as they can be fixed up
11796 // during PEI.
11797 TypeSize Scale(0U, false), Width(0U, false);
11798 int64_t MinOffset, MaxOffset;
11799 if (getMemOpInfo(MI.getOpcode(), Scale, Width, MinOffset, MaxOffset)) {
11800 unsigned ImmIdx = getLoadStoreImmIdx(MI.getOpcode());
11801 if (MI.getOperand(ImmIdx).isImm() && !MI.getOperand(ImmIdx - 1).isFI()) {
11802 int64_t Imm = MI.getOperand(ImmIdx).getImm();
11803 if (Imm < MinOffset || Imm > MaxOffset) {
11804 ErrInfo = "Unexpected immediate on load/store instruction";
11805 return false;
11806 }
11807 }
11808 }
11809
11810 const MCInstrDesc &MCID = MI.getDesc();
11811 for (unsigned Op = 0; Op < MCID.getNumOperands(); Op++) {
11812 const MachineOperand &MO = MI.getOperand(Op);
11813 switch (MCID.operands()[Op].OperandType) {
11815 if (!MO.isImm() || MO.getImm() != 0) {
11816 ErrInfo = "OPERAND_IMPLICIT_IMM_0 should be 0";
11817 return false;
11818 }
11819 break;
11821 if (!MO.isImm() ||
11823 (AArch64_AM::getShiftValue(MO.getImm()) != 8 &&
11824 AArch64_AM::getShiftValue(MO.getImm()) != 16)) {
11825 ErrInfo = "OPERAND_SHIFT_MSL should be msl shift of 8 or 16";
11826 return false;
11827 }
11828 break;
11829 default:
11830 break;
11831 }
11832 }
11833 return true;
11834}
11835
11836#define GET_INSTRINFO_HELPERS
11837#define GET_INSTRMAP_INFO
11838#include "AArch64GenInstrInfo.inc"
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, unsigned NumRegs)
static cl::opt< unsigned > BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19), cl::desc("Restrict range of Bcc instructions (DEBUG)"))
static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned MnegOpc, const TargetRegisterClass *RC)
genNeg - Helper to generate an intermediate negation of the second operand of Root
static bool isFrameStoreOpcode(int Opcode)
static cl::opt< unsigned > GatherOptSearchLimit("aarch64-search-limit", cl::Hidden, cl::init(2048), cl::desc("Restrict range of instructions to search for the " "machine-combiner gather pattern optimization"))
static bool getMaddPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Find instructions that can be turned into madd.
static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr)
Find a condition code used by the instruction.
static MachineInstr * genFusedMultiplyAcc(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC)
genFusedMultiplyAcc - Helper to generate fused multiply accumulate instructions.
static MachineInstr * genFusedMultiplyAccNeg(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned IdxMulOpd, unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC)
genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate instructions with an additional...
static bool isCombineInstrCandidate64(unsigned Opc)
static bool isFrameLoadOpcode(int Opcode)
static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg)
static bool areCFlagsAccessedBetweenInstrs(MachineBasicBlock::iterator From, MachineBasicBlock::iterator To, const TargetRegisterInfo *TRI, const AccessKind AccessToCheck=AK_All)
True when condition flags are accessed (either by writing or reading) on the instruction trace starti...
static bool getFMAPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Floating-Point Support.
static bool isADDSRegImm(unsigned Opcode)
static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI)
static bool isANDOpcode(MachineInstr &MI)
static void appendOffsetComment(int NumBytes, llvm::raw_string_ostream &Comment, StringRef RegScale={})
static unsigned sForm(MachineInstr &Instr)
Get opcode of S version of Instr.
static bool isCombineInstrSettingFlag(unsigned Opc)
static bool getFNEGPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB, MachineInstr *&UpdateInst, unsigned &UpdateCounterOprNum, Register &InitReg, bool &IsUpdatePriorComp)
If Reg is an induction variable, return true and set some parameters.
static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc)
static bool mustAvoidNeonAtMBBI(const AArch64Subtarget &Subtarget, MachineBasicBlock &MBB, MachineBasicBlock::iterator I)
Returns true if in a streaming call site region without SME-FA64.
static bool isPostIndexLdStOpcode(unsigned Opcode)
Return true if the opcode is a post-index ld/st instruction, which really loads from base+0.
static unsigned getBranchDisplacementBits(unsigned Opc)
static cl::opt< unsigned > CBDisplacementBits("aarch64-cb-offset-bits", cl::Hidden, cl::init(9), cl::desc("Restrict range of CB instructions (DEBUG)"))
static std::optional< ParamLoadedValue > describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI)
If the given ORR instruction is a copy, and DescribedReg overlaps with the destination register then,...
static bool getFMULPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
static void appendReadRegExpr(SmallVectorImpl< char > &Expr, unsigned RegNum)
static MachineInstr * genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR, const TargetRegisterClass *RC)
genMaddR - Generate madd instruction and combine mul and add using an extra virtual register Example ...
static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum, Register ReplaceReg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertTo)
Clone an instruction from MI.
static bool scaleOffset(unsigned Opc, int64_t &Offset)
static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO, unsigned MulOpc)
unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale)
static MachineInstr * genFusedMultiplyIdx(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC)
genFusedMultiplyIdx - Helper to generate fused multiply accumulate instructions.
static MachineInstr * genIndexedMultiply(MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxDupOp, unsigned MulOpc, const TargetRegisterClass *RC, MachineRegisterInfo &MRI)
Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
static bool isSUBSRegImm(unsigned Opcode)
static bool UpdateOperandRegClass(MachineInstr &Instr)
static const TargetRegisterClass * getRegClass(const MachineInstr &MI, Register Reg)
static bool isInStreamingCallSiteRegion(MachineBasicBlock &MBB, MachineBasicBlock::iterator I)
Returns true if the instruction at I is in a streaming call site region, within a single basic block.
static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr, int CmpValue, const TargetRegisterInfo &TRI, SmallVectorImpl< MachineInstr * > &CCUseInstrs, bool &IsInvertCC)
unsigned unscaledOffsetOpcode(unsigned Opcode)
static bool getLoadPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Search for patterns of LD instructions we can optimize.
static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr, const TargetRegisterInfo &TRI)
Check if CmpInstr can be substituted by MI.
static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC)
static bool isCombineInstrCandidateFP(const MachineInstr &Inst)
static void appendLoadRegExpr(SmallVectorImpl< char > &Expr, int64_t OffsetFromDefCFA)
static void appendConstantExpr(SmallVectorImpl< char > &Expr, int64_t Constant, dwarf::LocationAtom Operation)
static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI)
Return the opcode that does not set flags when possible - otherwise return the original opcode.
static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a, const outliner::Candidate &b)
static bool isCombineInstrCandidate32(unsigned Opc)
static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, SmallVectorImpl< MachineOperand > &Cond)
static unsigned offsetExtendOpcode(unsigned Opcode)
MachineOutlinerMBBFlags
@ LRUnavailableSomewhere
@ UnsafeRegsDead
static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const MCInstrDesc &MCID, Register DestReg, unsigned SubIdx0, unsigned SubIdx1, int FI, MachineMemOperand *MMO)
static void generateGatherLanePattern(MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned Pattern, unsigned NumLanes)
Generate optimized instruction sequence for gather load patterns to improve Memory-Level Parallelism ...
static bool getMiscPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Find other MI combine patterns.
static bool outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a, const outliner::Candidate &b)
static const MachineInstrBuilder & AddSubReg(const MachineInstrBuilder &MIB, MCRegister Reg, unsigned SubIdx, RegState State, const TargetRegisterInfo *TRI)
static bool outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a, const outliner::Candidate &b)
static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1, int64_t Offset1, unsigned Opcode1, int FI2, int64_t Offset2, unsigned Opcode2)
static cl::opt< unsigned > TBZDisplacementBits("aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"))
static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB, Register &RegMBB, Register &RegOther)
static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI, unsigned Reg, const StackOffset &Offset)
static bool isDefinedOutside(Register Reg, const MachineBasicBlock *BB)
static MachineInstr * genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC, FMAInstKind kind=FMAInstKind::Default, const Register *ReplacedAddend=nullptr)
genFusedMultiply - Generate fused multiply instructions.
static bool getGatherLanePattern(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, unsigned LoadLaneOpCode, unsigned NumLanes)
Check if the given instruction forms a gather load pattern that can be optimized for better Memory-Le...
static MachineInstr * genFusedMultiplyIdxNeg(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned IdxMulOpd, unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC)
genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate instructions with an additional...
static bool isCombineInstrCandidate(unsigned Opc)
static unsigned regOffsetOpcode(unsigned Opcode)
MachineOutlinerClass
Constants defining how certain sequences should be outlined.
@ MachineOutlinerTailCall
Emit a save, restore, call, and return.
@ MachineOutlinerRegSave
Emit a call and tail-call.
@ MachineOutlinerNoLRSave
Only emit a branch.
@ MachineOutlinerThunk
Emit a call and return.
@ MachineOutlinerDefault
static cl::opt< unsigned > BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26), cl::desc("Restrict range of B instructions (DEBUG)"))
static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB)
Check if AArch64::NZCV should be alive in successors of MBB.
static void emitFrameOffsetAdj(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, int64_t Offset, unsigned Opc, const TargetInstrInfo *TII, MachineInstr::MIFlag Flag, bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFAOffset, StackOffset CFAOffset, unsigned FrameReg)
static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize)
static cl::opt< unsigned > CBZDisplacementBits("aarch64-cbz-offset-bits", cl::Hidden, cl::init(19), cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"))
static void genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, unsigned IdxOpd1, DenseMap< Register, unsigned > &InstrIdxForVirtReg)
Do the following transformation A - (B + C) ==> (A - B) - C A - (B + C) ==> (A - C) - B.
static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, unsigned *NewReg=nullptr)
static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB, const AArch64InstrInfo *TII, bool ShouldSignReturnAddr)
static MachineInstr * genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs)
static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, unsigned MulOpc, unsigned ZeroReg)
static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const MCInstrDesc &MCID, Register SrcReg, bool IsKill, unsigned SubIdx0, unsigned SubIdx1, int FI, MachineMemOperand *MMO)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
DXIL Forward Handle Accesses
@ Default
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Machine Check Debug Module
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
PowerPC Reduce CR logical Operation
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define DEBUG_WITH_TYPE(TYPE,...)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
Definition Debug.h:72
static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, unsigned CombineOpc=0)
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
SignReturnAddress getSignReturnAddressCondition() const
void setOutliningStyle(const std::string &Style)
std::optional< bool > hasRedZone() const
static bool shouldSignReturnAddress(SignReturnAddress Condition, bool IsLRSpilled)
static bool isHForm(const MachineInstr &MI)
Returns whether the instruction is in H form (16 bit operands)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
static bool hasBTISemantics(const MachineInstr &MI)
Returns whether the instruction can be compatible with non-zero BTYPE.
static bool isQForm(const MachineInstr &MI)
Returns whether the instruction is in Q form (128 bit operands)
static bool getMemOpInfo(unsigned Opcode, TypeSize &Scale, TypeSize &Width, int64_t &MinOffset, int64_t &MaxOffset)
Returns true if opcode Opc is a memory operation.
static bool isTailCallReturnInst(const MachineInstr &MI)
Returns true if MI is one of the TCRETURN* instructions.
static int findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr)
static bool isFPRCopy(const MachineInstr &MI)
Does this instruction rename an FPR without modifying bits?
MachineInstr * emitLdStWithAddr(MachineInstr &MemI, const ExtAddrMode &AM) const override
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is an instruction that moves/copies value from one register to an...
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
GetInstSize - Return the number of bytes of code the specified instruction may be.
static bool isZExtLoad(const MachineInstr &MI)
Returns whether the instruction is a zero-extending load.
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
static bool isGPRCopy(const MachineInstr &MI)
Does this instruction rename a GPR without modifying bits?
static unsigned convertToFlagSettingOpc(unsigned Opc)
Return the opcode that set flags when possible.
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
bool canInsertSelect(const MachineBasicBlock &, ArrayRef< MachineOperand > Cond, Register, Register, Register, int &, int &, int &) const override
Register isLoadFromStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
Check for post-frame ptr elimination stack locations as well.
static const MachineOperand & getLdStOffsetOp(const MachineInstr &MI)
Returns the immediate offset operator of a load/store.
bool isCoalescableExtInstr(const MachineInstr &MI, Register &SrcReg, Register &DstReg, unsigned &SubIdx) const override
static std::optional< unsigned > getUnscaledLdSt(unsigned Opc)
Returns the unscaled load/store for the scaled load/store opcode, if there is a corresponding unscale...
static bool hasUnscaledLdStOffset(unsigned Opc)
Return true if it has an unscaled load/store offset.
static const MachineOperand & getLdStAmountOp(const MachineInstr &MI)
Returns the shift amount operator of a load/store.
static bool isPreLdSt(const MachineInstr &MI)
Returns whether the instruction is a pre-indexed load/store.
std::optional< ExtAddrMode > getAddrModeFromMemoryOp(const MachineInstr &MemI, const TargetRegisterInfo *TRI) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &MI, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const override
bool analyzeBranchPredicate(MachineBasicBlock &MBB, MachineBranchPredicate &MBP, bool AllowModify) const override
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, unsigned SubReg=0, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isPairableLdStInst(const MachineInstr &MI)
Return true if pairing the given load or store may be paired with another.
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isSExtLoad(const MachineInstr &MI)
Returns whether the instruction is a sign-extending load.
const AArch64RegisterInfo & getRegisterInfo() const
getRegisterInfo - TargetInstrInfo is a superset of MRegister info.
static bool isPreSt(const MachineInstr &MI)
Returns whether the instruction is a pre-indexed store.
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
AArch64InstrInfo(const AArch64Subtarget &STI)
static bool isPairedLdSt(const MachineInstr &MI)
Returns whether the instruction is a paired load/store.
bool getMemOperandWithOffsetWidth(const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset, bool &OffsetIsScalable, TypeSize &Width, const TargetRegisterInfo *TRI) const
If OffsetIsScalable is set to 'true', the offset is scaled by vscale.
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isStridedAccess(const MachineInstr &MI)
Return true if the given load or store is a strided memory access.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
Detect opportunities for ldp/stp formation.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
bool isThroughputPattern(unsigned Pattern) const override
Return true when a code sequence can improve throughput.
MachineOperand & getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const
Return the immediate offset of the base register in a load/store LdSt.
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
bool canFoldIntoAddrMode(const MachineInstr &MemI, Register Reg, const MachineInstr &AddrI, ExtAddrMode &AM) const override
static bool isLdStPairSuppressed(const MachineInstr &MI)
Return true if pairing the given load or store is hinted to be unprofitable.
Register isStoreToStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
Check for post-frame ptr elimination stack locations as well.
std::unique_ptr< TargetInstrInfo::PipelinerLoopInfo > analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const override
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
MachineBasicBlock::iterator probedStackAlloc(MachineBasicBlock::iterator MBBI, Register TargetReg, bool FrameSetup) const
Return true when there is potentially a faster code sequence for an instruction chain ending in Root.
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
optimizeCompareInstr - Convert the instruction supplying the argument to the comparison into one that...
static unsigned getLoadStoreImmIdx(unsigned Opc)
Returns the index for the immediate for a given instruction.
static bool isGPRZero(const MachineInstr &MI)
Does this instruction set its full destination register to zero?
void copyGPRRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, unsigned Opcode, unsigned ZeroReg, llvm::ArrayRef< unsigned > Indices) const
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
analyzeCompare - For a comparison instruction, return the source registers in SrcReg and SrcReg2,...
CombinerObjective getCombinerObjective(unsigned Pattern) const override
static bool isFpOrNEON(Register Reg)
Returns whether the physical register is FP or NEON.
bool isAsCheapAsAMove(const MachineInstr &MI) const override
std::optional< DestSourcePair > isCopyLikeInstrImpl(const MachineInstr &MI) const override
static void suppressLdStPair(MachineInstr &MI)
Hint that pairing the given load or store is unprofitable.
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static bool isPreLd(const MachineInstr &MI)
Returns whether the instruction is a pre-indexed load.
void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, unsigned Opcode, llvm::ArrayRef< unsigned > Indices) const
bool optimizeCondBranch(MachineInstr &MI) const override
Replace csincr-branch sequence by simple conditional branch.
static int getMemScale(unsigned Opc)
Scaling factor for (scaled or unscaled) load or store.
bool isCandidateToMergeOrPair(const MachineInstr &MI) const
Return true if this is a load/store that can be potentially paired/merged.
MCInst getNop() const override
static const MachineOperand & getLdStBaseOp(const MachineInstr &MI)
Returns the base register operator of a load/store.
bool isReservedReg(const MachineFunction &MF, MCRegister Reg) const
const AArch64RegisterInfo * getRegisterInfo() const override
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
This is an important base class in LLVM.
Definition Constant.h:43
A debug info location.
Definition DebugLoc.h:123
bool empty() const
Definition DenseMap.h:109
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:714
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:711
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
static LocationSize precise(uint64_t Value)
This class is intended to be used as a base class for asm properties and features specific to the tar...
Definition MCAsmInfo.h:64
bool usesWindowsCFI() const
Definition MCAsmInfo.h:655
static MCCFIInstruction cfiDefCfa(MCSymbol *L, unsigned Register, int64_t Offset, SMLoc Loc={})
.cfi_def_cfa defines a rule for computing CFA as: take address from Register and add Offset to it.
Definition MCDwarf.h:576
static MCCFIInstruction createOffset(MCSymbol *L, unsigned Register, int64_t Offset, SMLoc Loc={})
.cfi_offset Previous value of Register is saved at offset Offset from CFA.
Definition MCDwarf.h:618
static MCCFIInstruction cfiDefCfaOffset(MCSymbol *L, int64_t Offset, SMLoc Loc={})
.cfi_def_cfa_offset modifies a rule for computing CFA.
Definition MCDwarf.h:591
static MCCFIInstruction createEscape(MCSymbol *L, StringRef Vals, SMLoc Loc={}, StringRef Comment="")
.cfi_escape Allows the user to add arbitrary bytes to the unwind info.
Definition MCDwarf.h:688
Instances of this class represent a single low-level machine instruction.
Definition MCInst.h:188
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
constexpr bool isValid() const
Definition MCRegister.h:84
static constexpr unsigned NoRegister
Definition MCRegister.h:60
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
Set of metadata that should be preserved when using BuildMI().
bool isInlineAsmBrIndirectTarget() const
Returns true if this is the indirect dest of an INLINEASM_BR.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
reverse_instr_iterator instr_rbegin()
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
reverse_instr_iterator instr_rend()
Instructions::iterator instr_iterator
Instructions::const_iterator const_instr_iterator
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
LLVM_ABI bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
void setStackID(int ObjectIdx, uint8_t ID)
bool isCalleeSavedInfoValid() const
Has the callee saved info been calculated yet?
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
unsigned getNumObjects() const
Return the number of objects.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
unsigned addFrameInst(const MCCFIInstruction &Inst)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineJumpTableInfo * getJumpTableInfo() const
getJumpTableInfo - Return the jump table info object for the current function.
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addCFIIndex(unsigned CFIIndex) const
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
reverse_iterator getReverse() const
Get a reverse iterator to the same node.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
bool isCall(QueryType Type=AnyInBundle) const
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
LLVM_ABI uint32_t mergeFlagsWith(const MachineInstr &Other) const
Return the MIFlags which represent both MachineInstrs.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
bool registerDefIsDead(Register Reg, const TargetRegisterInfo *TRI) const
Returns true if the register is dead in this machine instruction.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI bool isLoadFoldBarrier() const
Returns true if it is illegal to fold a load across this instruction.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void addRegisterDefined(Register Reg, const TargetRegisterInfo *RegInfo=nullptr)
We have determined MI defines a register.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const std::vector< MachineJumpTableEntry > & getJumpTables() const
A description of a memory reference used in the backend.
@ MOVolatile
The memory access is volatile.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
This class contains meta information specific to a module.
LLVM_ABI MachineFunction * getMachineFunction(const Function &F) const
Returns the MachineFunction associated to IR function F if there is one, otherwise nullptr.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineBasicBlock * getMBB() const
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
bool tracksLiveness() const
tracksLiveness - Returns true when tracking register liveness accurately.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
bool use_nodbg_empty(Register RegNo) const
use_nodbg_empty - Return true if there are no non-Debug instructions using the specified register.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
bool reservedRegsFrozen() const
reservedRegsFrozen - Returns true after freezeReservedRegs() was called to ensure the set of reserved...
bool def_empty(Register RegNo) const
def_empty - Return true if there are no instructions defining the specified register (it may be live-...
use_instr_nodbg_iterator use_instr_nodbg_begin(Register RegNo) const
bool hasOneDef(Register RegNo) const
Return true if there is exactly one operand defining the specified register.
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
MI-level patchpoint operands.
Definition StackMaps.h:77
uint32_t getNumPatchBytes() const
Return the number of patchable bytes the given patchpoint should emit.
Definition StackMaps.h:105
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
static constexpr bool isVirtualRegister(unsigned Reg)
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:66
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Represents a location in source code.
Definition SMLoc.h:22
bool erase(PtrType Ptr)
Remove pointer from the set.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134
bool empty() const
Definition SmallSet.h:169
bool erase(const T &V)
Definition SmallSet.h:200
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
MI-level stackmap operands.
Definition StackMaps.h:36
uint32_t getNumPatchBytes() const
Return the number of patchable bytes the given stackmap should emit.
Definition StackMaps.h:51
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
int64_t getFixed() const
Returns the fixed component of the stack.
Definition TypeSize.h:46
int64_t getScalable() const
Returns the scalable component of the stack.
Definition TypeSize.h:49
static StackOffset get(int64_t Fixed, int64_t Scalable)
Definition TypeSize.h:41
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
MI-level Statepoint operands.
Definition StackMaps.h:159
uint32_t getNumPatchBytes() const
Return the number of patchable bytes the given statepoint should emit.
Definition StackMaps.h:208
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Object returned by analyzeLoopForPipelining.
TargetInstrInfo - Interface to description of machine instruction set.
virtual void genAlternativeCodeSequence(MachineInstr &Root, unsigned Pattern, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstIdxForVirtReg) const
When getMachineCombinerPatterns() finds patterns, this function generates the instructions that could...
virtual std::optional< ParamLoadedValue > describeLoadedValue(const MachineInstr &MI, Register Reg) const
Produce the expression describing the MI loading a value into the physical register Reg.
virtual bool getMachineCombinerPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, bool DoRegPressureReduce) const
Return true when there is potentially a faster code sequence for an instruction chain ending in Root.
virtual bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const
Test if the given instruction should be considered a scheduling boundary.
virtual CombinerObjective getCombinerObjective(unsigned Pattern) const
Return the objective of a combiner pattern.
virtual bool isFunctionSafeToSplit(const MachineFunction &MF) const
Return true if the function is a viable candidate for machine function splitting.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetInstrInfo * getInstrInfo() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
Target - Wrapper for Target specific information.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
Value * getOperand(unsigned i) const
Definition User.h:207
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
self_iterator getIterator()
Definition ilist_node.h:123
A raw_ostream that writes to an std::string.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static CondCode getInvertedCondCode(CondCode Code)
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_S
MO_S - Indicates that the bits of the symbol operand represented by MO_G0 etc are signed.
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_PREL
MO_PREL - Indicates that the bits of the symbol operand represented by MO_G0 etc are PC relative.
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_ARM64EC_CALLMANGLE
MO_ARM64EC_CALLMANGLE - Operand refers to the Arm64EC-mangled version of a symbol,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_TAGGED
MO_TAGGED - With MO_PAGE, indicates that the page includes a memory tag in bits 56-63.
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
unsigned getCheckerSizeInBytes(AuthCheckMethod Method)
Returns the number of bytes added by checkAuthenticatedRegister.
static uint64_t decodeLogicalImmediate(uint64_t val, unsigned regSize)
decodeLogicalImmediate - Decode a logical immediate value in the form "N:immr:imms" (where the immr a...
static unsigned getShiftValue(unsigned Imm)
getShiftValue - Extract the shift value.
static unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET, unsigned Imm)
getArithExtendImm - Encode the extend type and shift amount for an arithmetic instruction: imm: 3-bit...
static unsigned getArithShiftValue(unsigned Imm)
getArithShiftValue - get the arithmetic shift value.
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static AArch64_AM::ShiftExtendType getExtendType(unsigned Imm)
getExtendType - Extract the extend type for operands of arithmetic ops.
static AArch64_AM::ShiftExtendType getArithExtendType(unsigned Imm)
static AArch64_AM::ShiftExtendType getShiftType(unsigned Imm)
getShiftType - Extract the shift type.
static unsigned getShifterImm(AArch64_AM::ShiftExtendType ST, unsigned Imm)
getShifterImm - Encode the shift type and amount: imm: 6-bit shift amount shifter: 000 ==> lsl 001 ==...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
static const uint64_t InstrFlagIsWhile
static const uint64_t InstrFlagIsPTestLike
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
initializer< Ty > init(const Ty &Val)
constexpr double e
InstrType
Represents how an instruction should be mapped by the outliner.
LLVM_ABI Instruction & back() const
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
static bool isCondBranchOpcode(int Opc)
MCCFIInstruction createDefCFA(const TargetRegisterInfo &TRI, unsigned FrameReg, unsigned Reg, const StackOffset &Offset, bool LastAdjustmentWasScalable=true)
static bool isPTrueOpcode(unsigned Opc)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool succeeded(LogicalResult Result)
Utility function that returns true if the provided LogicalResult corresponds to a success value.
int isAArch64FrameOffsetLegal(const MachineInstr &MI, StackOffset &Offset, bool *OutUseUnscaledOp=nullptr, unsigned *OutUnscaledOp=nullptr, int64_t *EmittableOffset=nullptr)
Check if the Offset is a valid frame offset for MI.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
RegState
Flags to represent properties of register accesses.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
@ Renamable
Register that may be renamed.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
constexpr RegState getKillRegState(bool B)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
static bool isIndirectBranchOpcode(int Opc)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
unsigned getBLRCallOpcode(const MachineFunction &MF)
Return opcode to be used for indirect calls.
@ AArch64FrameOffsetIsLegal
Offset is legal.
@ AArch64FrameOffsetCanUpdate
Offset can apply, at least partly.
@ AArch64FrameOffsetCannotUpdate
Offset cannot apply.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Op::Description Desc
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
static bool isSEHInstruction(const MachineInstr &MI)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
AArch64MachineCombinerPattern
@ MULSUBv8i16_OP2
@ FMULv4i16_indexed_OP1
@ FMLSv1i32_indexed_OP2
@ MULSUBv2i32_indexed_OP1
@ FMLAv2i32_indexed_OP2
@ MULADDv4i16_indexed_OP2
@ FMLAv1i64_indexed_OP1
@ MULSUBv16i8_OP1
@ FMLAv8i16_indexed_OP2
@ FMULv2i32_indexed_OP1
@ MULSUBv8i16_indexed_OP2
@ FMLAv1i64_indexed_OP2
@ MULSUBv4i16_indexed_OP2
@ FMLAv1i32_indexed_OP1
@ FMLAv2i64_indexed_OP2
@ FMLSv8i16_indexed_OP1
@ MULSUBv2i32_OP1
@ FMULv4i16_indexed_OP2
@ MULSUBv4i32_indexed_OP2
@ FMULv2i64_indexed_OP2
@ FMLAv4i32_indexed_OP1
@ MULADDv4i16_OP2
@ FMULv8i16_indexed_OP2
@ MULSUBv4i16_OP1
@ MULADDv4i32_OP2
@ MULADDv2i32_OP2
@ MULADDv16i8_OP2
@ FMLSv4i16_indexed_OP1
@ MULADDv16i8_OP1
@ FMLAv2i64_indexed_OP1
@ FMLAv1i32_indexed_OP2
@ FMLSv2i64_indexed_OP2
@ MULADDv2i32_OP1
@ MULADDv4i32_OP1
@ MULADDv2i32_indexed_OP1
@ MULSUBv16i8_OP2
@ MULADDv4i32_indexed_OP1
@ MULADDv2i32_indexed_OP2
@ FMLAv4i16_indexed_OP2
@ MULSUBv8i16_OP1
@ FMULv2i32_indexed_OP2
@ FMLSv2i32_indexed_OP2
@ FMLSv4i32_indexed_OP1
@ FMULv2i64_indexed_OP1
@ MULSUBv4i16_OP2
@ FMLSv4i16_indexed_OP2
@ FMLAv2i32_indexed_OP1
@ FMLSv2i32_indexed_OP1
@ FMLAv8i16_indexed_OP1
@ MULSUBv4i16_indexed_OP1
@ FMLSv4i32_indexed_OP2
@ MULADDv4i32_indexed_OP2
@ MULSUBv4i32_OP2
@ MULSUBv8i16_indexed_OP1
@ MULADDv8i16_OP2
@ MULSUBv2i32_indexed_OP2
@ FMULv4i32_indexed_OP2
@ FMLSv2i64_indexed_OP1
@ MULADDv4i16_OP1
@ FMLAv4i32_indexed_OP2
@ MULADDv8i16_indexed_OP1
@ FMULv4i32_indexed_OP1
@ FMLAv4i16_indexed_OP1
@ FMULv8i16_indexed_OP1
@ MULADDv8i16_OP1
@ MULSUBv4i32_indexed_OP1
@ MULSUBv4i32_OP1
@ FMLSv8i16_indexed_OP2
@ MULADDv8i16_indexed_OP2
@ MULSUBv2i32_OP2
@ FMLSv1i64_indexed_OP2
@ MULADDv4i16_indexed_OP1
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, StackOffset Offset, const TargetInstrInfo *TII, MachineInstr::MIFlag=MachineInstr::NoFlags, bool SetNZCV=false, bool NeedsWinCFI=false, bool *HasWinCFI=nullptr, bool EmitCFAOffset=false, StackOffset InitialOffset={}, unsigned FrameReg=AArch64::SP)
emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg plus Offset.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
constexpr RegState getDefRegState(bool B)
CombinerObjective
The combiner's goal may differ based on which pattern it is attempting to optimize.
std::optional< UsedNZCV > examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr, const TargetRegisterInfo &TRI, SmallVectorImpl< MachineInstr * > *CCUseInstrs=nullptr)
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
auto instructionsWithoutDebug(IterT It, IterT End, bool SkipPseudoOp=true)
Construct a range iterator which begins at It and moves forwards until End is reached,...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
static MCRegister getXRegFromWReg(MCRegister Reg)
MCCFIInstruction createCFAOffset(const TargetRegisterInfo &MRI, unsigned Reg, const StackOffset &OffsetFromDefCFA, std::optional< int64_t > IncomingVGOffsetFromDefCFA)
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
static bool isUncondBranchOpcode(int Opc)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2192
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
bool rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, unsigned FrameReg, StackOffset &Offset, const AArch64InstrInfo *TII)
rewriteAArch64FrameIndex - Rewrite MI to access 'Offset' bytes from the FP.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
static const MachineMemOperand::Flags MOSuppressPair
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
void appendLEB128(SmallVectorImpl< U > &Buffer, T Value)
Definition LEB128.h:236
bool optimizeTerminators(MachineBasicBlock *MBB, const TargetInstrInfo &TII)
std::pair< MachineOperand, DIExpression * > ParamLoadedValue
bool isNZCVTouchedInInstructionRange(const MachineInstr &DefMI, const MachineInstr &UseMI, const TargetRegisterInfo *TRI)
Return true if there is an instruction /after/ DefMI and before UseMI which either reads or clobbers ...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
static const MachineMemOperand::Flags MOStridedAccess
constexpr RegState getUndefRegState(bool B)
void fullyRecomputeLiveIns(ArrayRef< MachineBasicBlock * > MBBs)
Convenience function for recomputing live-in's for a set of MBBs until the computation converges.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Used to describe addressing mode similar to ExtAddrMode in CodeGenPrepare.
LLVM_ABI static const MBBSectionID ColdSectionID
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getUnknownStack(MachineFunction &MF)
Stack memory without other information.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
An individual sequence of instructions to be replaced with a call to an outlined function.
MachineFunction * getMF() const
The information necessary to create an outlined function for some class of candidate.