LLVM 23.0.0git
AArch64InstrInfo.cpp
Go to the documentation of this file.
1//===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the AArch64 implementation of the TargetInstrInfo class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64InstrInfo.h"
14#include "AArch64ExpandImm.h"
16#include "AArch64PointerAuth.h"
17#include "AArch64Subtarget.h"
21#include "llvm/ADT/ArrayRef.h"
22#include "llvm/ADT/STLExtras.h"
23#include "llvm/ADT/SmallSet.h"
25#include "llvm/ADT/Statistic.h"
44#include "llvm/IR/DebugLoc.h"
45#include "llvm/IR/GlobalValue.h"
46#include "llvm/IR/Module.h"
47#include "llvm/MC/MCAsmInfo.h"
48#include "llvm/MC/MCInst.h"
50#include "llvm/MC/MCInstrDesc.h"
55#include "llvm/Support/LEB128.h"
59#include <cassert>
60#include <cstdint>
61#include <iterator>
62#include <utility>
63
64using namespace llvm;
65
66#define GET_INSTRINFO_CTOR_DTOR
67#include "AArch64GenInstrInfo.inc"
68
69#define DEBUG_TYPE "AArch64InstrInfo"
70
71STATISTIC(NumCopyInstrs, "Number of COPY instructions expanded");
72STATISTIC(NumZCRegMoveInstrsGPR, "Number of zero-cycle GPR register move "
73 "instructions expanded from canonical COPY");
74STATISTIC(NumZCRegMoveInstrsFPR, "Number of zero-cycle FPR register move "
75 "instructions expanded from canonical COPY");
76STATISTIC(NumZCZeroingInstrsGPR, "Number of zero-cycle GPR zeroing "
77 "instructions expanded from canonical COPY");
78// NumZCZeroingInstrsFPR is counted at AArch64AsmPrinter
79
81 CBDisplacementBits("aarch64-cb-offset-bits", cl::Hidden, cl::init(9),
82 cl::desc("Restrict range of CB instructions (DEBUG)"));
83
85 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
86 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
87
89 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
90 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
91
93 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
94 cl::desc("Restrict range of Bcc instructions (DEBUG)"));
95
97 BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26),
98 cl::desc("Restrict range of B instructions (DEBUG)"));
99
101 "aarch64-search-limit", cl::Hidden, cl::init(2048),
102 cl::desc("Restrict range of instructions to search for the "
103 "machine-combiner gather pattern optimization"));
104
106 : AArch64GenInstrInfo(STI, RI, AArch64::ADJCALLSTACKDOWN,
107 AArch64::ADJCALLSTACKUP, AArch64::CATCHRET),
108 RI(STI.getTargetTriple(), STI.getHwMode()), Subtarget(STI) {}
109
110/// Return the maximum number of bytes of code the specified instruction may be
111/// after LFI rewriting. If the instruction is not rewritten, std::nullopt is
112/// returned (use default sizing).
113///
114/// NOTE: the size estimates here must be kept in sync with the rewrites in
115/// AArch64MCLFIRewriter.cpp. Sizes may be overestimates of the rewritten
116/// instruction sequences.
117static std::optional<unsigned> getLFIInstSizeInBytes(const MachineInstr &MI) {
118 switch (MI.getOpcode()) {
119 case AArch64::SVC:
120 // SVC expands to 4 instructions.
121 return 16;
122 case AArch64::BR:
123 case AArch64::BLR:
124 // Indirect branches/calls expand to 2 instructions (guard + br/blr).
125 return 8;
126 case AArch64::RET:
127 // RET through LR is not rewritten, but RET through another register
128 // expands to 2 instructions (guard + ret).
129 if (MI.getOperand(0).getReg() != AArch64::LR)
130 return 8;
131 return 4;
132 default:
133 break;
134 }
135
136 // Instructions that explicitly modify LR expand to 2 instructions.
137 for (const MachineOperand &MO : MI.explicit_operands())
138 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::LR)
139 return 8;
140
141 // Default case: instructions that don't cause expansion.
142 // - TP accesses in LFI are a single load/store, so no expansion.
143 // - All remaining instructions are not rewritten.
144 return std::nullopt;
145}
146
147/// GetInstSize - Return the number of bytes of code the specified
148/// instruction may be. This returns the maximum number of bytes.
150 const MachineBasicBlock &MBB = *MI.getParent();
151 const MachineFunction *MF = MBB.getParent();
152 const Function &F = MF->getFunction();
153 const MCAsmInfo &MAI = MF->getTarget().getMCAsmInfo();
154
155 {
156 auto Op = MI.getOpcode();
157 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
158 return getInlineAsmLength(MI.getOperand(0).getSymbolName(), MAI);
159 }
160
161 // Meta-instructions emit no code.
162 if (MI.isMetaInstruction())
163 return 0;
164
165 // FIXME: We currently only handle pseudoinstructions that don't get expanded
166 // before the assembly printer.
167 unsigned NumBytes = 0;
168 const MCInstrDesc &Desc = MI.getDesc();
169
170 // LFI rewriter expansions that supersede normal sizing.
171 const auto &STI = MF->getSubtarget<AArch64Subtarget>();
172 if (STI.isLFI())
173 if (auto Size = getLFIInstSizeInBytes(MI))
174 return *Size;
175
176 if (!MI.isBundle() && isTailCallReturnInst(MI)) {
177 NumBytes = Desc.getSize() ? Desc.getSize() : 4;
178
179 const auto *MFI = MF->getInfo<AArch64FunctionInfo>();
180 if (!MFI->shouldSignReturnAddress(*MF))
181 return NumBytes;
182
183 auto Method = STI.getAuthenticatedLRCheckMethod(*MF);
184 NumBytes += AArch64PAuth::getCheckerSizeInBytes(Method);
185 return NumBytes;
186 }
187
188 // Size should be preferably set in
189 // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case).
190 // Specific cases handle instructions of variable sizes
191 switch (Desc.getOpcode()) {
192 default:
193 if (Desc.getSize())
194 return Desc.getSize();
195
196 // Anything not explicitly designated otherwise (i.e. pseudo-instructions
197 // with fixed constant size but not specified in .td file) is a normal
198 // 4-byte insn.
199 NumBytes = 4;
200 break;
201 case TargetOpcode::STACKMAP:
202 // The upper bound for a stackmap intrinsic is the full length of its shadow
203 NumBytes = StackMapOpers(&MI).getNumPatchBytes();
204 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
205 break;
206 case TargetOpcode::PATCHPOINT:
207 // The size of the patchpoint intrinsic is the number of bytes requested
208 NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
209 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
210 break;
211 case TargetOpcode::STATEPOINT:
212 NumBytes = StatepointOpers(&MI).getNumPatchBytes();
213 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
214 // No patch bytes means a normal call inst is emitted
215 if (NumBytes == 0)
216 NumBytes = 4;
217 break;
218 case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
219 // If `patchable-function-entry` is set, PATCHABLE_FUNCTION_ENTER
220 // instructions are expanded to the specified number of NOPs. Otherwise,
221 // they are expanded to 36-byte XRay sleds.
222 NumBytes =
223 F.getFnAttributeAsParsedInteger("patchable-function-entry", 9) * 4;
224 break;
225 case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
226 case TargetOpcode::PATCHABLE_TAIL_CALL:
227 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
228 // An XRay sled can be 4 bytes of alignment plus a 32-byte block.
229 NumBytes = 36;
230 break;
231 case TargetOpcode::PATCHABLE_EVENT_CALL:
232 // EVENT_CALL XRay sleds are exactly 6 instructions long (no alignment).
233 NumBytes = 24;
234 break;
235
236 case AArch64::SPACE:
237 NumBytes = MI.getOperand(1).getImm();
238 break;
239 case AArch64::MOVaddr:
240 case AArch64::MOVaddrJT:
241 case AArch64::MOVaddrCP:
242 case AArch64::MOVaddrBA:
243 case AArch64::MOVaddrTLS:
244 case AArch64::MOVaddrEXT: {
245 // Use the same logic as the pseudo expansion to count instructions.
248 MI.getOperand(1).getTargetFlags(),
249 Subtarget.isTargetMachO(), Insn);
250 NumBytes = Insn.size() * 4;
251 break;
252 }
253
254 case AArch64::MOVi32imm:
255 case AArch64::MOVi64imm: {
256 // Use the same logic as the pseudo expansion to count instructions.
257 unsigned BitSize = Desc.getOpcode() == AArch64::MOVi32imm ? 32 : 64;
259 AArch64_IMM::expandMOVImm(MI.getOperand(1).getImm(), BitSize, Insn);
260 NumBytes = Insn.size() * 4;
261 break;
262 }
263
264 case TargetOpcode::BUNDLE:
265 NumBytes = getInstBundleSize(MI);
266 break;
267 }
268
269 return NumBytes;
270}
271
274 // Block ends with fall-through condbranch.
275 switch (LastInst->getOpcode()) {
276 default:
277 llvm_unreachable("Unknown branch instruction?");
278 case AArch64::Bcc:
279 Target = LastInst->getOperand(1).getMBB();
280 Cond.push_back(LastInst->getOperand(0));
281 break;
282 case AArch64::CBZW:
283 case AArch64::CBZX:
284 case AArch64::CBNZW:
285 case AArch64::CBNZX:
286 Target = LastInst->getOperand(1).getMBB();
287 Cond.push_back(MachineOperand::CreateImm(-1));
288 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
289 Cond.push_back(LastInst->getOperand(0));
290 break;
291 case AArch64::TBZW:
292 case AArch64::TBZX:
293 case AArch64::TBNZW:
294 case AArch64::TBNZX:
295 Target = LastInst->getOperand(2).getMBB();
296 Cond.push_back(MachineOperand::CreateImm(-1));
297 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
298 Cond.push_back(LastInst->getOperand(0));
299 Cond.push_back(LastInst->getOperand(1));
300 break;
301 case AArch64::CBWPri:
302 case AArch64::CBXPri:
303 case AArch64::CBWPrr:
304 case AArch64::CBXPrr:
305 Target = LastInst->getOperand(3).getMBB();
306 Cond.push_back(MachineOperand::CreateImm(-1));
307 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
308 Cond.push_back(LastInst->getOperand(0));
309 Cond.push_back(LastInst->getOperand(1));
310 Cond.push_back(LastInst->getOperand(2));
311 break;
312 case AArch64::CBBAssertExt:
313 case AArch64::CBHAssertExt:
314 Target = LastInst->getOperand(3).getMBB();
315 Cond.push_back(MachineOperand::CreateImm(-1)); // -1
316 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); // Opc
317 Cond.push_back(LastInst->getOperand(0)); // Cond
318 Cond.push_back(LastInst->getOperand(1)); // Op0
319 Cond.push_back(LastInst->getOperand(2)); // Op1
320 Cond.push_back(LastInst->getOperand(4)); // Ext0
321 Cond.push_back(LastInst->getOperand(5)); // Ext1
322 break;
323 }
324}
325
326static unsigned getBranchDisplacementBits(unsigned Opc) {
327 switch (Opc) {
328 default:
329 llvm_unreachable("unexpected opcode!");
330 case AArch64::B:
331 return BDisplacementBits;
332 case AArch64::TBNZW:
333 case AArch64::TBZW:
334 case AArch64::TBNZX:
335 case AArch64::TBZX:
336 return TBZDisplacementBits;
337 case AArch64::CBNZW:
338 case AArch64::CBZW:
339 case AArch64::CBNZX:
340 case AArch64::CBZX:
341 return CBZDisplacementBits;
342 case AArch64::Bcc:
343 return BCCDisplacementBits;
344 case AArch64::CBWPri:
345 case AArch64::CBXPri:
346 case AArch64::CBBAssertExt:
347 case AArch64::CBHAssertExt:
348 case AArch64::CBWPrr:
349 case AArch64::CBXPrr:
350 return CBDisplacementBits;
351 }
352}
353
355 int64_t BrOffset) const {
356 unsigned Bits = getBranchDisplacementBits(BranchOp);
357 assert(Bits >= 3 && "max branch displacement must be enough to jump"
358 "over conditional branch expansion");
359 return isIntN(Bits, BrOffset / 4);
360}
361
364 switch (MI.getOpcode()) {
365 default:
366 llvm_unreachable("unexpected opcode!");
367 case AArch64::B:
368 return MI.getOperand(0).getMBB();
369 case AArch64::TBZW:
370 case AArch64::TBNZW:
371 case AArch64::TBZX:
372 case AArch64::TBNZX:
373 return MI.getOperand(2).getMBB();
374 case AArch64::CBZW:
375 case AArch64::CBNZW:
376 case AArch64::CBZX:
377 case AArch64::CBNZX:
378 case AArch64::Bcc:
379 return MI.getOperand(1).getMBB();
380 case AArch64::CBWPri:
381 case AArch64::CBXPri:
382 case AArch64::CBBAssertExt:
383 case AArch64::CBHAssertExt:
384 case AArch64::CBWPrr:
385 case AArch64::CBXPrr:
386 return MI.getOperand(3).getMBB();
387 }
388}
389
391 MachineBasicBlock &NewDestBB,
392 MachineBasicBlock &RestoreBB,
393 const DebugLoc &DL,
394 int64_t BrOffset,
395 RegScavenger *RS) const {
396 assert(RS && "RegScavenger required for long branching");
397 assert(MBB.empty() &&
398 "new block should be inserted for expanding unconditional branch");
399 assert(MBB.pred_size() == 1);
400 assert(RestoreBB.empty() &&
401 "restore block should be inserted for restoring clobbered registers");
402
403 auto buildIndirectBranch = [&](Register Reg, MachineBasicBlock &DestBB) {
404 // Offsets outside of the signed 33-bit range are not supported for ADRP +
405 // ADD.
406 if (!isInt<33>(BrOffset))
408 "Branch offsets outside of the signed 33-bit range not supported");
409
410 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADRP), Reg)
411 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGE);
412 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADDXri), Reg)
413 .addReg(Reg)
414 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
415 .addImm(0);
416 BuildMI(MBB, MBB.end(), DL, get(AArch64::BR)).addReg(Reg);
417 };
418
419 RS->enterBasicBlockEnd(MBB);
420 // If X16 is unused, we can rely on the linker to insert a range extension
421 // thunk if NewDestBB is out of range of a single B instruction.
422 constexpr Register Reg = AArch64::X16;
423 if (!RS->isRegUsed(Reg)) {
424 insertUnconditionalBranch(MBB, &NewDestBB, DL);
425 RS->setRegUsed(Reg);
426 return;
427 }
428
429 // In a cold block without BTI, insert the indirect branch if a register is
430 // free. Skip this if BTI is enabled to avoid inserting a BTI at the target,
431 // prioritizing a dynamic cost in cold code over a static cost in hot code.
432 AArch64FunctionInfo *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>();
433 bool HasBTI = AFI && AFI->branchTargetEnforcement();
434 if (MBB.getSectionID() == MBBSectionID::ColdSectionID && !HasBTI) {
435 Register Scavenged = RS->FindUnusedReg(&AArch64::GPR64RegClass);
436 if (Scavenged != AArch64::NoRegister) {
437 buildIndirectBranch(Scavenged, NewDestBB);
438 RS->setRegUsed(Scavenged);
439 return;
440 }
441 }
442
443 // Note: Spilling X16 briefly moves the stack pointer, making it incompatible
444 // with red zones.
445 if (!AFI || AFI->hasRedZone().value_or(true))
447 "Unable to insert indirect branch inside function that has red zone");
448
449 // Otherwise, spill X16 and defer range extension to the linker.
450 BuildMI(MBB, MBB.end(), DL, get(AArch64::STRXpre))
451 .addReg(AArch64::SP, RegState::Define)
452 .addReg(Reg)
453 .addReg(AArch64::SP)
454 .addImm(-16);
455
456 BuildMI(MBB, MBB.end(), DL, get(AArch64::B)).addMBB(&RestoreBB);
457
458 BuildMI(RestoreBB, RestoreBB.end(), DL, get(AArch64::LDRXpost))
459 .addReg(AArch64::SP, RegState::Define)
461 .addReg(AArch64::SP)
462 .addImm(16);
463}
464
465// Branch analysis.
468 MachineBasicBlock *&FBB,
470 bool AllowModify) const {
471 // If the block has no terminators, it just falls into the block after it.
472 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
473 if (I == MBB.end())
474 return false;
475
476 // Skip over SpeculationBarrierEndBB terminators
477 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
478 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
479 --I;
480 }
481
482 if (!isUnpredicatedTerminator(*I))
483 return false;
484
485 // Get the last instruction in the block.
486 MachineInstr *LastInst = &*I;
487
488 // If there is only one terminator instruction, process it.
489 unsigned LastOpc = LastInst->getOpcode();
490 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
491 if (isUncondBranchOpcode(LastOpc)) {
492 TBB = LastInst->getOperand(0).getMBB();
493 return false;
494 }
495 if (isCondBranchOpcode(LastOpc)) {
496 // Block ends with fall-through condbranch.
497 parseCondBranch(LastInst, TBB, Cond);
498 return false;
499 }
500 return true; // Can't handle indirect branch.
501 }
502
503 // Get the instruction before it if it is a terminator.
504 MachineInstr *SecondLastInst = &*I;
505 unsigned SecondLastOpc = SecondLastInst->getOpcode();
506
507 // If AllowModify is true and the block ends with two or more unconditional
508 // branches, delete all but the first unconditional branch.
509 if (AllowModify && isUncondBranchOpcode(LastOpc)) {
510 while (isUncondBranchOpcode(SecondLastOpc)) {
511 LastInst->eraseFromParent();
512 LastInst = SecondLastInst;
513 LastOpc = LastInst->getOpcode();
514 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
515 // Return now the only terminator is an unconditional branch.
516 TBB = LastInst->getOperand(0).getMBB();
517 return false;
518 }
519 SecondLastInst = &*I;
520 SecondLastOpc = SecondLastInst->getOpcode();
521 }
522 }
523
524 // If we're allowed to modify and the block ends in a unconditional branch
525 // which could simply fallthrough, remove the branch. (Note: This case only
526 // matters when we can't understand the whole sequence, otherwise it's also
527 // handled by BranchFolding.cpp.)
528 if (AllowModify && isUncondBranchOpcode(LastOpc) &&
529 MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) {
530 LastInst->eraseFromParent();
531 LastInst = SecondLastInst;
532 LastOpc = LastInst->getOpcode();
533 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
534 assert(!isUncondBranchOpcode(LastOpc) &&
535 "unreachable unconditional branches removed above");
536
537 if (isCondBranchOpcode(LastOpc)) {
538 // Block ends with fall-through condbranch.
539 parseCondBranch(LastInst, TBB, Cond);
540 return false;
541 }
542 return true; // Can't handle indirect branch.
543 }
544 SecondLastInst = &*I;
545 SecondLastOpc = SecondLastInst->getOpcode();
546 }
547
548 // If there are three terminators, we don't know what sort of block this is.
549 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
550 return true;
551
552 // If the block ends with a B and a Bcc, handle it.
553 if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
554 parseCondBranch(SecondLastInst, TBB, Cond);
555 FBB = LastInst->getOperand(0).getMBB();
556 return false;
557 }
558
559 // If the block ends with two unconditional branches, handle it. The second
560 // one is not executed, so remove it.
561 if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
562 TBB = SecondLastInst->getOperand(0).getMBB();
563 I = LastInst;
564 if (AllowModify)
565 I->eraseFromParent();
566 return false;
567 }
568
569 // ...likewise if it ends with an indirect branch followed by an unconditional
570 // branch.
571 if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
572 I = LastInst;
573 if (AllowModify)
574 I->eraseFromParent();
575 return true;
576 }
577
578 // Otherwise, can't handle this.
579 return true;
580}
581
583 MachineBranchPredicate &MBP,
584 bool AllowModify) const {
585 // Use analyzeBranch to validate the branch pattern.
586 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
588 if (analyzeBranch(MBB, TBB, FBB, Cond, AllowModify))
589 return true;
590
591 // analyzeBranch returns success with empty Cond for unconditional branches.
592 if (Cond.empty())
593 return true;
594
595 MBP.TrueDest = TBB;
596 assert(MBP.TrueDest && "expected!");
597 MBP.FalseDest = FBB ? FBB : MBB.getNextNode();
598
599 MBP.ConditionDef = nullptr;
600 MBP.SingleUseCondition = false;
601
602 // Find the conditional branch. After analyzeBranch succeeds with non-empty
603 // Cond, there's exactly one conditional branch - either last (fallthrough)
604 // or second-to-last (followed by unconditional B).
605 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
606 if (I == MBB.end())
607 return true;
608
609 if (isUncondBranchOpcode(I->getOpcode())) {
610 if (I == MBB.begin())
611 return true;
612 --I;
613 }
614
615 MachineInstr *CondBranch = &*I;
616 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
617
618 switch (CondBranch->getOpcode()) {
619 default:
620 return true;
621
622 case AArch64::Bcc:
623 // Bcc takes the NZCV flag as the operand to branch on, walk up the
624 // instruction stream to find the last instruction to define NZCV.
626 if (MI.modifiesRegister(AArch64::NZCV, /*TRI=*/nullptr)) {
627 MBP.ConditionDef = &MI;
628 break;
629 }
630 }
631 return false;
632
633 case AArch64::CBZW:
634 case AArch64::CBZX:
635 case AArch64::CBNZW:
636 case AArch64::CBNZX: {
637 MBP.LHS = CondBranch->getOperand(0);
638 MBP.RHS = MachineOperand::CreateImm(0);
639 unsigned Opc = CondBranch->getOpcode();
640 MBP.Predicate = (Opc == AArch64::CBNZX || Opc == AArch64::CBNZW)
641 ? MachineBranchPredicate::PRED_NE
642 : MachineBranchPredicate::PRED_EQ;
643 Register CondReg = MBP.LHS.getReg();
644 if (CondReg.isVirtual())
645 MBP.ConditionDef = MRI.getVRegDef(CondReg);
646 return false;
647 }
648
649 case AArch64::TBZW:
650 case AArch64::TBZX:
651 case AArch64::TBNZW:
652 case AArch64::TBNZX: {
653 Register CondReg = CondBranch->getOperand(0).getReg();
654 if (CondReg.isVirtual())
655 MBP.ConditionDef = MRI.getVRegDef(CondReg);
656 return false;
657 }
658 }
659}
660
663 if (Cond[0].getImm() != -1) {
664 // Regular Bcc
665 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
667 } else {
668 // Folded compare-and-branch
669 switch (Cond[1].getImm()) {
670 default:
671 llvm_unreachable("Unknown conditional branch!");
672 case AArch64::CBZW:
673 Cond[1].setImm(AArch64::CBNZW);
674 break;
675 case AArch64::CBNZW:
676 Cond[1].setImm(AArch64::CBZW);
677 break;
678 case AArch64::CBZX:
679 Cond[1].setImm(AArch64::CBNZX);
680 break;
681 case AArch64::CBNZX:
682 Cond[1].setImm(AArch64::CBZX);
683 break;
684 case AArch64::TBZW:
685 Cond[1].setImm(AArch64::TBNZW);
686 break;
687 case AArch64::TBNZW:
688 Cond[1].setImm(AArch64::TBZW);
689 break;
690 case AArch64::TBZX:
691 Cond[1].setImm(AArch64::TBNZX);
692 break;
693 case AArch64::TBNZX:
694 Cond[1].setImm(AArch64::TBZX);
695 break;
696
697 // Cond is { -1, Opcode, CC, Op0, Op1, ... }
698 case AArch64::CBWPri:
699 case AArch64::CBXPri:
700 case AArch64::CBBAssertExt:
701 case AArch64::CBHAssertExt:
702 case AArch64::CBWPrr:
703 case AArch64::CBXPrr: {
704 // Pseudos using standard 4bit Arm condition codes
706 static_cast<AArch64CC::CondCode>(Cond[2].getImm());
708 }
709 }
710 }
711
712 return false;
713}
714
716 int *BytesRemoved) const {
717 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
718 if (I == MBB.end())
719 return 0;
720
721 if (!isUncondBranchOpcode(I->getOpcode()) &&
722 !isCondBranchOpcode(I->getOpcode()))
723 return 0;
724
725 // Remove the branch.
726 I->eraseFromParent();
727
728 I = MBB.end();
729
730 if (I == MBB.begin()) {
731 if (BytesRemoved)
732 *BytesRemoved = 4;
733 return 1;
734 }
735 --I;
736 if (!isCondBranchOpcode(I->getOpcode())) {
737 if (BytesRemoved)
738 *BytesRemoved = 4;
739 return 1;
740 }
741
742 // Remove the branch.
743 I->eraseFromParent();
744 if (BytesRemoved)
745 *BytesRemoved = 8;
746
747 return 2;
748}
749
750void AArch64InstrInfo::instantiateCondBranch(
753 if (Cond[0].getImm() != -1) {
754 // Regular Bcc
755 BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
756 } else {
757 // Folded compare-and-branch
758 // Note that we use addOperand instead of addReg to keep the flags.
759
760 // cbz, cbnz
761 const MachineInstrBuilder MIB =
762 BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
763
764 // tbz/tbnz
765 if (Cond.size() > 3)
766 MIB.add(Cond[3]);
767
768 // cb
769 if (Cond.size() > 4)
770 MIB.add(Cond[4]);
771
772 MIB.addMBB(TBB);
773
774 // cb[b,h]
775 if (Cond.size() > 5) {
776 MIB.addImm(Cond[5].getImm());
777 MIB.addImm(Cond[6].getImm());
778 }
779 }
780}
781
784 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
785 // Shouldn't be a fall through.
786 assert(TBB && "insertBranch must not be told to insert a fallthrough");
787
788 if (!FBB) {
789 if (Cond.empty()) // Unconditional branch?
790 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
791 else
792 instantiateCondBranch(MBB, DL, TBB, Cond);
793
794 if (BytesAdded)
795 *BytesAdded = 4;
796
797 return 1;
798 }
799
800 // Two-way conditional branch.
801 instantiateCondBranch(MBB, DL, TBB, Cond);
802 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
803
804 if (BytesAdded)
805 *BytesAdded = 8;
806
807 return 2;
808}
809
811 const TargetInstrInfo &TII) {
812 for (MachineInstr &MI : MBB->terminators()) {
813 unsigned Opc = MI.getOpcode();
814 switch (Opc) {
815 case AArch64::CBZW:
816 case AArch64::CBZX:
817 case AArch64::TBZW:
818 case AArch64::TBZX:
819 // CBZ/TBZ with WZR/XZR -> unconditional B
820 if (MI.getOperand(0).getReg() == AArch64::WZR ||
821 MI.getOperand(0).getReg() == AArch64::XZR) {
822 DEBUG_WITH_TYPE("optimizeTerminators",
823 dbgs() << "Removing always taken branch: " << MI);
824 MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
825 SmallVector<MachineBasicBlock *> Succs(MBB->successors());
826 for (auto *S : Succs)
827 if (S != Target)
828 MBB->removeSuccessor(S);
829 DebugLoc DL = MI.getDebugLoc();
830 while (MBB->rbegin() != &MI)
831 MBB->rbegin()->eraseFromParent();
832 MI.eraseFromParent();
833 BuildMI(MBB, DL, TII.get(AArch64::B)).addMBB(Target);
834 return true;
835 }
836 break;
837 case AArch64::CBNZW:
838 case AArch64::CBNZX:
839 case AArch64::TBNZW:
840 case AArch64::TBNZX:
841 // CBNZ/TBNZ with WZR/XZR -> never taken, remove branch and successor
842 if (MI.getOperand(0).getReg() == AArch64::WZR ||
843 MI.getOperand(0).getReg() == AArch64::XZR) {
844 DEBUG_WITH_TYPE("optimizeTerminators",
845 dbgs() << "Removing never taken branch: " << MI);
846 MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
847 MI.getParent()->removeSuccessor(Target);
848 MI.eraseFromParent();
849 return true;
850 }
851 break;
852 }
853 }
854 return false;
855}
856
857// Find the original register that VReg is copied from.
858static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
859 while (Register::isVirtualRegister(VReg)) {
860 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
861 if (!DefMI->isFullCopy())
862 return VReg;
863 VReg = DefMI->getOperand(1).getReg();
864 }
865 return VReg;
866}
867
868// Determine if VReg is defined by an instruction that can be folded into a
869// csel instruction. If so, return the folded opcode, and the replacement
870// register.
871static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
872 unsigned *NewReg = nullptr) {
873 VReg = removeCopies(MRI, VReg);
875 return 0;
876
877 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
878 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
879 unsigned Opc = 0;
880 unsigned SrcReg = 0;
881 switch (DefMI->getOpcode()) {
882 case AArch64::SUBREG_TO_REG:
883 // Check for the following way to define an 64-bit immediate:
884 // %0:gpr32 = MOVi32imm 1
885 // %1:gpr64 = SUBREG_TO_REG %0:gpr32, %subreg.sub_32
886 if (!DefMI->getOperand(1).isReg())
887 return 0;
888 if (!DefMI->getOperand(2).isImm() ||
889 DefMI->getOperand(2).getImm() != AArch64::sub_32)
890 return 0;
891 DefMI = MRI.getVRegDef(DefMI->getOperand(1).getReg());
892 if (DefMI->getOpcode() != AArch64::MOVi32imm)
893 return 0;
894 if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 1)
895 return 0;
896 assert(Is64Bit);
897 SrcReg = AArch64::XZR;
898 Opc = AArch64::CSINCXr;
899 break;
900
901 case AArch64::MOVi32imm:
902 case AArch64::MOVi64imm:
903 if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 1)
904 return 0;
905 SrcReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
906 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
907 break;
908
909 case AArch64::ADDSXri:
910 case AArch64::ADDSWri:
911 // if NZCV is used, do not fold.
912 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
913 true) == -1)
914 return 0;
915 // fall-through to ADDXri and ADDWri.
916 [[fallthrough]];
917 case AArch64::ADDXri:
918 case AArch64::ADDWri:
919 // add x, 1 -> csinc.
920 if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
921 DefMI->getOperand(3).getImm() != 0)
922 return 0;
923 SrcReg = DefMI->getOperand(1).getReg();
924 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
925 break;
926
927 case AArch64::ORNXrr:
928 case AArch64::ORNWrr: {
929 // not x -> csinv, represented as orn dst, xzr, src.
930 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
931 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
932 return 0;
933 SrcReg = DefMI->getOperand(2).getReg();
934 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
935 break;
936 }
937
938 case AArch64::SUBSXrr:
939 case AArch64::SUBSWrr:
940 // if NZCV is used, do not fold.
941 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
942 true) == -1)
943 return 0;
944 // fall-through to SUBXrr and SUBWrr.
945 [[fallthrough]];
946 case AArch64::SUBXrr:
947 case AArch64::SUBWrr: {
948 // neg x -> csneg, represented as sub dst, xzr, src.
949 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
950 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
951 return 0;
952 SrcReg = DefMI->getOperand(2).getReg();
953 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
954 break;
955 }
956 default:
957 return 0;
958 }
959 assert(Opc && SrcReg && "Missing parameters");
960
961 if (NewReg)
962 *NewReg = SrcReg;
963 return Opc;
964}
965
968 Register DstReg, Register TrueReg,
969 Register FalseReg, int &CondCycles,
970 int &TrueCycles,
971 int &FalseCycles) const {
972 // Check register classes.
973 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
974 const TargetRegisterClass *RC =
975 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
976 if (!RC)
977 return false;
978
979 // Also need to check the dest regclass, in case we're trying to optimize
980 // something like:
981 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
982 if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))
983 return false;
984
985 // Expanding cbz/tbz requires an extra cycle of latency on the condition.
986 unsigned ExtraCondLat = Cond.size() != 1;
987
988 // GPRs are handled by csel.
989 // FIXME: Fold in x+1, -x, and ~x when applicable.
990 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
991 AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
992 // Single-cycle csel, csinc, csinv, and csneg.
993 CondCycles = 1 + ExtraCondLat;
994 TrueCycles = FalseCycles = 1;
995 if (canFoldIntoCSel(MRI, TrueReg))
996 TrueCycles = 0;
997 else if (canFoldIntoCSel(MRI, FalseReg))
998 FalseCycles = 0;
999 return true;
1000 }
1001
1002 // Scalar floating point is handled by fcsel.
1003 // FIXME: Form fabs, fmin, and fmax when applicable.
1004 if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
1005 AArch64::FPR32RegClass.hasSubClassEq(RC)) {
1006 CondCycles = 5 + ExtraCondLat;
1007 TrueCycles = FalseCycles = 2;
1008 return true;
1009 }
1010
1011 // Can't do vectors.
1012 return false;
1013}
1014
1017 const DebugLoc &DL, Register DstReg,
1019 Register TrueReg, Register FalseReg) const {
1020 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1021
1022 // Parse the condition code, see parseCondBranch() above.
1024 switch (Cond.size()) {
1025 default:
1026 llvm_unreachable("Unknown condition opcode in Cond");
1027 case 1: // b.cc
1028 CC = AArch64CC::CondCode(Cond[0].getImm());
1029 break;
1030 case 3: { // cbz/cbnz
1031 // We must insert a compare against 0.
1032 bool Is64Bit;
1033 switch (Cond[1].getImm()) {
1034 default:
1035 llvm_unreachable("Unknown branch opcode in Cond");
1036 case AArch64::CBZW:
1037 Is64Bit = false;
1038 CC = AArch64CC::EQ;
1039 break;
1040 case AArch64::CBZX:
1041 Is64Bit = true;
1042 CC = AArch64CC::EQ;
1043 break;
1044 case AArch64::CBNZW:
1045 Is64Bit = false;
1046 CC = AArch64CC::NE;
1047 break;
1048 case AArch64::CBNZX:
1049 Is64Bit = true;
1050 CC = AArch64CC::NE;
1051 break;
1052 }
1053 Register SrcReg = Cond[2].getReg();
1054 if (Is64Bit) {
1055 // cmp reg, #0 is actually subs xzr, reg, #0.
1056 MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
1057 BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
1058 .addReg(SrcReg)
1059 .addImm(0)
1060 .addImm(0);
1061 } else {
1062 MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
1063 BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
1064 .addReg(SrcReg)
1065 .addImm(0)
1066 .addImm(0);
1067 }
1068 break;
1069 }
1070 case 4: { // tbz/tbnz
1071 // We must insert a tst instruction.
1072 switch (Cond[1].getImm()) {
1073 default:
1074 llvm_unreachable("Unknown branch opcode in Cond");
1075 case AArch64::TBZW:
1076 case AArch64::TBZX:
1077 CC = AArch64CC::EQ;
1078 break;
1079 case AArch64::TBNZW:
1080 case AArch64::TBNZX:
1081 CC = AArch64CC::NE;
1082 break;
1083 }
1084 // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
1085 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
1086 BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
1087 .addReg(Cond[2].getReg())
1088 .addImm(
1090 else
1091 BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
1092 .addReg(Cond[2].getReg())
1093 .addImm(
1095 break;
1096 }
1097 case 5: { // cb
1098 // We must insert a cmp, that is a subs
1099 // 0 1 2 3 4
1100 // Cond is { -1, Opcode, CC, Op0, Op1 }
1101
1102 unsigned SubsOpc, SubsDestReg;
1103 bool IsImm = false;
1104 CC = static_cast<AArch64CC::CondCode>(Cond[2].getImm());
1105 switch (Cond[1].getImm()) {
1106 default:
1107 llvm_unreachable("Unknown branch opcode in Cond");
1108 case AArch64::CBWPri:
1109 SubsOpc = AArch64::SUBSWri;
1110 SubsDestReg = AArch64::WZR;
1111 IsImm = true;
1112 break;
1113 case AArch64::CBXPri:
1114 SubsOpc = AArch64::SUBSXri;
1115 SubsDestReg = AArch64::XZR;
1116 IsImm = true;
1117 break;
1118 case AArch64::CBWPrr:
1119 SubsOpc = AArch64::SUBSWrr;
1120 SubsDestReg = AArch64::WZR;
1121 IsImm = false;
1122 break;
1123 case AArch64::CBXPrr:
1124 SubsOpc = AArch64::SUBSXrr;
1125 SubsDestReg = AArch64::XZR;
1126 IsImm = false;
1127 break;
1128 }
1129
1130 if (IsImm)
1131 BuildMI(MBB, I, DL, get(SubsOpc), SubsDestReg)
1132 .addReg(Cond[3].getReg())
1133 .addImm(Cond[4].getImm())
1134 .addImm(0);
1135 else
1136 BuildMI(MBB, I, DL, get(SubsOpc), SubsDestReg)
1137 .addReg(Cond[3].getReg())
1138 .addReg(Cond[4].getReg());
1139 } break;
1140 case 7: { // cb[b,h]
1141 // We must insert a cmp, that is a subs, but also zero- or sign-extensions
1142 // that have been folded. For the first operand we codegen an explicit
1143 // extension, for the second operand we fold the extension into cmp.
1144 // 0 1 2 3 4 5 6
1145 // Cond is { -1, Opcode, CC, Op0, Op1, Ext0, Ext1 }
1146
1147 // We need a new register for the now explicitly extended register
1148 Register Reg = Cond[4].getReg();
1150 unsigned ExtOpc;
1151 unsigned ExtBits;
1152 AArch64_AM::ShiftExtendType ExtendType =
1154 switch (ExtendType) {
1155 default:
1156 llvm_unreachable("Unknown shift-extend for CB instruction");
1157 case AArch64_AM::SXTB:
1158 assert(
1159 Cond[1].getImm() == AArch64::CBBAssertExt &&
1160 "Unexpected compare-and-branch instruction for SXTB shift-extend");
1161 ExtOpc = AArch64::SBFMWri;
1162 ExtBits = AArch64_AM::encodeLogicalImmediate(0xff, 32);
1163 break;
1164 case AArch64_AM::SXTH:
1165 assert(
1166 Cond[1].getImm() == AArch64::CBHAssertExt &&
1167 "Unexpected compare-and-branch instruction for SXTH shift-extend");
1168 ExtOpc = AArch64::SBFMWri;
1169 ExtBits = AArch64_AM::encodeLogicalImmediate(0xffff, 32);
1170 break;
1171 case AArch64_AM::UXTB:
1172 assert(
1173 Cond[1].getImm() == AArch64::CBBAssertExt &&
1174 "Unexpected compare-and-branch instruction for UXTB shift-extend");
1175 ExtOpc = AArch64::ANDWri;
1176 ExtBits = AArch64_AM::encodeLogicalImmediate(0xff, 32);
1177 break;
1178 case AArch64_AM::UXTH:
1179 assert(
1180 Cond[1].getImm() == AArch64::CBHAssertExt &&
1181 "Unexpected compare-and-branch instruction for UXTH shift-extend");
1182 ExtOpc = AArch64::ANDWri;
1183 ExtBits = AArch64_AM::encodeLogicalImmediate(0xffff, 32);
1184 break;
1185 }
1186
1187 // Build the explicit extension of the first operand
1188 Reg = MRI.createVirtualRegister(&AArch64::GPR32spRegClass);
1190 BuildMI(MBB, I, DL, get(ExtOpc), Reg).addReg(Cond[4].getReg());
1191 if (ExtOpc != AArch64::ANDWri)
1192 MBBI.addImm(0);
1193 MBBI.addImm(ExtBits);
1194 }
1195
1196 // Now, subs with an extended second operand
1198 AArch64_AM::ShiftExtendType ExtendType =
1200 MRI.constrainRegClass(Reg, MRI.getRegClass(Cond[3].getReg()));
1201 MRI.constrainRegClass(Cond[3].getReg(), &AArch64::GPR32spRegClass);
1202 BuildMI(MBB, I, DL, get(AArch64::SUBSWrx), AArch64::WZR)
1203 .addReg(Cond[3].getReg())
1204 .addReg(Reg)
1205 .addImm(AArch64_AM::getArithExtendImm(ExtendType, 0));
1206 } // If no extension is needed, just a regular subs
1207 else {
1208 MRI.constrainRegClass(Reg, MRI.getRegClass(Cond[3].getReg()));
1209 MRI.constrainRegClass(Cond[3].getReg(), &AArch64::GPR32spRegClass);
1210 BuildMI(MBB, I, DL, get(AArch64::SUBSWrr), AArch64::WZR)
1211 .addReg(Cond[3].getReg())
1212 .addReg(Reg);
1213 }
1214
1215 CC = static_cast<AArch64CC::CondCode>(Cond[2].getImm());
1216 } break;
1217 }
1218
1219 unsigned Opc = 0;
1220 const TargetRegisterClass *RC = nullptr;
1221 bool TryFold = false;
1222 if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
1223 RC = &AArch64::GPR64RegClass;
1224 Opc = AArch64::CSELXr;
1225 TryFold = true;
1226 } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
1227 RC = &AArch64::GPR32RegClass;
1228 Opc = AArch64::CSELWr;
1229 TryFold = true;
1230 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
1231 RC = &AArch64::FPR64RegClass;
1232 Opc = AArch64::FCSELDrrr;
1233 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
1234 RC = &AArch64::FPR32RegClass;
1235 Opc = AArch64::FCSELSrrr;
1236 }
1237 assert(RC && "Unsupported regclass");
1238
1239 // Try folding simple instructions into the csel.
1240 if (TryFold) {
1241 unsigned NewReg = 0;
1242 unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewReg);
1243 if (FoldedOpc) {
1244 // The folded opcodes csinc, csinc and csneg apply the operation to
1245 // FalseReg, so we need to invert the condition.
1247 TrueReg = FalseReg;
1248 } else
1249 FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewReg);
1250
1251 // Fold the operation. Leave any dead instructions for DCE to clean up.
1252 if (FoldedOpc) {
1253 FalseReg = NewReg;
1254 Opc = FoldedOpc;
1255 // Extend the live range of NewReg.
1256 MRI.clearKillFlags(NewReg);
1257 }
1258 }
1259
1260 // Pull all virtual register into the appropriate class.
1261 MRI.constrainRegClass(TrueReg, RC);
1262 // FalseReg might be WZR or XZR if the folded operand is a literal 1.
1263 assert(
1264 (FalseReg.isVirtual() || FalseReg == AArch64::WZR ||
1265 FalseReg == AArch64::XZR) &&
1266 "FalseReg was folded into a non-virtual register other than WZR or XZR");
1267 if (FalseReg.isVirtual())
1268 MRI.constrainRegClass(FalseReg, RC);
1269
1270 // Insert the csel.
1271 BuildMI(MBB, I, DL, get(Opc), DstReg)
1272 .addReg(TrueReg)
1273 .addReg(FalseReg)
1274 .addImm(CC);
1275}
1276
1277// Return true if Imm can be loaded into a register by a "cheap" sequence of
1278// instructions. For now, "cheap" means at most two instructions.
1279static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
1280 if (BitSize == 32)
1281 return true;
1282
1283 assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed");
1284 uint64_t Imm = static_cast<uint64_t>(MI.getOperand(1).getImm());
1286 AArch64_IMM::expandMOVImm(Imm, BitSize, Is);
1287
1288 return Is.size() <= 2;
1289}
1290
1291// Check if a COPY instruction is cheap.
1292static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI) {
1293 assert(MI.isCopy() && "Expected COPY instruction");
1294 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
1295
1296 // Cross-bank copies (e.g., between GPR and FPR) are expensive on AArch64,
1297 // typically requiring an FMOV instruction with a 2-6 cycle latency.
1298 auto GetRegClass = [&](Register Reg) -> const TargetRegisterClass * {
1299 if (Reg.isVirtual())
1300 return MRI.getRegClass(Reg);
1301 if (Reg.isPhysical())
1302 return RI.getMinimalPhysRegClass(Reg);
1303 return nullptr;
1304 };
1305 const TargetRegisterClass *DstRC = GetRegClass(MI.getOperand(0).getReg());
1306 const TargetRegisterClass *SrcRC = GetRegClass(MI.getOperand(1).getReg());
1307 if (DstRC && SrcRC && !RI.getCommonSubClass(DstRC, SrcRC))
1308 return false;
1309
1310 return MI.isAsCheapAsAMove();
1311}
1312
1313// FIXME: this implementation should be micro-architecture dependent, so a
1314// micro-architecture target hook should be introduced here in future.
1316 if (Subtarget.hasExynosCheapAsMoveHandling()) {
1317 if (isExynosCheapAsMove(MI))
1318 return true;
1319 return MI.isAsCheapAsAMove();
1320 }
1321
1322 switch (MI.getOpcode()) {
1323 default:
1324 return MI.isAsCheapAsAMove();
1325
1326 case TargetOpcode::COPY:
1327 return isCheapCopy(MI, RI);
1328
1329 case AArch64::ADDWrs:
1330 case AArch64::ADDXrs:
1331 case AArch64::SUBWrs:
1332 case AArch64::SUBXrs:
1333 return Subtarget.hasALULSLFast() && MI.getOperand(3).getImm() <= 4;
1334
1335 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
1336 // ORRXri, it is as cheap as MOV.
1337 // Likewise if it can be expanded to MOVZ/MOVN/MOVK.
1338 case AArch64::MOVi32imm:
1339 return isCheapImmediate(MI, 32);
1340 case AArch64::MOVi64imm:
1341 return isCheapImmediate(MI, 64);
1342 }
1343}
1344
1345bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
1346 switch (MI.getOpcode()) {
1347 default:
1348 return false;
1349
1350 case AArch64::ADDWrs:
1351 case AArch64::ADDXrs:
1352 case AArch64::ADDSWrs:
1353 case AArch64::ADDSXrs: {
1354 unsigned Imm = MI.getOperand(3).getImm();
1355 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1356 if (ShiftVal == 0)
1357 return true;
1358 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
1359 }
1360
1361 case AArch64::ADDWrx:
1362 case AArch64::ADDXrx:
1363 case AArch64::ADDXrx64:
1364 case AArch64::ADDSWrx:
1365 case AArch64::ADDSXrx:
1366 case AArch64::ADDSXrx64: {
1367 unsigned Imm = MI.getOperand(3).getImm();
1368 switch (AArch64_AM::getArithExtendType(Imm)) {
1369 default:
1370 return false;
1371 case AArch64_AM::UXTB:
1372 case AArch64_AM::UXTH:
1373 case AArch64_AM::UXTW:
1374 case AArch64_AM::UXTX:
1375 return AArch64_AM::getArithShiftValue(Imm) <= 4;
1376 }
1377 }
1378
1379 case AArch64::SUBWrs:
1380 case AArch64::SUBSWrs: {
1381 unsigned Imm = MI.getOperand(3).getImm();
1382 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1383 return ShiftVal == 0 ||
1384 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
1385 }
1386
1387 case AArch64::SUBXrs:
1388 case AArch64::SUBSXrs: {
1389 unsigned Imm = MI.getOperand(3).getImm();
1390 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1391 return ShiftVal == 0 ||
1392 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
1393 }
1394
1395 case AArch64::SUBWrx:
1396 case AArch64::SUBXrx:
1397 case AArch64::SUBXrx64:
1398 case AArch64::SUBSWrx:
1399 case AArch64::SUBSXrx:
1400 case AArch64::SUBSXrx64: {
1401 unsigned Imm = MI.getOperand(3).getImm();
1402 switch (AArch64_AM::getArithExtendType(Imm)) {
1403 default:
1404 return false;
1405 case AArch64_AM::UXTB:
1406 case AArch64_AM::UXTH:
1407 case AArch64_AM::UXTW:
1408 case AArch64_AM::UXTX:
1409 return AArch64_AM::getArithShiftValue(Imm) == 0;
1410 }
1411 }
1412
1413 case AArch64::LDRBBroW:
1414 case AArch64::LDRBBroX:
1415 case AArch64::LDRBroW:
1416 case AArch64::LDRBroX:
1417 case AArch64::LDRDroW:
1418 case AArch64::LDRDroX:
1419 case AArch64::LDRHHroW:
1420 case AArch64::LDRHHroX:
1421 case AArch64::LDRHroW:
1422 case AArch64::LDRHroX:
1423 case AArch64::LDRQroW:
1424 case AArch64::LDRQroX:
1425 case AArch64::LDRSBWroW:
1426 case AArch64::LDRSBWroX:
1427 case AArch64::LDRSBXroW:
1428 case AArch64::LDRSBXroX:
1429 case AArch64::LDRSHWroW:
1430 case AArch64::LDRSHWroX:
1431 case AArch64::LDRSHXroW:
1432 case AArch64::LDRSHXroX:
1433 case AArch64::LDRSWroW:
1434 case AArch64::LDRSWroX:
1435 case AArch64::LDRSroW:
1436 case AArch64::LDRSroX:
1437 case AArch64::LDRWroW:
1438 case AArch64::LDRWroX:
1439 case AArch64::LDRXroW:
1440 case AArch64::LDRXroX:
1441 case AArch64::PRFMroW:
1442 case AArch64::PRFMroX:
1443 case AArch64::STRBBroW:
1444 case AArch64::STRBBroX:
1445 case AArch64::STRBroW:
1446 case AArch64::STRBroX:
1447 case AArch64::STRDroW:
1448 case AArch64::STRDroX:
1449 case AArch64::STRHHroW:
1450 case AArch64::STRHHroX:
1451 case AArch64::STRHroW:
1452 case AArch64::STRHroX:
1453 case AArch64::STRQroW:
1454 case AArch64::STRQroX:
1455 case AArch64::STRSroW:
1456 case AArch64::STRSroX:
1457 case AArch64::STRWroW:
1458 case AArch64::STRWroX:
1459 case AArch64::STRXroW:
1460 case AArch64::STRXroX: {
1461 unsigned IsSigned = MI.getOperand(3).getImm();
1462 return !IsSigned;
1463 }
1464 }
1465}
1466
1467bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
1468 unsigned Opc = MI.getOpcode();
1469 switch (Opc) {
1470 default:
1471 return false;
1472 case AArch64::SEH_StackAlloc:
1473 case AArch64::SEH_SaveFPLR:
1474 case AArch64::SEH_SaveFPLR_X:
1475 case AArch64::SEH_SaveReg:
1476 case AArch64::SEH_SaveReg_X:
1477 case AArch64::SEH_SaveRegP:
1478 case AArch64::SEH_SaveRegP_X:
1479 case AArch64::SEH_SaveFReg:
1480 case AArch64::SEH_SaveFReg_X:
1481 case AArch64::SEH_SaveFRegP:
1482 case AArch64::SEH_SaveFRegP_X:
1483 case AArch64::SEH_SetFP:
1484 case AArch64::SEH_AddFP:
1485 case AArch64::SEH_Nop:
1486 case AArch64::SEH_PrologEnd:
1487 case AArch64::SEH_EpilogStart:
1488 case AArch64::SEH_EpilogEnd:
1489 case AArch64::SEH_PACSignLR:
1490 case AArch64::SEH_SaveAnyRegI:
1491 case AArch64::SEH_SaveAnyRegIP:
1492 case AArch64::SEH_SaveAnyRegQP:
1493 case AArch64::SEH_SaveAnyRegQPX:
1494 case AArch64::SEH_AllocZ:
1495 case AArch64::SEH_SaveZReg:
1496 case AArch64::SEH_SavePReg:
1497 return true;
1498 }
1499}
1500
1502 Register &SrcReg, Register &DstReg,
1503 unsigned &SubIdx) const {
1504 switch (MI.getOpcode()) {
1505 default:
1506 return false;
1507 case AArch64::SBFMXri: // aka sxtw
1508 case AArch64::UBFMXri: // aka uxtw
1509 // Check for the 32 -> 64 bit extension case, these instructions can do
1510 // much more.
1511 if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
1512 return false;
1513 // This is a signed or unsigned 32 -> 64 bit extension.
1514 SrcReg = MI.getOperand(1).getReg();
1515 DstReg = MI.getOperand(0).getReg();
1516 SubIdx = AArch64::sub_32;
1517 return true;
1518 }
1519}
1520
1522 const MachineInstr &MIa, const MachineInstr &MIb) const {
1524 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
1525 int64_t OffsetA = 0, OffsetB = 0;
1526 TypeSize WidthA(0, false), WidthB(0, false);
1527 bool OffsetAIsScalable = false, OffsetBIsScalable = false;
1528
1529 assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
1530 assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
1531
1534 return false;
1535
1536 // Retrieve the base, offset from the base and width. Width
1537 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
1538 // base are identical, and the offset of a lower memory access +
1539 // the width doesn't overlap the offset of a higher memory access,
1540 // then the memory accesses are different.
1541 // If OffsetAIsScalable and OffsetBIsScalable are both true, they
1542 // are assumed to have the same scale (vscale).
1543 if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,
1544 WidthA, TRI) &&
1545 getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,
1546 WidthB, TRI)) {
1547 if (BaseOpA->isIdenticalTo(*BaseOpB) &&
1548 OffsetAIsScalable == OffsetBIsScalable) {
1549 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1550 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1551 TypeSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1552 if (LowWidth.isScalable() == OffsetAIsScalable &&
1553 LowOffset + (int)LowWidth.getKnownMinValue() <= HighOffset)
1554 return true;
1555 }
1556 }
1557 return false;
1558}
1559
1561 const MachineBasicBlock *MBB,
1562 const MachineFunction &MF) const {
1564 return true;
1565
1566 // Do not move an instruction that can be recognized as a branch target.
1567 if (hasBTISemantics(MI))
1568 return true;
1569
1570 switch (MI.getOpcode()) {
1571 case AArch64::HINT:
1572 // CSDB hints are scheduling barriers.
1573 if (MI.getOperand(0).getImm() == 0x14)
1574 return true;
1575 break;
1576 case AArch64::DSB:
1577 case AArch64::ISB:
1578 // DSB and ISB also are scheduling barriers.
1579 return true;
1580 case AArch64::MSRpstatesvcrImm1:
1581 // SMSTART and SMSTOP are also scheduling barriers.
1582 return true;
1583 default:;
1584 }
1585 if (isSEHInstruction(MI))
1586 return true;
1587 auto Next = std::next(MI.getIterator());
1588 return Next != MBB->end() && Next->isCFIInstruction();
1589}
1590
1591/// analyzeCompare - For a comparison instruction, return the source registers
1592/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1593/// Return true if the comparison instruction can be analyzed.
1595 Register &SrcReg2, int64_t &CmpMask,
1596 int64_t &CmpValue) const {
1597 // The first operand can be a frame index where we'd normally expect a
1598 // register.
1599 // FIXME: Pass subregisters out of analyzeCompare
1600 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1601 if (!MI.getOperand(1).isReg() || MI.getOperand(1).getSubReg())
1602 return false;
1603
1604 switch (MI.getOpcode()) {
1605 default:
1606 break;
1607 case AArch64::PTEST_PP:
1608 case AArch64::PTEST_PP_ANY:
1609 case AArch64::PTEST_PP_FIRST:
1610 SrcReg = MI.getOperand(0).getReg();
1611 SrcReg2 = MI.getOperand(1).getReg();
1612 if (MI.getOperand(2).getSubReg())
1613 return false;
1614
1615 // Not sure about the mask and value for now...
1616 CmpMask = ~0;
1617 CmpValue = 0;
1618 return true;
1619 case AArch64::SUBSWrr:
1620 case AArch64::SUBSWrs:
1621 case AArch64::SUBSWrx:
1622 case AArch64::SUBSXrr:
1623 case AArch64::SUBSXrs:
1624 case AArch64::SUBSXrx:
1625 case AArch64::ADDSWrr:
1626 case AArch64::ADDSWrs:
1627 case AArch64::ADDSWrx:
1628 case AArch64::ADDSXrr:
1629 case AArch64::ADDSXrs:
1630 case AArch64::ADDSXrx:
1631 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1632 SrcReg = MI.getOperand(1).getReg();
1633 SrcReg2 = MI.getOperand(2).getReg();
1634
1635 // FIXME: Pass subregisters out of analyzeCompare
1636 if (MI.getOperand(2).getSubReg())
1637 return false;
1638
1639 CmpMask = ~0;
1640 CmpValue = 0;
1641 return true;
1642 case AArch64::SUBSWri:
1643 case AArch64::ADDSWri:
1644 case AArch64::SUBSXri:
1645 case AArch64::ADDSXri:
1646 SrcReg = MI.getOperand(1).getReg();
1647 SrcReg2 = 0;
1648 CmpMask = ~0;
1649 CmpValue = MI.getOperand(2).getImm();
1650 return true;
1651 case AArch64::ANDSWri:
1652 case AArch64::ANDSXri:
1653 // ANDS does not use the same encoding scheme as the others xxxS
1654 // instructions.
1655 SrcReg = MI.getOperand(1).getReg();
1656 SrcReg2 = 0;
1657 CmpMask = ~0;
1659 MI.getOperand(2).getImm(),
1660 MI.getOpcode() == AArch64::ANDSWri ? 32 : 64);
1661 return true;
1662 }
1663
1664 return false;
1665}
1666
1668 MachineBasicBlock *MBB = Instr.getParent();
1669 assert(MBB && "Can't get MachineBasicBlock here");
1670 MachineFunction *MF = MBB->getParent();
1671 assert(MF && "Can't get MachineFunction here");
1674 MachineRegisterInfo *MRI = &MF->getRegInfo();
1675
1676 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1677 ++OpIdx) {
1678 MachineOperand &MO = Instr.getOperand(OpIdx);
1679 const TargetRegisterClass *OpRegCstraints =
1680 Instr.getRegClassConstraint(OpIdx, TII, TRI);
1681
1682 // If there's no constraint, there's nothing to do.
1683 if (!OpRegCstraints)
1684 continue;
1685 // If the operand is a frame index, there's nothing to do here.
1686 // A frame index operand will resolve correctly during PEI.
1687 if (MO.isFI())
1688 continue;
1689
1690 assert(MO.isReg() &&
1691 "Operand has register constraints without being a register!");
1692
1693 Register Reg = MO.getReg();
1694 if (Reg.isPhysical()) {
1695 if (!OpRegCstraints->contains(Reg))
1696 return false;
1697 } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1698 !MRI->constrainRegClass(Reg, OpRegCstraints))
1699 return false;
1700 }
1701
1702 return true;
1703}
1704
1705/// Return the opcode that does not set flags when possible - otherwise
1706/// return the original opcode. The caller is responsible to do the actual
1707/// substitution and legality checking.
1709 // Don't convert all compare instructions, because for some the zero register
1710 // encoding becomes the sp register.
1711 bool MIDefinesZeroReg = false;
1712 if (MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1713 MI.definesRegister(AArch64::XZR, /*TRI=*/nullptr))
1714 MIDefinesZeroReg = true;
1715
1716 switch (MI.getOpcode()) {
1717 default:
1718 return MI.getOpcode();
1719 case AArch64::ADDSWrr:
1720 return AArch64::ADDWrr;
1721 case AArch64::ADDSWri:
1722 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1723 case AArch64::ADDSWrs:
1724 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1725 case AArch64::ADDSWrx:
1726 return AArch64::ADDWrx;
1727 case AArch64::ADDSXrr:
1728 return AArch64::ADDXrr;
1729 case AArch64::ADDSXri:
1730 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1731 case AArch64::ADDSXrs:
1732 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1733 case AArch64::ADDSXrx:
1734 return AArch64::ADDXrx;
1735 case AArch64::SUBSWrr:
1736 return AArch64::SUBWrr;
1737 case AArch64::SUBSWri:
1738 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1739 case AArch64::SUBSWrs:
1740 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1741 case AArch64::SUBSWrx:
1742 return AArch64::SUBWrx;
1743 case AArch64::SUBSXrr:
1744 return AArch64::SUBXrr;
1745 case AArch64::SUBSXri:
1746 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1747 case AArch64::SUBSXrs:
1748 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1749 case AArch64::SUBSXrx:
1750 return AArch64::SUBXrx;
1751 }
1752}
1753
1754enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1755
1756/// True when condition flags are accessed (either by writing or reading)
1757/// on the instruction trace starting at From and ending at To.
1758///
1759/// Note: If From and To are from different blocks it's assumed CC are accessed
1760/// on the path.
1763 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1764 // Early exit if To is at the beginning of the BB.
1765 if (To == To->getParent()->begin())
1766 return true;
1767
1768 // Check whether the instructions are in the same basic block
1769 // If not, assume the condition flags might get modified somewhere.
1770 if (To->getParent() != From->getParent())
1771 return true;
1772
1773 // From must be above To.
1774 assert(std::any_of(
1775 ++To.getReverse(), To->getParent()->rend(),
1776 [From](MachineInstr &MI) { return MI.getIterator() == From; }));
1777
1778 // We iterate backward starting at \p To until we hit \p From.
1779 for (const MachineInstr &Instr :
1781 if (((AccessToCheck & AK_Write) &&
1782 Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1783 ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1784 return true;
1785 }
1786 return false;
1787}
1788
1789std::optional<unsigned>
1790AArch64InstrInfo::canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask,
1791 MachineInstr *Pred,
1792 const MachineRegisterInfo *MRI) const {
1793 unsigned MaskOpcode = Mask->getOpcode();
1794 unsigned PredOpcode = Pred->getOpcode();
1795 bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);
1796 bool PredIsWhileLike = isWhileOpcode(PredOpcode);
1797
1798 if (PredIsWhileLike) {
1799 // For PTEST(PG, PG), PTEST is redundant when PG is the result of a WHILEcc
1800 // instruction and the condition is "any" since WHILcc does an implicit
1801 // PTEST(ALL, PG) check and PG is always a subset of ALL.
1802 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1803 return PredOpcode;
1804
1805 // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is
1806 // redundant since WHILE performs an implicit PTEST with an all active
1807 // mask.
1808 if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1809 getElementSizeForOpcode(MaskOpcode) ==
1810 getElementSizeForOpcode(PredOpcode))
1811 return PredOpcode;
1812
1813 // For PTEST_FIRST(PTRUE_ALL, WHILE), the PTEST_FIRST is redundant since
1814 // WHILEcc performs an implicit PTEST with an all active mask, setting
1815 // the N flag as the PTEST_FIRST would.
1816 if (PTest->getOpcode() == AArch64::PTEST_PP_FIRST &&
1817 isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31)
1818 return PredOpcode;
1819
1820 return {};
1821 }
1822
1823 if (PredIsPTestLike) {
1824 // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
1825 // instruction that sets the flags as PTEST would and the condition is
1826 // "any" since PG is always a subset of the governing predicate of the
1827 // ptest-like instruction.
1828 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1829 return PredOpcode;
1830
1831 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1832
1833 // If the PTEST like instruction's general predicate is not `Mask`, attempt
1834 // to look through a copy and try again. This is because some instructions
1835 // take a predicate whose register class is a subset of its result class.
1836 if (Mask != PTestLikeMask && PTestLikeMask->isFullCopy() &&
1837 PTestLikeMask->getOperand(1).getReg().isVirtual())
1838 PTestLikeMask =
1839 MRI->getUniqueVRegDef(PTestLikeMask->getOperand(1).getReg());
1840
1841 // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the
1842 // the element size matches and either the PTEST_LIKE instruction uses
1843 // the same all active mask or the condition is "any".
1844 if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1845 getElementSizeForOpcode(MaskOpcode) ==
1846 getElementSizeForOpcode(PredOpcode)) {
1847 if (Mask == PTestLikeMask || PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1848 return PredOpcode;
1849 }
1850
1851 // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the
1852 // flags are set based on the same mask 'PG', but PTEST_LIKE must operate
1853 // on 8-bit predicates like the PTEST. Otherwise, for instructions like
1854 // compare that also support 16/32/64-bit predicates, the implicit PTEST
1855 // performed by the compare could consider fewer lanes for these element
1856 // sizes.
1857 //
1858 // For example, consider
1859 //
1860 // ptrue p0.b ; P0=1111-1111-1111-1111
1861 // index z0.s, #0, #1 ; Z0=<0,1,2,3>
1862 // index z1.s, #1, #1 ; Z1=<1,2,3,4>
1863 // cmphi p1.s, p0/z, z1.s, z0.s ; P1=0001-0001-0001-0001
1864 // ; ^ last active
1865 // ptest p0, p1.b ; P1=0001-0001-0001-0001
1866 // ; ^ last active
1867 //
1868 // where the compare generates a canonical all active 32-bit predicate
1869 // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last
1870 // active flag, whereas the PTEST instruction with the same mask doesn't.
1871 // For PTEST_ANY this doesn't apply as the flags in this case would be
1872 // identical regardless of element size.
1873 uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode);
1874 if (Mask == PTestLikeMask && (PredElementSize == AArch64::ElementSizeB ||
1875 PTest->getOpcode() == AArch64::PTEST_PP_ANY))
1876 return PredOpcode;
1877
1878 return {};
1879 }
1880
1881 // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the
1882 // opcode so the PTEST becomes redundant.
1883 switch (PredOpcode) {
1884 case AArch64::AND_PPzPP:
1885 case AArch64::BIC_PPzPP:
1886 case AArch64::EOR_PPzPP:
1887 case AArch64::NAND_PPzPP:
1888 case AArch64::NOR_PPzPP:
1889 case AArch64::ORN_PPzPP:
1890 case AArch64::ORR_PPzPP:
1891 case AArch64::BRKA_PPzP:
1892 case AArch64::BRKPA_PPzPP:
1893 case AArch64::BRKB_PPzP:
1894 case AArch64::BRKPB_PPzPP:
1895 case AArch64::RDFFR_PPz: {
1896 // Check to see if our mask is the same. If not the resulting flag bits
1897 // may be different and we can't remove the ptest.
1898 auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1899 if (Mask != PredMask)
1900 return {};
1901 break;
1902 }
1903 case AArch64::BRKN_PPzP: {
1904 // BRKN uses an all active implicit mask to set flags unlike the other
1905 // flag-setting instructions.
1906 // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B).
1907 if ((MaskOpcode != AArch64::PTRUE_B) ||
1908 (Mask->getOperand(1).getImm() != 31))
1909 return {};
1910 break;
1911 }
1912 case AArch64::PTRUE_B:
1913 // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A)
1914 break;
1915 default:
1916 // Bail out if we don't recognize the input
1917 return {};
1918 }
1919
1920 return convertToFlagSettingOpc(PredOpcode);
1921}
1922
1923/// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
1924/// operation which could set the flags in an identical manner
1925bool AArch64InstrInfo::optimizePTestInstr(
1926 MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
1927 const MachineRegisterInfo *MRI) const {
1928 auto *Mask = MRI->getUniqueVRegDef(MaskReg);
1929 auto *Pred = MRI->getUniqueVRegDef(PredReg);
1930
1931 if (Pred->isCopy() && PTest->getOpcode() == AArch64::PTEST_PP_FIRST) {
1932 // Instructions which return a multi-vector (e.g. WHILECC_x2) require copies
1933 // before the branch to extract each subregister.
1934 auto Op = Pred->getOperand(1);
1935 if (Op.isReg() && Op.getReg().isVirtual() &&
1936 Op.getSubReg() == AArch64::psub0)
1937 Pred = MRI->getUniqueVRegDef(Op.getReg());
1938 }
1939
1940 unsigned PredOpcode = Pred->getOpcode();
1941 auto NewOp = canRemovePTestInstr(PTest, Mask, Pred, MRI);
1942 if (!NewOp)
1943 return false;
1944
1945 const TargetRegisterInfo *TRI = &getRegisterInfo();
1946
1947 // If another instruction between Pred and PTest accesses flags, don't remove
1948 // the ptest or update the earlier instruction to modify them.
1949 if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI))
1950 return false;
1951
1952 // If we pass all the checks, it's safe to remove the PTEST and use the flags
1953 // as they are prior to PTEST. Sometimes this requires the tested PTEST
1954 // operand to be replaced with an equivalent instruction that also sets the
1955 // flags.
1956 PTest->eraseFromParent();
1957 if (*NewOp != PredOpcode) {
1958 Pred->setDesc(get(*NewOp));
1959 bool succeeded = UpdateOperandRegClass(*Pred);
1960 (void)succeeded;
1961 assert(succeeded && "Operands have incompatible register classes!");
1962 Pred->addRegisterDefined(AArch64::NZCV, TRI);
1963 }
1964
1965 // Ensure that the flags def is live.
1966 if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) {
1967 unsigned i = 0, e = Pred->getNumOperands();
1968 for (; i != e; ++i) {
1969 MachineOperand &MO = Pred->getOperand(i);
1970 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {
1971 MO.setIsDead(false);
1972 break;
1973 }
1974 }
1975 }
1976 return true;
1977}
1978
1979/// Try to optimize a compare instruction. A compare instruction is an
1980/// instruction which produces AArch64::NZCV. It can be truly compare
1981/// instruction
1982/// when there are no uses of its destination register.
1983///
1984/// The following steps are tried in order:
1985/// 1. Convert CmpInstr into an unconditional version.
1986/// 2. Remove CmpInstr if above there is an instruction producing a needed
1987/// condition code or an instruction which can be converted into such an
1988/// instruction.
1989/// Only comparison with zero is supported.
1991 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask,
1992 int64_t CmpValue, const MachineRegisterInfo *MRI) const {
1993 assert(CmpInstr.getParent());
1994 assert(MRI);
1995
1996 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1997 int DeadNZCVIdx =
1998 CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
1999 if (DeadNZCVIdx != -1) {
2000 if (CmpInstr.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
2001 CmpInstr.definesRegister(AArch64::XZR, /*TRI=*/nullptr)) {
2002 CmpInstr.eraseFromParent();
2003 return true;
2004 }
2005 unsigned Opc = CmpInstr.getOpcode();
2006 unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
2007 if (NewOpc == Opc)
2008 return false;
2009 const MCInstrDesc &MCID = get(NewOpc);
2010 CmpInstr.setDesc(MCID);
2011 CmpInstr.removeOperand(DeadNZCVIdx);
2012 bool succeeded = UpdateOperandRegClass(CmpInstr);
2013 (void)succeeded;
2014 assert(succeeded && "Some operands reg class are incompatible!");
2015 return true;
2016 }
2017
2018 if (CmpInstr.getOpcode() == AArch64::PTEST_PP ||
2019 CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY ||
2020 CmpInstr.getOpcode() == AArch64::PTEST_PP_FIRST)
2021 return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);
2022
2023 if (SrcReg2 != 0)
2024 return false;
2025
2026 // CmpInstr is a Compare instruction if destination register is not used.
2027 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
2028 return false;
2029
2030 if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI))
2031 return true;
2032 return (CmpValue == 0 || CmpValue == 1) &&
2033 removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI);
2034}
2035
2036/// Get opcode of S version of Instr.
2037/// If Instr is S version its opcode is returned.
2038/// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
2039/// or we are not interested in it.
2040static unsigned sForm(MachineInstr &Instr) {
2041 switch (Instr.getOpcode()) {
2042 default:
2043 return AArch64::INSTRUCTION_LIST_END;
2044
2045 case AArch64::ADDSWrr:
2046 case AArch64::ADDSWri:
2047 case AArch64::ADDSXrr:
2048 case AArch64::ADDSXri:
2049 case AArch64::ADDSWrx:
2050 case AArch64::ADDSXrx:
2051 case AArch64::SUBSWrr:
2052 case AArch64::SUBSWri:
2053 case AArch64::SUBSWrx:
2054 case AArch64::SUBSXrr:
2055 case AArch64::SUBSXri:
2056 case AArch64::SUBSXrx:
2057 case AArch64::ANDSWri:
2058 case AArch64::ANDSWrr:
2059 case AArch64::ANDSWrs:
2060 case AArch64::ANDSXri:
2061 case AArch64::ANDSXrr:
2062 case AArch64::ANDSXrs:
2063 case AArch64::BICSWrr:
2064 case AArch64::BICSXrr:
2065 case AArch64::BICSWrs:
2066 case AArch64::BICSXrs:
2067 return Instr.getOpcode();
2068
2069 case AArch64::ADDWrr:
2070 return AArch64::ADDSWrr;
2071 case AArch64::ADDWri:
2072 return AArch64::ADDSWri;
2073 case AArch64::ADDXrr:
2074 return AArch64::ADDSXrr;
2075 case AArch64::ADDXri:
2076 return AArch64::ADDSXri;
2077 case AArch64::ADDWrx:
2078 return AArch64::ADDSWrx;
2079 case AArch64::ADDXrx:
2080 return AArch64::ADDSXrx;
2081 case AArch64::ADCWr:
2082 return AArch64::ADCSWr;
2083 case AArch64::ADCXr:
2084 return AArch64::ADCSXr;
2085 case AArch64::SUBWrr:
2086 return AArch64::SUBSWrr;
2087 case AArch64::SUBWri:
2088 return AArch64::SUBSWri;
2089 case AArch64::SUBXrr:
2090 return AArch64::SUBSXrr;
2091 case AArch64::SUBXri:
2092 return AArch64::SUBSXri;
2093 case AArch64::SUBWrx:
2094 return AArch64::SUBSWrx;
2095 case AArch64::SUBXrx:
2096 return AArch64::SUBSXrx;
2097 case AArch64::SBCWr:
2098 return AArch64::SBCSWr;
2099 case AArch64::SBCXr:
2100 return AArch64::SBCSXr;
2101 case AArch64::ANDWri:
2102 return AArch64::ANDSWri;
2103 case AArch64::ANDXri:
2104 return AArch64::ANDSXri;
2105 case AArch64::ANDWrr:
2106 return AArch64::ANDSWrr;
2107 case AArch64::ANDWrs:
2108 return AArch64::ANDSWrs;
2109 case AArch64::ANDXrr:
2110 return AArch64::ANDSXrr;
2111 case AArch64::ANDXrs:
2112 return AArch64::ANDSXrs;
2113 case AArch64::BICWrr:
2114 return AArch64::BICSWrr;
2115 case AArch64::BICXrr:
2116 return AArch64::BICSXrr;
2117 case AArch64::BICWrs:
2118 return AArch64::BICSWrs;
2119 case AArch64::BICXrs:
2120 return AArch64::BICSXrs;
2121 }
2122}
2123
2124/// Check if AArch64::NZCV should be alive in successors of MBB.
2126 for (auto *BB : MBB->successors())
2127 if (BB->isLiveIn(AArch64::NZCV))
2128 return true;
2129 return false;
2130}
2131
2132/// \returns The condition code operand index for \p Instr if it is a branch
2133/// or select and -1 otherwise.
2134int AArch64InstrInfo::findCondCodeUseOperandIdxForBranchOrSelect(
2135 const MachineInstr &Instr) {
2136 switch (Instr.getOpcode()) {
2137 default:
2138 return -1;
2139
2140 case AArch64::Bcc: {
2141 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
2142 assert(Idx >= 2);
2143 return Idx - 2;
2144 }
2145
2146 case AArch64::CSINVWr:
2147 case AArch64::CSINVXr:
2148 case AArch64::CSINCWr:
2149 case AArch64::CSINCXr:
2150 case AArch64::CSELWr:
2151 case AArch64::CSELXr:
2152 case AArch64::CSNEGWr:
2153 case AArch64::CSNEGXr:
2154 case AArch64::FCSELSrrr:
2155 case AArch64::FCSELDrrr: {
2156 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
2157 assert(Idx >= 1);
2158 return Idx - 1;
2159 }
2160 }
2161}
2162
2163/// Find a condition code used by the instruction.
2164/// Returns AArch64CC::Invalid if either the instruction does not use condition
2165/// codes or we don't optimize CmpInstr in the presence of such instructions.
2167 int CCIdx =
2168 AArch64InstrInfo::findCondCodeUseOperandIdxForBranchOrSelect(Instr);
2169 return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>(
2170 Instr.getOperand(CCIdx).getImm())
2172}
2173
2176 UsedNZCV UsedFlags;
2177 switch (CC) {
2178 default:
2179 break;
2180
2181 case AArch64CC::EQ: // Z set
2182 case AArch64CC::NE: // Z clear
2183 UsedFlags.Z = true;
2184 break;
2185
2186 case AArch64CC::HI: // Z clear and C set
2187 case AArch64CC::LS: // Z set or C clear
2188 UsedFlags.Z = true;
2189 [[fallthrough]];
2190 case AArch64CC::HS: // C set
2191 case AArch64CC::LO: // C clear
2192 UsedFlags.C = true;
2193 break;
2194
2195 case AArch64CC::MI: // N set
2196 case AArch64CC::PL: // N clear
2197 UsedFlags.N = true;
2198 break;
2199
2200 case AArch64CC::VS: // V set
2201 case AArch64CC::VC: // V clear
2202 UsedFlags.V = true;
2203 break;
2204
2205 case AArch64CC::GT: // Z clear, N and V the same
2206 case AArch64CC::LE: // Z set, N and V differ
2207 UsedFlags.Z = true;
2208 [[fallthrough]];
2209 case AArch64CC::GE: // N and V the same
2210 case AArch64CC::LT: // N and V differ
2211 UsedFlags.N = true;
2212 UsedFlags.V = true;
2213 break;
2214 }
2215 return UsedFlags;
2216}
2217
2218/// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV
2219/// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
2220/// \returns std::nullopt otherwise.
2221///
2222/// Collect instructions using that flags in \p CCUseInstrs if provided.
2223std::optional<UsedNZCV>
2225 const TargetRegisterInfo &TRI,
2226 SmallVectorImpl<MachineInstr *> *CCUseInstrs) {
2227 MachineBasicBlock *CmpParent = CmpInstr.getParent();
2228 if (MI.getParent() != CmpParent)
2229 return std::nullopt;
2230
2231 if (areCFlagsAliveInSuccessors(CmpParent))
2232 return std::nullopt;
2233
2234 UsedNZCV NZCVUsedAfterCmp;
2236 std::next(CmpInstr.getIterator()), CmpParent->instr_end())) {
2237 if (Instr.readsRegister(AArch64::NZCV, &TRI)) {
2239 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
2240 return std::nullopt;
2241 NZCVUsedAfterCmp |= getUsedNZCV(CC);
2242 if (CCUseInstrs)
2243 CCUseInstrs->push_back(&Instr);
2244 }
2245 if (Instr.modifiesRegister(AArch64::NZCV, &TRI))
2246 break;
2247 }
2248 return NZCVUsedAfterCmp;
2249}
2250
2251static bool isADDSRegImm(unsigned Opcode) {
2252 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
2253}
2254
2255static bool isSUBSRegImm(unsigned Opcode) {
2256 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
2257}
2258
2260 unsigned Opc = sForm(MI);
2261 switch (Opc) {
2262 case AArch64::ANDSWri:
2263 case AArch64::ANDSWrr:
2264 case AArch64::ANDSWrs:
2265 case AArch64::ANDSXri:
2266 case AArch64::ANDSXrr:
2267 case AArch64::ANDSXrs:
2268 case AArch64::BICSWrr:
2269 case AArch64::BICSXrr:
2270 case AArch64::BICSWrs:
2271 case AArch64::BICSXrs:
2272 return true;
2273 default:
2274 return false;
2275 }
2276}
2277
2278/// Check if CmpInstr can be substituted by MI.
2279///
2280/// CmpInstr can be substituted:
2281/// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
2282/// - and, MI and CmpInstr are from the same MachineBB
2283/// - and, condition flags are not alive in successors of the CmpInstr parent
2284/// - and, if MI opcode is the S form there must be no defs of flags between
2285/// MI and CmpInstr
2286/// or if MI opcode is not the S form there must be neither defs of flags
2287/// nor uses of flags between MI and CmpInstr.
2288/// - and, if C/V flags are not used after CmpInstr
2289/// or if N flag is used but MI produces poison value if signed overflow
2290/// occurs.
2292 const TargetRegisterInfo &TRI) {
2293 // NOTE this assertion guarantees that MI.getOpcode() is add or subtraction
2294 // that may or may not set flags.
2295 assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END);
2296
2297 const unsigned CmpOpcode = CmpInstr.getOpcode();
2298 if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
2299 return false;
2300
2301 assert((CmpInstr.getOperand(2).isImm() &&
2302 CmpInstr.getOperand(2).getImm() == 0) &&
2303 "Caller guarantees that CmpInstr compares with constant 0");
2304
2305 std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI);
2306 if (!NZVCUsed || NZVCUsed->C)
2307 return false;
2308
2309 // CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either
2310 // '%vreg = add ...' or '%vreg = sub ...'.
2311 // Condition flag V is used to indicate signed overflow.
2312 // 1) MI and CmpInstr set N and V to the same value.
2313 // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when
2314 // signed overflow occurs, so CmpInstr could still be simplified away.
2315 // Note that Ands and Bics instructions always clear the V flag.
2316 if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap) && !isANDOpcode(MI))
2317 return false;
2318
2319 AccessKind AccessToCheck = AK_Write;
2320 if (sForm(MI) != MI.getOpcode())
2321 AccessToCheck = AK_All;
2322 return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck);
2323}
2324
2325/// Substitute an instruction comparing to zero with another instruction
2326/// which produces needed condition flags.
2327///
2328/// Return true on success.
2329bool AArch64InstrInfo::substituteCmpToZero(
2330 MachineInstr &CmpInstr, unsigned SrcReg,
2331 const MachineRegisterInfo &MRI) const {
2332 // Get the unique definition of SrcReg.
2333 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
2334 if (!MI)
2335 return false;
2336
2337 const TargetRegisterInfo &TRI = getRegisterInfo();
2338
2339 unsigned NewOpc = sForm(*MI);
2340 if (NewOpc == AArch64::INSTRUCTION_LIST_END)
2341 return false;
2342
2343 if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI))
2344 return false;
2345
2346 // Update the instruction to set NZCV.
2347 MI->setDesc(get(NewOpc));
2348 CmpInstr.eraseFromParent();
2350 (void)succeeded;
2351 assert(succeeded && "Some operands reg class are incompatible!");
2352 MI->addRegisterDefined(AArch64::NZCV, &TRI);
2353 return true;
2354}
2355
2356/// \returns True if \p CmpInstr can be removed.
2357///
2358/// \p IsInvertCC is true if, after removing \p CmpInstr, condition
2359/// codes used in \p CCUseInstrs must be inverted.
2361 int CmpValue, const TargetRegisterInfo &TRI,
2363 bool &IsInvertCC) {
2364 assert((CmpValue == 0 || CmpValue == 1) &&
2365 "Only comparisons to 0 or 1 considered for removal!");
2366
2367 // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>'
2368 unsigned MIOpc = MI.getOpcode();
2369 if (MIOpc == AArch64::CSINCWr) {
2370 if (MI.getOperand(1).getReg() != AArch64::WZR ||
2371 MI.getOperand(2).getReg() != AArch64::WZR)
2372 return false;
2373 } else if (MIOpc == AArch64::CSINCXr) {
2374 if (MI.getOperand(1).getReg() != AArch64::XZR ||
2375 MI.getOperand(2).getReg() != AArch64::XZR)
2376 return false;
2377 } else {
2378 return false;
2379 }
2381 if (MICC == AArch64CC::Invalid)
2382 return false;
2383
2384 // NZCV needs to be defined
2385 if (MI.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) != -1)
2386 return false;
2387
2388 // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1'
2389 const unsigned CmpOpcode = CmpInstr.getOpcode();
2390 bool IsSubsRegImm = isSUBSRegImm(CmpOpcode);
2391 if (CmpValue && !IsSubsRegImm)
2392 return false;
2393 if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode))
2394 return false;
2395
2396 // MI conditions allowed: eq, ne, mi, pl
2397 UsedNZCV MIUsedNZCV = getUsedNZCV(MICC);
2398 if (MIUsedNZCV.C || MIUsedNZCV.V)
2399 return false;
2400
2401 std::optional<UsedNZCV> NZCVUsedAfterCmp =
2402 examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs);
2403 // Condition flags are not used in CmpInstr basic block successors and only
2404 // Z or N flags allowed to be used after CmpInstr within its basic block
2405 if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V)
2406 return false;
2407 // Z or N flag used after CmpInstr must correspond to the flag used in MI
2408 if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
2409 (MIUsedNZCV.N && NZCVUsedAfterCmp->Z))
2410 return false;
2411 // If CmpInstr is comparison to zero MI conditions are limited to eq, ne
2412 if (MIUsedNZCV.N && !CmpValue)
2413 return false;
2414
2415 // There must be no defs of flags between MI and CmpInstr
2416 if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write))
2417 return false;
2418
2419 // Condition code is inverted in the following cases:
2420 // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
2421 // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1'
2422 IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) ||
2423 (!CmpValue && MICC == AArch64CC::NE);
2424 return true;
2425}
2426
2427/// Remove comparison in csinc-cmp sequence
2428///
2429/// Examples:
2430/// 1. \code
2431/// csinc w9, wzr, wzr, ne
2432/// cmp w9, #0
2433/// b.eq
2434/// \endcode
2435/// to
2436/// \code
2437/// csinc w9, wzr, wzr, ne
2438/// b.ne
2439/// \endcode
2440///
2441/// 2. \code
2442/// csinc x2, xzr, xzr, mi
2443/// cmp x2, #1
2444/// b.pl
2445/// \endcode
2446/// to
2447/// \code
2448/// csinc x2, xzr, xzr, mi
2449/// b.pl
2450/// \endcode
2451///
2452/// \param CmpInstr comparison instruction
2453/// \return True when comparison removed
2454bool AArch64InstrInfo::removeCmpToZeroOrOne(
2455 MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue,
2456 const MachineRegisterInfo &MRI) const {
2457 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
2458 if (!MI)
2459 return false;
2460 const TargetRegisterInfo &TRI = getRegisterInfo();
2461 SmallVector<MachineInstr *, 4> CCUseInstrs;
2462 bool IsInvertCC = false;
2463 if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs,
2464 IsInvertCC))
2465 return false;
2466 // Make transformation
2467 CmpInstr.eraseFromParent();
2468 if (IsInvertCC) {
2469 // Invert condition codes in CmpInstr CC users
2470 for (MachineInstr *CCUseInstr : CCUseInstrs) {
2471 int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr);
2472 assert(Idx >= 0 && "Unexpected instruction using CC.");
2473 MachineOperand &CCOperand = CCUseInstr->getOperand(Idx);
2475 static_cast<AArch64CC::CondCode>(CCOperand.getImm()));
2476 CCOperand.setImm(CCUse);
2477 }
2478 }
2479 return true;
2480}
2481
2482bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
2483 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
2484 MI.getOpcode() != AArch64::CATCHRET)
2485 return false;
2486
2487 MachineBasicBlock &MBB = *MI.getParent();
2488 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
2489 auto TRI = Subtarget.getRegisterInfo();
2490 DebugLoc DL = MI.getDebugLoc();
2491
2492 if (MI.getOpcode() == AArch64::CATCHRET) {
2493 // Skip to the first instruction before the epilog.
2494 const TargetInstrInfo *TII =
2496 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
2498 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
2499 while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
2500 FirstEpilogSEH != MBB.begin())
2501 FirstEpilogSEH = std::prev(FirstEpilogSEH);
2502 if (FirstEpilogSEH != MBB.begin())
2503 FirstEpilogSEH = std::next(FirstEpilogSEH);
2504 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
2505 .addReg(AArch64::X0, RegState::Define)
2506 .addMBB(TargetMBB);
2507 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
2508 .addReg(AArch64::X0, RegState::Define)
2509 .addReg(AArch64::X0)
2510 .addMBB(TargetMBB)
2511 .addImm(0);
2512 TargetMBB->setMachineBlockAddressTaken();
2513 return true;
2514 }
2515
2516 Register Reg = MI.getOperand(0).getReg();
2518 if (M.getStackProtectorGuard() == "sysreg") {
2519 const AArch64SysReg::SysReg *SrcReg =
2520 AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg());
2521 if (!SrcReg)
2522 report_fatal_error("Unknown SysReg for Stack Protector Guard Register");
2523
2524 // mrs xN, sysreg
2525 BuildMI(MBB, MI, DL, get(AArch64::MRS))
2527 .addImm(SrcReg->Encoding);
2528 int Offset = M.getStackProtectorGuardOffset();
2529 if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) {
2530 // ldr xN, [xN, #offset]
2531 BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2532 .addDef(Reg)
2534 .addImm(Offset / 8);
2535 } else if (Offset >= -256 && Offset <= 255) {
2536 // ldur xN, [xN, #offset]
2537 BuildMI(MBB, MI, DL, get(AArch64::LDURXi))
2538 .addDef(Reg)
2540 .addImm(Offset);
2541 } else if (Offset >= -4095 && Offset <= 4095) {
2542 if (Offset > 0) {
2543 // add xN, xN, #offset
2544 BuildMI(MBB, MI, DL, get(AArch64::ADDXri))
2545 .addDef(Reg)
2547 .addImm(Offset)
2548 .addImm(0);
2549 } else {
2550 // sub xN, xN, #offset
2551 BuildMI(MBB, MI, DL, get(AArch64::SUBXri))
2552 .addDef(Reg)
2554 .addImm(-Offset)
2555 .addImm(0);
2556 }
2557 // ldr xN, [xN]
2558 BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2559 .addDef(Reg)
2561 .addImm(0);
2562 } else {
2563 // Cases that are larger than +/- 4095 and not a multiple of 8, or larger
2564 // than 23760.
2565 // It might be nice to use AArch64::MOVi32imm here, which would get
2566 // expanded in PreSched2 after PostRA, but our lone scratch Reg already
2567 // contains the MRS result. findScratchNonCalleeSaveRegister() in
2568 // AArch64FrameLowering might help us find such a scratch register
2569 // though. If we failed to find a scratch register, we could emit a
2570 // stream of add instructions to build up the immediate. Or, we could try
2571 // to insert a AArch64::MOVi32imm before register allocation so that we
2572 // didn't need to scavenge for a scratch register.
2573 report_fatal_error("Unable to encode Stack Protector Guard Offset");
2574 }
2575 MBB.erase(MI);
2576 return true;
2577 }
2578
2579 const GlobalValue *GV =
2580 cast<GlobalValue>((*MI.memoperands_begin())->getValue());
2581 const TargetMachine &TM = MBB.getParent()->getTarget();
2582 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
2583 const unsigned char MO_NC = AArch64II::MO_NC;
2584
2585 unsigned GuardWidth = M.getStackProtectorGuardValueWidth().value_or(
2586 Subtarget.isTargetILP32() ? 4 : 8);
2587 if (GuardWidth != 4 && GuardWidth != 8)
2588 report_fatal_error("Unsupported stack protector value width");
2589 if ((OpFlags & AArch64II::MO_GOT) != 0) {
2590 BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
2591 .addGlobalAddress(GV, 0, OpFlags);
2592 if (GuardWidth == 4) {
2593 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2594 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2595 .addDef(Reg32, RegState::Dead)
2597 .addImm(0)
2598 .addMemOperand(*MI.memoperands_begin())
2600 } else {
2601 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2603 .addImm(0)
2604 .addMemOperand(*MI.memoperands_begin());
2605 }
2606 } else if (TM.getCodeModel() == CodeModel::Large) {
2607 if (GuardWidth == 4)
2608 report_fatal_error("Large code model with 4-byte stack protector not yet "
2609 "supported");
2610 BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
2611 .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
2612 .addImm(0);
2613 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2615 .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
2616 .addImm(16);
2617 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2619 .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
2620 .addImm(32);
2621 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2624 .addImm(48);
2625 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2627 .addImm(0)
2628 .addMemOperand(*MI.memoperands_begin());
2629 } else if (TM.getCodeModel() == CodeModel::Tiny) {
2630 // FIXME: This is computing the stack protector value as a constant
2631 // pc-relative offset, not loading it from memory. Which is maybe
2632 // an interesting compromise in some environments, but it looks like it
2633 // was done accidentally. And it probably shouldn't be tied to the
2634 // code model.
2635 BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
2636 .addGlobalAddress(GV, 0, OpFlags);
2637 } else {
2638 BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
2639 .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
2640 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
2641 if (GuardWidth == 4) {
2642 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2643 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2644 .addDef(Reg32, RegState::Dead)
2646 .addGlobalAddress(GV, 0, LoFlags)
2647 .addMemOperand(*MI.memoperands_begin())
2649 } else {
2650 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2652 .addGlobalAddress(GV, 0, LoFlags)
2653 .addMemOperand(*MI.memoperands_begin());
2654 }
2655 }
2656
2657 MBB.erase(MI);
2658
2659 return true;
2660}
2661
2662// Return true if this instruction simply sets its single destination register
2663// to zero. This is equivalent to a register rename of the zero-register.
2665 switch (MI.getOpcode()) {
2666 default:
2667 break;
2668 case AArch64::MOVZWi:
2669 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
2670 if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
2671 assert(MI.getDesc().getNumOperands() == 3 &&
2672 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
2673 return true;
2674 }
2675 break;
2676 case AArch64::ANDWri: // and Rd, Rzr, #imm
2677 return MI.getOperand(1).getReg() == AArch64::WZR;
2678 case AArch64::ANDXri:
2679 return MI.getOperand(1).getReg() == AArch64::XZR;
2680 case TargetOpcode::COPY:
2681 return MI.getOperand(1).getReg() == AArch64::WZR;
2682 }
2683 return false;
2684}
2685
2686// Return true if this instruction simply renames a general register without
2687// modifying bits.
2689 switch (MI.getOpcode()) {
2690 default:
2691 break;
2692 case TargetOpcode::COPY: {
2693 // GPR32 copies will by lowered to ORRXrs
2694 Register DstReg = MI.getOperand(0).getReg();
2695 return (AArch64::GPR32RegClass.contains(DstReg) ||
2696 AArch64::GPR64RegClass.contains(DstReg));
2697 }
2698 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
2699 if (MI.getOperand(1).getReg() == AArch64::XZR) {
2700 assert(MI.getDesc().getNumOperands() == 4 &&
2701 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
2702 return true;
2703 }
2704 break;
2705 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
2706 if (MI.getOperand(2).getImm() == 0) {
2707 assert(MI.getDesc().getNumOperands() == 4 &&
2708 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
2709 return true;
2710 }
2711 break;
2712 }
2713 return false;
2714}
2715
2716// Return true if this instruction simply renames a general register without
2717// modifying bits.
2719 switch (MI.getOpcode()) {
2720 default:
2721 break;
2722 case TargetOpcode::COPY: {
2723 Register DstReg = MI.getOperand(0).getReg();
2724 return AArch64::FPR128RegClass.contains(DstReg);
2725 }
2726 case AArch64::ORRv16i8:
2727 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
2728 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
2729 "invalid ORRv16i8 operands");
2730 return true;
2731 }
2732 break;
2733 }
2734 return false;
2735}
2736
2737static bool isFrameLoadOpcode(int Opcode) {
2738 switch (Opcode) {
2739 default:
2740 return false;
2741 case AArch64::LDRWui:
2742 case AArch64::LDRXui:
2743 case AArch64::LDRBui:
2744 case AArch64::LDRHui:
2745 case AArch64::LDRSui:
2746 case AArch64::LDRDui:
2747 case AArch64::LDRQui:
2748 case AArch64::LDR_PXI:
2749 return true;
2750 }
2751}
2752
2754 int &FrameIndex) const {
2755 if (!isFrameLoadOpcode(MI.getOpcode()))
2756 return Register();
2757
2758 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2759 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2760 FrameIndex = MI.getOperand(1).getIndex();
2761 return MI.getOperand(0).getReg();
2762 }
2763 return Register();
2764}
2765
2766static bool isFrameStoreOpcode(int Opcode) {
2767 switch (Opcode) {
2768 default:
2769 return false;
2770 case AArch64::STRWui:
2771 case AArch64::STRXui:
2772 case AArch64::STRBui:
2773 case AArch64::STRHui:
2774 case AArch64::STRSui:
2775 case AArch64::STRDui:
2776 case AArch64::STRQui:
2777 case AArch64::STR_PXI:
2778 return true;
2779 }
2780}
2781
2783 int &FrameIndex) const {
2784 if (!isFrameStoreOpcode(MI.getOpcode()))
2785 return Register();
2786
2787 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2788 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2789 FrameIndex = MI.getOperand(1).getIndex();
2790 return MI.getOperand(0).getReg();
2791 }
2792 return Register();
2793}
2794
2796 int &FrameIndex) const {
2797 if (!isFrameStoreOpcode(MI.getOpcode()))
2798 return Register();
2799
2800 if (Register Reg = isStoreToStackSlot(MI, FrameIndex))
2801 return Reg;
2802
2804 if (hasStoreToStackSlot(MI, Accesses)) {
2805 if (Accesses.size() > 1)
2806 return Register();
2807
2808 FrameIndex =
2809 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
2810 ->getFrameIndex();
2811 return MI.getOperand(0).getReg();
2812 }
2813 return Register();
2814}
2815
2817 int &FrameIndex) const {
2818 if (!isFrameLoadOpcode(MI.getOpcode()))
2819 return Register();
2820
2821 if (Register Reg = isLoadFromStackSlot(MI, FrameIndex))
2822 return Reg;
2823
2825 if (hasLoadFromStackSlot(MI, Accesses)) {
2826 if (Accesses.size() > 1)
2827 return Register();
2828
2829 FrameIndex =
2830 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
2831 ->getFrameIndex();
2832 return MI.getOperand(0).getReg();
2833 }
2834 return Register();
2835}
2836
2837/// Check all MachineMemOperands for a hint to suppress pairing.
2839 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2840 return MMO->getFlags() & MOSuppressPair;
2841 });
2842}
2843
2844/// Set a flag on the first MachineMemOperand to suppress pairing.
2846 if (MI.memoperands_empty())
2847 return;
2848 (*MI.memoperands_begin())->setFlags(MOSuppressPair);
2849}
2850
2851/// Check all MachineMemOperands for a hint that the load/store is strided.
2853 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2854 return MMO->getFlags() & MOStridedAccess;
2855 });
2856}
2857
2859 switch (Opc) {
2860 default:
2861 return false;
2862 case AArch64::STURSi:
2863 case AArch64::STRSpre:
2864 case AArch64::STURDi:
2865 case AArch64::STRDpre:
2866 case AArch64::STURQi:
2867 case AArch64::STRQpre:
2868 case AArch64::STURBBi:
2869 case AArch64::STURHHi:
2870 case AArch64::STURWi:
2871 case AArch64::STRWpre:
2872 case AArch64::STURXi:
2873 case AArch64::STRXpre:
2874 case AArch64::LDURSi:
2875 case AArch64::LDRSpre:
2876 case AArch64::LDURDi:
2877 case AArch64::LDRDpre:
2878 case AArch64::LDURQi:
2879 case AArch64::LDRQpre:
2880 case AArch64::LDURWi:
2881 case AArch64::LDRWpre:
2882 case AArch64::LDURXi:
2883 case AArch64::LDRXpre:
2884 case AArch64::LDRSWpre:
2885 case AArch64::LDURSWi:
2886 case AArch64::LDURHHi:
2887 case AArch64::LDURBBi:
2888 case AArch64::LDURSBWi:
2889 case AArch64::LDURSHWi:
2890 return true;
2891 }
2892}
2893
2894std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
2895 switch (Opc) {
2896 default: return {};
2897 case AArch64::PRFMui: return AArch64::PRFUMi;
2898 case AArch64::LDRXui: return AArch64::LDURXi;
2899 case AArch64::LDRWui: return AArch64::LDURWi;
2900 case AArch64::LDRBui: return AArch64::LDURBi;
2901 case AArch64::LDRHui: return AArch64::LDURHi;
2902 case AArch64::LDRSui: return AArch64::LDURSi;
2903 case AArch64::LDRDui: return AArch64::LDURDi;
2904 case AArch64::LDRQui: return AArch64::LDURQi;
2905 case AArch64::LDRBBui: return AArch64::LDURBBi;
2906 case AArch64::LDRHHui: return AArch64::LDURHHi;
2907 case AArch64::LDRSBXui: return AArch64::LDURSBXi;
2908 case AArch64::LDRSBWui: return AArch64::LDURSBWi;
2909 case AArch64::LDRSHXui: return AArch64::LDURSHXi;
2910 case AArch64::LDRSHWui: return AArch64::LDURSHWi;
2911 case AArch64::LDRSWui: return AArch64::LDURSWi;
2912 case AArch64::STRXui: return AArch64::STURXi;
2913 case AArch64::STRWui: return AArch64::STURWi;
2914 case AArch64::STRBui: return AArch64::STURBi;
2915 case AArch64::STRHui: return AArch64::STURHi;
2916 case AArch64::STRSui: return AArch64::STURSi;
2917 case AArch64::STRDui: return AArch64::STURDi;
2918 case AArch64::STRQui: return AArch64::STURQi;
2919 case AArch64::STRBBui: return AArch64::STURBBi;
2920 case AArch64::STRHHui: return AArch64::STURHHi;
2921 }
2922}
2923
2925 switch (Opc) {
2926 default:
2927 llvm_unreachable("Unhandled Opcode in getLoadStoreImmIdx");
2928 case AArch64::ADDG:
2929 case AArch64::LDAPURBi:
2930 case AArch64::LDAPURHi:
2931 case AArch64::LDAPURi:
2932 case AArch64::LDAPURSBWi:
2933 case AArch64::LDAPURSBXi:
2934 case AArch64::LDAPURSHWi:
2935 case AArch64::LDAPURSHXi:
2936 case AArch64::LDAPURSWi:
2937 case AArch64::LDAPURXi:
2938 case AArch64::LDR_PPXI:
2939 case AArch64::LDR_PXI:
2940 case AArch64::LDR_ZXI:
2941 case AArch64::LDR_ZZXI:
2942 case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
2943 case AArch64::LDR_ZZZXI:
2944 case AArch64::LDR_ZZZZXI:
2945 case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
2946 case AArch64::LDRBBui:
2947 case AArch64::LDRBui:
2948 case AArch64::LDRDui:
2949 case AArch64::LDRHHui:
2950 case AArch64::LDRHui:
2951 case AArch64::LDRQui:
2952 case AArch64::LDRSBWui:
2953 case AArch64::LDRSBXui:
2954 case AArch64::LDRSHWui:
2955 case AArch64::LDRSHXui:
2956 case AArch64::LDRSui:
2957 case AArch64::LDRSWui:
2958 case AArch64::LDRWui:
2959 case AArch64::LDRXui:
2960 case AArch64::LDURBBi:
2961 case AArch64::LDURBi:
2962 case AArch64::LDURDi:
2963 case AArch64::LDURHHi:
2964 case AArch64::LDURHi:
2965 case AArch64::LDURQi:
2966 case AArch64::LDURSBWi:
2967 case AArch64::LDURSBXi:
2968 case AArch64::LDURSHWi:
2969 case AArch64::LDURSHXi:
2970 case AArch64::LDURSi:
2971 case AArch64::LDURSWi:
2972 case AArch64::LDURWi:
2973 case AArch64::LDURXi:
2974 case AArch64::PRFMui:
2975 case AArch64::PRFUMi:
2976 case AArch64::ST2Gi:
2977 case AArch64::STGi:
2978 case AArch64::STLURBi:
2979 case AArch64::STLURHi:
2980 case AArch64::STLURWi:
2981 case AArch64::STLURXi:
2982 case AArch64::StoreSwiftAsyncContext:
2983 case AArch64::STR_PPXI:
2984 case AArch64::STR_PXI:
2985 case AArch64::STR_ZXI:
2986 case AArch64::STR_ZZXI:
2987 case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
2988 case AArch64::STR_ZZZXI:
2989 case AArch64::STR_ZZZZXI:
2990 case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
2991 case AArch64::STRBBui:
2992 case AArch64::STRBui:
2993 case AArch64::STRDui:
2994 case AArch64::STRHHui:
2995 case AArch64::STRHui:
2996 case AArch64::STRQui:
2997 case AArch64::STRSui:
2998 case AArch64::STRWui:
2999 case AArch64::STRXui:
3000 case AArch64::STURBBi:
3001 case AArch64::STURBi:
3002 case AArch64::STURDi:
3003 case AArch64::STURHHi:
3004 case AArch64::STURHi:
3005 case AArch64::STURQi:
3006 case AArch64::STURSi:
3007 case AArch64::STURWi:
3008 case AArch64::STURXi:
3009 case AArch64::STZ2Gi:
3010 case AArch64::STZGi:
3011 case AArch64::TAGPstack:
3012 return 2;
3013 case AArch64::LD1B_D_IMM:
3014 case AArch64::LD1B_H_IMM:
3015 case AArch64::LD1B_IMM:
3016 case AArch64::LD1B_S_IMM:
3017 case AArch64::LD1D_IMM:
3018 case AArch64::LD1H_D_IMM:
3019 case AArch64::LD1H_IMM:
3020 case AArch64::LD1H_S_IMM:
3021 case AArch64::LD1RB_D_IMM:
3022 case AArch64::LD1RB_H_IMM:
3023 case AArch64::LD1RB_IMM:
3024 case AArch64::LD1RB_S_IMM:
3025 case AArch64::LD1RD_IMM:
3026 case AArch64::LD1RH_D_IMM:
3027 case AArch64::LD1RH_IMM:
3028 case AArch64::LD1RH_S_IMM:
3029 case AArch64::LD1RSB_D_IMM:
3030 case AArch64::LD1RSB_H_IMM:
3031 case AArch64::LD1RSB_S_IMM:
3032 case AArch64::LD1RSH_D_IMM:
3033 case AArch64::LD1RSH_S_IMM:
3034 case AArch64::LD1RSW_IMM:
3035 case AArch64::LD1RW_D_IMM:
3036 case AArch64::LD1RW_IMM:
3037 case AArch64::LD1SB_D_IMM:
3038 case AArch64::LD1SB_H_IMM:
3039 case AArch64::LD1SB_S_IMM:
3040 case AArch64::LD1SH_D_IMM:
3041 case AArch64::LD1SH_S_IMM:
3042 case AArch64::LD1SW_D_IMM:
3043 case AArch64::LD1W_D_IMM:
3044 case AArch64::LD1W_IMM:
3045 case AArch64::LD2B_IMM:
3046 case AArch64::LD2D_IMM:
3047 case AArch64::LD2H_IMM:
3048 case AArch64::LD2W_IMM:
3049 case AArch64::LD3B_IMM:
3050 case AArch64::LD3D_IMM:
3051 case AArch64::LD3H_IMM:
3052 case AArch64::LD3W_IMM:
3053 case AArch64::LD4B_IMM:
3054 case AArch64::LD4D_IMM:
3055 case AArch64::LD4H_IMM:
3056 case AArch64::LD4W_IMM:
3057 case AArch64::LDG:
3058 case AArch64::LDNF1B_D_IMM:
3059 case AArch64::LDNF1B_H_IMM:
3060 case AArch64::LDNF1B_IMM:
3061 case AArch64::LDNF1B_S_IMM:
3062 case AArch64::LDNF1D_IMM:
3063 case AArch64::LDNF1H_D_IMM:
3064 case AArch64::LDNF1H_IMM:
3065 case AArch64::LDNF1H_S_IMM:
3066 case AArch64::LDNF1SB_D_IMM:
3067 case AArch64::LDNF1SB_H_IMM:
3068 case AArch64::LDNF1SB_S_IMM:
3069 case AArch64::LDNF1SH_D_IMM:
3070 case AArch64::LDNF1SH_S_IMM:
3071 case AArch64::LDNF1SW_D_IMM:
3072 case AArch64::LDNF1W_D_IMM:
3073 case AArch64::LDNF1W_IMM:
3074 case AArch64::LDNPDi:
3075 case AArch64::LDNPQi:
3076 case AArch64::LDNPSi:
3077 case AArch64::LDNPWi:
3078 case AArch64::LDNPXi:
3079 case AArch64::LDNT1B_ZRI:
3080 case AArch64::LDNT1D_ZRI:
3081 case AArch64::LDNT1H_ZRI:
3082 case AArch64::LDNT1W_ZRI:
3083 case AArch64::LDPDi:
3084 case AArch64::LDPQi:
3085 case AArch64::LDPSi:
3086 case AArch64::LDPWi:
3087 case AArch64::LDPXi:
3088 case AArch64::LDRBBpost:
3089 case AArch64::LDRBBpre:
3090 case AArch64::LDRBpost:
3091 case AArch64::LDRBpre:
3092 case AArch64::LDRDpost:
3093 case AArch64::LDRDpre:
3094 case AArch64::LDRHHpost:
3095 case AArch64::LDRHHpre:
3096 case AArch64::LDRHpost:
3097 case AArch64::LDRHpre:
3098 case AArch64::LDRQpost:
3099 case AArch64::LDRQpre:
3100 case AArch64::LDRSpost:
3101 case AArch64::LDRSpre:
3102 case AArch64::LDRWpost:
3103 case AArch64::LDRWpre:
3104 case AArch64::LDRXpost:
3105 case AArch64::LDRXpre:
3106 case AArch64::ST1B_D_IMM:
3107 case AArch64::ST1B_H_IMM:
3108 case AArch64::ST1B_IMM:
3109 case AArch64::ST1B_S_IMM:
3110 case AArch64::ST1D_IMM:
3111 case AArch64::ST1H_D_IMM:
3112 case AArch64::ST1H_IMM:
3113 case AArch64::ST1H_S_IMM:
3114 case AArch64::ST1W_D_IMM:
3115 case AArch64::ST1W_IMM:
3116 case AArch64::ST2B_IMM:
3117 case AArch64::ST2D_IMM:
3118 case AArch64::ST2H_IMM:
3119 case AArch64::ST2W_IMM:
3120 case AArch64::ST3B_IMM:
3121 case AArch64::ST3D_IMM:
3122 case AArch64::ST3H_IMM:
3123 case AArch64::ST3W_IMM:
3124 case AArch64::ST4B_IMM:
3125 case AArch64::ST4D_IMM:
3126 case AArch64::ST4H_IMM:
3127 case AArch64::ST4W_IMM:
3128 case AArch64::STGPi:
3129 case AArch64::STGPreIndex:
3130 case AArch64::STZGPreIndex:
3131 case AArch64::ST2GPreIndex:
3132 case AArch64::STZ2GPreIndex:
3133 case AArch64::STGPostIndex:
3134 case AArch64::STZGPostIndex:
3135 case AArch64::ST2GPostIndex:
3136 case AArch64::STZ2GPostIndex:
3137 case AArch64::STNPDi:
3138 case AArch64::STNPQi:
3139 case AArch64::STNPSi:
3140 case AArch64::STNPWi:
3141 case AArch64::STNPXi:
3142 case AArch64::STNT1B_ZRI:
3143 case AArch64::STNT1D_ZRI:
3144 case AArch64::STNT1H_ZRI:
3145 case AArch64::STNT1W_ZRI:
3146 case AArch64::STPDi:
3147 case AArch64::STPQi:
3148 case AArch64::STPSi:
3149 case AArch64::STPWi:
3150 case AArch64::STPXi:
3151 case AArch64::STRBBpost:
3152 case AArch64::STRBBpre:
3153 case AArch64::STRBpost:
3154 case AArch64::STRBpre:
3155 case AArch64::STRDpost:
3156 case AArch64::STRDpre:
3157 case AArch64::STRHHpost:
3158 case AArch64::STRHHpre:
3159 case AArch64::STRHpost:
3160 case AArch64::STRHpre:
3161 case AArch64::STRQpost:
3162 case AArch64::STRQpre:
3163 case AArch64::STRSpost:
3164 case AArch64::STRSpre:
3165 case AArch64::STRWpost:
3166 case AArch64::STRWpre:
3167 case AArch64::STRXpost:
3168 case AArch64::STRXpre:
3169 return 3;
3170 case AArch64::LDPDpost:
3171 case AArch64::LDPDpre:
3172 case AArch64::LDPQpost:
3173 case AArch64::LDPQpre:
3174 case AArch64::LDPSpost:
3175 case AArch64::LDPSpre:
3176 case AArch64::LDPWpost:
3177 case AArch64::LDPWpre:
3178 case AArch64::LDPXpost:
3179 case AArch64::LDPXpre:
3180 case AArch64::STGPpre:
3181 case AArch64::STGPpost:
3182 case AArch64::STPDpost:
3183 case AArch64::STPDpre:
3184 case AArch64::STPQpost:
3185 case AArch64::STPQpre:
3186 case AArch64::STPSpost:
3187 case AArch64::STPSpre:
3188 case AArch64::STPWpost:
3189 case AArch64::STPWpre:
3190 case AArch64::STPXpost:
3191 case AArch64::STPXpre:
3192 return 4;
3193 }
3194}
3195
3197 switch (MI.getOpcode()) {
3198 default:
3199 return false;
3200 // Scaled instructions.
3201 case AArch64::STRSui:
3202 case AArch64::STRDui:
3203 case AArch64::STRQui:
3204 case AArch64::STRXui:
3205 case AArch64::STRWui:
3206 case AArch64::LDRSui:
3207 case AArch64::LDRDui:
3208 case AArch64::LDRQui:
3209 case AArch64::LDRXui:
3210 case AArch64::LDRWui:
3211 case AArch64::LDRSWui:
3212 // Unscaled instructions.
3213 case AArch64::STURSi:
3214 case AArch64::STRSpre:
3215 case AArch64::STURDi:
3216 case AArch64::STRDpre:
3217 case AArch64::STURQi:
3218 case AArch64::STRQpre:
3219 case AArch64::STURWi:
3220 case AArch64::STRWpre:
3221 case AArch64::STURXi:
3222 case AArch64::STRXpre:
3223 case AArch64::LDURSi:
3224 case AArch64::LDRSpre:
3225 case AArch64::LDURDi:
3226 case AArch64::LDRDpre:
3227 case AArch64::LDURQi:
3228 case AArch64::LDRQpre:
3229 case AArch64::LDURWi:
3230 case AArch64::LDRWpre:
3231 case AArch64::LDURXi:
3232 case AArch64::LDRXpre:
3233 case AArch64::LDURSWi:
3234 case AArch64::LDRSWpre:
3235 // SVE instructions.
3236 case AArch64::LDR_ZXI:
3237 case AArch64::STR_ZXI:
3238 return true;
3239 }
3240}
3241
3243 switch (MI.getOpcode()) {
3244 default:
3245 assert((!MI.isCall() || !MI.isReturn()) &&
3246 "Unexpected instruction - was a new tail call opcode introduced?");
3247 return false;
3248 case AArch64::TCRETURNdi:
3249 case AArch64::TCRETURNri:
3250 case AArch64::TCRETURNrix16x17:
3251 case AArch64::TCRETURNrix17:
3252 case AArch64::TCRETURNrinotx16:
3253 case AArch64::TCRETURNriALL:
3254 case AArch64::AUTH_TCRETURN:
3255 case AArch64::AUTH_TCRETURN_BTI:
3256 return true;
3257 }
3258}
3259
3261 switch (Opc) {
3262 default:
3263 llvm_unreachable("Opcode has no flag setting equivalent!");
3264 // 32-bit cases:
3265 case AArch64::ADDWri:
3266 return AArch64::ADDSWri;
3267 case AArch64::ADDWrr:
3268 return AArch64::ADDSWrr;
3269 case AArch64::ADDWrs:
3270 return AArch64::ADDSWrs;
3271 case AArch64::ADDWrx:
3272 return AArch64::ADDSWrx;
3273 case AArch64::ANDWri:
3274 return AArch64::ANDSWri;
3275 case AArch64::ANDWrr:
3276 return AArch64::ANDSWrr;
3277 case AArch64::ANDWrs:
3278 return AArch64::ANDSWrs;
3279 case AArch64::BICWrr:
3280 return AArch64::BICSWrr;
3281 case AArch64::BICWrs:
3282 return AArch64::BICSWrs;
3283 case AArch64::SUBWri:
3284 return AArch64::SUBSWri;
3285 case AArch64::SUBWrr:
3286 return AArch64::SUBSWrr;
3287 case AArch64::SUBWrs:
3288 return AArch64::SUBSWrs;
3289 case AArch64::SUBWrx:
3290 return AArch64::SUBSWrx;
3291 // 64-bit cases:
3292 case AArch64::ADDXri:
3293 return AArch64::ADDSXri;
3294 case AArch64::ADDXrr:
3295 return AArch64::ADDSXrr;
3296 case AArch64::ADDXrs:
3297 return AArch64::ADDSXrs;
3298 case AArch64::ADDXrx:
3299 return AArch64::ADDSXrx;
3300 case AArch64::ANDXri:
3301 return AArch64::ANDSXri;
3302 case AArch64::ANDXrr:
3303 return AArch64::ANDSXrr;
3304 case AArch64::ANDXrs:
3305 return AArch64::ANDSXrs;
3306 case AArch64::BICXrr:
3307 return AArch64::BICSXrr;
3308 case AArch64::BICXrs:
3309 return AArch64::BICSXrs;
3310 case AArch64::SUBXri:
3311 return AArch64::SUBSXri;
3312 case AArch64::SUBXrr:
3313 return AArch64::SUBSXrr;
3314 case AArch64::SUBXrs:
3315 return AArch64::SUBSXrs;
3316 case AArch64::SUBXrx:
3317 return AArch64::SUBSXrx;
3318 // SVE instructions:
3319 case AArch64::AND_PPzPP:
3320 return AArch64::ANDS_PPzPP;
3321 case AArch64::BIC_PPzPP:
3322 return AArch64::BICS_PPzPP;
3323 case AArch64::EOR_PPzPP:
3324 return AArch64::EORS_PPzPP;
3325 case AArch64::NAND_PPzPP:
3326 return AArch64::NANDS_PPzPP;
3327 case AArch64::NOR_PPzPP:
3328 return AArch64::NORS_PPzPP;
3329 case AArch64::ORN_PPzPP:
3330 return AArch64::ORNS_PPzPP;
3331 case AArch64::ORR_PPzPP:
3332 return AArch64::ORRS_PPzPP;
3333 case AArch64::BRKA_PPzP:
3334 return AArch64::BRKAS_PPzP;
3335 case AArch64::BRKPA_PPzPP:
3336 return AArch64::BRKPAS_PPzPP;
3337 case AArch64::BRKB_PPzP:
3338 return AArch64::BRKBS_PPzP;
3339 case AArch64::BRKPB_PPzPP:
3340 return AArch64::BRKPBS_PPzPP;
3341 case AArch64::BRKN_PPzP:
3342 return AArch64::BRKNS_PPzP;
3343 case AArch64::RDFFR_PPz:
3344 return AArch64::RDFFRS_PPz;
3345 case AArch64::PTRUE_B:
3346 return AArch64::PTRUES_B;
3347 }
3348}
3349
3350// Is this a candidate for ld/st merging or pairing? For example, we don't
3351// touch volatiles or load/stores that have a hint to avoid pair formation.
3353
3354 bool IsPreLdSt = isPreLdSt(MI);
3355
3356 // If this is a volatile load/store, don't mess with it.
3357 if (MI.hasOrderedMemoryRef())
3358 return false;
3359
3360 // Make sure this is a reg/fi+imm (as opposed to an address reloc).
3361 // For Pre-inc LD/ST, the operand is shifted by one.
3362 assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() ||
3363 MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) &&
3364 "Expected a reg or frame index operand.");
3365
3366 // For Pre-indexed addressing quadword instructions, the third operand is the
3367 // immediate value.
3368 bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm();
3369
3370 if (!MI.getOperand(2).isImm() && !IsImmPreLdSt)
3371 return false;
3372
3373 // Can't merge/pair if the instruction modifies the base register.
3374 // e.g., ldr x0, [x0]
3375 // This case will never occur with an FI base.
3376 // However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or
3377 // STR<S,D,Q,W,X>pre, it can be merged.
3378 // For example:
3379 // ldr q0, [x11, #32]!
3380 // ldr q1, [x11, #16]
3381 // to
3382 // ldp q0, q1, [x11, #32]!
3383 if (MI.getOperand(1).isReg() && !IsPreLdSt) {
3384 Register BaseReg = MI.getOperand(1).getReg();
3386 if (MI.modifiesRegister(BaseReg, TRI))
3387 return false;
3388 }
3389
3390 // Pairing SVE fills/spills is only valid for little-endian targets that
3391 // implement VLS 128.
3392 switch (MI.getOpcode()) {
3393 default:
3394 break;
3395 case AArch64::LDR_ZXI:
3396 case AArch64::STR_ZXI:
3397 if (!Subtarget.isLittleEndian() ||
3398 Subtarget.getSVEVectorSizeInBits() != 128)
3399 return false;
3400 }
3401
3402 // Check if this load/store has a hint to avoid pair formation.
3403 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
3405 return false;
3406
3407 // Do not pair any callee-save store/reload instructions in the
3408 // prologue/epilogue if the CFI information encoded the operations as separate
3409 // instructions, as that will cause the size of the actual prologue to mismatch
3410 // with the prologue size recorded in the Windows CFI.
3411 const MCAsmInfo &MAI = MI.getMF()->getTarget().getMCAsmInfo();
3412 bool NeedsWinCFI =
3413 MAI.usesWindowsCFI() && MI.getMF()->getFunction().needsUnwindTableEntry();
3414 if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
3416 return false;
3417
3418 // On some CPUs quad load/store pairs are slower than two single load/stores.
3419 if (Subtarget.isPaired128Slow()) {
3420 switch (MI.getOpcode()) {
3421 default:
3422 break;
3423 case AArch64::LDURQi:
3424 case AArch64::STURQi:
3425 case AArch64::LDRQui:
3426 case AArch64::STRQui:
3427 return false;
3428 }
3429 }
3430
3431 return true;
3432}
3433
3436 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
3437 const TargetRegisterInfo *TRI) const {
3438 if (!LdSt.mayLoadOrStore())
3439 return false;
3440
3441 const MachineOperand *BaseOp;
3442 TypeSize WidthN(0, false);
3443 if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
3444 WidthN, TRI))
3445 return false;
3446 // The maximum vscale is 16 under AArch64, return the maximal extent for the
3447 // vector.
3448 Width = LocationSize::precise(WidthN);
3449 BaseOps.push_back(BaseOp);
3450 return true;
3451}
3452
3453std::optional<ExtAddrMode>
3455 const TargetRegisterInfo *TRI) const {
3456 const MachineOperand *Base; // Filled with the base operand of MI.
3457 int64_t Offset; // Filled with the offset of MI.
3458 bool OffsetIsScalable;
3459 if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI))
3460 return std::nullopt;
3461
3462 if (!Base->isReg())
3463 return std::nullopt;
3464 ExtAddrMode AM;
3465 AM.BaseReg = Base->getReg();
3466 AM.Displacement = Offset;
3467 AM.ScaledReg = 0;
3468 AM.Scale = 0;
3469 return AM;
3470}
3471
3473 Register Reg,
3474 const MachineInstr &AddrI,
3475 ExtAddrMode &AM) const {
3476 // Filter out instructions into which we cannot fold.
3477 unsigned NumBytes;
3478 int64_t OffsetScale = 1;
3479 switch (MemI.getOpcode()) {
3480 default:
3481 return false;
3482
3483 case AArch64::LDURQi:
3484 case AArch64::STURQi:
3485 NumBytes = 16;
3486 break;
3487
3488 case AArch64::LDURDi:
3489 case AArch64::STURDi:
3490 case AArch64::LDURXi:
3491 case AArch64::STURXi:
3492 NumBytes = 8;
3493 break;
3494
3495 case AArch64::LDURWi:
3496 case AArch64::LDURSWi:
3497 case AArch64::STURWi:
3498 NumBytes = 4;
3499 break;
3500
3501 case AArch64::LDURHi:
3502 case AArch64::STURHi:
3503 case AArch64::LDURHHi:
3504 case AArch64::STURHHi:
3505 case AArch64::LDURSHXi:
3506 case AArch64::LDURSHWi:
3507 NumBytes = 2;
3508 break;
3509
3510 case AArch64::LDRBroX:
3511 case AArch64::LDRBBroX:
3512 case AArch64::LDRSBXroX:
3513 case AArch64::LDRSBWroX:
3514 case AArch64::STRBroX:
3515 case AArch64::STRBBroX:
3516 case AArch64::LDURBi:
3517 case AArch64::LDURBBi:
3518 case AArch64::LDURSBXi:
3519 case AArch64::LDURSBWi:
3520 case AArch64::STURBi:
3521 case AArch64::STURBBi:
3522 case AArch64::LDRBui:
3523 case AArch64::LDRBBui:
3524 case AArch64::LDRSBXui:
3525 case AArch64::LDRSBWui:
3526 case AArch64::STRBui:
3527 case AArch64::STRBBui:
3528 NumBytes = 1;
3529 break;
3530
3531 case AArch64::LDRQroX:
3532 case AArch64::STRQroX:
3533 case AArch64::LDRQui:
3534 case AArch64::STRQui:
3535 NumBytes = 16;
3536 OffsetScale = 16;
3537 break;
3538
3539 case AArch64::LDRDroX:
3540 case AArch64::STRDroX:
3541 case AArch64::LDRXroX:
3542 case AArch64::STRXroX:
3543 case AArch64::LDRDui:
3544 case AArch64::STRDui:
3545 case AArch64::LDRXui:
3546 case AArch64::STRXui:
3547 NumBytes = 8;
3548 OffsetScale = 8;
3549 break;
3550
3551 case AArch64::LDRWroX:
3552 case AArch64::LDRSWroX:
3553 case AArch64::STRWroX:
3554 case AArch64::LDRWui:
3555 case AArch64::LDRSWui:
3556 case AArch64::STRWui:
3557 NumBytes = 4;
3558 OffsetScale = 4;
3559 break;
3560
3561 case AArch64::LDRHroX:
3562 case AArch64::STRHroX:
3563 case AArch64::LDRHHroX:
3564 case AArch64::STRHHroX:
3565 case AArch64::LDRSHXroX:
3566 case AArch64::LDRSHWroX:
3567 case AArch64::LDRHui:
3568 case AArch64::STRHui:
3569 case AArch64::LDRHHui:
3570 case AArch64::STRHHui:
3571 case AArch64::LDRSHXui:
3572 case AArch64::LDRSHWui:
3573 NumBytes = 2;
3574 OffsetScale = 2;
3575 break;
3576 }
3577
3578 // Check the fold operand is not the loaded/stored value.
3579 const MachineOperand &BaseRegOp = MemI.getOperand(0);
3580 if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg)
3581 return false;
3582
3583 // Handle memory instructions with a [Reg, Reg] addressing mode.
3584 if (MemI.getOperand(2).isReg()) {
3585 // Bail if the addressing mode already includes extension of the offset
3586 // register.
3587 if (MemI.getOperand(3).getImm())
3588 return false;
3589
3590 // Check if we actually have a scaled offset.
3591 if (MemI.getOperand(4).getImm() == 0)
3592 OffsetScale = 1;
3593
3594 // If the address instructions is folded into the base register, then the
3595 // addressing mode must not have a scale. Then we can swap the base and the
3596 // scaled registers.
3597 if (MemI.getOperand(1).getReg() == Reg && OffsetScale != 1)
3598 return false;
3599
3600 switch (AddrI.getOpcode()) {
3601 default:
3602 return false;
3603
3604 case AArch64::SBFMXri:
3605 // sxtw Xa, Wm
3606 // ldr Xd, [Xn, Xa, lsl #N]
3607 // ->
3608 // ldr Xd, [Xn, Wm, sxtw #N]
3609 if (AddrI.getOperand(2).getImm() != 0 ||
3610 AddrI.getOperand(3).getImm() != 31)
3611 return false;
3612
3613 AM.BaseReg = MemI.getOperand(1).getReg();
3614 if (AM.BaseReg == Reg)
3615 AM.BaseReg = MemI.getOperand(2).getReg();
3616 AM.ScaledReg = AddrI.getOperand(1).getReg();
3617 AM.Scale = OffsetScale;
3618 AM.Displacement = 0;
3620 return true;
3621
3622 case TargetOpcode::SUBREG_TO_REG: {
3623 // mov Wa, Wm
3624 // ldr Xd, [Xn, Xa, lsl #N]
3625 // ->
3626 // ldr Xd, [Xn, Wm, uxtw #N]
3627
3628 // Zero-extension looks like an ORRWrs followed by a SUBREG_TO_REG.
3629 if (AddrI.getOperand(2).getImm() != AArch64::sub_32)
3630 return false;
3631
3632 const MachineRegisterInfo &MRI = AddrI.getMF()->getRegInfo();
3633 Register OffsetReg = AddrI.getOperand(1).getReg();
3634 if (!OffsetReg.isVirtual() || !MRI.hasOneNonDBGUse(OffsetReg))
3635 return false;
3636
3637 const MachineInstr &DefMI = *MRI.getVRegDef(OffsetReg);
3638 if (DefMI.getOpcode() != AArch64::ORRWrs ||
3639 DefMI.getOperand(1).getReg() != AArch64::WZR ||
3640 DefMI.getOperand(3).getImm() != 0)
3641 return false;
3642
3643 AM.BaseReg = MemI.getOperand(1).getReg();
3644 if (AM.BaseReg == Reg)
3645 AM.BaseReg = MemI.getOperand(2).getReg();
3646 AM.ScaledReg = DefMI.getOperand(2).getReg();
3647 AM.Scale = OffsetScale;
3648 AM.Displacement = 0;
3650 return true;
3651 }
3652 }
3653 }
3654
3655 // Handle memory instructions with a [Reg, #Imm] addressing mode.
3656
3657 // Check we are not breaking a potential conversion to an LDP.
3658 auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset,
3659 int64_t NewOffset) -> bool {
3660 int64_t MinOffset, MaxOffset;
3661 switch (NumBytes) {
3662 default:
3663 return true;
3664 case 4:
3665 MinOffset = -256;
3666 MaxOffset = 252;
3667 break;
3668 case 8:
3669 MinOffset = -512;
3670 MaxOffset = 504;
3671 break;
3672 case 16:
3673 MinOffset = -1024;
3674 MaxOffset = 1008;
3675 break;
3676 }
3677 return OldOffset < MinOffset || OldOffset > MaxOffset ||
3678 (NewOffset >= MinOffset && NewOffset <= MaxOffset);
3679 };
3680 auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool {
3681 int64_t OldOffset = MemI.getOperand(2).getImm() * OffsetScale;
3682 int64_t NewOffset = OldOffset + Disp;
3683 if (!isLegalAddressingMode(NumBytes, NewOffset, /* Scale */ 0))
3684 return false;
3685 // If the old offset would fit into an LDP, but the new offset wouldn't,
3686 // bail out.
3687 if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset))
3688 return false;
3689 AM.BaseReg = AddrI.getOperand(1).getReg();
3690 AM.ScaledReg = 0;
3691 AM.Scale = 0;
3692 AM.Displacement = NewOffset;
3694 return true;
3695 };
3696
3697 auto canFoldAddRegIntoAddrMode =
3698 [&](int64_t Scale,
3700 if (MemI.getOperand(2).getImm() != 0)
3701 return false;
3702 if ((unsigned)Scale != Scale)
3703 return false;
3704 if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale))
3705 return false;
3706 AM.BaseReg = AddrI.getOperand(1).getReg();
3707 AM.ScaledReg = AddrI.getOperand(2).getReg();
3708 AM.Scale = Scale;
3709 AM.Displacement = 0;
3710 AM.Form = Form;
3711 return true;
3712 };
3713
3714 auto avoidSlowSTRQ = [&](const MachineInstr &MemI) {
3715 unsigned Opcode = MemI.getOpcode();
3716 return (Opcode == AArch64::STURQi || Opcode == AArch64::STRQui) &&
3717 Subtarget.isSTRQroSlow();
3718 };
3719
3720 int64_t Disp = 0;
3721 const bool OptSize = MemI.getMF()->getFunction().hasOptSize();
3722 switch (AddrI.getOpcode()) {
3723 default:
3724 return false;
3725
3726 case AArch64::ADDXri:
3727 // add Xa, Xn, #N
3728 // ldr Xd, [Xa, #M]
3729 // ->
3730 // ldr Xd, [Xn, #N'+M]
3731 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
3732 return canFoldAddSubImmIntoAddrMode(Disp);
3733
3734 case AArch64::SUBXri:
3735 // sub Xa, Xn, #N
3736 // ldr Xd, [Xa, #M]
3737 // ->
3738 // ldr Xd, [Xn, #N'+M]
3739 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
3740 return canFoldAddSubImmIntoAddrMode(-Disp);
3741
3742 case AArch64::ADDXrs: {
3743 // add Xa, Xn, Xm, lsl #N
3744 // ldr Xd, [Xa]
3745 // ->
3746 // ldr Xd, [Xn, Xm, lsl #N]
3747
3748 // Don't fold the add if the result would be slower, unless optimising for
3749 // size.
3750 unsigned Shift = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3752 return false;
3753 Shift = AArch64_AM::getShiftValue(Shift);
3754 if (!OptSize) {
3755 if (Shift != 2 && Shift != 3 && Subtarget.hasAddrLSLSlow14())
3756 return false;
3757 if (avoidSlowSTRQ(MemI))
3758 return false;
3759 }
3760 return canFoldAddRegIntoAddrMode(1ULL << Shift);
3761 }
3762
3763 case AArch64::ADDXrr:
3764 // add Xa, Xn, Xm
3765 // ldr Xd, [Xa]
3766 // ->
3767 // ldr Xd, [Xn, Xm, lsl #0]
3768
3769 // Don't fold the add if the result would be slower, unless optimising for
3770 // size.
3771 if (!OptSize && avoidSlowSTRQ(MemI))
3772 return false;
3773 return canFoldAddRegIntoAddrMode(1);
3774
3775 case AArch64::ADDXrx:
3776 // add Xa, Xn, Wm, {s,u}xtw #N
3777 // ldr Xd, [Xa]
3778 // ->
3779 // ldr Xd, [Xn, Wm, {s,u}xtw #N]
3780
3781 // Don't fold the add if the result would be slower, unless optimising for
3782 // size.
3783 if (!OptSize && avoidSlowSTRQ(MemI))
3784 return false;
3785
3786 // Can fold only sign-/zero-extend of a word.
3787 unsigned Imm = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3789 if (Extend != AArch64_AM::UXTW && Extend != AArch64_AM::SXTW)
3790 return false;
3791
3792 return canFoldAddRegIntoAddrMode(
3793 1ULL << AArch64_AM::getArithShiftValue(Imm),
3796 }
3797}
3798
3799// Given an opcode for an instruction with a [Reg, #Imm] addressing mode,
3800// return the opcode of an instruction performing the same operation, but using
3801// the [Reg, Reg] addressing mode.
3802static unsigned regOffsetOpcode(unsigned Opcode) {
3803 switch (Opcode) {
3804 default:
3805 llvm_unreachable("Address folding not implemented for instruction");
3806
3807 case AArch64::LDURQi:
3808 case AArch64::LDRQui:
3809 return AArch64::LDRQroX;
3810 case AArch64::STURQi:
3811 case AArch64::STRQui:
3812 return AArch64::STRQroX;
3813 case AArch64::LDURDi:
3814 case AArch64::LDRDui:
3815 return AArch64::LDRDroX;
3816 case AArch64::STURDi:
3817 case AArch64::STRDui:
3818 return AArch64::STRDroX;
3819 case AArch64::LDURXi:
3820 case AArch64::LDRXui:
3821 return AArch64::LDRXroX;
3822 case AArch64::STURXi:
3823 case AArch64::STRXui:
3824 return AArch64::STRXroX;
3825 case AArch64::LDURWi:
3826 case AArch64::LDRWui:
3827 return AArch64::LDRWroX;
3828 case AArch64::LDURSWi:
3829 case AArch64::LDRSWui:
3830 return AArch64::LDRSWroX;
3831 case AArch64::STURWi:
3832 case AArch64::STRWui:
3833 return AArch64::STRWroX;
3834 case AArch64::LDURHi:
3835 case AArch64::LDRHui:
3836 return AArch64::LDRHroX;
3837 case AArch64::STURHi:
3838 case AArch64::STRHui:
3839 return AArch64::STRHroX;
3840 case AArch64::LDURHHi:
3841 case AArch64::LDRHHui:
3842 return AArch64::LDRHHroX;
3843 case AArch64::STURHHi:
3844 case AArch64::STRHHui:
3845 return AArch64::STRHHroX;
3846 case AArch64::LDURSHXi:
3847 case AArch64::LDRSHXui:
3848 return AArch64::LDRSHXroX;
3849 case AArch64::LDURSHWi:
3850 case AArch64::LDRSHWui:
3851 return AArch64::LDRSHWroX;
3852 case AArch64::LDURBi:
3853 case AArch64::LDRBui:
3854 return AArch64::LDRBroX;
3855 case AArch64::LDURBBi:
3856 case AArch64::LDRBBui:
3857 return AArch64::LDRBBroX;
3858 case AArch64::LDURSBXi:
3859 case AArch64::LDRSBXui:
3860 return AArch64::LDRSBXroX;
3861 case AArch64::LDURSBWi:
3862 case AArch64::LDRSBWui:
3863 return AArch64::LDRSBWroX;
3864 case AArch64::STURBi:
3865 case AArch64::STRBui:
3866 return AArch64::STRBroX;
3867 case AArch64::STURBBi:
3868 case AArch64::STRBBui:
3869 return AArch64::STRBBroX;
3870 }
3871}
3872
3873// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3874// the opcode of an instruction performing the same operation, but using the
3875// [Reg, #Imm] addressing mode with scaled offset.
3876unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) {
3877 switch (Opcode) {
3878 default:
3879 llvm_unreachable("Address folding not implemented for instruction");
3880
3881 case AArch64::LDURQi:
3882 Scale = 16;
3883 return AArch64::LDRQui;
3884 case AArch64::STURQi:
3885 Scale = 16;
3886 return AArch64::STRQui;
3887 case AArch64::LDURDi:
3888 Scale = 8;
3889 return AArch64::LDRDui;
3890 case AArch64::STURDi:
3891 Scale = 8;
3892 return AArch64::STRDui;
3893 case AArch64::LDURXi:
3894 Scale = 8;
3895 return AArch64::LDRXui;
3896 case AArch64::STURXi:
3897 Scale = 8;
3898 return AArch64::STRXui;
3899 case AArch64::LDURWi:
3900 Scale = 4;
3901 return AArch64::LDRWui;
3902 case AArch64::LDURSWi:
3903 Scale = 4;
3904 return AArch64::LDRSWui;
3905 case AArch64::STURWi:
3906 Scale = 4;
3907 return AArch64::STRWui;
3908 case AArch64::LDURHi:
3909 Scale = 2;
3910 return AArch64::LDRHui;
3911 case AArch64::STURHi:
3912 Scale = 2;
3913 return AArch64::STRHui;
3914 case AArch64::LDURHHi:
3915 Scale = 2;
3916 return AArch64::LDRHHui;
3917 case AArch64::STURHHi:
3918 Scale = 2;
3919 return AArch64::STRHHui;
3920 case AArch64::LDURSHXi:
3921 Scale = 2;
3922 return AArch64::LDRSHXui;
3923 case AArch64::LDURSHWi:
3924 Scale = 2;
3925 return AArch64::LDRSHWui;
3926 case AArch64::LDURBi:
3927 Scale = 1;
3928 return AArch64::LDRBui;
3929 case AArch64::LDURBBi:
3930 Scale = 1;
3931 return AArch64::LDRBBui;
3932 case AArch64::LDURSBXi:
3933 Scale = 1;
3934 return AArch64::LDRSBXui;
3935 case AArch64::LDURSBWi:
3936 Scale = 1;
3937 return AArch64::LDRSBWui;
3938 case AArch64::STURBi:
3939 Scale = 1;
3940 return AArch64::STRBui;
3941 case AArch64::STURBBi:
3942 Scale = 1;
3943 return AArch64::STRBBui;
3944 case AArch64::LDRQui:
3945 case AArch64::STRQui:
3946 Scale = 16;
3947 return Opcode;
3948 case AArch64::LDRDui:
3949 case AArch64::STRDui:
3950 case AArch64::LDRXui:
3951 case AArch64::STRXui:
3952 Scale = 8;
3953 return Opcode;
3954 case AArch64::LDRWui:
3955 case AArch64::LDRSWui:
3956 case AArch64::STRWui:
3957 Scale = 4;
3958 return Opcode;
3959 case AArch64::LDRHui:
3960 case AArch64::STRHui:
3961 case AArch64::LDRHHui:
3962 case AArch64::STRHHui:
3963 case AArch64::LDRSHXui:
3964 case AArch64::LDRSHWui:
3965 Scale = 2;
3966 return Opcode;
3967 case AArch64::LDRBui:
3968 case AArch64::LDRBBui:
3969 case AArch64::LDRSBXui:
3970 case AArch64::LDRSBWui:
3971 case AArch64::STRBui:
3972 case AArch64::STRBBui:
3973 Scale = 1;
3974 return Opcode;
3975 }
3976}
3977
3978// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3979// the opcode of an instruction performing the same operation, but using the
3980// [Reg, #Imm] addressing mode with unscaled offset.
3981unsigned unscaledOffsetOpcode(unsigned Opcode) {
3982 switch (Opcode) {
3983 default:
3984 llvm_unreachable("Address folding not implemented for instruction");
3985
3986 case AArch64::LDURQi:
3987 case AArch64::STURQi:
3988 case AArch64::LDURDi:
3989 case AArch64::STURDi:
3990 case AArch64::LDURXi:
3991 case AArch64::STURXi:
3992 case AArch64::LDURWi:
3993 case AArch64::LDURSWi:
3994 case AArch64::STURWi:
3995 case AArch64::LDURHi:
3996 case AArch64::STURHi:
3997 case AArch64::LDURHHi:
3998 case AArch64::STURHHi:
3999 case AArch64::LDURSHXi:
4000 case AArch64::LDURSHWi:
4001 case AArch64::LDURBi:
4002 case AArch64::STURBi:
4003 case AArch64::LDURBBi:
4004 case AArch64::STURBBi:
4005 case AArch64::LDURSBWi:
4006 case AArch64::LDURSBXi:
4007 return Opcode;
4008 case AArch64::LDRQui:
4009 return AArch64::LDURQi;
4010 case AArch64::STRQui:
4011 return AArch64::STURQi;
4012 case AArch64::LDRDui:
4013 return AArch64::LDURDi;
4014 case AArch64::STRDui:
4015 return AArch64::STURDi;
4016 case AArch64::LDRXui:
4017 return AArch64::LDURXi;
4018 case AArch64::STRXui:
4019 return AArch64::STURXi;
4020 case AArch64::LDRWui:
4021 return AArch64::LDURWi;
4022 case AArch64::LDRSWui:
4023 return AArch64::LDURSWi;
4024 case AArch64::STRWui:
4025 return AArch64::STURWi;
4026 case AArch64::LDRHui:
4027 return AArch64::LDURHi;
4028 case AArch64::STRHui:
4029 return AArch64::STURHi;
4030 case AArch64::LDRHHui:
4031 return AArch64::LDURHHi;
4032 case AArch64::STRHHui:
4033 return AArch64::STURHHi;
4034 case AArch64::LDRSHXui:
4035 return AArch64::LDURSHXi;
4036 case AArch64::LDRSHWui:
4037 return AArch64::LDURSHWi;
4038 case AArch64::LDRBBui:
4039 return AArch64::LDURBBi;
4040 case AArch64::LDRBui:
4041 return AArch64::LDURBi;
4042 case AArch64::STRBBui:
4043 return AArch64::STURBBi;
4044 case AArch64::STRBui:
4045 return AArch64::STURBi;
4046 case AArch64::LDRSBWui:
4047 return AArch64::LDURSBWi;
4048 case AArch64::LDRSBXui:
4049 return AArch64::LDURSBXi;
4050 }
4051}
4052
4053// Given the opcode of a memory load/store instruction, return the opcode of an
4054// instruction performing the same operation, but using
4055// the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the
4056// offset register.
4057static unsigned offsetExtendOpcode(unsigned Opcode) {
4058 switch (Opcode) {
4059 default:
4060 llvm_unreachable("Address folding not implemented for instruction");
4061
4062 case AArch64::LDRQroX:
4063 case AArch64::LDURQi:
4064 case AArch64::LDRQui:
4065 return AArch64::LDRQroW;
4066 case AArch64::STRQroX:
4067 case AArch64::STURQi:
4068 case AArch64::STRQui:
4069 return AArch64::STRQroW;
4070 case AArch64::LDRDroX:
4071 case AArch64::LDURDi:
4072 case AArch64::LDRDui:
4073 return AArch64::LDRDroW;
4074 case AArch64::STRDroX:
4075 case AArch64::STURDi:
4076 case AArch64::STRDui:
4077 return AArch64::STRDroW;
4078 case AArch64::LDRXroX:
4079 case AArch64::LDURXi:
4080 case AArch64::LDRXui:
4081 return AArch64::LDRXroW;
4082 case AArch64::STRXroX:
4083 case AArch64::STURXi:
4084 case AArch64::STRXui:
4085 return AArch64::STRXroW;
4086 case AArch64::LDRWroX:
4087 case AArch64::LDURWi:
4088 case AArch64::LDRWui:
4089 return AArch64::LDRWroW;
4090 case AArch64::LDRSWroX:
4091 case AArch64::LDURSWi:
4092 case AArch64::LDRSWui:
4093 return AArch64::LDRSWroW;
4094 case AArch64::STRWroX:
4095 case AArch64::STURWi:
4096 case AArch64::STRWui:
4097 return AArch64::STRWroW;
4098 case AArch64::LDRHroX:
4099 case AArch64::LDURHi:
4100 case AArch64::LDRHui:
4101 return AArch64::LDRHroW;
4102 case AArch64::STRHroX:
4103 case AArch64::STURHi:
4104 case AArch64::STRHui:
4105 return AArch64::STRHroW;
4106 case AArch64::LDRHHroX:
4107 case AArch64::LDURHHi:
4108 case AArch64::LDRHHui:
4109 return AArch64::LDRHHroW;
4110 case AArch64::STRHHroX:
4111 case AArch64::STURHHi:
4112 case AArch64::STRHHui:
4113 return AArch64::STRHHroW;
4114 case AArch64::LDRSHXroX:
4115 case AArch64::LDURSHXi:
4116 case AArch64::LDRSHXui:
4117 return AArch64::LDRSHXroW;
4118 case AArch64::LDRSHWroX:
4119 case AArch64::LDURSHWi:
4120 case AArch64::LDRSHWui:
4121 return AArch64::LDRSHWroW;
4122 case AArch64::LDRBroX:
4123 case AArch64::LDURBi:
4124 case AArch64::LDRBui:
4125 return AArch64::LDRBroW;
4126 case AArch64::LDRBBroX:
4127 case AArch64::LDURBBi:
4128 case AArch64::LDRBBui:
4129 return AArch64::LDRBBroW;
4130 case AArch64::LDRSBXroX:
4131 case AArch64::LDURSBXi:
4132 case AArch64::LDRSBXui:
4133 return AArch64::LDRSBXroW;
4134 case AArch64::LDRSBWroX:
4135 case AArch64::LDURSBWi:
4136 case AArch64::LDRSBWui:
4137 return AArch64::LDRSBWroW;
4138 case AArch64::STRBroX:
4139 case AArch64::STURBi:
4140 case AArch64::STRBui:
4141 return AArch64::STRBroW;
4142 case AArch64::STRBBroX:
4143 case AArch64::STURBBi:
4144 case AArch64::STRBBui:
4145 return AArch64::STRBBroW;
4146 }
4147}
4148
4150 const ExtAddrMode &AM) const {
4151
4152 const DebugLoc &DL = MemI.getDebugLoc();
4153 MachineBasicBlock &MBB = *MemI.getParent();
4154 MachineRegisterInfo &MRI = MemI.getMF()->getRegInfo();
4155
4157 if (AM.ScaledReg) {
4158 // The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`.
4159 unsigned Opcode = regOffsetOpcode(MemI.getOpcode());
4160 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
4161 auto B = BuildMI(MBB, MemI, DL, get(Opcode))
4162 .addReg(MemI.getOperand(0).getReg(),
4163 getDefRegState(MemI.mayLoad()))
4164 .addReg(AM.BaseReg)
4165 .addReg(AM.ScaledReg)
4166 .addImm(0)
4167 .addImm(AM.Scale > 1)
4168 .setMemRefs(MemI.memoperands())
4169 .setMIFlags(MemI.getFlags());
4170 return B.getInstr();
4171 }
4172
4173 assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
4174 "Addressing mode not supported for folding");
4175
4176 // The new instruction will be in the form `ld[u]r Rt, [Xn, #imm]`.
4177 unsigned Scale = 1;
4178 unsigned Opcode = MemI.getOpcode();
4179 if (isInt<9>(AM.Displacement))
4180 Opcode = unscaledOffsetOpcode(Opcode);
4181 else
4182 Opcode = scaledOffsetOpcode(Opcode, Scale);
4183
4184 auto B =
4185 BuildMI(MBB, MemI, DL, get(Opcode))
4186 .addReg(MemI.getOperand(0).getReg(), getDefRegState(MemI.mayLoad()))
4187 .addReg(AM.BaseReg)
4188 .addImm(AM.Displacement / Scale)
4189 .setMemRefs(MemI.memoperands())
4190 .setMIFlags(MemI.getFlags());
4191 return B.getInstr();
4192 }
4193
4196 // The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`.
4197 assert(AM.ScaledReg && !AM.Displacement &&
4198 "Address offset can be a register or an immediate, but not both");
4199 unsigned Opcode = offsetExtendOpcode(MemI.getOpcode());
4200 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
4201 // Make sure the offset register is in the correct register class.
4202 Register OffsetReg = AM.ScaledReg;
4203 const TargetRegisterClass *RC = MRI.getRegClass(OffsetReg);
4204 if (RC->hasSuperClassEq(&AArch64::GPR64RegClass)) {
4205 OffsetReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
4206 BuildMI(MBB, MemI, DL, get(TargetOpcode::COPY), OffsetReg)
4207 .addReg(AM.ScaledReg, {}, AArch64::sub_32);
4208 }
4209 auto B =
4210 BuildMI(MBB, MemI, DL, get(Opcode))
4211 .addReg(MemI.getOperand(0).getReg(), getDefRegState(MemI.mayLoad()))
4212 .addReg(AM.BaseReg)
4213 .addReg(OffsetReg)
4215 .addImm(AM.Scale != 1)
4216 .setMemRefs(MemI.memoperands())
4217 .setMIFlags(MemI.getFlags());
4218
4219 return B.getInstr();
4220 }
4221
4223 "Function must not be called with an addressing mode it can't handle");
4224}
4225
4226/// Return true if the opcode is a post-index ld/st instruction, which really
4227/// loads from base+0.
4228static bool isPostIndexLdStOpcode(unsigned Opcode) {
4229 switch (Opcode) {
4230 default:
4231 return false;
4232 case AArch64::LD1Fourv16b_POST:
4233 case AArch64::LD1Fourv1d_POST:
4234 case AArch64::LD1Fourv2d_POST:
4235 case AArch64::LD1Fourv2s_POST:
4236 case AArch64::LD1Fourv4h_POST:
4237 case AArch64::LD1Fourv4s_POST:
4238 case AArch64::LD1Fourv8b_POST:
4239 case AArch64::LD1Fourv8h_POST:
4240 case AArch64::LD1Onev16b_POST:
4241 case AArch64::LD1Onev1d_POST:
4242 case AArch64::LD1Onev2d_POST:
4243 case AArch64::LD1Onev2s_POST:
4244 case AArch64::LD1Onev4h_POST:
4245 case AArch64::LD1Onev4s_POST:
4246 case AArch64::LD1Onev8b_POST:
4247 case AArch64::LD1Onev8h_POST:
4248 case AArch64::LD1Rv16b_POST:
4249 case AArch64::LD1Rv1d_POST:
4250 case AArch64::LD1Rv2d_POST:
4251 case AArch64::LD1Rv2s_POST:
4252 case AArch64::LD1Rv4h_POST:
4253 case AArch64::LD1Rv4s_POST:
4254 case AArch64::LD1Rv8b_POST:
4255 case AArch64::LD1Rv8h_POST:
4256 case AArch64::LD1Threev16b_POST:
4257 case AArch64::LD1Threev1d_POST:
4258 case AArch64::LD1Threev2d_POST:
4259 case AArch64::LD1Threev2s_POST:
4260 case AArch64::LD1Threev4h_POST:
4261 case AArch64::LD1Threev4s_POST:
4262 case AArch64::LD1Threev8b_POST:
4263 case AArch64::LD1Threev8h_POST:
4264 case AArch64::LD1Twov16b_POST:
4265 case AArch64::LD1Twov1d_POST:
4266 case AArch64::LD1Twov2d_POST:
4267 case AArch64::LD1Twov2s_POST:
4268 case AArch64::LD1Twov4h_POST:
4269 case AArch64::LD1Twov4s_POST:
4270 case AArch64::LD1Twov8b_POST:
4271 case AArch64::LD1Twov8h_POST:
4272 case AArch64::LD1i16_POST:
4273 case AArch64::LD1i32_POST:
4274 case AArch64::LD1i64_POST:
4275 case AArch64::LD1i8_POST:
4276 case AArch64::LD2Rv16b_POST:
4277 case AArch64::LD2Rv1d_POST:
4278 case AArch64::LD2Rv2d_POST:
4279 case AArch64::LD2Rv2s_POST:
4280 case AArch64::LD2Rv4h_POST:
4281 case AArch64::LD2Rv4s_POST:
4282 case AArch64::LD2Rv8b_POST:
4283 case AArch64::LD2Rv8h_POST:
4284 case AArch64::LD2Twov16b_POST:
4285 case AArch64::LD2Twov2d_POST:
4286 case AArch64::LD2Twov2s_POST:
4287 case AArch64::LD2Twov4h_POST:
4288 case AArch64::LD2Twov4s_POST:
4289 case AArch64::LD2Twov8b_POST:
4290 case AArch64::LD2Twov8h_POST:
4291 case AArch64::LD2i16_POST:
4292 case AArch64::LD2i32_POST:
4293 case AArch64::LD2i64_POST:
4294 case AArch64::LD2i8_POST:
4295 case AArch64::LD3Rv16b_POST:
4296 case AArch64::LD3Rv1d_POST:
4297 case AArch64::LD3Rv2d_POST:
4298 case AArch64::LD3Rv2s_POST:
4299 case AArch64::LD3Rv4h_POST:
4300 case AArch64::LD3Rv4s_POST:
4301 case AArch64::LD3Rv8b_POST:
4302 case AArch64::LD3Rv8h_POST:
4303 case AArch64::LD3Threev16b_POST:
4304 case AArch64::LD3Threev2d_POST:
4305 case AArch64::LD3Threev2s_POST:
4306 case AArch64::LD3Threev4h_POST:
4307 case AArch64::LD3Threev4s_POST:
4308 case AArch64::LD3Threev8b_POST:
4309 case AArch64::LD3Threev8h_POST:
4310 case AArch64::LD3i16_POST:
4311 case AArch64::LD3i32_POST:
4312 case AArch64::LD3i64_POST:
4313 case AArch64::LD3i8_POST:
4314 case AArch64::LD4Fourv16b_POST:
4315 case AArch64::LD4Fourv2d_POST:
4316 case AArch64::LD4Fourv2s_POST:
4317 case AArch64::LD4Fourv4h_POST:
4318 case AArch64::LD4Fourv4s_POST:
4319 case AArch64::LD4Fourv8b_POST:
4320 case AArch64::LD4Fourv8h_POST:
4321 case AArch64::LD4Rv16b_POST:
4322 case AArch64::LD4Rv1d_POST:
4323 case AArch64::LD4Rv2d_POST:
4324 case AArch64::LD4Rv2s_POST:
4325 case AArch64::LD4Rv4h_POST:
4326 case AArch64::LD4Rv4s_POST:
4327 case AArch64::LD4Rv8b_POST:
4328 case AArch64::LD4Rv8h_POST:
4329 case AArch64::LD4i16_POST:
4330 case AArch64::LD4i32_POST:
4331 case AArch64::LD4i64_POST:
4332 case AArch64::LD4i8_POST:
4333 case AArch64::LDAPRWpost:
4334 case AArch64::LDAPRXpost:
4335 case AArch64::LDIAPPWpost:
4336 case AArch64::LDIAPPXpost:
4337 case AArch64::LDPDpost:
4338 case AArch64::LDPQpost:
4339 case AArch64::LDPSWpost:
4340 case AArch64::LDPSpost:
4341 case AArch64::LDPWpost:
4342 case AArch64::LDPXpost:
4343 case AArch64::LDRBBpost:
4344 case AArch64::LDRBpost:
4345 case AArch64::LDRDpost:
4346 case AArch64::LDRHHpost:
4347 case AArch64::LDRHpost:
4348 case AArch64::LDRQpost:
4349 case AArch64::LDRSBWpost:
4350 case AArch64::LDRSBXpost:
4351 case AArch64::LDRSHWpost:
4352 case AArch64::LDRSHXpost:
4353 case AArch64::LDRSWpost:
4354 case AArch64::LDRSpost:
4355 case AArch64::LDRWpost:
4356 case AArch64::LDRXpost:
4357 case AArch64::ST1Fourv16b_POST:
4358 case AArch64::ST1Fourv1d_POST:
4359 case AArch64::ST1Fourv2d_POST:
4360 case AArch64::ST1Fourv2s_POST:
4361 case AArch64::ST1Fourv4h_POST:
4362 case AArch64::ST1Fourv4s_POST:
4363 case AArch64::ST1Fourv8b_POST:
4364 case AArch64::ST1Fourv8h_POST:
4365 case AArch64::ST1Onev16b_POST:
4366 case AArch64::ST1Onev1d_POST:
4367 case AArch64::ST1Onev2d_POST:
4368 case AArch64::ST1Onev2s_POST:
4369 case AArch64::ST1Onev4h_POST:
4370 case AArch64::ST1Onev4s_POST:
4371 case AArch64::ST1Onev8b_POST:
4372 case AArch64::ST1Onev8h_POST:
4373 case AArch64::ST1Threev16b_POST:
4374 case AArch64::ST1Threev1d_POST:
4375 case AArch64::ST1Threev2d_POST:
4376 case AArch64::ST1Threev2s_POST:
4377 case AArch64::ST1Threev4h_POST:
4378 case AArch64::ST1Threev4s_POST:
4379 case AArch64::ST1Threev8b_POST:
4380 case AArch64::ST1Threev8h_POST:
4381 case AArch64::ST1Twov16b_POST:
4382 case AArch64::ST1Twov1d_POST:
4383 case AArch64::ST1Twov2d_POST:
4384 case AArch64::ST1Twov2s_POST:
4385 case AArch64::ST1Twov4h_POST:
4386 case AArch64::ST1Twov4s_POST:
4387 case AArch64::ST1Twov8b_POST:
4388 case AArch64::ST1Twov8h_POST:
4389 case AArch64::ST1i16_POST:
4390 case AArch64::ST1i32_POST:
4391 case AArch64::ST1i64_POST:
4392 case AArch64::ST1i8_POST:
4393 case AArch64::ST2GPostIndex:
4394 case AArch64::ST2Twov16b_POST:
4395 case AArch64::ST2Twov2d_POST:
4396 case AArch64::ST2Twov2s_POST:
4397 case AArch64::ST2Twov4h_POST:
4398 case AArch64::ST2Twov4s_POST:
4399 case AArch64::ST2Twov8b_POST:
4400 case AArch64::ST2Twov8h_POST:
4401 case AArch64::ST2i16_POST:
4402 case AArch64::ST2i32_POST:
4403 case AArch64::ST2i64_POST:
4404 case AArch64::ST2i8_POST:
4405 case AArch64::ST3Threev16b_POST:
4406 case AArch64::ST3Threev2d_POST:
4407 case AArch64::ST3Threev2s_POST:
4408 case AArch64::ST3Threev4h_POST:
4409 case AArch64::ST3Threev4s_POST:
4410 case AArch64::ST3Threev8b_POST:
4411 case AArch64::ST3Threev8h_POST:
4412 case AArch64::ST3i16_POST:
4413 case AArch64::ST3i32_POST:
4414 case AArch64::ST3i64_POST:
4415 case AArch64::ST3i8_POST:
4416 case AArch64::ST4Fourv16b_POST:
4417 case AArch64::ST4Fourv2d_POST:
4418 case AArch64::ST4Fourv2s_POST:
4419 case AArch64::ST4Fourv4h_POST:
4420 case AArch64::ST4Fourv4s_POST:
4421 case AArch64::ST4Fourv8b_POST:
4422 case AArch64::ST4Fourv8h_POST:
4423 case AArch64::ST4i16_POST:
4424 case AArch64::ST4i32_POST:
4425 case AArch64::ST4i64_POST:
4426 case AArch64::ST4i8_POST:
4427 case AArch64::STGPostIndex:
4428 case AArch64::STGPpost:
4429 case AArch64::STPDpost:
4430 case AArch64::STPQpost:
4431 case AArch64::STPSpost:
4432 case AArch64::STPWpost:
4433 case AArch64::STPXpost:
4434 case AArch64::STRBBpost:
4435 case AArch64::STRBpost:
4436 case AArch64::STRDpost:
4437 case AArch64::STRHHpost:
4438 case AArch64::STRHpost:
4439 case AArch64::STRQpost:
4440 case AArch64::STRSpost:
4441 case AArch64::STRWpost:
4442 case AArch64::STRXpost:
4443 case AArch64::STZ2GPostIndex:
4444 case AArch64::STZGPostIndex:
4445 return true;
4446 }
4447}
4448
4450 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
4451 bool &OffsetIsScalable, TypeSize &Width,
4452 const TargetRegisterInfo *TRI) const {
4453 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
4454 // Handle only loads/stores with base register followed by immediate offset.
4455 if (LdSt.getNumExplicitOperands() == 3) {
4456 // Non-paired instruction (e.g., ldr x1, [x0, #8]).
4457 if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
4458 !LdSt.getOperand(2).isImm())
4459 return false;
4460 } else if (LdSt.getNumExplicitOperands() == 4) {
4461 // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
4462 if (!LdSt.getOperand(1).isReg() ||
4463 (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
4464 !LdSt.getOperand(3).isImm())
4465 return false;
4466 } else
4467 return false;
4468
4469 // Get the scaling factor for the instruction and set the width for the
4470 // instruction.
4471 TypeSize Scale(0U, false);
4472 int64_t Dummy1, Dummy2;
4473
4474 // If this returns false, then it's an instruction we don't want to handle.
4475 if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
4476 return false;
4477
4478 // Compute the offset. Offset is calculated as the immediate operand
4479 // multiplied by the scaling factor. Unscaled instructions have scaling factor
4480 // set to 1. Postindex are a special case which have an offset of 0.
4481 if (isPostIndexLdStOpcode(LdSt.getOpcode())) {
4482 BaseOp = &LdSt.getOperand(2);
4483 Offset = 0;
4484 } else if (LdSt.getNumExplicitOperands() == 3) {
4485 BaseOp = &LdSt.getOperand(1);
4486 Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinValue();
4487 } else {
4488 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
4489 BaseOp = &LdSt.getOperand(2);
4490 Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinValue();
4491 }
4492 OffsetIsScalable = Scale.isScalable();
4493
4494 return BaseOp->isReg() || BaseOp->isFI();
4495}
4496
4499 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
4500 MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
4501 assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
4502 return OfsOp;
4503}
4504
4505bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
4506 TypeSize &Width, int64_t &MinOffset,
4507 int64_t &MaxOffset) {
4508 switch (Opcode) {
4509 // Not a memory operation or something we want to handle.
4510 default:
4511 Scale = TypeSize::getFixed(0);
4512 Width = TypeSize::getFixed(0);
4513 MinOffset = MaxOffset = 0;
4514 return false;
4515 // LDR / STR
4516 case AArch64::LDRQui:
4517 case AArch64::STRQui:
4518 Scale = TypeSize::getFixed(16);
4519 Width = TypeSize::getFixed(16);
4520 MinOffset = 0;
4521 MaxOffset = 4095;
4522 break;
4523 case AArch64::LDRXui:
4524 case AArch64::LDRDui:
4525 case AArch64::STRXui:
4526 case AArch64::STRDui:
4527 case AArch64::PRFMui:
4528 Scale = TypeSize::getFixed(8);
4529 Width = TypeSize::getFixed(8);
4530 MinOffset = 0;
4531 MaxOffset = 4095;
4532 break;
4533 case AArch64::LDRWui:
4534 case AArch64::LDRSui:
4535 case AArch64::LDRSWui:
4536 case AArch64::STRWui:
4537 case AArch64::STRSui:
4538 Scale = TypeSize::getFixed(4);
4539 Width = TypeSize::getFixed(4);
4540 MinOffset = 0;
4541 MaxOffset = 4095;
4542 break;
4543 case AArch64::LDRHui:
4544 case AArch64::LDRHHui:
4545 case AArch64::LDRSHWui:
4546 case AArch64::LDRSHXui:
4547 case AArch64::STRHui:
4548 case AArch64::STRHHui:
4549 Scale = TypeSize::getFixed(2);
4550 Width = TypeSize::getFixed(2);
4551 MinOffset = 0;
4552 MaxOffset = 4095;
4553 break;
4554 case AArch64::LDRBui:
4555 case AArch64::LDRBBui:
4556 case AArch64::LDRSBWui:
4557 case AArch64::LDRSBXui:
4558 case AArch64::STRBui:
4559 case AArch64::STRBBui:
4560 Scale = TypeSize::getFixed(1);
4561 Width = TypeSize::getFixed(1);
4562 MinOffset = 0;
4563 MaxOffset = 4095;
4564 break;
4565 // post/pre inc
4566 case AArch64::STRQpre:
4567 case AArch64::LDRQpost:
4568 Scale = TypeSize::getFixed(1);
4569 Width = TypeSize::getFixed(16);
4570 MinOffset = -256;
4571 MaxOffset = 255;
4572 break;
4573 case AArch64::LDRDpost:
4574 case AArch64::LDRDpre:
4575 case AArch64::LDRXpost:
4576 case AArch64::LDRXpre:
4577 case AArch64::STRDpost:
4578 case AArch64::STRDpre:
4579 case AArch64::STRXpost:
4580 case AArch64::STRXpre:
4581 Scale = TypeSize::getFixed(1);
4582 Width = TypeSize::getFixed(8);
4583 MinOffset = -256;
4584 MaxOffset = 255;
4585 break;
4586 case AArch64::STRWpost:
4587 case AArch64::STRWpre:
4588 case AArch64::LDRWpost:
4589 case AArch64::LDRWpre:
4590 case AArch64::STRSpost:
4591 case AArch64::STRSpre:
4592 case AArch64::LDRSpost:
4593 case AArch64::LDRSpre:
4594 Scale = TypeSize::getFixed(1);
4595 Width = TypeSize::getFixed(4);
4596 MinOffset = -256;
4597 MaxOffset = 255;
4598 break;
4599 case AArch64::LDRHpost:
4600 case AArch64::LDRHpre:
4601 case AArch64::STRHpost:
4602 case AArch64::STRHpre:
4603 case AArch64::LDRHHpost:
4604 case AArch64::LDRHHpre:
4605 case AArch64::STRHHpost:
4606 case AArch64::STRHHpre:
4607 Scale = TypeSize::getFixed(1);
4608 Width = TypeSize::getFixed(2);
4609 MinOffset = -256;
4610 MaxOffset = 255;
4611 break;
4612 case AArch64::LDRBpost:
4613 case AArch64::LDRBpre:
4614 case AArch64::STRBpost:
4615 case AArch64::STRBpre:
4616 case AArch64::LDRBBpost:
4617 case AArch64::LDRBBpre:
4618 case AArch64::STRBBpost:
4619 case AArch64::STRBBpre:
4620 Scale = TypeSize::getFixed(1);
4621 Width = TypeSize::getFixed(1);
4622 MinOffset = -256;
4623 MaxOffset = 255;
4624 break;
4625 // Unscaled
4626 case AArch64::LDURQi:
4627 case AArch64::STURQi:
4628 Scale = TypeSize::getFixed(1);
4629 Width = TypeSize::getFixed(16);
4630 MinOffset = -256;
4631 MaxOffset = 255;
4632 break;
4633 case AArch64::LDURXi:
4634 case AArch64::LDURDi:
4635 case AArch64::LDAPURXi:
4636 case AArch64::STURXi:
4637 case AArch64::STURDi:
4638 case AArch64::STLURXi:
4639 case AArch64::PRFUMi:
4640 Scale = TypeSize::getFixed(1);
4641 Width = TypeSize::getFixed(8);
4642 MinOffset = -256;
4643 MaxOffset = 255;
4644 break;
4645 case AArch64::LDURWi:
4646 case AArch64::LDURSi:
4647 case AArch64::LDURSWi:
4648 case AArch64::LDAPURi:
4649 case AArch64::LDAPURSWi:
4650 case AArch64::STURWi:
4651 case AArch64::STURSi:
4652 case AArch64::STLURWi:
4653 Scale = TypeSize::getFixed(1);
4654 Width = TypeSize::getFixed(4);
4655 MinOffset = -256;
4656 MaxOffset = 255;
4657 break;
4658 case AArch64::LDURHi:
4659 case AArch64::LDURHHi:
4660 case AArch64::LDURSHXi:
4661 case AArch64::LDURSHWi:
4662 case AArch64::LDAPURHi:
4663 case AArch64::LDAPURSHWi:
4664 case AArch64::LDAPURSHXi:
4665 case AArch64::STURHi:
4666 case AArch64::STURHHi:
4667 case AArch64::STLURHi:
4668 Scale = TypeSize::getFixed(1);
4669 Width = TypeSize::getFixed(2);
4670 MinOffset = -256;
4671 MaxOffset = 255;
4672 break;
4673 case AArch64::LDURBi:
4674 case AArch64::LDURBBi:
4675 case AArch64::LDURSBXi:
4676 case AArch64::LDURSBWi:
4677 case AArch64::LDAPURBi:
4678 case AArch64::LDAPURSBWi:
4679 case AArch64::LDAPURSBXi:
4680 case AArch64::STURBi:
4681 case AArch64::STURBBi:
4682 case AArch64::STLURBi:
4683 Scale = TypeSize::getFixed(1);
4684 Width = TypeSize::getFixed(1);
4685 MinOffset = -256;
4686 MaxOffset = 255;
4687 break;
4688 // LDP / STP (including pre/post inc)
4689 case AArch64::LDPQi:
4690 case AArch64::LDNPQi:
4691 case AArch64::STPQi:
4692 case AArch64::STNPQi:
4693 case AArch64::LDPQpost:
4694 case AArch64::LDPQpre:
4695 case AArch64::STPQpost:
4696 case AArch64::STPQpre:
4697 Scale = TypeSize::getFixed(16);
4698 Width = TypeSize::getFixed(16 * 2);
4699 MinOffset = -64;
4700 MaxOffset = 63;
4701 break;
4702 case AArch64::LDPXi:
4703 case AArch64::LDPDi:
4704 case AArch64::LDNPXi:
4705 case AArch64::LDNPDi:
4706 case AArch64::STPXi:
4707 case AArch64::STPDi:
4708 case AArch64::STNPXi:
4709 case AArch64::STNPDi:
4710 case AArch64::LDPDpost:
4711 case AArch64::LDPDpre:
4712 case AArch64::LDPXpost:
4713 case AArch64::LDPXpre:
4714 case AArch64::STPDpost:
4715 case AArch64::STPDpre:
4716 case AArch64::STPXpost:
4717 case AArch64::STPXpre:
4718 Scale = TypeSize::getFixed(8);
4719 Width = TypeSize::getFixed(8 * 2);
4720 MinOffset = -64;
4721 MaxOffset = 63;
4722 break;
4723 case AArch64::LDPWi:
4724 case AArch64::LDPSi:
4725 case AArch64::LDNPWi:
4726 case AArch64::LDNPSi:
4727 case AArch64::STPWi:
4728 case AArch64::STPSi:
4729 case AArch64::STNPWi:
4730 case AArch64::STNPSi:
4731 case AArch64::LDPSpost:
4732 case AArch64::LDPSpre:
4733 case AArch64::LDPWpost:
4734 case AArch64::LDPWpre:
4735 case AArch64::STPSpost:
4736 case AArch64::STPSpre:
4737 case AArch64::STPWpost:
4738 case AArch64::STPWpre:
4739 Scale = TypeSize::getFixed(4);
4740 Width = TypeSize::getFixed(4 * 2);
4741 MinOffset = -64;
4742 MaxOffset = 63;
4743 break;
4744 case AArch64::StoreSwiftAsyncContext:
4745 // Store is an STRXui, but there might be an ADDXri in the expansion too.
4746 Scale = TypeSize::getFixed(1);
4747 Width = TypeSize::getFixed(8);
4748 MinOffset = 0;
4749 MaxOffset = 4095;
4750 break;
4751 case AArch64::ADDG:
4752 Scale = TypeSize::getFixed(16);
4753 Width = TypeSize::getFixed(0);
4754 MinOffset = 0;
4755 MaxOffset = 63;
4756 break;
4757 case AArch64::TAGPstack:
4758 Scale = TypeSize::getFixed(16);
4759 Width = TypeSize::getFixed(0);
4760 // TAGP with a negative offset turns into SUBP, which has a maximum offset
4761 // of 63 (not 64!).
4762 MinOffset = -63;
4763 MaxOffset = 63;
4764 break;
4765 case AArch64::LDG:
4766 case AArch64::STGi:
4767 case AArch64::STGPreIndex:
4768 case AArch64::STGPostIndex:
4769 case AArch64::STZGi:
4770 case AArch64::STZGPreIndex:
4771 case AArch64::STZGPostIndex:
4772 Scale = TypeSize::getFixed(16);
4773 Width = TypeSize::getFixed(16);
4774 MinOffset = -256;
4775 MaxOffset = 255;
4776 break;
4777 // SVE
4778 case AArch64::STR_ZZZZXI:
4779 case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
4780 case AArch64::LDR_ZZZZXI:
4781 case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
4782 Scale = TypeSize::getScalable(16);
4783 Width = TypeSize::getScalable(16 * 4);
4784 MinOffset = -256;
4785 MaxOffset = 252;
4786 break;
4787 case AArch64::STR_ZZZXI:
4788 case AArch64::LDR_ZZZXI:
4789 Scale = TypeSize::getScalable(16);
4790 Width = TypeSize::getScalable(16 * 3);
4791 MinOffset = -256;
4792 MaxOffset = 253;
4793 break;
4794 case AArch64::STR_ZZXI:
4795 case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
4796 case AArch64::LDR_ZZXI:
4797 case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
4798 Scale = TypeSize::getScalable(16);
4799 Width = TypeSize::getScalable(16 * 2);
4800 MinOffset = -256;
4801 MaxOffset = 254;
4802 break;
4803 case AArch64::LDR_PXI:
4804 case AArch64::STR_PXI:
4805 Scale = TypeSize::getScalable(2);
4806 Width = TypeSize::getScalable(2);
4807 MinOffset = -256;
4808 MaxOffset = 255;
4809 break;
4810 case AArch64::LDR_PPXI:
4811 case AArch64::STR_PPXI:
4812 Scale = TypeSize::getScalable(2);
4813 Width = TypeSize::getScalable(2 * 2);
4814 MinOffset = -256;
4815 MaxOffset = 254;
4816 break;
4817 case AArch64::LDR_ZXI:
4818 case AArch64::STR_ZXI:
4819 Scale = TypeSize::getScalable(16);
4820 Width = TypeSize::getScalable(16);
4821 MinOffset = -256;
4822 MaxOffset = 255;
4823 break;
4824 case AArch64::LD1B_IMM:
4825 case AArch64::LD1H_IMM:
4826 case AArch64::LD1W_IMM:
4827 case AArch64::LD1D_IMM:
4828 case AArch64::LDNT1B_ZRI:
4829 case AArch64::LDNT1H_ZRI:
4830 case AArch64::LDNT1W_ZRI:
4831 case AArch64::LDNT1D_ZRI:
4832 case AArch64::ST1B_IMM:
4833 case AArch64::ST1H_IMM:
4834 case AArch64::ST1W_IMM:
4835 case AArch64::ST1D_IMM:
4836 case AArch64::STNT1B_ZRI:
4837 case AArch64::STNT1H_ZRI:
4838 case AArch64::STNT1W_ZRI:
4839 case AArch64::STNT1D_ZRI:
4840 case AArch64::LDNF1B_IMM:
4841 case AArch64::LDNF1H_IMM:
4842 case AArch64::LDNF1W_IMM:
4843 case AArch64::LDNF1D_IMM:
4844 // A full vectors worth of data
4845 // Width = mbytes * elements
4846 Scale = TypeSize::getScalable(16);
4847 Width = TypeSize::getScalable(16);
4848 MinOffset = -8;
4849 MaxOffset = 7;
4850 break;
4851 case AArch64::LD2B_IMM:
4852 case AArch64::LD2H_IMM:
4853 case AArch64::LD2W_IMM:
4854 case AArch64::LD2D_IMM:
4855 case AArch64::ST2B_IMM:
4856 case AArch64::ST2H_IMM:
4857 case AArch64::ST2W_IMM:
4858 case AArch64::ST2D_IMM:
4859 Scale = TypeSize::getScalable(32);
4860 Width = TypeSize::getScalable(16 * 2);
4861 MinOffset = -8;
4862 MaxOffset = 7;
4863 break;
4864 case AArch64::LD3B_IMM:
4865 case AArch64::LD3H_IMM:
4866 case AArch64::LD3W_IMM:
4867 case AArch64::LD3D_IMM:
4868 case AArch64::ST3B_IMM:
4869 case AArch64::ST3H_IMM:
4870 case AArch64::ST3W_IMM:
4871 case AArch64::ST3D_IMM:
4872 Scale = TypeSize::getScalable(48);
4873 Width = TypeSize::getScalable(16 * 3);
4874 MinOffset = -8;
4875 MaxOffset = 7;
4876 break;
4877 case AArch64::LD4B_IMM:
4878 case AArch64::LD4H_IMM:
4879 case AArch64::LD4W_IMM:
4880 case AArch64::LD4D_IMM:
4881 case AArch64::ST4B_IMM:
4882 case AArch64::ST4H_IMM:
4883 case AArch64::ST4W_IMM:
4884 case AArch64::ST4D_IMM:
4885 Scale = TypeSize::getScalable(64);
4886 Width = TypeSize::getScalable(16 * 4);
4887 MinOffset = -8;
4888 MaxOffset = 7;
4889 break;
4890 case AArch64::LD1B_H_IMM:
4891 case AArch64::LD1SB_H_IMM:
4892 case AArch64::LD1H_S_IMM:
4893 case AArch64::LD1SH_S_IMM:
4894 case AArch64::LD1W_D_IMM:
4895 case AArch64::LD1SW_D_IMM:
4896 case AArch64::ST1B_H_IMM:
4897 case AArch64::ST1H_S_IMM:
4898 case AArch64::ST1W_D_IMM:
4899 case AArch64::LDNF1B_H_IMM:
4900 case AArch64::LDNF1SB_H_IMM:
4901 case AArch64::LDNF1H_S_IMM:
4902 case AArch64::LDNF1SH_S_IMM:
4903 case AArch64::LDNF1W_D_IMM:
4904 case AArch64::LDNF1SW_D_IMM:
4905 // A half vector worth of data
4906 // Width = mbytes * elements
4907 Scale = TypeSize::getScalable(8);
4908 Width = TypeSize::getScalable(8);
4909 MinOffset = -8;
4910 MaxOffset = 7;
4911 break;
4912 case AArch64::LD1B_S_IMM:
4913 case AArch64::LD1SB_S_IMM:
4914 case AArch64::LD1H_D_IMM:
4915 case AArch64::LD1SH_D_IMM:
4916 case AArch64::ST1B_S_IMM:
4917 case AArch64::ST1H_D_IMM:
4918 case AArch64::LDNF1B_S_IMM:
4919 case AArch64::LDNF1SB_S_IMM:
4920 case AArch64::LDNF1H_D_IMM:
4921 case AArch64::LDNF1SH_D_IMM:
4922 // A quarter vector worth of data
4923 // Width = mbytes * elements
4924 Scale = TypeSize::getScalable(4);
4925 Width = TypeSize::getScalable(4);
4926 MinOffset = -8;
4927 MaxOffset = 7;
4928 break;
4929 case AArch64::LD1B_D_IMM:
4930 case AArch64::LD1SB_D_IMM:
4931 case AArch64::ST1B_D_IMM:
4932 case AArch64::LDNF1B_D_IMM:
4933 case AArch64::LDNF1SB_D_IMM:
4934 // A eighth vector worth of data
4935 // Width = mbytes * elements
4936 Scale = TypeSize::getScalable(2);
4937 Width = TypeSize::getScalable(2);
4938 MinOffset = -8;
4939 MaxOffset = 7;
4940 break;
4941 case AArch64::ST2Gi:
4942 case AArch64::ST2GPreIndex:
4943 case AArch64::ST2GPostIndex:
4944 case AArch64::STZ2Gi:
4945 case AArch64::STZ2GPreIndex:
4946 case AArch64::STZ2GPostIndex:
4947 Scale = TypeSize::getFixed(16);
4948 Width = TypeSize::getFixed(32);
4949 MinOffset = -256;
4950 MaxOffset = 255;
4951 break;
4952 case AArch64::STGPi:
4953 case AArch64::STGPpost:
4954 case AArch64::STGPpre:
4955 Scale = TypeSize::getFixed(16);
4956 Width = TypeSize::getFixed(16);
4957 MinOffset = -64;
4958 MaxOffset = 63;
4959 break;
4960 case AArch64::LD1RB_IMM:
4961 case AArch64::LD1RB_H_IMM:
4962 case AArch64::LD1RB_S_IMM:
4963 case AArch64::LD1RB_D_IMM:
4964 case AArch64::LD1RSB_H_IMM:
4965 case AArch64::LD1RSB_S_IMM:
4966 case AArch64::LD1RSB_D_IMM:
4967 Scale = TypeSize::getFixed(1);
4968 Width = TypeSize::getFixed(1);
4969 MinOffset = 0;
4970 MaxOffset = 63;
4971 break;
4972 case AArch64::LD1RH_IMM:
4973 case AArch64::LD1RH_S_IMM:
4974 case AArch64::LD1RH_D_IMM:
4975 case AArch64::LD1RSH_S_IMM:
4976 case AArch64::LD1RSH_D_IMM:
4977 Scale = TypeSize::getFixed(2);
4978 Width = TypeSize::getFixed(2);
4979 MinOffset = 0;
4980 MaxOffset = 63;
4981 break;
4982 case AArch64::LD1RW_IMM:
4983 case AArch64::LD1RW_D_IMM:
4984 case AArch64::LD1RSW_IMM:
4985 Scale = TypeSize::getFixed(4);
4986 Width = TypeSize::getFixed(4);
4987 MinOffset = 0;
4988 MaxOffset = 63;
4989 break;
4990 case AArch64::LD1RD_IMM:
4991 Scale = TypeSize::getFixed(8);
4992 Width = TypeSize::getFixed(8);
4993 MinOffset = 0;
4994 MaxOffset = 63;
4995 break;
4996 }
4997
4998 return true;
4999}
5000
5001// Scaling factor for unscaled load or store.
5003 switch (Opc) {
5004 default:
5005 llvm_unreachable("Opcode has unknown scale!");
5006 case AArch64::LDRBui:
5007 case AArch64::LDRBBui:
5008 case AArch64::LDURBBi:
5009 case AArch64::LDRSBWui:
5010 case AArch64::LDURSBWi:
5011 case AArch64::STRBui:
5012 case AArch64::STRBBui:
5013 case AArch64::STURBBi:
5014 return 1;
5015 case AArch64::LDRHui:
5016 case AArch64::LDRHHui:
5017 case AArch64::LDURHHi:
5018 case AArch64::LDRSHWui:
5019 case AArch64::LDURSHWi:
5020 case AArch64::STRHui:
5021 case AArch64::STRHHui:
5022 case AArch64::STURHHi:
5023 return 2;
5024 case AArch64::LDRSui:
5025 case AArch64::LDURSi:
5026 case AArch64::LDRSpre:
5027 case AArch64::LDRSWui:
5028 case AArch64::LDURSWi:
5029 case AArch64::LDRSWpre:
5030 case AArch64::LDRWpre:
5031 case AArch64::LDRWui:
5032 case AArch64::LDURWi:
5033 case AArch64::STRSui:
5034 case AArch64::STURSi:
5035 case AArch64::STRSpre:
5036 case AArch64::STRWui:
5037 case AArch64::STURWi:
5038 case AArch64::STRWpre:
5039 case AArch64::LDPSi:
5040 case AArch64::LDPSWi:
5041 case AArch64::LDPWi:
5042 case AArch64::STPSi:
5043 case AArch64::STPWi:
5044 return 4;
5045 case AArch64::LDRDui:
5046 case AArch64::LDURDi:
5047 case AArch64::LDRDpre:
5048 case AArch64::LDRXui:
5049 case AArch64::LDURXi:
5050 case AArch64::LDRXpre:
5051 case AArch64::STRDui:
5052 case AArch64::STURDi:
5053 case AArch64::STRDpre:
5054 case AArch64::STRXui:
5055 case AArch64::STURXi:
5056 case AArch64::STRXpre:
5057 case AArch64::LDPDi:
5058 case AArch64::LDPXi:
5059 case AArch64::STPDi:
5060 case AArch64::STPXi:
5061 return 8;
5062 case AArch64::LDRQui:
5063 case AArch64::LDURQi:
5064 case AArch64::STRQui:
5065 case AArch64::STURQi:
5066 case AArch64::STRQpre:
5067 case AArch64::LDPQi:
5068 case AArch64::LDRQpre:
5069 case AArch64::STPQi:
5070 case AArch64::STGi:
5071 case AArch64::STZGi:
5072 case AArch64::ST2Gi:
5073 case AArch64::STZ2Gi:
5074 case AArch64::STGPi:
5075 return 16;
5076 }
5077}
5078
5080 switch (MI.getOpcode()) {
5081 default:
5082 return false;
5083 case AArch64::LDRWpre:
5084 case AArch64::LDRXpre:
5085 case AArch64::LDRSWpre:
5086 case AArch64::LDRSpre:
5087 case AArch64::LDRDpre:
5088 case AArch64::LDRQpre:
5089 return true;
5090 }
5091}
5092
5094 switch (MI.getOpcode()) {
5095 default:
5096 return false;
5097 case AArch64::STRWpre:
5098 case AArch64::STRXpre:
5099 case AArch64::STRSpre:
5100 case AArch64::STRDpre:
5101 case AArch64::STRQpre:
5102 return true;
5103 }
5104}
5105
5107 return isPreLd(MI) || isPreSt(MI);
5108}
5109
5111 switch (MI.getOpcode()) {
5112 default:
5113 return false;
5114 case AArch64::LDURBBi:
5115 case AArch64::LDURHHi:
5116 case AArch64::LDURWi:
5117 case AArch64::LDRBBui:
5118 case AArch64::LDRHHui:
5119 case AArch64::LDRWui:
5120 case AArch64::LDRBBroX:
5121 case AArch64::LDRHHroX:
5122 case AArch64::LDRWroX:
5123 case AArch64::LDRBBroW:
5124 case AArch64::LDRHHroW:
5125 case AArch64::LDRWroW:
5126 return true;
5127 }
5128}
5129
5131 switch (MI.getOpcode()) {
5132 default:
5133 return false;
5134 case AArch64::LDURSBWi:
5135 case AArch64::LDURSHWi:
5136 case AArch64::LDURSBXi:
5137 case AArch64::LDURSHXi:
5138 case AArch64::LDURSWi:
5139 case AArch64::LDRSBWui:
5140 case AArch64::LDRSHWui:
5141 case AArch64::LDRSBXui:
5142 case AArch64::LDRSHXui:
5143 case AArch64::LDRSWui:
5144 case AArch64::LDRSBWroX:
5145 case AArch64::LDRSHWroX:
5146 case AArch64::LDRSBXroX:
5147 case AArch64::LDRSHXroX:
5148 case AArch64::LDRSWroX:
5149 case AArch64::LDRSBWroW:
5150 case AArch64::LDRSHWroW:
5151 case AArch64::LDRSBXroW:
5152 case AArch64::LDRSHXroW:
5153 case AArch64::LDRSWroW:
5154 return true;
5155 }
5156}
5157
5159 switch (MI.getOpcode()) {
5160 default:
5161 return false;
5162 case AArch64::LDPSi:
5163 case AArch64::LDPSWi:
5164 case AArch64::LDPDi:
5165 case AArch64::LDPQi:
5166 case AArch64::LDPWi:
5167 case AArch64::LDPXi:
5168 case AArch64::STPSi:
5169 case AArch64::STPDi:
5170 case AArch64::STPQi:
5171 case AArch64::STPWi:
5172 case AArch64::STPXi:
5173 case AArch64::STGPi:
5174 return true;
5175 }
5176}
5177
5179 assert(MI.mayLoadOrStore() && "Load or store instruction expected");
5180 unsigned Idx =
5182 : 1;
5183 return MI.getOperand(Idx);
5184}
5185
5186const MachineOperand &
5188 assert(MI.mayLoadOrStore() && "Load or store instruction expected");
5189 unsigned Idx =
5191 : 2;
5192 return MI.getOperand(Idx);
5193}
5194
5195const MachineOperand &
5197 switch (MI.getOpcode()) {
5198 default:
5199 llvm_unreachable("Unexpected opcode");
5200 case AArch64::LDRBroX:
5201 case AArch64::LDRBBroX:
5202 case AArch64::LDRSBXroX:
5203 case AArch64::LDRSBWroX:
5204 case AArch64::LDRHroX:
5205 case AArch64::LDRHHroX:
5206 case AArch64::LDRSHXroX:
5207 case AArch64::LDRSHWroX:
5208 case AArch64::LDRWroX:
5209 case AArch64::LDRSroX:
5210 case AArch64::LDRSWroX:
5211 case AArch64::LDRDroX:
5212 case AArch64::LDRXroX:
5213 case AArch64::LDRQroX:
5214 return MI.getOperand(4);
5215 }
5216}
5217
5219 Register Reg) {
5220 if (MI.getParent() == nullptr)
5221 return nullptr;
5222 const MachineFunction *MF = MI.getParent()->getParent();
5223 return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr;
5224}
5225
5227 auto IsHFPR = [&](const MachineOperand &Op) {
5228 if (!Op.isReg())
5229 return false;
5230 auto Reg = Op.getReg();
5231 if (Reg.isPhysical())
5232 return AArch64::FPR16RegClass.contains(Reg);
5233 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5234 return TRC == &AArch64::FPR16RegClass ||
5235 TRC == &AArch64::FPR16_loRegClass;
5236 };
5237 return llvm::any_of(MI.operands(), IsHFPR);
5238}
5239
5241 auto IsQFPR = [&](const MachineOperand &Op) {
5242 if (!Op.isReg())
5243 return false;
5244 auto Reg = Op.getReg();
5245 if (Reg.isPhysical())
5246 return AArch64::FPR128RegClass.contains(Reg);
5247 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5248 return TRC == &AArch64::FPR128RegClass ||
5249 TRC == &AArch64::FPR128_loRegClass;
5250 };
5251 return llvm::any_of(MI.operands(), IsQFPR);
5252}
5253
5255 switch (MI.getOpcode()) {
5256 case AArch64::BRK:
5257 case AArch64::HLT:
5258 case AArch64::PACIASP:
5259 case AArch64::PACIBSP:
5260 // Implicit BTI behavior.
5261 return true;
5262 case AArch64::PAUTH_PROLOGUE:
5263 // PAUTH_PROLOGUE expands to PACI(A|B)SP.
5264 return true;
5265 case AArch64::HINT: {
5266 unsigned Imm = MI.getOperand(0).getImm();
5267 // Explicit BTI instruction.
5268 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
5269 return true;
5270 // PACI(A|B)SP instructions.
5271 if (Imm == 25 || Imm == 27)
5272 return true;
5273 return false;
5274 }
5275 default:
5276 return false;
5277 }
5278}
5279
5281 if (Reg == 0)
5282 return false;
5283 assert(Reg.isPhysical() && "Expected physical register in isFpOrNEON");
5284 return AArch64::FPR128RegClass.contains(Reg) ||
5285 AArch64::FPR64RegClass.contains(Reg) ||
5286 AArch64::FPR32RegClass.contains(Reg) ||
5287 AArch64::FPR16RegClass.contains(Reg) ||
5288 AArch64::FPR8RegClass.contains(Reg);
5289}
5290
5292 auto IsFPR = [&](const MachineOperand &Op) {
5293 if (!Op.isReg())
5294 return false;
5295 auto Reg = Op.getReg();
5296 if (Reg.isPhysical())
5297 return isFpOrNEON(Reg);
5298
5299 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5300 return TRC == &AArch64::FPR128RegClass ||
5301 TRC == &AArch64::FPR128_loRegClass ||
5302 TRC == &AArch64::FPR64RegClass ||
5303 TRC == &AArch64::FPR64_loRegClass ||
5304 TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass ||
5305 TRC == &AArch64::FPR8RegClass;
5306 };
5307 return llvm::any_of(MI.operands(), IsFPR);
5308}
5309
5310// Scale the unscaled offsets. Returns false if the unscaled offset can't be
5311// scaled.
5312static bool scaleOffset(unsigned Opc, int64_t &Offset) {
5314
5315 // If the byte-offset isn't a multiple of the stride, we can't scale this
5316 // offset.
5317 if (Offset % Scale != 0)
5318 return false;
5319
5320 // Convert the byte-offset used by unscaled into an "element" offset used
5321 // by the scaled pair load/store instructions.
5322 Offset /= Scale;
5323 return true;
5324}
5325
5326static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
5327 if (FirstOpc == SecondOpc)
5328 return true;
5329 // We can also pair sign-ext and zero-ext instructions.
5330 switch (FirstOpc) {
5331 default:
5332 return false;
5333 case AArch64::STRSui:
5334 case AArch64::STURSi:
5335 return SecondOpc == AArch64::STRSui || SecondOpc == AArch64::STURSi;
5336 case AArch64::STRDui:
5337 case AArch64::STURDi:
5338 return SecondOpc == AArch64::STRDui || SecondOpc == AArch64::STURDi;
5339 case AArch64::STRQui:
5340 case AArch64::STURQi:
5341 return SecondOpc == AArch64::STRQui || SecondOpc == AArch64::STURQi;
5342 case AArch64::STRWui:
5343 case AArch64::STURWi:
5344 return SecondOpc == AArch64::STRWui || SecondOpc == AArch64::STURWi;
5345 case AArch64::STRXui:
5346 case AArch64::STURXi:
5347 return SecondOpc == AArch64::STRXui || SecondOpc == AArch64::STURXi;
5348 case AArch64::LDRSui:
5349 case AArch64::LDURSi:
5350 return SecondOpc == AArch64::LDRSui || SecondOpc == AArch64::LDURSi;
5351 case AArch64::LDRDui:
5352 case AArch64::LDURDi:
5353 return SecondOpc == AArch64::LDRDui || SecondOpc == AArch64::LDURDi;
5354 case AArch64::LDRQui:
5355 case AArch64::LDURQi:
5356 return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
5357 case AArch64::LDRWui:
5358 case AArch64::LDURWi:
5359 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
5360 case AArch64::LDRSWui:
5361 case AArch64::LDURSWi:
5362 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
5363 case AArch64::LDRXui:
5364 case AArch64::LDURXi:
5365 return SecondOpc == AArch64::LDRXui || SecondOpc == AArch64::LDURXi;
5366 }
5367 // These instructions can't be paired based on their opcodes.
5368 return false;
5369}
5370
5371static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
5372 int64_t Offset1, unsigned Opcode1, int FI2,
5373 int64_t Offset2, unsigned Opcode2) {
5374 // Accesses through fixed stack object frame indices may access a different
5375 // fixed stack slot. Check that the object offsets + offsets match.
5376 if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
5377 int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
5378 int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
5379 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
5380 // Convert to scaled object offsets.
5381 int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
5382 if (ObjectOffset1 % Scale1 != 0)
5383 return false;
5384 ObjectOffset1 /= Scale1;
5385 int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
5386 if (ObjectOffset2 % Scale2 != 0)
5387 return false;
5388 ObjectOffset2 /= Scale2;
5389 ObjectOffset1 += Offset1;
5390 ObjectOffset2 += Offset2;
5391 return ObjectOffset1 + 1 == ObjectOffset2;
5392 }
5393
5394 return FI1 == FI2;
5395}
5396
5397/// Detect opportunities for ldp/stp formation.
5398///
5399/// Only called for LdSt for which getMemOperandWithOffset returns true.
5401 ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
5402 bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
5403 int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
5404 unsigned NumBytes) const {
5405 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
5406 const MachineOperand &BaseOp1 = *BaseOps1.front();
5407 const MachineOperand &BaseOp2 = *BaseOps2.front();
5408 const MachineInstr &FirstLdSt = *BaseOp1.getParent();
5409 const MachineInstr &SecondLdSt = *BaseOp2.getParent();
5410 if (BaseOp1.getType() != BaseOp2.getType())
5411 return false;
5412
5413 assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
5414 "Only base registers and frame indices are supported.");
5415
5416 // Check for both base regs and base FI.
5417 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
5418 return false;
5419
5420 // Only cluster up to a single pair.
5421 if (ClusterSize > 2)
5422 return false;
5423
5424 if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
5425 return false;
5426
5427 // Can we pair these instructions based on their opcodes?
5428 unsigned FirstOpc = FirstLdSt.getOpcode();
5429 unsigned SecondOpc = SecondLdSt.getOpcode();
5430 if (!canPairLdStOpc(FirstOpc, SecondOpc))
5431 return false;
5432
5433 // Can't merge volatiles or load/stores that have a hint to avoid pair
5434 // formation, for example.
5435 if (!isCandidateToMergeOrPair(FirstLdSt) ||
5436 !isCandidateToMergeOrPair(SecondLdSt))
5437 return false;
5438
5439 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
5440 int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
5441 if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
5442 return false;
5443
5444 int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
5445 if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
5446 return false;
5447
5448 // Pairwise instructions have a 7-bit signed offset field.
5449 if (Offset1 > 63 || Offset1 < -64)
5450 return false;
5451
5452 // The caller should already have ordered First/SecondLdSt by offset.
5453 // Note: except for non-equal frame index bases
5454 if (BaseOp1.isFI()) {
5455 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
5456 "Caller should have ordered offsets.");
5457
5458 const MachineFrameInfo &MFI =
5459 FirstLdSt.getParent()->getParent()->getFrameInfo();
5460 return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
5461 BaseOp2.getIndex(), Offset2, SecondOpc);
5462 }
5463
5464 assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
5465
5466 return Offset1 + 1 == Offset2;
5467}
5468
5470 MCRegister Reg, unsigned SubIdx,
5471 RegState State,
5472 const TargetRegisterInfo *TRI) {
5473 if (!SubIdx)
5474 return MIB.addReg(Reg, State);
5475
5476 if (Reg.isPhysical())
5477 return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
5478 return MIB.addReg(Reg, State, SubIdx);
5479}
5480
5481static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
5482 unsigned NumRegs) {
5483 // We really want the positive remainder mod 32 here, that happens to be
5484 // easily obtainable with a mask.
5485 return ((DestReg - SrcReg) & 0x1f) < NumRegs;
5486}
5487
5490 const DebugLoc &DL, MCRegister DestReg,
5491 MCRegister SrcReg, bool KillSrc,
5492 unsigned Opcode,
5493 ArrayRef<unsigned> Indices) const {
5494 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
5496 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
5497 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
5498 unsigned NumRegs = Indices.size();
5499
5500 int SubReg = 0, End = NumRegs, Incr = 1;
5501 if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
5502 SubReg = NumRegs - 1;
5503 End = -1;
5504 Incr = -1;
5505 }
5506
5507 for (; SubReg != End; SubReg += Incr) {
5508 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
5509 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
5510 AddSubReg(MIB, SrcReg, Indices[SubReg], {}, TRI);
5511 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
5512 }
5513}
5514
5517 const DebugLoc &DL, MCRegister DestReg,
5518 MCRegister SrcReg, bool KillSrc,
5519 unsigned Opcode, unsigned ZeroReg,
5520 llvm::ArrayRef<unsigned> Indices) const {
5522 unsigned NumRegs = Indices.size();
5523
5524#ifndef NDEBUG
5525 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
5526 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
5527 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
5528 "GPR reg sequences should not be able to overlap");
5529#endif
5530
5531 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
5532 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
5533 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
5534 MIB.addReg(ZeroReg);
5535 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
5536 MIB.addImm(0);
5537 }
5538}
5539
5540/// Returns true if the instruction at I is in a streaming call site region,
5541/// within a single basic block.
5542/// A "call site streaming region" starts after smstart and ends at smstop
5543/// around a call to a streaming function. This walks backward from I.
5546 MachineFunction &MF = *MBB.getParent();
5548 if (!AFI->hasStreamingModeChanges())
5549 return false;
5550 // Walk backwards to find smstart/smstop
5551 for (MachineInstr &MI : reverse(make_range(MBB.begin(), I))) {
5552 unsigned Opc = MI.getOpcode();
5553 if (Opc == AArch64::MSRpstatesvcrImm1 || Opc == AArch64::MSRpstatePseudo) {
5554 // Check if this is SM change (not ZA)
5555 int64_t PState = MI.getOperand(0).getImm();
5556 if (PState == AArch64SVCR::SVCRSM || PState == AArch64SVCR::SVCRSMZA) {
5557 // Operand 1 is 1 for start, 0 for stop
5558 return MI.getOperand(1).getImm() == 1;
5559 }
5560 }
5561 }
5562 return false;
5563}
5564
5565/// Returns true if in a streaming call site region without SME-FA64.
5566static bool mustAvoidNeonAtMBBI(const AArch64Subtarget &Subtarget,
5569 return !Subtarget.hasSMEFA64() && isInStreamingCallSiteRegion(MBB, I);
5570}
5571
5574 const DebugLoc &DL, Register DestReg,
5575 Register SrcReg, bool KillSrc,
5576 bool RenamableDest,
5577 bool RenamableSrc) const {
5578 ++NumCopyInstrs;
5579 if (AArch64::GPR32spRegClass.contains(DestReg) &&
5580 AArch64::GPR32spRegClass.contains(SrcReg)) {
5581 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
5582 // If either operand is WSP, expand to ADD #0.
5583 if (Subtarget.hasZeroCycleRegMoveGPR64() &&
5584 !Subtarget.hasZeroCycleRegMoveGPR32()) {
5585 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
5586 MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
5587 &AArch64::GPR64spRegClass);
5588 MCRegister SrcRegX = RI.getMatchingSuperReg(SrcReg, AArch64::sub_32,
5589 &AArch64::GPR64spRegClass);
5590 // This instruction is reading and writing X registers. This may upset
5591 // the register scavenger and machine verifier, so we need to indicate
5592 // that we are reading an undefined value from SrcRegX, but a proper
5593 // value from SrcReg.
5594 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
5595 .addReg(SrcRegX, RegState::Undef)
5596 .addImm(0)
5598 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5599 ++NumZCRegMoveInstrsGPR;
5600 } else {
5601 BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
5602 .addReg(SrcReg, getKillRegState(KillSrc))
5603 .addImm(0)
5605 if (Subtarget.hasZeroCycleRegMoveGPR32())
5606 ++NumZCRegMoveInstrsGPR;
5607 }
5608 } else if (Subtarget.hasZeroCycleRegMoveGPR64() &&
5609 !Subtarget.hasZeroCycleRegMoveGPR32()) {
5610 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
5611 MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
5612 &AArch64::GPR64spRegClass);
5613 assert(DestRegX.isValid() && "Destination super-reg not valid");
5614 MCRegister SrcRegX = RI.getMatchingSuperReg(SrcReg, AArch64::sub_32,
5615 &AArch64::GPR64spRegClass);
5616 assert(SrcRegX.isValid() && "Source super-reg not valid");
5617 // This instruction is reading and writing X registers. This may upset
5618 // the register scavenger and machine verifier, so we need to indicate
5619 // that we are reading an undefined value from SrcRegX, but a proper
5620 // value from SrcReg.
5621 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
5622 .addReg(AArch64::XZR)
5623 .addReg(SrcRegX, RegState::Undef)
5624 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5625 ++NumZCRegMoveInstrsGPR;
5626 } else {
5627 // Otherwise, expand to ORR WZR.
5628 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
5629 .addReg(AArch64::WZR)
5630 .addReg(SrcReg, getKillRegState(KillSrc));
5631 if (Subtarget.hasZeroCycleRegMoveGPR32())
5632 ++NumZCRegMoveInstrsGPR;
5633 }
5634 return;
5635 }
5636
5637 // GPR32 zeroing
5638 if (AArch64::GPR32spRegClass.contains(DestReg) && SrcReg == AArch64::WZR) {
5639 if (Subtarget.hasZeroCycleZeroingGPR64() &&
5640 !Subtarget.hasZeroCycleZeroingGPR32()) {
5641 MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
5642 &AArch64::GPR64spRegClass);
5643 assert(DestRegX.isValid() && "Destination super-reg not valid");
5644 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestRegX)
5645 .addImm(0)
5647 ++NumZCZeroingInstrsGPR;
5648 } else if (Subtarget.hasZeroCycleZeroingGPR32()) {
5649 BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
5650 .addImm(0)
5652 ++NumZCZeroingInstrsGPR;
5653 } else {
5654 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
5655 .addReg(AArch64::WZR)
5656 .addReg(AArch64::WZR);
5657 }
5658 return;
5659 }
5660
5661 if (AArch64::GPR64spRegClass.contains(DestReg) &&
5662 AArch64::GPR64spRegClass.contains(SrcReg)) {
5663 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
5664 // If either operand is SP, expand to ADD #0.
5665 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
5666 .addReg(SrcReg, getKillRegState(KillSrc))
5667 .addImm(0)
5669 if (Subtarget.hasZeroCycleRegMoveGPR64())
5670 ++NumZCRegMoveInstrsGPR;
5671 } else {
5672 // Otherwise, expand to ORR XZR.
5673 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
5674 .addReg(AArch64::XZR)
5675 .addReg(SrcReg, getKillRegState(KillSrc));
5676 if (Subtarget.hasZeroCycleRegMoveGPR64())
5677 ++NumZCRegMoveInstrsGPR;
5678 }
5679 return;
5680 }
5681
5682 // GPR64 zeroing
5683 if (AArch64::GPR64spRegClass.contains(DestReg) && SrcReg == AArch64::XZR) {
5684 if (Subtarget.hasZeroCycleZeroingGPR64()) {
5685 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
5686 .addImm(0)
5688 ++NumZCZeroingInstrsGPR;
5689 } else {
5690 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
5691 .addReg(AArch64::XZR)
5692 .addReg(AArch64::XZR);
5693 }
5694 return;
5695 }
5696
5697 // Copy a Predicate register by ORRing with itself.
5698 if (AArch64::PPRRegClass.contains(DestReg) &&
5699 AArch64::PPRRegClass.contains(SrcReg)) {
5700 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5701 "Unexpected SVE register.");
5702 BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
5703 .addReg(SrcReg) // Pg
5704 .addReg(SrcReg)
5705 .addReg(SrcReg, getKillRegState(KillSrc));
5706 return;
5707 }
5708
5709 // Copy a predicate-as-counter register by ORRing with itself as if it
5710 // were a regular predicate (mask) register.
5711 bool DestIsPNR = AArch64::PNRRegClass.contains(DestReg);
5712 bool SrcIsPNR = AArch64::PNRRegClass.contains(SrcReg);
5713 if (DestIsPNR || SrcIsPNR) {
5714 auto ToPPR = [](MCRegister R) -> MCRegister {
5715 return (R - AArch64::PN0) + AArch64::P0;
5716 };
5717 MCRegister PPRSrcReg = SrcIsPNR ? ToPPR(SrcReg) : SrcReg.asMCReg();
5718 MCRegister PPRDestReg = DestIsPNR ? ToPPR(DestReg) : DestReg.asMCReg();
5719
5720 if (PPRSrcReg != PPRDestReg) {
5721 auto NewMI = BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), PPRDestReg)
5722 .addReg(PPRSrcReg) // Pg
5723 .addReg(PPRSrcReg)
5724 .addReg(PPRSrcReg, getKillRegState(KillSrc));
5725 if (DestIsPNR)
5726 NewMI.addDef(DestReg, RegState::Implicit);
5727 }
5728 return;
5729 }
5730
5731 // Copy a Z register by ORRing with itself.
5732 if (AArch64::ZPRRegClass.contains(DestReg) &&
5733 AArch64::ZPRRegClass.contains(SrcReg)) {
5734 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5735 "Unexpected SVE register.");
5736 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
5737 .addReg(SrcReg)
5738 .addReg(SrcReg, getKillRegState(KillSrc));
5739 return;
5740 }
5741
5742 // Copy a Z register pair by copying the individual sub-registers.
5743 if ((AArch64::ZPR2RegClass.contains(DestReg) ||
5744 AArch64::ZPR2StridedOrContiguousRegClass.contains(DestReg)) &&
5745 (AArch64::ZPR2RegClass.contains(SrcReg) ||
5746 AArch64::ZPR2StridedOrContiguousRegClass.contains(SrcReg))) {
5747 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5748 "Unexpected SVE register.");
5749 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
5750 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5751 Indices);
5752 return;
5753 }
5754
5755 // Copy a Z register triple by copying the individual sub-registers.
5756 if (AArch64::ZPR3RegClass.contains(DestReg) &&
5757 AArch64::ZPR3RegClass.contains(SrcReg)) {
5758 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5759 "Unexpected SVE register.");
5760 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5761 AArch64::zsub2};
5762 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5763 Indices);
5764 return;
5765 }
5766
5767 // Copy a Z register quad by copying the individual sub-registers.
5768 if ((AArch64::ZPR4RegClass.contains(DestReg) ||
5769 AArch64::ZPR4StridedOrContiguousRegClass.contains(DestReg)) &&
5770 (AArch64::ZPR4RegClass.contains(SrcReg) ||
5771 AArch64::ZPR4StridedOrContiguousRegClass.contains(SrcReg))) {
5772 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5773 "Unexpected SVE register.");
5774 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5775 AArch64::zsub2, AArch64::zsub3};
5776 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5777 Indices);
5778 return;
5779 }
5780
5781 // Copy a DDDD register quad by copying the individual sub-registers.
5782 if (AArch64::DDDDRegClass.contains(DestReg) &&
5783 AArch64::DDDDRegClass.contains(SrcReg)) {
5784 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5785 AArch64::dsub2, AArch64::dsub3};
5786 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5787 Indices);
5788 return;
5789 }
5790
5791 // Copy a DDD register triple by copying the individual sub-registers.
5792 if (AArch64::DDDRegClass.contains(DestReg) &&
5793 AArch64::DDDRegClass.contains(SrcReg)) {
5794 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5795 AArch64::dsub2};
5796 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5797 Indices);
5798 return;
5799 }
5800
5801 // Copy a DD register pair by copying the individual sub-registers.
5802 if (AArch64::DDRegClass.contains(DestReg) &&
5803 AArch64::DDRegClass.contains(SrcReg)) {
5804 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
5805 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5806 Indices);
5807 return;
5808 }
5809
5810 // Copy a QQQQ register quad by copying the individual sub-registers.
5811 if (AArch64::QQQQRegClass.contains(DestReg) &&
5812 AArch64::QQQQRegClass.contains(SrcReg)) {
5813 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5814 AArch64::qsub2, AArch64::qsub3};
5815 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5816 Indices);
5817 return;
5818 }
5819
5820 // Copy a QQQ register triple by copying the individual sub-registers.
5821 if (AArch64::QQQRegClass.contains(DestReg) &&
5822 AArch64::QQQRegClass.contains(SrcReg)) {
5823 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5824 AArch64::qsub2};
5825 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5826 Indices);
5827 return;
5828 }
5829
5830 // Copy a QQ register pair by copying the individual sub-registers.
5831 if (AArch64::QQRegClass.contains(DestReg) &&
5832 AArch64::QQRegClass.contains(SrcReg)) {
5833 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
5834 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5835 Indices);
5836 return;
5837 }
5838
5839 if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
5840 AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
5841 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
5842 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
5843 AArch64::XZR, Indices);
5844 return;
5845 }
5846
5847 if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
5848 AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
5849 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
5850 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
5851 AArch64::WZR, Indices);
5852 return;
5853 }
5854
5855 if (AArch64::FPR128RegClass.contains(DestReg) &&
5856 AArch64::FPR128RegClass.contains(SrcReg)) {
5857 // In streaming regions, NEON is illegal but streaming-SVE is available.
5858 // Use SVE for copies if we're in a streaming region and SME is available.
5859 // With +sme-fa64, NEON is legal in streaming mode so we can use it.
5860 if ((Subtarget.isSVEorStreamingSVEAvailable() &&
5861 !Subtarget.isNeonAvailable()) ||
5862 mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5863 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ))
5864 .addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define)
5865 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0))
5866 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0));
5867 } else if (Subtarget.isNeonAvailable()) {
5868 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
5869 .addReg(SrcReg)
5870 .addReg(SrcReg, getKillRegState(KillSrc));
5871 if (Subtarget.hasZeroCycleRegMoveFPR128())
5872 ++NumZCRegMoveInstrsFPR;
5873 } else {
5874 BuildMI(MBB, I, DL, get(AArch64::STRQpre))
5875 .addReg(AArch64::SP, RegState::Define)
5876 .addReg(SrcReg, getKillRegState(KillSrc))
5877 .addReg(AArch64::SP)
5878 .addImm(-16);
5879 BuildMI(MBB, I, DL, get(AArch64::LDRQpost))
5880 .addReg(AArch64::SP, RegState::Define)
5881 .addReg(DestReg, RegState::Define)
5882 .addReg(AArch64::SP)
5883 .addImm(16);
5884 }
5885 return;
5886 }
5887
5888 if (AArch64::FPR64RegClass.contains(DestReg) &&
5889 AArch64::FPR64RegClass.contains(SrcReg)) {
5890 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5891 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5892 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5893 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5894 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
5895 &AArch64::FPR128RegClass);
5896 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
5897 &AArch64::FPR128RegClass);
5898 // This instruction is reading and writing Q registers. This may upset
5899 // the register scavenger and machine verifier, so we need to indicate
5900 // that we are reading an undefined value from SrcRegQ, but a proper
5901 // value from SrcReg.
5902 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
5903 .addReg(SrcRegQ, RegState::Undef)
5904 .addReg(SrcRegQ, RegState::Undef)
5905 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5906 ++NumZCRegMoveInstrsFPR;
5907 } else {
5908 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
5909 .addReg(SrcReg, getKillRegState(KillSrc));
5910 if (Subtarget.hasZeroCycleRegMoveFPR64())
5911 ++NumZCRegMoveInstrsFPR;
5912 }
5913 return;
5914 }
5915
5916 if (AArch64::FPR32RegClass.contains(DestReg) &&
5917 AArch64::FPR32RegClass.contains(SrcReg)) {
5918 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5919 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5920 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5921 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5922 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
5923 &AArch64::FPR128RegClass);
5924 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
5925 &AArch64::FPR128RegClass);
5926 // This instruction is reading and writing Q registers. This may upset
5927 // the register scavenger and machine verifier, so we need to indicate
5928 // that we are reading an undefined value from SrcRegQ, but a proper
5929 // value from SrcReg.
5930 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
5931 .addReg(SrcRegQ, RegState::Undef)
5932 .addReg(SrcRegQ, RegState::Undef)
5933 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5934 ++NumZCRegMoveInstrsFPR;
5935 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5936 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5937 MCRegister DestRegD = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
5938 &AArch64::FPR64RegClass);
5939 MCRegister SrcRegD = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
5940 &AArch64::FPR64RegClass);
5941 // This instruction is reading and writing D registers. This may upset
5942 // the register scavenger and machine verifier, so we need to indicate
5943 // that we are reading an undefined value from SrcRegD, but a proper
5944 // value from SrcReg.
5945 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5946 .addReg(SrcRegD, RegState::Undef)
5947 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5948 ++NumZCRegMoveInstrsFPR;
5949 } else {
5950 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5951 .addReg(SrcReg, getKillRegState(KillSrc));
5952 if (Subtarget.hasZeroCycleRegMoveFPR32())
5953 ++NumZCRegMoveInstrsFPR;
5954 }
5955 return;
5956 }
5957
5958 if (AArch64::FPR16RegClass.contains(DestReg) &&
5959 AArch64::FPR16RegClass.contains(SrcReg)) {
5960 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5961 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5962 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5963 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5964 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
5965 &AArch64::FPR128RegClass);
5966 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
5967 &AArch64::FPR128RegClass);
5968 // This instruction is reading and writing Q registers. This may upset
5969 // the register scavenger and machine verifier, so we need to indicate
5970 // that we are reading an undefined value from SrcRegQ, but a proper
5971 // value from SrcReg.
5972 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
5973 .addReg(SrcRegQ, RegState::Undef)
5974 .addReg(SrcRegQ, RegState::Undef)
5975 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5976 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5977 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5978 MCRegister DestRegD = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
5979 &AArch64::FPR64RegClass);
5980 MCRegister SrcRegD = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
5981 &AArch64::FPR64RegClass);
5982 // This instruction is reading and writing D registers. This may upset
5983 // the register scavenger and machine verifier, so we need to indicate
5984 // that we are reading an undefined value from SrcRegD, but a proper
5985 // value from SrcReg.
5986 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5987 .addReg(SrcRegD, RegState::Undef)
5988 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5989 } else {
5990 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
5991 &AArch64::FPR32RegClass);
5992 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
5993 &AArch64::FPR32RegClass);
5994 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5995 .addReg(SrcReg, getKillRegState(KillSrc));
5996 }
5997 return;
5998 }
5999
6000 if (AArch64::FPR8RegClass.contains(DestReg) &&
6001 AArch64::FPR8RegClass.contains(SrcReg)) {
6002 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
6003 !Subtarget.hasZeroCycleRegMoveFPR64() &&
6004 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
6005 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
6006 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
6007 &AArch64::FPR128RegClass);
6008 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
6009 &AArch64::FPR128RegClass);
6010 // This instruction is reading and writing Q registers. This may upset
6011 // the register scavenger and machine verifier, so we need to indicate
6012 // that we are reading an undefined value from SrcRegQ, but a proper
6013 // value from SrcReg.
6014 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
6015 .addReg(SrcRegQ, RegState::Undef)
6016 .addReg(SrcRegQ, RegState::Undef)
6017 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
6018 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
6019 !Subtarget.hasZeroCycleRegMoveFPR32()) {
6020 MCRegister DestRegD = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
6021 &AArch64::FPR64RegClass);
6022 MCRegister SrcRegD = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
6023 &AArch64::FPR64RegClass);
6024 // This instruction is reading and writing D registers. This may upset
6025 // the register scavenger and machine verifier, so we need to indicate
6026 // that we are reading an undefined value from SrcRegD, but a proper
6027 // value from SrcReg.
6028 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
6029 .addReg(SrcRegD, RegState::Undef)
6030 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
6031 } else {
6032 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
6033 &AArch64::FPR32RegClass);
6034 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
6035 &AArch64::FPR32RegClass);
6036 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
6037 .addReg(SrcReg, getKillRegState(KillSrc));
6038 }
6039 return;
6040 }
6041
6042 // Copies between GPR64 and FPR64.
6043 if (AArch64::FPR64RegClass.contains(DestReg) &&
6044 AArch64::GPR64RegClass.contains(SrcReg)) {
6045 if (AArch64::XZR == SrcReg) {
6046 BuildMI(MBB, I, DL, get(AArch64::FMOVD0), DestReg);
6047 } else {
6048 BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
6049 .addReg(SrcReg, getKillRegState(KillSrc));
6050 }
6051 return;
6052 }
6053 if (AArch64::GPR64RegClass.contains(DestReg) &&
6054 AArch64::FPR64RegClass.contains(SrcReg)) {
6055 BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
6056 .addReg(SrcReg, getKillRegState(KillSrc));
6057 return;
6058 }
6059 // Copies between GPR32 and FPR32.
6060 if (AArch64::FPR32RegClass.contains(DestReg) &&
6061 AArch64::GPR32RegClass.contains(SrcReg)) {
6062 if (AArch64::WZR == SrcReg) {
6063 BuildMI(MBB, I, DL, get(AArch64::FMOVS0), DestReg);
6064 } else {
6065 BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
6066 .addReg(SrcReg, getKillRegState(KillSrc));
6067 }
6068 return;
6069 }
6070 if (AArch64::GPR32RegClass.contains(DestReg) &&
6071 AArch64::FPR32RegClass.contains(SrcReg)) {
6072 BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
6073 .addReg(SrcReg, getKillRegState(KillSrc));
6074 return;
6075 }
6076
6077 if (DestReg == AArch64::NZCV) {
6078 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
6079 BuildMI(MBB, I, DL, get(AArch64::MSR))
6080 .addImm(AArch64SysReg::NZCV)
6081 .addReg(SrcReg, getKillRegState(KillSrc))
6082 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
6083 return;
6084 }
6085
6086 if (SrcReg == AArch64::NZCV) {
6087 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
6088 BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
6089 .addImm(AArch64SysReg::NZCV)
6090 .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
6091 return;
6092 }
6093
6094#ifndef NDEBUG
6095 errs() << RI.getRegAsmName(DestReg) << " = COPY " << RI.getRegAsmName(SrcReg)
6096 << "\n";
6097#endif
6098 llvm_unreachable("unimplemented reg-to-reg copy");
6099}
6100
6103 MachineBasicBlock::iterator InsertBefore,
6104 const MCInstrDesc &MCID,
6105 Register SrcReg, bool IsKill,
6106 unsigned SubIdx0, unsigned SubIdx1, int FI,
6107 MachineMemOperand *MMO) {
6108 Register SrcReg0 = SrcReg;
6109 Register SrcReg1 = SrcReg;
6110 if (SrcReg.isPhysical()) {
6111 SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
6112 SubIdx0 = 0;
6113 SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
6114 SubIdx1 = 0;
6115 }
6116 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
6117 .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
6118 .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
6119 .addFrameIndex(FI)
6120 .addImm(0)
6121 .addMemOperand(MMO);
6122}
6123
6126 Register SrcReg, bool isKill, int FI,
6127 const TargetRegisterClass *RC,
6128 Register VReg,
6129 MachineInstr::MIFlag Flags) const {
6130 MachineFunction &MF = *MBB.getParent();
6131 MachineFrameInfo &MFI = MF.getFrameInfo();
6132
6134 MachineMemOperand *MMO =
6136 MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
6137 unsigned Opc = 0;
6138 bool Offset = true;
6140 unsigned StackID = TargetStackID::Default;
6141 switch (RI.getSpillSize(*RC)) {
6142 case 1:
6143 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
6144 Opc = AArch64::STRBui;
6145 break;
6146 case 2: {
6147 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
6148 Opc = AArch64::STRHui;
6149 else if (AArch64::PNRRegClass.hasSubClassEq(RC) ||
6150 AArch64::PPRRegClass.hasSubClassEq(RC)) {
6151 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6152 "Unexpected register store without SVE store instructions");
6153 Opc = AArch64::STR_PXI;
6155 }
6156 break;
6157 }
6158 case 4:
6159 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
6160 Opc = AArch64::STRWui;
6161 if (SrcReg.isVirtual())
6162 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
6163 else
6164 assert(SrcReg != AArch64::WSP);
6165 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6166 Opc = AArch64::STRSui;
6167 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
6168 Opc = AArch64::STR_PPXI;
6170 }
6171 break;
6172 case 8:
6173 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
6174 Opc = AArch64::STRXui;
6175 if (SrcReg.isVirtual())
6176 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
6177 else
6178 assert(SrcReg != AArch64::SP);
6179 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
6180 Opc = AArch64::STRDui;
6181 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
6183 get(AArch64::STPWi), SrcReg, isKill,
6184 AArch64::sube32, AArch64::subo32, FI, MMO);
6185 return;
6186 }
6187 break;
6188 case 16:
6189 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
6190 Opc = AArch64::STRQui;
6191 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
6192 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6193 Opc = AArch64::ST1Twov1d;
6194 Offset = false;
6195 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
6197 get(AArch64::STPXi), SrcReg, isKill,
6198 AArch64::sube64, AArch64::subo64, FI, MMO);
6199 return;
6200 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
6201 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6202 "Unexpected register store without SVE store instructions");
6203 Opc = AArch64::STR_ZXI;
6205 }
6206 break;
6207 case 24:
6208 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
6209 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6210 Opc = AArch64::ST1Threev1d;
6211 Offset = false;
6212 }
6213 break;
6214 case 32:
6215 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
6216 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6217 Opc = AArch64::ST1Fourv1d;
6218 Offset = false;
6219 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
6220 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6221 Opc = AArch64::ST1Twov2d;
6222 Offset = false;
6223 } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6224 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6225 "Unexpected register store without SVE store instructions");
6226 Opc = AArch64::STR_ZZXI_STRIDED_CONTIGUOUS;
6228 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
6229 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6230 "Unexpected register store without SVE store instructions");
6231 Opc = AArch64::STR_ZZXI;
6233 }
6234 break;
6235 case 48:
6236 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
6237 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6238 Opc = AArch64::ST1Threev2d;
6239 Offset = false;
6240 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
6241 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6242 "Unexpected register store without SVE store instructions");
6243 Opc = AArch64::STR_ZZZXI;
6245 }
6246 break;
6247 case 64:
6248 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
6249 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6250 Opc = AArch64::ST1Fourv2d;
6251 Offset = false;
6252 } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6253 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6254 "Unexpected register store without SVE store instructions");
6255 Opc = AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS;
6257 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
6258 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6259 "Unexpected register store without SVE store instructions");
6260 Opc = AArch64::STR_ZZZZXI;
6262 }
6263 break;
6264 }
6265 assert(Opc && "Unknown register class");
6266 MFI.setStackID(FI, StackID);
6267
6269 .addReg(SrcReg, getKillRegState(isKill))
6270 .addFrameIndex(FI);
6271
6272 if (Offset)
6273 MI.addImm(0);
6274 if (PNRReg.isValid())
6275 MI.addDef(PNRReg, RegState::Implicit);
6276 MI.addMemOperand(MMO);
6277}
6278
6281 MachineBasicBlock::iterator InsertBefore,
6282 const MCInstrDesc &MCID,
6283 Register DestReg, unsigned SubIdx0,
6284 unsigned SubIdx1, int FI,
6285 MachineMemOperand *MMO) {
6286 Register DestReg0 = DestReg;
6287 Register DestReg1 = DestReg;
6288 bool IsUndef = true;
6289 if (DestReg.isPhysical()) {
6290 DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
6291 SubIdx0 = 0;
6292 DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
6293 SubIdx1 = 0;
6294 IsUndef = false;
6295 }
6296 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
6297 .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
6298 .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
6299 .addFrameIndex(FI)
6300 .addImm(0)
6301 .addMemOperand(MMO);
6302}
6303
6306 Register DestReg, int FI,
6307 const TargetRegisterClass *RC,
6308 Register VReg, unsigned SubReg,
6309 MachineInstr::MIFlag Flags) const {
6310 MachineFunction &MF = *MBB.getParent();
6311 MachineFrameInfo &MFI = MF.getFrameInfo();
6313 MachineMemOperand *MMO =
6315 MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
6316
6317 unsigned Opc = 0;
6318 bool Offset = true;
6319 unsigned StackID = TargetStackID::Default;
6321 switch (TRI.getSpillSize(*RC)) {
6322 case 1:
6323 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
6324 Opc = AArch64::LDRBui;
6325 break;
6326 case 2: {
6327 bool IsPNR = AArch64::PNRRegClass.hasSubClassEq(RC);
6328 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
6329 Opc = AArch64::LDRHui;
6330 else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) {
6331 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6332 "Unexpected register load without SVE load instructions");
6333 if (IsPNR)
6334 PNRReg = DestReg;
6335 Opc = AArch64::LDR_PXI;
6337 }
6338 break;
6339 }
6340 case 4:
6341 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
6342 Opc = AArch64::LDRWui;
6343 if (DestReg.isVirtual())
6344 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
6345 else
6346 assert(DestReg != AArch64::WSP);
6347 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6348 Opc = AArch64::LDRSui;
6349 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
6350 Opc = AArch64::LDR_PPXI;
6352 }
6353 break;
6354 case 8:
6355 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
6356 Opc = AArch64::LDRXui;
6357 if (DestReg.isVirtual())
6358 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
6359 else
6360 assert(DestReg != AArch64::SP);
6361 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
6362 Opc = AArch64::LDRDui;
6363 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
6365 get(AArch64::LDPWi), DestReg, AArch64::sube32,
6366 AArch64::subo32, FI, MMO);
6367 return;
6368 }
6369 break;
6370 case 16:
6371 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
6372 Opc = AArch64::LDRQui;
6373 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
6374 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6375 Opc = AArch64::LD1Twov1d;
6376 Offset = false;
6377 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
6379 get(AArch64::LDPXi), DestReg, AArch64::sube64,
6380 AArch64::subo64, FI, MMO);
6381 return;
6382 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
6383 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6384 "Unexpected register load without SVE load instructions");
6385 Opc = AArch64::LDR_ZXI;
6387 }
6388 break;
6389 case 24:
6390 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
6391 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6392 Opc = AArch64::LD1Threev1d;
6393 Offset = false;
6394 }
6395 break;
6396 case 32:
6397 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
6398 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6399 Opc = AArch64::LD1Fourv1d;
6400 Offset = false;
6401 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
6402 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6403 Opc = AArch64::LD1Twov2d;
6404 Offset = false;
6405 } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6406 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6407 "Unexpected register load without SVE load instructions");
6408 Opc = AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS;
6410 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
6411 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6412 "Unexpected register load without SVE load instructions");
6413 Opc = AArch64::LDR_ZZXI;
6415 }
6416 break;
6417 case 48:
6418 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
6419 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6420 Opc = AArch64::LD1Threev2d;
6421 Offset = false;
6422 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
6423 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6424 "Unexpected register load without SVE load instructions");
6425 Opc = AArch64::LDR_ZZZXI;
6427 }
6428 break;
6429 case 64:
6430 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
6431 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6432 Opc = AArch64::LD1Fourv2d;
6433 Offset = false;
6434 } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6435 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6436 "Unexpected register load without SVE load instructions");
6437 Opc = AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS;
6439 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
6440 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6441 "Unexpected register load without SVE load instructions");
6442 Opc = AArch64::LDR_ZZZZXI;
6444 }
6445 break;
6446 }
6447
6448 assert(Opc && "Unknown register class");
6449 MFI.setStackID(FI, StackID);
6450
6452 .addReg(DestReg, getDefRegState(true))
6453 .addFrameIndex(FI);
6454 if (Offset)
6455 MI.addImm(0);
6456 if (PNRReg.isValid() && !PNRReg.isVirtual())
6457 MI.addDef(PNRReg, RegState::Implicit);
6458 MI.addMemOperand(MMO);
6459}
6460
6462 const MachineInstr &UseMI,
6463 const TargetRegisterInfo *TRI) {
6464 return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),
6465 UseMI.getIterator()),
6466 [TRI](const MachineInstr &I) {
6467 return I.modifiesRegister(AArch64::NZCV, TRI) ||
6468 I.readsRegister(AArch64::NZCV, TRI);
6469 });
6470}
6471
6472void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
6473 const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
6474 // The smallest scalable element supported by scaled SVE addressing
6475 // modes are predicates, which are 2 scalable bytes in size. So the scalable
6476 // byte offset must always be a multiple of 2.
6477 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
6478
6479 // VGSized offsets are divided by '2', because the VG register is the
6480 // the number of 64bit granules as opposed to 128bit vector chunks,
6481 // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
6482 // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
6483 // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
6484 ByteSized = Offset.getFixed();
6485 VGSized = Offset.getScalable() / 2;
6486}
6487
6488/// Returns the offset in parts to which this frame offset can be
6489/// decomposed for the purpose of describing a frame offset.
6490/// For non-scalable offsets this is simply its byte size.
6491void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
6492 const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
6493 int64_t &NumDataVectors) {
6494 // The smallest scalable element supported by scaled SVE addressing
6495 // modes are predicates, which are 2 scalable bytes in size. So the scalable
6496 // byte offset must always be a multiple of 2.
6497 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
6498
6499 NumBytes = Offset.getFixed();
6500 NumDataVectors = 0;
6501 NumPredicateVectors = Offset.getScalable() / 2;
6502 // This method is used to get the offsets to adjust the frame offset.
6503 // If the function requires ADDPL to be used and needs more than two ADDPL
6504 // instructions, part of the offset is folded into NumDataVectors so that it
6505 // uses ADDVL for part of it, reducing the number of ADDPL instructions.
6506 if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
6507 NumPredicateVectors > 62) {
6508 NumDataVectors = NumPredicateVectors / 8;
6509 NumPredicateVectors -= NumDataVectors * 8;
6510 }
6511}
6512
6513// Convenience function to create a DWARF expression for: Constant `Operation`.
6514// This helper emits compact sequences for common cases. For example, for`-15
6515// DW_OP_plus`, this helper would create DW_OP_lit15 DW_OP_minus.
6518 if (Operation == dwarf::DW_OP_plus && Constant < 0 && -Constant <= 31) {
6519 // -Constant (1 to 31)
6520 Expr.push_back(dwarf::DW_OP_lit0 - Constant);
6521 Operation = dwarf::DW_OP_minus;
6522 } else if (Constant >= 0 && Constant <= 31) {
6523 // Literal value 0 to 31
6524 Expr.push_back(dwarf::DW_OP_lit0 + Constant);
6525 } else {
6526 // Signed constant
6527 Expr.push_back(dwarf::DW_OP_consts);
6529 }
6530 return Expr.push_back(Operation);
6531}
6532
6533// Convenience function to create a DWARF expression for a register.
6534static void appendReadRegExpr(SmallVectorImpl<char> &Expr, unsigned RegNum) {
6535 Expr.push_back((char)dwarf::DW_OP_bregx);
6537 Expr.push_back(0);
6538}
6539
6540// Convenience function to create a DWARF expression for loading a register from
6541// a CFA offset.
6543 int64_t OffsetFromDefCFA) {
6544 // This assumes the top of the DWARF stack contains the CFA.
6545 Expr.push_back(dwarf::DW_OP_dup);
6546 // Add the offset to the register.
6547 appendConstantExpr(Expr, OffsetFromDefCFA, dwarf::DW_OP_plus);
6548 // Dereference the address (loads a 64 bit value)..
6549 Expr.push_back(dwarf::DW_OP_deref);
6550}
6551
6552// Convenience function to create a comment for
6553// (+/-) NumBytes (* RegScale)?
6554static void appendOffsetComment(int NumBytes, llvm::raw_string_ostream &Comment,
6555 StringRef RegScale = {}) {
6556 if (NumBytes) {
6557 Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
6558 if (!RegScale.empty())
6559 Comment << ' ' << RegScale;
6560 }
6561}
6562
6563// Creates an MCCFIInstruction:
6564// { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
6566 unsigned Reg,
6567 const StackOffset &Offset) {
6568 int64_t NumBytes, NumVGScaledBytes;
6569 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, NumBytes,
6570 NumVGScaledBytes);
6571 std::string CommentBuffer;
6572 llvm::raw_string_ostream Comment(CommentBuffer);
6573
6574 if (Reg == AArch64::SP)
6575 Comment << "sp";
6576 else if (Reg == AArch64::FP)
6577 Comment << "fp";
6578 else
6579 Comment << printReg(Reg, &TRI);
6580
6581 // Build up the expression (Reg + NumBytes + VG * NumVGScaledBytes)
6582 SmallString<64> Expr;
6583 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
6584 assert(DwarfReg <= 31 && "DwarfReg out of bounds (0..31)");
6585 // Reg + NumBytes
6586 Expr.push_back(dwarf::DW_OP_breg0 + DwarfReg);
6587 appendLEB128<LEB128Sign::Signed>(Expr, NumBytes);
6588 appendOffsetComment(NumBytes, Comment);
6589 if (NumVGScaledBytes) {
6590 // + VG * NumVGScaledBytes
6591 appendOffsetComment(NumVGScaledBytes, Comment, "* VG");
6592 appendReadRegExpr(Expr, TRI.getDwarfRegNum(AArch64::VG, true));
6593 appendConstantExpr(Expr, NumVGScaledBytes, dwarf::DW_OP_mul);
6594 Expr.push_back(dwarf::DW_OP_plus);
6595 }
6596
6597 // Wrap this into DW_CFA_def_cfa.
6598 SmallString<64> DefCfaExpr;
6599 DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
6600 appendLEB128<LEB128Sign::Unsigned>(DefCfaExpr, Expr.size());
6601 DefCfaExpr.append(Expr.str());
6602 return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(),
6603 Comment.str());
6604}
6605
6607 unsigned FrameReg, unsigned Reg,
6608 const StackOffset &Offset,
6609 bool LastAdjustmentWasScalable) {
6610 if (Offset.getScalable())
6611 return createDefCFAExpression(TRI, Reg, Offset);
6612
6613 if (FrameReg == Reg && !LastAdjustmentWasScalable)
6614 return MCCFIInstruction::cfiDefCfaOffset(nullptr, int(Offset.getFixed()));
6615
6616 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
6617 return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed());
6618}
6619
6622 const StackOffset &OffsetFromDefCFA,
6623 std::optional<int64_t> IncomingVGOffsetFromDefCFA) {
6624 int64_t NumBytes, NumVGScaledBytes;
6625 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
6626 OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
6627
6628 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
6629
6630 // Non-scalable offsets can use DW_CFA_offset directly.
6631 if (!NumVGScaledBytes)
6632 return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
6633
6634 std::string CommentBuffer;
6635 llvm::raw_string_ostream Comment(CommentBuffer);
6636 Comment << printReg(Reg, &TRI) << " @ cfa";
6637
6638 // Build up expression (CFA + VG * NumVGScaledBytes + NumBytes)
6639 assert(NumVGScaledBytes && "Expected scalable offset");
6640 SmallString<64> OffsetExpr;
6641 // + VG * NumVGScaledBytes
6642 StringRef VGRegScale;
6643 if (IncomingVGOffsetFromDefCFA) {
6644 appendLoadRegExpr(OffsetExpr, *IncomingVGOffsetFromDefCFA);
6645 VGRegScale = "* IncomingVG";
6646 } else {
6647 appendReadRegExpr(OffsetExpr, TRI.getDwarfRegNum(AArch64::VG, true));
6648 VGRegScale = "* VG";
6649 }
6650 appendConstantExpr(OffsetExpr, NumVGScaledBytes, dwarf::DW_OP_mul);
6651 appendOffsetComment(NumVGScaledBytes, Comment, VGRegScale);
6652 OffsetExpr.push_back(dwarf::DW_OP_plus);
6653 if (NumBytes) {
6654 // + NumBytes
6655 appendOffsetComment(NumBytes, Comment);
6656 appendConstantExpr(OffsetExpr, NumBytes, dwarf::DW_OP_plus);
6657 }
6658
6659 // Wrap this into DW_CFA_expression
6660 SmallString<64> CfaExpr;
6661 CfaExpr.push_back(dwarf::DW_CFA_expression);
6662 appendLEB128<LEB128Sign::Unsigned>(CfaExpr, DwarfReg);
6663 appendLEB128<LEB128Sign::Unsigned>(CfaExpr, OffsetExpr.size());
6664 CfaExpr.append(OffsetExpr.str());
6665
6666 return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), SMLoc(),
6667 Comment.str());
6668}
6669
6670// Helper function to emit a frame offset adjustment from a given
6671// pointer (SrcReg), stored into DestReg. This function is explicit
6672// in that it requires the opcode.
6675 const DebugLoc &DL, unsigned DestReg,
6676 unsigned SrcReg, int64_t Offset, unsigned Opc,
6677 const TargetInstrInfo *TII,
6678 MachineInstr::MIFlag Flag, bool NeedsWinCFI,
6679 bool *HasWinCFI, bool EmitCFAOffset,
6680 StackOffset CFAOffset, unsigned FrameReg) {
6681 int Sign = 1;
6682 unsigned MaxEncoding, ShiftSize;
6683 switch (Opc) {
6684 case AArch64::ADDXri:
6685 case AArch64::ADDSXri:
6686 case AArch64::SUBXri:
6687 case AArch64::SUBSXri:
6688 MaxEncoding = 0xfff;
6689 ShiftSize = 12;
6690 break;
6691 case AArch64::ADDVL_XXI:
6692 case AArch64::ADDPL_XXI:
6693 case AArch64::ADDSVL_XXI:
6694 case AArch64::ADDSPL_XXI:
6695 MaxEncoding = 31;
6696 ShiftSize = 0;
6697 if (Offset < 0) {
6698 MaxEncoding = 32;
6699 Sign = -1;
6700 Offset = -Offset;
6701 }
6702 break;
6703 default:
6704 llvm_unreachable("Unsupported opcode");
6705 }
6706
6707 // `Offset` can be in bytes or in "scalable bytes".
6708 int VScale = 1;
6709 if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI)
6710 VScale = 16;
6711 else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI)
6712 VScale = 2;
6713
6714 // FIXME: If the offset won't fit in 24-bits, compute the offset into a
6715 // scratch register. If DestReg is a virtual register, use it as the
6716 // scratch register; otherwise, create a new virtual register (to be
6717 // replaced by the scavenger at the end of PEI). That case can be optimized
6718 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
6719 // register can be loaded with offset%8 and the add/sub can use an extending
6720 // instruction with LSL#3.
6721 // Currently the function handles any offsets but generates a poor sequence
6722 // of code.
6723 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
6724
6725 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
6726 Register TmpReg = DestReg;
6727 if (TmpReg == AArch64::XZR)
6728 TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
6729 &AArch64::GPR64RegClass);
6730 do {
6731 uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
6732 unsigned LocalShiftSize = 0;
6733 if (ThisVal > MaxEncoding) {
6734 ThisVal = ThisVal >> ShiftSize;
6735 LocalShiftSize = ShiftSize;
6736 }
6737 assert((ThisVal >> ShiftSize) <= MaxEncoding &&
6738 "Encoding cannot handle value that big");
6739
6740 Offset -= ThisVal << LocalShiftSize;
6741 if (Offset == 0)
6742 TmpReg = DestReg;
6743 auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
6744 .addReg(SrcReg)
6745 .addImm(Sign * (int)ThisVal);
6746 if (ShiftSize)
6747 MBI = MBI.addImm(
6749 MBI = MBI.setMIFlag(Flag);
6750
6751 auto Change =
6752 VScale == 1
6753 ? StackOffset::getFixed(ThisVal << LocalShiftSize)
6754 : StackOffset::getScalable(VScale * (ThisVal << LocalShiftSize));
6755 if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri)
6756 CFAOffset += Change;
6757 else
6758 CFAOffset -= Change;
6759 if (EmitCFAOffset && DestReg == TmpReg) {
6760 MachineFunction &MF = *MBB.getParent();
6761 const TargetSubtargetInfo &STI = MF.getSubtarget();
6762 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
6763
6764 unsigned CFIIndex = MF.addFrameInst(
6765 createDefCFA(TRI, FrameReg, DestReg, CFAOffset, VScale != 1));
6766 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
6767 .addCFIIndex(CFIIndex)
6768 .setMIFlags(Flag);
6769 }
6770
6771 if (NeedsWinCFI) {
6772 int Imm = (int)(ThisVal << LocalShiftSize);
6773 if (VScale != 1 && DestReg == AArch64::SP) {
6774 if (HasWinCFI)
6775 *HasWinCFI = true;
6776 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AllocZ))
6777 .addImm(ThisVal)
6778 .setMIFlag(Flag);
6779 } else if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
6780 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
6781 assert(VScale == 1 && "Expected non-scalable operation");
6782 if (HasWinCFI)
6783 *HasWinCFI = true;
6784 if (Imm == 0)
6785 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
6786 else
6787 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
6788 .addImm(Imm)
6789 .setMIFlag(Flag);
6790 assert(Offset == 0 && "Expected remaining offset to be zero to "
6791 "emit a single SEH directive");
6792 } else if (DestReg == AArch64::SP) {
6793 assert(VScale == 1 && "Expected non-scalable operation");
6794 if (HasWinCFI)
6795 *HasWinCFI = true;
6796 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
6797 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
6798 .addImm(Imm)
6799 .setMIFlag(Flag);
6800 }
6801 }
6802
6803 SrcReg = TmpReg;
6804 } while (Offset);
6805}
6806
6809 unsigned DestReg, unsigned SrcReg,
6811 MachineInstr::MIFlag Flag, bool SetNZCV,
6812 bool NeedsWinCFI, bool *HasWinCFI,
6813 bool EmitCFAOffset, StackOffset CFAOffset,
6814 unsigned FrameReg) {
6815 // If a function is marked as arm_locally_streaming, then the runtime value of
6816 // vscale in the prologue/epilogue is different the runtime value of vscale
6817 // in the function's body. To avoid having to consider multiple vscales,
6818 // we can use `addsvl` to allocate any scalable stack-slots, which under
6819 // most circumstances will be only locals, not callee-save slots.
6820 const Function &F = MBB.getParent()->getFunction();
6821 bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body");
6822
6823 int64_t Bytes, NumPredicateVectors, NumDataVectors;
6824 AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
6825 Offset, Bytes, NumPredicateVectors, NumDataVectors);
6826
6827 // Insert ADDSXri for scalable offset at the end.
6828 bool NeedsFinalDefNZCV = SetNZCV && (NumPredicateVectors || NumDataVectors);
6829 if (NeedsFinalDefNZCV)
6830 SetNZCV = false;
6831
6832 // First emit non-scalable frame offsets, or a simple 'mov'.
6833 if (Bytes || (!Offset && SrcReg != DestReg)) {
6834 assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
6835 "SP increment/decrement not 8-byte aligned");
6836 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
6837 if (Bytes < 0) {
6838 Bytes = -Bytes;
6839 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
6840 }
6841 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
6842 NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6843 FrameReg);
6844 CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri)
6845 ? StackOffset::getFixed(-Bytes)
6846 : StackOffset::getFixed(Bytes);
6847 SrcReg = DestReg;
6848 FrameReg = DestReg;
6849 }
6850
6851 assert(!(NeedsWinCFI && NumPredicateVectors) &&
6852 "WinCFI can't allocate fractions of an SVE data vector");
6853
6854 if (NumDataVectors) {
6855 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
6856 UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI, TII,
6857 Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6858 FrameReg);
6859 CFAOffset += StackOffset::getScalable(-NumDataVectors * 16);
6860 SrcReg = DestReg;
6861 }
6862
6863 if (NumPredicateVectors) {
6864 assert(DestReg != AArch64::SP && "Unaligned access to SP");
6865 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
6866 UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI, TII,
6867 Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6868 FrameReg);
6869 }
6870
6871 if (NeedsFinalDefNZCV)
6872 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDSXri), DestReg)
6873 .addReg(DestReg)
6874 .addImm(0)
6875 .addImm(0);
6876}
6877
6880 int FrameIndex, MachineInstr *&CopyMI, LiveIntervals *LIS,
6881 VirtRegMap *VRM) const {
6883 // This is a bit of a hack. Consider this instruction:
6884 //
6885 // %0 = COPY %sp; GPR64all:%0
6886 //
6887 // We explicitly chose GPR64all for the virtual register so such a copy might
6888 // be eliminated by RegisterCoalescer. However, that may not be possible, and
6889 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
6890 // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
6891 //
6892 // To prevent that, we are going to constrain the %0 register class here.
6893 if (MI.isFullCopy()) {
6894 Register DstReg = MI.getOperand(0).getReg();
6895 Register SrcReg = MI.getOperand(1).getReg();
6896 if (SrcReg == AArch64::SP && DstReg.isVirtual()) {
6897 MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
6898 return nullptr;
6899 }
6900 if (DstReg == AArch64::SP && SrcReg.isVirtual()) {
6901 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
6902 return nullptr;
6903 }
6904 // Nothing can folded with copy from/to NZCV.
6905 if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV)
6906 return nullptr;
6907 }
6908
6909 // Handle the case where a copy is being spilled or filled but the source
6910 // and destination register class don't match. For example:
6911 //
6912 // %0 = COPY %xzr; GPR64common:%0
6913 //
6914 // In this case we can still safely fold away the COPY and generate the
6915 // following spill code:
6916 //
6917 // STRXui %xzr, %stack.0
6918 //
6919 // This also eliminates spilled cross register class COPYs (e.g. between x and
6920 // d regs) of the same size. For example:
6921 //
6922 // %0 = COPY %1; GPR64:%0, FPR64:%1
6923 //
6924 // will be filled as
6925 //
6926 // LDRDui %0, fi<#0>
6927 //
6928 // instead of
6929 //
6930 // LDRXui %Temp, fi<#0>
6931 // %0 = FMOV %Temp
6932 //
6933 if (MI.isCopy() && Ops.size() == 1 &&
6934 // Make sure we're only folding the explicit COPY defs/uses.
6935 (Ops[0] == 0 || Ops[0] == 1)) {
6936 bool IsSpill = Ops[0] == 0;
6937 bool IsFill = !IsSpill;
6939 const MachineRegisterInfo &MRI = MF.getRegInfo();
6940 MachineBasicBlock &MBB = *MI.getParent();
6941 const MachineOperand &DstMO = MI.getOperand(0);
6942 const MachineOperand &SrcMO = MI.getOperand(1);
6943 Register DstReg = DstMO.getReg();
6944 Register SrcReg = SrcMO.getReg();
6945 // This is slightly expensive to compute for physical regs since
6946 // getMinimalPhysRegClass is slow.
6947 auto getRegClass = [&](unsigned Reg) {
6948 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
6949 : TRI.getMinimalPhysRegClass(Reg);
6950 };
6951
6952 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
6953 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
6954 TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
6955 "Mismatched register size in non subreg COPY");
6956 if (IsSpill)
6957 storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
6958 getRegClass(SrcReg), Register());
6959 else
6960 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
6961 getRegClass(DstReg), Register());
6962 return &*--InsertPt;
6963 }
6964
6965 // Handle cases like spilling def of:
6966 //
6967 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
6968 //
6969 // where the physical register source can be widened and stored to the full
6970 // virtual reg destination stack slot, in this case producing:
6971 //
6972 // STRXui %xzr, %stack.0
6973 //
6974 if (IsSpill && DstMO.isUndef() && SrcReg == AArch64::WZR &&
6975 TRI.getRegSizeInBits(*getRegClass(DstReg)) == 64) {
6976 assert(SrcMO.getSubReg() == 0 &&
6977 "Unexpected subreg on physical register");
6978 storeRegToStackSlot(MBB, InsertPt, AArch64::XZR, SrcMO.isKill(),
6979 FrameIndex, &AArch64::GPR64RegClass, Register());
6980 return &*--InsertPt;
6981 }
6982
6983 // Handle cases like filling use of:
6984 //
6985 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
6986 //
6987 // where we can load the full virtual reg source stack slot, into the subreg
6988 // destination, in this case producing:
6989 //
6990 // LDRWui %0:sub_32<def,read-undef>, %stack.0
6991 //
6992 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
6993 const TargetRegisterClass *FillRC = nullptr;
6994 switch (DstMO.getSubReg()) {
6995 default:
6996 break;
6997 case AArch64::sub_32:
6998 if (AArch64::GPR64RegClass.hasSubClassEq(getRegClass(DstReg)))
6999 FillRC = &AArch64::GPR32RegClass;
7000 break;
7001 case AArch64::ssub:
7002 FillRC = &AArch64::FPR32RegClass;
7003 break;
7004 case AArch64::dsub:
7005 FillRC = &AArch64::FPR64RegClass;
7006 break;
7007 }
7008
7009 if (FillRC) {
7010 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
7011 TRI.getRegSizeInBits(*FillRC) &&
7012 "Mismatched regclass size on folded subreg COPY");
7013 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC,
7014 Register());
7015 MachineInstr &LoadMI = *--InsertPt;
7016 MachineOperand &LoadDst = LoadMI.getOperand(0);
7017 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
7018 LoadDst.setSubReg(DstMO.getSubReg());
7019 LoadDst.setIsUndef();
7020 return &LoadMI;
7021 }
7022 }
7023 }
7024
7025 // Cannot fold.
7026 return nullptr;
7027}
7028
7030 StackOffset &SOffset,
7031 bool *OutUseUnscaledOp,
7032 unsigned *OutUnscaledOp,
7033 int64_t *EmittableOffset) {
7034 // Set output values in case of early exit.
7035 if (EmittableOffset)
7036 *EmittableOffset = 0;
7037 if (OutUseUnscaledOp)
7038 *OutUseUnscaledOp = false;
7039 if (OutUnscaledOp)
7040 *OutUnscaledOp = 0;
7041
7042 // Exit early for structured vector spills/fills as they can't take an
7043 // immediate offset.
7044 switch (MI.getOpcode()) {
7045 default:
7046 break;
7047 case AArch64::LD1Rv1d:
7048 case AArch64::LD1Rv2s:
7049 case AArch64::LD1Rv2d:
7050 case AArch64::LD1Rv4h:
7051 case AArch64::LD1Rv4s:
7052 case AArch64::LD1Rv8b:
7053 case AArch64::LD1Rv8h:
7054 case AArch64::LD1Rv16b:
7055 case AArch64::LD1Twov2d:
7056 case AArch64::LD1Threev2d:
7057 case AArch64::LD1Fourv2d:
7058 case AArch64::LD1Twov1d:
7059 case AArch64::LD1Threev1d:
7060 case AArch64::LD1Fourv1d:
7061 case AArch64::ST1Twov2d:
7062 case AArch64::ST1Threev2d:
7063 case AArch64::ST1Fourv2d:
7064 case AArch64::ST1Twov1d:
7065 case AArch64::ST1Threev1d:
7066 case AArch64::ST1Fourv1d:
7067 case AArch64::ST1i8:
7068 case AArch64::ST1i16:
7069 case AArch64::ST1i32:
7070 case AArch64::ST1i64:
7071 case AArch64::IRG:
7072 case AArch64::IRGstack:
7073 case AArch64::STGloop:
7074 case AArch64::STZGloop:
7076 }
7077
7078 // Get the min/max offset and the scale.
7079 TypeSize ScaleValue(0U, false), Width(0U, false);
7080 int64_t MinOff, MaxOff;
7081 if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,
7082 MaxOff))
7083 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
7084
7085 // Construct the complete offset.
7086 bool IsMulVL = ScaleValue.isScalable();
7087 unsigned Scale = ScaleValue.getKnownMinValue();
7088 int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
7089
7090 const MachineOperand &ImmOpnd =
7091 MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
7092 Offset += ImmOpnd.getImm() * Scale;
7093
7094 // If the offset doesn't match the scale, we rewrite the instruction to
7095 // use the unscaled instruction instead. Likewise, if we have a negative
7096 // offset and there is an unscaled op to use.
7097 std::optional<unsigned> UnscaledOp =
7099 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
7100 if (useUnscaledOp &&
7101 !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,
7102 MaxOff))
7103 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
7104
7105 Scale = ScaleValue.getKnownMinValue();
7106 assert(IsMulVL == ScaleValue.isScalable() &&
7107 "Unscaled opcode has different value for scalable");
7108
7109 int64_t Remainder = Offset % Scale;
7110 assert(!(Remainder && useUnscaledOp) &&
7111 "Cannot have remainder when using unscaled op");
7112
7113 assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
7114 int64_t NewOffset = Offset / Scale;
7115 if (MinOff <= NewOffset && NewOffset <= MaxOff)
7116 Offset = Remainder;
7117 else {
7118 NewOffset = NewOffset < 0 ? MinOff : MaxOff;
7119 Offset = Offset - (NewOffset * Scale);
7120 }
7121
7122 if (EmittableOffset)
7123 *EmittableOffset = NewOffset;
7124 if (OutUseUnscaledOp)
7125 *OutUseUnscaledOp = useUnscaledOp;
7126 if (OutUnscaledOp && UnscaledOp)
7127 *OutUnscaledOp = *UnscaledOp;
7128
7129 if (IsMulVL)
7130 SOffset = StackOffset::get(SOffset.getFixed(), Offset);
7131 else
7132 SOffset = StackOffset::get(Offset, SOffset.getScalable());
7134 (SOffset ? 0 : AArch64FrameOffsetIsLegal);
7135}
7136
7138 unsigned FrameReg, StackOffset &Offset,
7139 const AArch64InstrInfo *TII) {
7140 unsigned Opcode = MI.getOpcode();
7141 unsigned ImmIdx = FrameRegIdx + 1;
7142
7143 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
7144 Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm());
7145 emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
7146 MI.getOperand(0).getReg(), FrameReg, Offset, TII,
7147 MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
7148 MI.eraseFromParent();
7149 Offset = StackOffset();
7150 return true;
7151 }
7152
7153 int64_t NewOffset;
7154 unsigned UnscaledOp;
7155 bool UseUnscaledOp;
7156 int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
7157 &UnscaledOp, &NewOffset);
7160 // Replace the FrameIndex with FrameReg.
7161 MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
7162 if (UseUnscaledOp)
7163 MI.setDesc(TII->get(UnscaledOp));
7164
7165 MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
7166 return !Offset;
7167 }
7168
7169 return false;
7170}
7171
7177
7178MCInst AArch64InstrInfo::getNop() const { return MCInstBuilder(AArch64::NOP); }
7179
7180// AArch64 supports MachineCombiner.
7181bool AArch64InstrInfo::useMachineCombiner() const { return true; }
7182
7183// True when Opc sets flag
7184static bool isCombineInstrSettingFlag(unsigned Opc) {
7185 switch (Opc) {
7186 case AArch64::ADDSWrr:
7187 case AArch64::ADDSWri:
7188 case AArch64::ADDSXrr:
7189 case AArch64::ADDSXri:
7190 case AArch64::SUBSWrr:
7191 case AArch64::SUBSXrr:
7192 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7193 case AArch64::SUBSWri:
7194 case AArch64::SUBSXri:
7195 return true;
7196 default:
7197 break;
7198 }
7199 return false;
7200}
7201
7202// 32b Opcodes that can be combined with a MUL
7203static bool isCombineInstrCandidate32(unsigned Opc) {
7204 switch (Opc) {
7205 case AArch64::ADDWrr:
7206 case AArch64::ADDWri:
7207 case AArch64::SUBWrr:
7208 case AArch64::ADDSWrr:
7209 case AArch64::ADDSWri:
7210 case AArch64::SUBSWrr:
7211 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7212 case AArch64::SUBWri:
7213 case AArch64::SUBSWri:
7214 return true;
7215 default:
7216 break;
7217 }
7218 return false;
7219}
7220
7221// 64b Opcodes that can be combined with a MUL
7222static bool isCombineInstrCandidate64(unsigned Opc) {
7223 switch (Opc) {
7224 case AArch64::ADDXrr:
7225 case AArch64::ADDXri:
7226 case AArch64::SUBXrr:
7227 case AArch64::ADDSXrr:
7228 case AArch64::ADDSXri:
7229 case AArch64::SUBSXrr:
7230 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7231 case AArch64::SUBXri:
7232 case AArch64::SUBSXri:
7233 case AArch64::ADDv8i8:
7234 case AArch64::ADDv16i8:
7235 case AArch64::ADDv4i16:
7236 case AArch64::ADDv8i16:
7237 case AArch64::ADDv2i32:
7238 case AArch64::ADDv4i32:
7239 case AArch64::SUBv8i8:
7240 case AArch64::SUBv16i8:
7241 case AArch64::SUBv4i16:
7242 case AArch64::SUBv8i16:
7243 case AArch64::SUBv2i32:
7244 case AArch64::SUBv4i32:
7245 return true;
7246 default:
7247 break;
7248 }
7249 return false;
7250}
7251
7252// FP Opcodes that can be combined with a FMUL.
7253static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
7254 switch (Inst.getOpcode()) {
7255 default:
7256 break;
7257 case AArch64::FADDHrr:
7258 case AArch64::FADDSrr:
7259 case AArch64::FADDDrr:
7260 case AArch64::FADDv4f16:
7261 case AArch64::FADDv8f16:
7262 case AArch64::FADDv2f32:
7263 case AArch64::FADDv2f64:
7264 case AArch64::FADDv4f32:
7265 case AArch64::FSUBHrr:
7266 case AArch64::FSUBSrr:
7267 case AArch64::FSUBDrr:
7268 case AArch64::FSUBv4f16:
7269 case AArch64::FSUBv8f16:
7270 case AArch64::FSUBv2f32:
7271 case AArch64::FSUBv2f64:
7272 case AArch64::FSUBv4f32:
7274 // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
7275 // the target options or if FADD/FSUB has the contract fast-math flag.
7276 return Options.AllowFPOpFusion == FPOpFusion::Fast ||
7278 }
7279 return false;
7280}
7281
7282// Opcodes that can be combined with a MUL
7286
7287//
7288// Utility routine that checks if \param MO is defined by an
7289// \param CombineOpc instruction in the basic block \param MBB
7291 unsigned CombineOpc, unsigned ZeroReg = 0,
7292 bool CheckZeroReg = false) {
7293 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7294 MachineInstr *MI = nullptr;
7295
7296 if (MO.isReg() && MO.getReg().isVirtual())
7297 MI = MRI.getUniqueVRegDef(MO.getReg());
7298 // And it needs to be in the trace (otherwise, it won't have a depth).
7299 if (!MI || MI->getParent() != &MBB || MI->getOpcode() != CombineOpc)
7300 return false;
7301 // Must only used by the user we combine with.
7302 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
7303 return false;
7304
7305 if (CheckZeroReg) {
7306 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
7307 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
7308 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
7309 // The third input reg must be zero.
7310 if (MI->getOperand(3).getReg() != ZeroReg)
7311 return false;
7312 }
7313
7314 if (isCombineInstrSettingFlag(CombineOpc) &&
7315 MI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) == -1)
7316 return false;
7317
7318 return true;
7319}
7320
7321//
7322// Is \param MO defined by an integer multiply and can be combined?
7324 unsigned MulOpc, unsigned ZeroReg) {
7325 return canCombine(MBB, MO, MulOpc, ZeroReg, true);
7326}
7327
7328//
7329// Is \param MO defined by a floating-point multiply and can be combined?
7331 unsigned MulOpc) {
7332 return canCombine(MBB, MO, MulOpc);
7333}
7334
7335// TODO: There are many more machine instruction opcodes to match:
7336// 1. Other data types (integer, vectors)
7337// 2. Other math / logic operations (xor, or)
7338// 3. Other forms of the same operation (intrinsics and other variants)
7339bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
7340 bool Invert) const {
7341 if (Invert)
7342 return false;
7343 switch (Inst.getOpcode()) {
7344 // == Floating-point types ==
7345 // -- Floating-point instructions --
7346 case AArch64::FADDHrr:
7347 case AArch64::FADDSrr:
7348 case AArch64::FADDDrr:
7349 case AArch64::FMULHrr:
7350 case AArch64::FMULSrr:
7351 case AArch64::FMULDrr:
7352 case AArch64::FMULX16:
7353 case AArch64::FMULX32:
7354 case AArch64::FMULX64:
7355 // -- Advanced SIMD instructions --
7356 case AArch64::FADDv4f16:
7357 case AArch64::FADDv8f16:
7358 case AArch64::FADDv2f32:
7359 case AArch64::FADDv4f32:
7360 case AArch64::FADDv2f64:
7361 case AArch64::FMULv4f16:
7362 case AArch64::FMULv8f16:
7363 case AArch64::FMULv2f32:
7364 case AArch64::FMULv4f32:
7365 case AArch64::FMULv2f64:
7366 case AArch64::FMULXv4f16:
7367 case AArch64::FMULXv8f16:
7368 case AArch64::FMULXv2f32:
7369 case AArch64::FMULXv4f32:
7370 case AArch64::FMULXv2f64:
7371 // -- SVE instructions --
7372 // Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX
7373 // in the SVE instruction set (though there are predicated ones).
7374 case AArch64::FADD_ZZZ_H:
7375 case AArch64::FADD_ZZZ_S:
7376 case AArch64::FADD_ZZZ_D:
7377 case AArch64::FMUL_ZZZ_H:
7378 case AArch64::FMUL_ZZZ_S:
7379 case AArch64::FMUL_ZZZ_D:
7382
7383 // == Integer types ==
7384 // -- Base instructions --
7385 // Opcodes MULWrr and MULXrr don't exist because
7386 // `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of
7387 // `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively.
7388 // The machine-combiner does not support three-source-operands machine
7389 // instruction. So we cannot reassociate MULs.
7390 case AArch64::ADDWrr:
7391 case AArch64::ADDXrr:
7392 case AArch64::ANDWrr:
7393 case AArch64::ANDXrr:
7394 case AArch64::ORRWrr:
7395 case AArch64::ORRXrr:
7396 case AArch64::EORWrr:
7397 case AArch64::EORXrr:
7398 case AArch64::EONWrr:
7399 case AArch64::EONXrr:
7400 // -- Advanced SIMD instructions --
7401 // Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL
7402 // in the Advanced SIMD instruction set.
7403 case AArch64::ADDv8i8:
7404 case AArch64::ADDv16i8:
7405 case AArch64::ADDv4i16:
7406 case AArch64::ADDv8i16:
7407 case AArch64::ADDv2i32:
7408 case AArch64::ADDv4i32:
7409 case AArch64::ADDv1i64:
7410 case AArch64::ADDv2i64:
7411 case AArch64::MULv8i8:
7412 case AArch64::MULv16i8:
7413 case AArch64::MULv4i16:
7414 case AArch64::MULv8i16:
7415 case AArch64::MULv2i32:
7416 case AArch64::MULv4i32:
7417 case AArch64::ANDv8i8:
7418 case AArch64::ANDv16i8:
7419 case AArch64::ORRv8i8:
7420 case AArch64::ORRv16i8:
7421 case AArch64::EORv8i8:
7422 case AArch64::EORv16i8:
7423 // -- SVE instructions --
7424 case AArch64::ADD_ZZZ_B:
7425 case AArch64::ADD_ZZZ_H:
7426 case AArch64::ADD_ZZZ_S:
7427 case AArch64::ADD_ZZZ_D:
7428 case AArch64::MUL_ZZZ_B:
7429 case AArch64::MUL_ZZZ_H:
7430 case AArch64::MUL_ZZZ_S:
7431 case AArch64::MUL_ZZZ_D:
7432 case AArch64::AND_ZZZ:
7433 case AArch64::ORR_ZZZ:
7434 case AArch64::EOR_ZZZ:
7435 return true;
7436
7437 default:
7438 return false;
7439 }
7440}
7441
7442/// Find instructions that can be turned into madd.
7444 SmallVectorImpl<unsigned> &Patterns) {
7445 unsigned Opc = Root.getOpcode();
7446 MachineBasicBlock &MBB = *Root.getParent();
7447 bool Found = false;
7448
7450 return false;
7452 int Cmp_NZCV =
7453 Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
7454 // When NZCV is live bail out.
7455 if (Cmp_NZCV == -1)
7456 return false;
7457 unsigned NewOpc = convertToNonFlagSettingOpc(Root);
7458 // When opcode can't change bail out.
7459 // CHECKME: do we miss any cases for opcode conversion?
7460 if (NewOpc == Opc)
7461 return false;
7462 Opc = NewOpc;
7463 }
7464
7465 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
7466 unsigned Pattern) {
7467 if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
7468 Patterns.push_back(Pattern);
7469 Found = true;
7470 }
7471 };
7472
7473 auto setVFound = [&](int Opcode, int Operand, unsigned Pattern) {
7474 if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
7475 Patterns.push_back(Pattern);
7476 Found = true;
7477 }
7478 };
7479
7481
7482 switch (Opc) {
7483 default:
7484 break;
7485 case AArch64::ADDWrr:
7486 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7487 "ADDWrr does not have register operands");
7488 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
7489 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
7490 break;
7491 case AArch64::ADDXrr:
7492 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
7493 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
7494 break;
7495 case AArch64::SUBWrr:
7496 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
7497 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
7498 break;
7499 case AArch64::SUBXrr:
7500 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
7501 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
7502 break;
7503 case AArch64::ADDWri:
7504 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
7505 break;
7506 case AArch64::ADDXri:
7507 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
7508 break;
7509 case AArch64::SUBWri:
7510 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
7511 break;
7512 case AArch64::SUBXri:
7513 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
7514 break;
7515 case AArch64::ADDv8i8:
7516 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
7517 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
7518 break;
7519 case AArch64::ADDv16i8:
7520 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
7521 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
7522 break;
7523 case AArch64::ADDv4i16:
7524 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
7525 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
7526 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
7527 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
7528 break;
7529 case AArch64::ADDv8i16:
7530 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
7531 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
7532 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
7533 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
7534 break;
7535 case AArch64::ADDv2i32:
7536 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
7537 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
7538 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
7539 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
7540 break;
7541 case AArch64::ADDv4i32:
7542 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
7543 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
7544 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
7545 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
7546 break;
7547 case AArch64::SUBv8i8:
7548 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
7549 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
7550 break;
7551 case AArch64::SUBv16i8:
7552 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
7553 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
7554 break;
7555 case AArch64::SUBv4i16:
7556 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
7557 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
7558 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
7559 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
7560 break;
7561 case AArch64::SUBv8i16:
7562 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
7563 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
7564 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
7565 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
7566 break;
7567 case AArch64::SUBv2i32:
7568 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
7569 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
7570 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
7571 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
7572 break;
7573 case AArch64::SUBv4i32:
7574 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
7575 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
7576 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
7577 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
7578 break;
7579 }
7580 return Found;
7581}
7582
7583bool AArch64InstrInfo::isAccumulationOpcode(unsigned Opcode) const {
7584 switch (Opcode) {
7585 default:
7586 break;
7587 case AArch64::UABALB_ZZZ_D:
7588 case AArch64::UABALB_ZZZ_H:
7589 case AArch64::UABALB_ZZZ_S:
7590 case AArch64::UABALT_ZZZ_D:
7591 case AArch64::UABALT_ZZZ_H:
7592 case AArch64::UABALT_ZZZ_S:
7593 case AArch64::SABALB_ZZZ_D:
7594 case AArch64::SABALB_ZZZ_S:
7595 case AArch64::SABALB_ZZZ_H:
7596 case AArch64::SABALT_ZZZ_D:
7597 case AArch64::SABALT_ZZZ_S:
7598 case AArch64::SABALT_ZZZ_H:
7599 case AArch64::UABALv16i8_v8i16:
7600 case AArch64::UABALv2i32_v2i64:
7601 case AArch64::UABALv4i16_v4i32:
7602 case AArch64::UABALv4i32_v2i64:
7603 case AArch64::UABALv8i16_v4i32:
7604 case AArch64::UABALv8i8_v8i16:
7605 case AArch64::UABAv16i8:
7606 case AArch64::UABAv2i32:
7607 case AArch64::UABAv4i16:
7608 case AArch64::UABAv4i32:
7609 case AArch64::UABAv8i16:
7610 case AArch64::UABAv8i8:
7611 case AArch64::SABALv16i8_v8i16:
7612 case AArch64::SABALv2i32_v2i64:
7613 case AArch64::SABALv4i16_v4i32:
7614 case AArch64::SABALv4i32_v2i64:
7615 case AArch64::SABALv8i16_v4i32:
7616 case AArch64::SABALv8i8_v8i16:
7617 case AArch64::SABAv16i8:
7618 case AArch64::SABAv2i32:
7619 case AArch64::SABAv4i16:
7620 case AArch64::SABAv4i32:
7621 case AArch64::SABAv8i16:
7622 case AArch64::SABAv8i8:
7623 return true;
7624 }
7625
7626 return false;
7627}
7628
7629unsigned AArch64InstrInfo::getAccumulationStartOpcode(
7630 unsigned AccumulationOpcode) const {
7631 switch (AccumulationOpcode) {
7632 default:
7633 llvm_unreachable("Unsupported accumulation Opcode!");
7634 case AArch64::UABALB_ZZZ_D:
7635 return AArch64::UABDLB_ZZZ_D;
7636 case AArch64::UABALB_ZZZ_H:
7637 return AArch64::UABDLB_ZZZ_H;
7638 case AArch64::UABALB_ZZZ_S:
7639 return AArch64::UABDLB_ZZZ_S;
7640 case AArch64::UABALT_ZZZ_D:
7641 return AArch64::UABDLT_ZZZ_D;
7642 case AArch64::UABALT_ZZZ_H:
7643 return AArch64::UABDLT_ZZZ_H;
7644 case AArch64::UABALT_ZZZ_S:
7645 return AArch64::UABDLT_ZZZ_S;
7646 case AArch64::UABALv16i8_v8i16:
7647 return AArch64::UABDLv16i8_v8i16;
7648 case AArch64::UABALv2i32_v2i64:
7649 return AArch64::UABDLv2i32_v2i64;
7650 case AArch64::UABALv4i16_v4i32:
7651 return AArch64::UABDLv4i16_v4i32;
7652 case AArch64::UABALv4i32_v2i64:
7653 return AArch64::UABDLv4i32_v2i64;
7654 case AArch64::UABALv8i16_v4i32:
7655 return AArch64::UABDLv8i16_v4i32;
7656 case AArch64::UABALv8i8_v8i16:
7657 return AArch64::UABDLv8i8_v8i16;
7658 case AArch64::UABAv16i8:
7659 return AArch64::UABDv16i8;
7660 case AArch64::UABAv2i32:
7661 return AArch64::UABDv2i32;
7662 case AArch64::UABAv4i16:
7663 return AArch64::UABDv4i16;
7664 case AArch64::UABAv4i32:
7665 return AArch64::UABDv4i32;
7666 case AArch64::UABAv8i16:
7667 return AArch64::UABDv8i16;
7668 case AArch64::UABAv8i8:
7669 return AArch64::UABDv8i8;
7670 case AArch64::SABALB_ZZZ_D:
7671 return AArch64::SABDLB_ZZZ_D;
7672 case AArch64::SABALB_ZZZ_S:
7673 return AArch64::SABDLB_ZZZ_S;
7674 case AArch64::SABALB_ZZZ_H:
7675 return AArch64::SABDLB_ZZZ_H;
7676 case AArch64::SABALT_ZZZ_D:
7677 return AArch64::SABDLT_ZZZ_D;
7678 case AArch64::SABALT_ZZZ_S:
7679 return AArch64::SABDLT_ZZZ_S;
7680 case AArch64::SABALT_ZZZ_H:
7681 return AArch64::SABDLT_ZZZ_H;
7682 case AArch64::SABALv16i8_v8i16:
7683 return AArch64::SABDLv16i8_v8i16;
7684 case AArch64::SABALv2i32_v2i64:
7685 return AArch64::SABDLv2i32_v2i64;
7686 case AArch64::SABALv4i16_v4i32:
7687 return AArch64::SABDLv4i16_v4i32;
7688 case AArch64::SABALv4i32_v2i64:
7689 return AArch64::SABDLv4i32_v2i64;
7690 case AArch64::SABALv8i16_v4i32:
7691 return AArch64::SABDLv8i16_v4i32;
7692 case AArch64::SABALv8i8_v8i16:
7693 return AArch64::SABDLv8i8_v8i16;
7694 case AArch64::SABAv16i8:
7695 return AArch64::SABDv16i8;
7696 case AArch64::SABAv2i32:
7697 return AArch64::SABAv2i32;
7698 case AArch64::SABAv4i16:
7699 return AArch64::SABDv4i16;
7700 case AArch64::SABAv4i32:
7701 return AArch64::SABDv4i32;
7702 case AArch64::SABAv8i16:
7703 return AArch64::SABDv8i16;
7704 case AArch64::SABAv8i8:
7705 return AArch64::SABDv8i8;
7706 }
7707}
7708
7709/// Floating-Point Support
7710
7711/// Find instructions that can be turned into madd.
7713 SmallVectorImpl<unsigned> &Patterns) {
7714
7715 if (!isCombineInstrCandidateFP(Root))
7716 return false;
7717
7718 MachineBasicBlock &MBB = *Root.getParent();
7719 bool Found = false;
7720
7721 auto Match = [&](int Opcode, int Operand, unsigned Pattern) -> bool {
7722 if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
7723 Patterns.push_back(Pattern);
7724 return true;
7725 }
7726 return false;
7727 };
7728
7730
7731 switch (Root.getOpcode()) {
7732 default:
7733 assert(false && "Unsupported FP instruction in combiner\n");
7734 break;
7735 case AArch64::FADDHrr:
7736 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7737 "FADDHrr does not have register operands");
7738
7739 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
7740 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
7741 break;
7742 case AArch64::FADDSrr:
7743 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7744 "FADDSrr does not have register operands");
7745
7746 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
7747 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
7748
7749 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
7750 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
7751 break;
7752 case AArch64::FADDDrr:
7753 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
7754 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
7755
7756 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
7757 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
7758 break;
7759 case AArch64::FADDv4f16:
7760 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
7761 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
7762
7763 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
7764 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
7765 break;
7766 case AArch64::FADDv8f16:
7767 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
7768 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
7769
7770 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
7771 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
7772 break;
7773 case AArch64::FADDv2f32:
7774 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
7775 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
7776
7777 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
7778 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
7779 break;
7780 case AArch64::FADDv2f64:
7781 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
7782 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
7783
7784 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
7785 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
7786 break;
7787 case AArch64::FADDv4f32:
7788 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
7789 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
7790
7791 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
7792 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
7793 break;
7794 case AArch64::FSUBHrr:
7795 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
7796 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
7797 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
7798 break;
7799 case AArch64::FSUBSrr:
7800 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
7801
7802 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
7803 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
7804
7805 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
7806 break;
7807 case AArch64::FSUBDrr:
7808 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
7809
7810 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
7811 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
7812
7813 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
7814 break;
7815 case AArch64::FSUBv4f16:
7816 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
7817 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
7818
7819 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
7820 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
7821 break;
7822 case AArch64::FSUBv8f16:
7823 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
7824 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
7825
7826 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
7827 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
7828 break;
7829 case AArch64::FSUBv2f32:
7830 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
7831 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
7832
7833 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
7834 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
7835 break;
7836 case AArch64::FSUBv2f64:
7837 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
7838 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
7839
7840 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
7841 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
7842 break;
7843 case AArch64::FSUBv4f32:
7844 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
7845 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
7846
7847 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
7848 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
7849 break;
7850 }
7851 return Found;
7852}
7853
7855 SmallVectorImpl<unsigned> &Patterns) {
7856 MachineBasicBlock &MBB = *Root.getParent();
7857 bool Found = false;
7858
7859 auto Match = [&](unsigned Opcode, int Operand, unsigned Pattern) -> bool {
7860 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7861 MachineOperand &MO = Root.getOperand(Operand);
7862 MachineInstr *MI = nullptr;
7863 if (MO.isReg() && MO.getReg().isVirtual())
7864 MI = MRI.getUniqueVRegDef(MO.getReg());
7865 // Ignore No-op COPYs in FMUL(COPY(DUP(..)))
7866 if (MI && MI->getOpcode() == TargetOpcode::COPY &&
7867 MI->getOperand(1).getReg().isVirtual())
7868 MI = MRI.getUniqueVRegDef(MI->getOperand(1).getReg());
7869 if (MI && MI->getOpcode() == Opcode) {
7870 Patterns.push_back(Pattern);
7871 return true;
7872 }
7873 return false;
7874 };
7875
7877
7878 switch (Root.getOpcode()) {
7879 default:
7880 return false;
7881 case AArch64::FMULv2f32:
7882 Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1);
7883 Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2);
7884 break;
7885 case AArch64::FMULv2f64:
7886 Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1);
7887 Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2);
7888 break;
7889 case AArch64::FMULv4f16:
7890 Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1);
7891 Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2);
7892 break;
7893 case AArch64::FMULv4f32:
7894 Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1);
7895 Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2);
7896 break;
7897 case AArch64::FMULv8f16:
7898 Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1);
7899 Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2);
7900 break;
7901 }
7902
7903 return Found;
7904}
7905
7907 SmallVectorImpl<unsigned> &Patterns) {
7908 unsigned Opc = Root.getOpcode();
7909 MachineBasicBlock &MBB = *Root.getParent();
7910 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7911
7912 auto Match = [&](unsigned Opcode, unsigned Pattern) -> bool {
7913 MachineOperand &MO = Root.getOperand(1);
7915 if (MI != nullptr && (MI->getOpcode() == Opcode) &&
7916 MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) &&
7920 MI->getFlag(MachineInstr::MIFlag::FmNsz)) {
7921 Patterns.push_back(Pattern);
7922 return true;
7923 }
7924 return false;
7925 };
7926
7927 switch (Opc) {
7928 default:
7929 break;
7930 case AArch64::FNEGDr:
7931 return Match(AArch64::FMADDDrrr, AArch64MachineCombinerPattern::FNMADD);
7932 case AArch64::FNEGSr:
7933 return Match(AArch64::FMADDSrrr, AArch64MachineCombinerPattern::FNMADD);
7934 }
7935
7936 return false;
7937}
7938
7939/// Return true when a code sequence can improve throughput. It
7940/// should be called only for instructions in loops.
7941/// \param Pattern - combiner pattern
7943 switch (Pattern) {
7944 default:
7945 break;
8051 return true;
8052 } // end switch (Pattern)
8053 return false;
8054}
8055
8056/// Find other MI combine patterns.
8058 SmallVectorImpl<unsigned> &Patterns) {
8059 // A - (B + C) ==> (A - B) - C or (A - C) - B
8060 unsigned Opc = Root.getOpcode();
8061 MachineBasicBlock &MBB = *Root.getParent();
8062
8063 switch (Opc) {
8064 case AArch64::SUBWrr:
8065 case AArch64::SUBSWrr:
8066 case AArch64::SUBXrr:
8067 case AArch64::SUBSXrr:
8068 // Found candidate root.
8069 break;
8070 default:
8071 return false;
8072 }
8073
8075 Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) ==
8076 -1)
8077 return false;
8078
8079 if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) ||
8080 canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) ||
8081 canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) ||
8082 canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) {
8085 return true;
8086 }
8087
8088 return false;
8089}
8090
8091/// Check if the given instruction forms a gather load pattern that can be
8092/// optimized for better Memory-Level Parallelism (MLP). This function
8093/// identifies chains of NEON lane load instructions that load data from
8094/// different memory addresses into individual lanes of a 128-bit vector
8095/// register, then attempts to split the pattern into parallel loads to break
8096/// the serial dependency between instructions.
8097///
8098/// Pattern Matched:
8099/// Initial scalar load -> SUBREG_TO_REG (lane 0) -> LD1i* (lane 1) ->
8100/// LD1i* (lane 2) -> ... -> LD1i* (lane N-1, Root)
8101///
8102/// Transformed Into:
8103/// Two parallel vector loads using fewer lanes each, followed by ZIP1v2i64
8104/// to combine the results, enabling better memory-level parallelism.
8105///
8106/// Supported Element Types:
8107/// - 32-bit elements (LD1i32, 4 lanes total)
8108/// - 16-bit elements (LD1i16, 8 lanes total)
8109/// - 8-bit elements (LD1i8, 16 lanes total)
8111 SmallVectorImpl<unsigned> &Patterns,
8112 unsigned LoadLaneOpCode, unsigned NumLanes) {
8113 const MachineFunction *MF = Root.getMF();
8114
8115 // Early exit if optimizing for size.
8116 if (MF->getFunction().hasMinSize())
8117 return false;
8118
8119 const MachineRegisterInfo &MRI = MF->getRegInfo();
8121
8122 // The root of the pattern must load into the last lane of the vector.
8123 if (Root.getOperand(2).getImm() != NumLanes - 1)
8124 return false;
8125
8126 // Check that we have load into all lanes except lane 0.
8127 // For each load we also want to check that:
8128 // 1. It has a single non-debug use (since we will be replacing the virtual
8129 // register)
8130 // 2. That the addressing mode only uses a single pointer operand
8131 auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
8132 auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
8133 SmallSet<unsigned, 16> RemainingLanes(Range.begin(), Range.end());
8135 while (!RemainingLanes.empty() && CurrInstr &&
8136 CurrInstr->getOpcode() == LoadLaneOpCode &&
8137 MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
8138 CurrInstr->getNumOperands() == 4) {
8139 RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
8140 LoadInstrs.push_back(CurrInstr);
8141 CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
8142 }
8143
8144 // Check that we have found a match for lanes N-1.. 1.
8145 if (!RemainingLanes.empty())
8146 return false;
8147
8148 // Match the SUBREG_TO_REG sequence.
8149 if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
8150 return false;
8151
8152 // Verify that the subreg to reg loads an integer into the first lane.
8153 auto Lane0LoadReg = CurrInstr->getOperand(1).getReg();
8154 unsigned SingleLaneSizeInBits = 128 / NumLanes;
8155 if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits)
8156 return false;
8157
8158 // Verify that it also has a single non debug use.
8159 if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
8160 return false;
8161
8162 LoadInstrs.push_back(MRI.getUniqueVRegDef(Lane0LoadReg));
8163
8164 // If there is any chance of aliasing, do not apply the pattern.
8165 // Walk backward through the MBB starting from Root.
8166 // Exit early if we've encountered all load instructions or hit the search
8167 // limit.
8168 auto MBBItr = Root.getIterator();
8169 unsigned RemainingSteps = GatherOptSearchLimit;
8170 SmallPtrSet<const MachineInstr *, 16> RemainingLoadInstrs;
8171 RemainingLoadInstrs.insert(LoadInstrs.begin(), LoadInstrs.end());
8172 const MachineBasicBlock *MBB = Root.getParent();
8173
8174 for (; MBBItr != MBB->begin() && RemainingSteps > 0 &&
8175 !RemainingLoadInstrs.empty();
8176 --MBBItr, --RemainingSteps) {
8177 const MachineInstr &CurrInstr = *MBBItr;
8178
8179 // Remove this instruction from remaining loads if it's one we're tracking.
8180 RemainingLoadInstrs.erase(&CurrInstr);
8181
8182 // Check for potential aliasing with any of the load instructions to
8183 // optimize.
8184 if (CurrInstr.isLoadFoldBarrier())
8185 return false;
8186 }
8187
8188 // If we hit the search limit without finding all load instructions,
8189 // don't match the pattern.
8190 if (RemainingSteps == 0 && !RemainingLoadInstrs.empty())
8191 return false;
8192
8193 switch (NumLanes) {
8194 case 4:
8196 break;
8197 case 8:
8199 break;
8200 case 16:
8202 break;
8203 default:
8204 llvm_unreachable("Got bad number of lanes for gather pattern.");
8205 }
8206
8207 return true;
8208}
8209
8210/// Search for patterns of LD instructions we can optimize.
8212 SmallVectorImpl<unsigned> &Patterns) {
8213
8214 // The pattern searches for loads into single lanes.
8215 switch (Root.getOpcode()) {
8216 case AArch64::LD1i32:
8217 return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 4);
8218 case AArch64::LD1i16:
8219 return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 8);
8220 case AArch64::LD1i8:
8221 return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 16);
8222 default:
8223 return false;
8224 }
8225}
8226
8227/// Generate optimized instruction sequence for gather load patterns to improve
8228/// Memory-Level Parallelism (MLP). This function transforms a chain of
8229/// sequential NEON lane loads into parallel vector loads that can execute
8230/// concurrently.
8231static void
8235 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
8236 unsigned Pattern, unsigned NumLanes) {
8237 MachineFunction &MF = *Root.getParent()->getParent();
8238 MachineRegisterInfo &MRI = MF.getRegInfo();
8240
8241 // Gather the initial load instructions to build the pattern.
8242 SmallVector<MachineInstr *, 16> LoadToLaneInstrs;
8243 MachineInstr *CurrInstr = &Root;
8244 for (unsigned i = 0; i < NumLanes - 1; ++i) {
8245 LoadToLaneInstrs.push_back(CurrInstr);
8246 CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
8247 }
8248
8249 // Sort the load instructions according to the lane.
8250 llvm::sort(LoadToLaneInstrs,
8251 [](const MachineInstr *A, const MachineInstr *B) {
8252 return A->getOperand(2).getImm() > B->getOperand(2).getImm();
8253 });
8254
8255 MachineInstr *SubregToReg = CurrInstr;
8256 LoadToLaneInstrs.push_back(
8257 MRI.getUniqueVRegDef(SubregToReg->getOperand(1).getReg()));
8258 auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs);
8259
8260 const TargetRegisterClass *FPR128RegClass =
8261 MRI.getRegClass(Root.getOperand(0).getReg());
8262
8263 // Helper lambda to create a LD1 instruction.
8264 auto CreateLD1Instruction = [&](MachineInstr *OriginalInstr,
8265 Register SrcRegister, unsigned Lane,
8266 Register OffsetRegister,
8267 bool OffsetRegisterKillState) {
8268 auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
8269 MachineInstrBuilder LoadIndexIntoRegister =
8270 BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
8271 NewRegister)
8272 .addReg(SrcRegister)
8273 .addImm(Lane)
8274 .addReg(OffsetRegister, getKillRegState(OffsetRegisterKillState))
8275 .setMemRefs(OriginalInstr->memoperands());
8276 InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
8277 InsInstrs.push_back(LoadIndexIntoRegister);
8278 return NewRegister;
8279 };
8280
8281 // Helper to create load instruction based on the NumLanes in the NEON
8282 // register we are rewriting.
8283 auto CreateLDRInstruction =
8284 [&](unsigned NumLanes, Register DestReg, Register OffsetReg,
8286 unsigned Opcode;
8287 switch (NumLanes) {
8288 case 4:
8289 Opcode = AArch64::LDRSui;
8290 break;
8291 case 8:
8292 Opcode = AArch64::LDRHui;
8293 break;
8294 case 16:
8295 Opcode = AArch64::LDRBui;
8296 break;
8297 default:
8299 "Got unsupported number of lanes in machine-combiner gather pattern");
8300 }
8301 // Immediate offset load
8302 return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg)
8303 .addReg(OffsetReg)
8304 .addImm(0)
8305 .setMemRefs(MMOs);
8306 };
8307
8308 // Load the remaining lanes into register 0.
8309 auto LanesToLoadToReg0 =
8310 llvm::make_range(LoadToLaneInstrsAscending.begin() + 1,
8311 LoadToLaneInstrsAscending.begin() + NumLanes / 2);
8312 Register PrevReg = SubregToReg->getOperand(0).getReg();
8313 for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) {
8314 const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(3);
8315 PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
8316 OffsetRegOperand.getReg(),
8317 OffsetRegOperand.isKill());
8318 DelInstrs.push_back(LoadInstr);
8319 }
8320 Register LastLoadReg0 = PrevReg;
8321
8322 // First load into register 1. Perform an integer load to zero out the upper
8323 // lanes in a single instruction.
8324 MachineInstr *Lane0Load = *LoadToLaneInstrsAscending.begin();
8325 MachineInstr *OriginalSplitLoad =
8326 *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2);
8327 Register DestRegForMiddleIndex = MRI.createVirtualRegister(
8328 MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
8329
8330 const MachineOperand &OriginalSplitToLoadOffsetOperand =
8331 OriginalSplitLoad->getOperand(3);
8332 MachineInstrBuilder MiddleIndexLoadInstr =
8333 CreateLDRInstruction(NumLanes, DestRegForMiddleIndex,
8334 OriginalSplitToLoadOffsetOperand.getReg(),
8335 OriginalSplitLoad->memoperands());
8336
8337 InstrIdxForVirtReg.insert(
8338 std::make_pair(DestRegForMiddleIndex, InsInstrs.size()));
8339 InsInstrs.push_back(MiddleIndexLoadInstr);
8340 DelInstrs.push_back(OriginalSplitLoad);
8341
8342 // Subreg To Reg instruction for register 1.
8343 Register DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
8344 unsigned SubregType;
8345 switch (NumLanes) {
8346 case 4:
8347 SubregType = AArch64::ssub;
8348 break;
8349 case 8:
8350 SubregType = AArch64::hsub;
8351 break;
8352 case 16:
8353 SubregType = AArch64::bsub;
8354 break;
8355 default:
8357 "Got invalid NumLanes for machine-combiner gather pattern");
8358 }
8359
8360 auto SubRegToRegInstr =
8361 BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()),
8362 DestRegForSubregToReg)
8363 .addReg(DestRegForMiddleIndex, getKillRegState(true))
8364 .addImm(SubregType);
8365 InstrIdxForVirtReg.insert(
8366 std::make_pair(DestRegForSubregToReg, InsInstrs.size()));
8367 InsInstrs.push_back(SubRegToRegInstr);
8368
8369 // Load remaining lanes into register 1.
8370 auto LanesToLoadToReg1 =
8371 llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1,
8372 LoadToLaneInstrsAscending.end());
8373 PrevReg = SubRegToRegInstr->getOperand(0).getReg();
8374 for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) {
8375 const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(3);
8376 PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
8377 OffsetRegOperand.getReg(),
8378 OffsetRegOperand.isKill());
8379
8380 // Do not add the last reg to DelInstrs - it will be removed later.
8381 if (Index == NumLanes / 2 - 2) {
8382 break;
8383 }
8384 DelInstrs.push_back(LoadInstr);
8385 }
8386 Register LastLoadReg1 = PrevReg;
8387
8388 // Create the final zip instruction to combine the results.
8389 MachineInstrBuilder ZipInstr =
8390 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
8391 Root.getOperand(0).getReg())
8392 .addReg(LastLoadReg0)
8393 .addReg(LastLoadReg1);
8394 InsInstrs.push_back(ZipInstr);
8395}
8396
8410
8411/// Return true when there is potentially a faster code sequence for an
8412/// instruction chain ending in \p Root. All potential patterns are listed in
8413/// the \p Pattern vector. Pattern should be sorted in priority order since the
8414/// pattern evaluator stops checking as soon as it finds a faster sequence.
8415
8416bool AArch64InstrInfo::getMachineCombinerPatterns(
8417 MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns,
8418 bool DoRegPressureReduce) const {
8419 // Integer patterns
8420 if (getMaddPatterns(Root, Patterns))
8421 return true;
8422 // Floating point patterns
8423 if (getFMULPatterns(Root, Patterns))
8424 return true;
8425 if (getFMAPatterns(Root, Patterns))
8426 return true;
8427 if (getFNEGPatterns(Root, Patterns))
8428 return true;
8429
8430 // Other patterns
8431 if (getMiscPatterns(Root, Patterns))
8432 return true;
8433
8434 // Load patterns
8435 if (getLoadPatterns(Root, Patterns))
8436 return true;
8437
8438 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
8439 DoRegPressureReduce);
8440}
8441
8443/// genFusedMultiply - Generate fused multiply instructions.
8444/// This function supports both integer and floating point instructions.
8445/// A typical example:
8446/// F|MUL I=A,B,0
8447/// F|ADD R,I,C
8448/// ==> F|MADD R,A,B,C
8449/// \param MF Containing MachineFunction
8450/// \param MRI Register information
8451/// \param TII Target information
8452/// \param Root is the F|ADD instruction
8453/// \param [out] InsInstrs is a vector of machine instructions and will
8454/// contain the generated madd instruction
8455/// \param IdxMulOpd is index of operand in Root that is the result of
8456/// the F|MUL. In the example above IdxMulOpd is 1.
8457/// \param MaddOpc the opcode fo the f|madd instruction
8458/// \param RC Register class of operands
8459/// \param kind of fma instruction (addressing mode) to be generated
8460/// \param ReplacedAddend is the result register from the instruction
8461/// replacing the non-combined operand, if any.
8462static MachineInstr *
8464 const TargetInstrInfo *TII, MachineInstr &Root,
8465 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
8466 unsigned MaddOpc, const TargetRegisterClass *RC,
8468 const Register *ReplacedAddend = nullptr) {
8469 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
8470
8471 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
8472 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
8473 Register ResultReg = Root.getOperand(0).getReg();
8474 Register SrcReg0 = MUL->getOperand(1).getReg();
8475 bool Src0IsKill = MUL->getOperand(1).isKill();
8476 Register SrcReg1 = MUL->getOperand(2).getReg();
8477 bool Src1IsKill = MUL->getOperand(2).isKill();
8478
8479 Register SrcReg2;
8480 bool Src2IsKill;
8481 if (ReplacedAddend) {
8482 // If we just generated a new addend, we must be it's only use.
8483 SrcReg2 = *ReplacedAddend;
8484 Src2IsKill = true;
8485 } else {
8486 SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
8487 Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
8488 }
8489
8490 if (ResultReg.isVirtual())
8491 MRI.constrainRegClass(ResultReg, RC);
8492 if (SrcReg0.isVirtual())
8493 MRI.constrainRegClass(SrcReg0, RC);
8494 if (SrcReg1.isVirtual())
8495 MRI.constrainRegClass(SrcReg1, RC);
8496 if (SrcReg2.isVirtual())
8497 MRI.constrainRegClass(SrcReg2, RC);
8498
8500 if (kind == FMAInstKind::Default)
8501 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8502 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8503 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8504 .addReg(SrcReg2, getKillRegState(Src2IsKill));
8505 else if (kind == FMAInstKind::Indexed)
8506 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8507 .addReg(SrcReg2, getKillRegState(Src2IsKill))
8508 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8509 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8510 .addImm(MUL->getOperand(3).getImm());
8511 else if (kind == FMAInstKind::Accumulator)
8512 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8513 .addReg(SrcReg2, getKillRegState(Src2IsKill))
8514 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8515 .addReg(SrcReg1, getKillRegState(Src1IsKill));
8516 else
8517 assert(false && "Invalid FMA instruction kind \n");
8518 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
8519 InsInstrs.push_back(MIB);
8520 return MUL;
8521}
8522
8523static MachineInstr *
8525 const TargetInstrInfo *TII, MachineInstr &Root,
8527 MachineInstr *MAD = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
8528
8529 unsigned Opc = 0;
8530 const TargetRegisterClass *RC = MRI.getRegClass(MAD->getOperand(0).getReg());
8531 if (AArch64::FPR32RegClass.hasSubClassEq(RC))
8532 Opc = AArch64::FNMADDSrrr;
8533 else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
8534 Opc = AArch64::FNMADDDrrr;
8535 else
8536 return nullptr;
8537
8538 Register ResultReg = Root.getOperand(0).getReg();
8539 Register SrcReg0 = MAD->getOperand(1).getReg();
8540 Register SrcReg1 = MAD->getOperand(2).getReg();
8541 Register SrcReg2 = MAD->getOperand(3).getReg();
8542 bool Src0IsKill = MAD->getOperand(1).isKill();
8543 bool Src1IsKill = MAD->getOperand(2).isKill();
8544 bool Src2IsKill = MAD->getOperand(3).isKill();
8545 if (ResultReg.isVirtual())
8546 MRI.constrainRegClass(ResultReg, RC);
8547 if (SrcReg0.isVirtual())
8548 MRI.constrainRegClass(SrcReg0, RC);
8549 if (SrcReg1.isVirtual())
8550 MRI.constrainRegClass(SrcReg1, RC);
8551 if (SrcReg2.isVirtual())
8552 MRI.constrainRegClass(SrcReg2, RC);
8553
8555 BuildMI(MF, MIMetadata(Root), TII->get(Opc), ResultReg)
8556 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8557 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8558 .addReg(SrcReg2, getKillRegState(Src2IsKill));
8559 InsInstrs.push_back(MIB);
8560
8561 return MAD;
8562}
8563
8564/// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
8565static MachineInstr *
8568 unsigned IdxDupOp, unsigned MulOpc,
8569 const TargetRegisterClass *RC, MachineRegisterInfo &MRI) {
8570 assert(((IdxDupOp == 1) || (IdxDupOp == 2)) &&
8571 "Invalid index of FMUL operand");
8572
8573 MachineFunction &MF = *Root.getMF();
8575
8576 MachineInstr *Dup =
8577 MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg());
8578
8579 if (Dup->getOpcode() == TargetOpcode::COPY)
8580 Dup = MRI.getUniqueVRegDef(Dup->getOperand(1).getReg());
8581
8582 Register DupSrcReg = Dup->getOperand(1).getReg();
8583 MRI.clearKillFlags(DupSrcReg);
8584 MRI.constrainRegClass(DupSrcReg, RC);
8585
8586 unsigned DupSrcLane = Dup->getOperand(2).getImm();
8587
8588 unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1;
8589 MachineOperand &MulOp = Root.getOperand(IdxMulOp);
8590
8591 Register ResultReg = Root.getOperand(0).getReg();
8592
8594 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MulOpc), ResultReg)
8595 .add(MulOp)
8596 .addReg(DupSrcReg)
8597 .addImm(DupSrcLane);
8598
8599 InsInstrs.push_back(MIB);
8600 return &Root;
8601}
8602
8603/// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
8604/// instructions.
8605///
8606/// \see genFusedMultiply
8610 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
8611 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8613}
8614
8615/// genNeg - Helper to generate an intermediate negation of the second operand
8616/// of Root
8618 const TargetInstrInfo *TII, MachineInstr &Root,
8620 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
8621 unsigned MnegOpc, const TargetRegisterClass *RC) {
8622 Register NewVR = MRI.createVirtualRegister(RC);
8624 BuildMI(MF, MIMetadata(Root), TII->get(MnegOpc), NewVR)
8625 .add(Root.getOperand(2));
8626 InsInstrs.push_back(MIB);
8627
8628 assert(InstrIdxForVirtReg.empty());
8629 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8630
8631 return NewVR;
8632}
8633
8634/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
8635/// instructions with an additional negation of the accumulator
8639 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
8640 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
8641 assert(IdxMulOpd == 1);
8642
8643 Register NewVR =
8644 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
8645 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8646 FMAInstKind::Accumulator, &NewVR);
8647}
8648
8649/// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
8650/// instructions.
8651///
8652/// \see genFusedMultiply
8656 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
8657 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8659}
8660
8661/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
8662/// instructions with an additional negation of the accumulator
8666 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
8667 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
8668 assert(IdxMulOpd == 1);
8669
8670 Register NewVR =
8671 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
8672
8673 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8674 FMAInstKind::Indexed, &NewVR);
8675}
8676
8677/// genMaddR - Generate madd instruction and combine mul and add using
8678/// an extra virtual register
8679/// Example - an ADD intermediate needs to be stored in a register:
8680/// MUL I=A,B,0
8681/// ADD R,I,Imm
8682/// ==> ORR V, ZR, Imm
8683/// ==> MADD R,A,B,V
8684/// \param MF Containing MachineFunction
8685/// \param MRI Register information
8686/// \param TII Target information
8687/// \param Root is the ADD instruction
8688/// \param [out] InsInstrs is a vector of machine instructions and will
8689/// contain the generated madd instruction
8690/// \param IdxMulOpd is index of operand in Root that is the result of
8691/// the MUL. In the example above IdxMulOpd is 1.
8692/// \param MaddOpc the opcode fo the madd instruction
8693/// \param VR is a virtual register that holds the value of an ADD operand
8694/// (V in the example above).
8695/// \param RC Register class of operands
8697 const TargetInstrInfo *TII, MachineInstr &Root,
8699 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
8700 const TargetRegisterClass *RC) {
8701 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
8702
8703 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
8704 Register ResultReg = Root.getOperand(0).getReg();
8705 Register SrcReg0 = MUL->getOperand(1).getReg();
8706 bool Src0IsKill = MUL->getOperand(1).isKill();
8707 Register SrcReg1 = MUL->getOperand(2).getReg();
8708 bool Src1IsKill = MUL->getOperand(2).isKill();
8709
8710 if (ResultReg.isVirtual())
8711 MRI.constrainRegClass(ResultReg, RC);
8712 if (SrcReg0.isVirtual())
8713 MRI.constrainRegClass(SrcReg0, RC);
8714 if (SrcReg1.isVirtual())
8715 MRI.constrainRegClass(SrcReg1, RC);
8717 MRI.constrainRegClass(VR, RC);
8718
8720 BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8721 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8722 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8723 .addReg(VR);
8724 // Insert the MADD
8725 InsInstrs.push_back(MIB);
8726 return MUL;
8727}
8728
8729/// Do the following transformation
8730/// A - (B + C) ==> (A - B) - C
8731/// A - (B + C) ==> (A - C) - B
8733 const TargetInstrInfo *TII, MachineInstr &Root,
8736 unsigned IdxOpd1,
8737 DenseMap<Register, unsigned> &InstrIdxForVirtReg) {
8738 assert(IdxOpd1 == 1 || IdxOpd1 == 2);
8739 unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1;
8740 MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
8741
8742 Register ResultReg = Root.getOperand(0).getReg();
8743 Register RegA = Root.getOperand(1).getReg();
8744 bool RegAIsKill = Root.getOperand(1).isKill();
8745 Register RegB = AddMI->getOperand(IdxOpd1).getReg();
8746 bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill();
8747 Register RegC = AddMI->getOperand(IdxOtherOpd).getReg();
8748 bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill();
8749 Register NewVR =
8751
8752 unsigned Opcode = Root.getOpcode();
8753 if (Opcode == AArch64::SUBSWrr)
8754 Opcode = AArch64::SUBWrr;
8755 else if (Opcode == AArch64::SUBSXrr)
8756 Opcode = AArch64::SUBXrr;
8757 else
8758 assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) &&
8759 "Unexpected instruction opcode.");
8760
8761 uint32_t Flags = Root.mergeFlagsWith(*AddMI);
8762 Flags &= ~MachineInstr::NoSWrap;
8763 Flags &= ~MachineInstr::NoUWrap;
8764
8765 MachineInstrBuilder MIB1 =
8766 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), NewVR)
8767 .addReg(RegA, getKillRegState(RegAIsKill))
8768 .addReg(RegB, getKillRegState(RegBIsKill))
8769 .setMIFlags(Flags);
8770 MachineInstrBuilder MIB2 =
8771 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), ResultReg)
8772 .addReg(NewVR, getKillRegState(true))
8773 .addReg(RegC, getKillRegState(RegCIsKill))
8774 .setMIFlags(Flags);
8775
8776 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8777 InsInstrs.push_back(MIB1);
8778 InsInstrs.push_back(MIB2);
8779 DelInstrs.push_back(AddMI);
8780 DelInstrs.push_back(&Root);
8781}
8782
8783unsigned AArch64InstrInfo::getReduceOpcodeForAccumulator(
8784 unsigned int AccumulatorOpCode) const {
8785 switch (AccumulatorOpCode) {
8786 case AArch64::UABALB_ZZZ_D:
8787 case AArch64::SABALB_ZZZ_D:
8788 case AArch64::UABALT_ZZZ_D:
8789 case AArch64::SABALT_ZZZ_D:
8790 return AArch64::ADD_ZZZ_D;
8791 case AArch64::UABALB_ZZZ_H:
8792 case AArch64::SABALB_ZZZ_H:
8793 case AArch64::UABALT_ZZZ_H:
8794 case AArch64::SABALT_ZZZ_H:
8795 return AArch64::ADD_ZZZ_H;
8796 case AArch64::UABALB_ZZZ_S:
8797 case AArch64::SABALB_ZZZ_S:
8798 case AArch64::UABALT_ZZZ_S:
8799 case AArch64::SABALT_ZZZ_S:
8800 return AArch64::ADD_ZZZ_S;
8801 case AArch64::UABALv16i8_v8i16:
8802 case AArch64::SABALv8i8_v8i16:
8803 case AArch64::SABAv8i16:
8804 case AArch64::UABAv8i16:
8805 return AArch64::ADDv8i16;
8806 case AArch64::SABALv2i32_v2i64:
8807 case AArch64::UABALv2i32_v2i64:
8808 case AArch64::SABALv4i32_v2i64:
8809 return AArch64::ADDv2i64;
8810 case AArch64::UABALv4i16_v4i32:
8811 case AArch64::SABALv4i16_v4i32:
8812 case AArch64::SABALv8i16_v4i32:
8813 case AArch64::SABAv4i32:
8814 case AArch64::UABAv4i32:
8815 return AArch64::ADDv4i32;
8816 case AArch64::UABALv4i32_v2i64:
8817 return AArch64::ADDv2i64;
8818 case AArch64::UABALv8i16_v4i32:
8819 return AArch64::ADDv4i32;
8820 case AArch64::UABALv8i8_v8i16:
8821 case AArch64::SABALv16i8_v8i16:
8822 return AArch64::ADDv8i16;
8823 case AArch64::UABAv16i8:
8824 case AArch64::SABAv16i8:
8825 return AArch64::ADDv16i8;
8826 case AArch64::UABAv4i16:
8827 case AArch64::SABAv4i16:
8828 return AArch64::ADDv4i16;
8829 case AArch64::UABAv2i32:
8830 case AArch64::SABAv2i32:
8831 return AArch64::ADDv2i32;
8832 case AArch64::UABAv8i8:
8833 case AArch64::SABAv8i8:
8834 return AArch64::ADDv8i8;
8835 default:
8836 llvm_unreachable("Unknown accumulator opcode");
8837 }
8838}
8839
8840/// When getMachineCombinerPatterns() finds potential patterns,
8841/// this function generates the instructions that could replace the
8842/// original code sequence
8843void AArch64InstrInfo::genAlternativeCodeSequence(
8844 MachineInstr &Root, unsigned Pattern,
8847 DenseMap<Register, unsigned> &InstrIdxForVirtReg) const {
8848 MachineBasicBlock &MBB = *Root.getParent();
8849 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8850 MachineFunction &MF = *MBB.getParent();
8851 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
8852
8853 MachineInstr *MUL = nullptr;
8854 const TargetRegisterClass *RC;
8855 unsigned Opc;
8856 switch (Pattern) {
8857 default:
8858 // Reassociate instructions.
8859 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
8860 DelInstrs, InstrIdxForVirtReg);
8861 return;
8863 // A - (B + C)
8864 // ==> (A - B) - C
8865 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1,
8866 InstrIdxForVirtReg);
8867 return;
8869 // A - (B + C)
8870 // ==> (A - C) - B
8871 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2,
8872 InstrIdxForVirtReg);
8873 return;
8876 // MUL I=A,B,0
8877 // ADD R,I,C
8878 // ==> MADD R,A,B,C
8879 // --- Create(MADD);
8881 Opc = AArch64::MADDWrrr;
8882 RC = &AArch64::GPR32RegClass;
8883 } else {
8884 Opc = AArch64::MADDXrrr;
8885 RC = &AArch64::GPR64RegClass;
8886 }
8887 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8888 break;
8891 // MUL I=A,B,0
8892 // ADD R,C,I
8893 // ==> MADD R,A,B,C
8894 // --- Create(MADD);
8896 Opc = AArch64::MADDWrrr;
8897 RC = &AArch64::GPR32RegClass;
8898 } else {
8899 Opc = AArch64::MADDXrrr;
8900 RC = &AArch64::GPR64RegClass;
8901 }
8902 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8903 break;
8908 // MUL I=A,B,0
8909 // ADD/SUB R,I,Imm
8910 // ==> MOV V, Imm/-Imm
8911 // ==> MADD R,A,B,V
8912 // --- Create(MADD);
8913 const TargetRegisterClass *RC;
8914 unsigned BitSize, MovImm;
8917 MovImm = AArch64::MOVi32imm;
8918 RC = &AArch64::GPR32spRegClass;
8919 BitSize = 32;
8920 Opc = AArch64::MADDWrrr;
8921 RC = &AArch64::GPR32RegClass;
8922 } else {
8923 MovImm = AArch64::MOVi64imm;
8924 RC = &AArch64::GPR64spRegClass;
8925 BitSize = 64;
8926 Opc = AArch64::MADDXrrr;
8927 RC = &AArch64::GPR64RegClass;
8928 }
8929 Register NewVR = MRI.createVirtualRegister(RC);
8930 uint64_t Imm = Root.getOperand(2).getImm();
8931
8932 if (Root.getOperand(3).isImm()) {
8933 unsigned Val = Root.getOperand(3).getImm();
8934 Imm = Imm << Val;
8935 }
8936 bool IsSub = Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1 ||
8938 uint64_t UImm = SignExtend64(IsSub ? -Imm : Imm, BitSize);
8939 // Check that the immediate can be composed via a single instruction.
8941 AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
8942 if (Insn.size() != 1)
8943 return;
8944 MachineInstrBuilder MIB1 =
8945 BuildMI(MF, MIMetadata(Root), TII->get(MovImm), NewVR)
8946 .addImm(IsSub ? -Imm : Imm);
8947 InsInstrs.push_back(MIB1);
8948 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8949 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
8950 break;
8951 }
8954 // MUL I=A,B,0
8955 // SUB R,I, C
8956 // ==> SUB V, 0, C
8957 // ==> MADD R,A,B,V // = -C + A*B
8958 // --- Create(MADD);
8959 const TargetRegisterClass *SubRC;
8960 unsigned SubOpc, ZeroReg;
8962 SubOpc = AArch64::SUBWrr;
8963 SubRC = &AArch64::GPR32spRegClass;
8964 ZeroReg = AArch64::WZR;
8965 Opc = AArch64::MADDWrrr;
8966 RC = &AArch64::GPR32RegClass;
8967 } else {
8968 SubOpc = AArch64::SUBXrr;
8969 SubRC = &AArch64::GPR64spRegClass;
8970 ZeroReg = AArch64::XZR;
8971 Opc = AArch64::MADDXrrr;
8972 RC = &AArch64::GPR64RegClass;
8973 }
8974 Register NewVR = MRI.createVirtualRegister(SubRC);
8975 // SUB NewVR, 0, C
8976 MachineInstrBuilder MIB1 =
8977 BuildMI(MF, MIMetadata(Root), TII->get(SubOpc), NewVR)
8978 .addReg(ZeroReg)
8979 .add(Root.getOperand(2));
8980 InsInstrs.push_back(MIB1);
8981 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8982 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
8983 break;
8984 }
8987 // MUL I=A,B,0
8988 // SUB R,C,I
8989 // ==> MSUB R,A,B,C (computes C - A*B)
8990 // --- Create(MSUB);
8992 Opc = AArch64::MSUBWrrr;
8993 RC = &AArch64::GPR32RegClass;
8994 } else {
8995 Opc = AArch64::MSUBXrrr;
8996 RC = &AArch64::GPR64RegClass;
8997 }
8998 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8999 break;
9001 Opc = AArch64::MLAv8i8;
9002 RC = &AArch64::FPR64RegClass;
9003 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9004 break;
9006 Opc = AArch64::MLAv8i8;
9007 RC = &AArch64::FPR64RegClass;
9008 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9009 break;
9011 Opc = AArch64::MLAv16i8;
9012 RC = &AArch64::FPR128RegClass;
9013 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9014 break;
9016 Opc = AArch64::MLAv16i8;
9017 RC = &AArch64::FPR128RegClass;
9018 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9019 break;
9021 Opc = AArch64::MLAv4i16;
9022 RC = &AArch64::FPR64RegClass;
9023 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9024 break;
9026 Opc = AArch64::MLAv4i16;
9027 RC = &AArch64::FPR64RegClass;
9028 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9029 break;
9031 Opc = AArch64::MLAv8i16;
9032 RC = &AArch64::FPR128RegClass;
9033 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9034 break;
9036 Opc = AArch64::MLAv8i16;
9037 RC = &AArch64::FPR128RegClass;
9038 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9039 break;
9041 Opc = AArch64::MLAv2i32;
9042 RC = &AArch64::FPR64RegClass;
9043 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9044 break;
9046 Opc = AArch64::MLAv2i32;
9047 RC = &AArch64::FPR64RegClass;
9048 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9049 break;
9051 Opc = AArch64::MLAv4i32;
9052 RC = &AArch64::FPR128RegClass;
9053 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9054 break;
9056 Opc = AArch64::MLAv4i32;
9057 RC = &AArch64::FPR128RegClass;
9058 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9059 break;
9060
9062 Opc = AArch64::MLAv8i8;
9063 RC = &AArch64::FPR64RegClass;
9064 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9065 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8,
9066 RC);
9067 break;
9069 Opc = AArch64::MLSv8i8;
9070 RC = &AArch64::FPR64RegClass;
9071 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9072 break;
9074 Opc = AArch64::MLAv16i8;
9075 RC = &AArch64::FPR128RegClass;
9076 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9077 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8,
9078 RC);
9079 break;
9081 Opc = AArch64::MLSv16i8;
9082 RC = &AArch64::FPR128RegClass;
9083 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9084 break;
9086 Opc = AArch64::MLAv4i16;
9087 RC = &AArch64::FPR64RegClass;
9088 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9089 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
9090 RC);
9091 break;
9093 Opc = AArch64::MLSv4i16;
9094 RC = &AArch64::FPR64RegClass;
9095 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9096 break;
9098 Opc = AArch64::MLAv8i16;
9099 RC = &AArch64::FPR128RegClass;
9100 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9101 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
9102 RC);
9103 break;
9105 Opc = AArch64::MLSv8i16;
9106 RC = &AArch64::FPR128RegClass;
9107 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9108 break;
9110 Opc = AArch64::MLAv2i32;
9111 RC = &AArch64::FPR64RegClass;
9112 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9113 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
9114 RC);
9115 break;
9117 Opc = AArch64::MLSv2i32;
9118 RC = &AArch64::FPR64RegClass;
9119 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9120 break;
9122 Opc = AArch64::MLAv4i32;
9123 RC = &AArch64::FPR128RegClass;
9124 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9125 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
9126 RC);
9127 break;
9129 Opc = AArch64::MLSv4i32;
9130 RC = &AArch64::FPR128RegClass;
9131 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9132 break;
9133
9135 Opc = AArch64::MLAv4i16_indexed;
9136 RC = &AArch64::FPR64RegClass;
9137 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9138 break;
9140 Opc = AArch64::MLAv4i16_indexed;
9141 RC = &AArch64::FPR64RegClass;
9142 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9143 break;
9145 Opc = AArch64::MLAv8i16_indexed;
9146 RC = &AArch64::FPR128RegClass;
9147 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9148 break;
9150 Opc = AArch64::MLAv8i16_indexed;
9151 RC = &AArch64::FPR128RegClass;
9152 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9153 break;
9155 Opc = AArch64::MLAv2i32_indexed;
9156 RC = &AArch64::FPR64RegClass;
9157 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9158 break;
9160 Opc = AArch64::MLAv2i32_indexed;
9161 RC = &AArch64::FPR64RegClass;
9162 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9163 break;
9165 Opc = AArch64::MLAv4i32_indexed;
9166 RC = &AArch64::FPR128RegClass;
9167 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9168 break;
9170 Opc = AArch64::MLAv4i32_indexed;
9171 RC = &AArch64::FPR128RegClass;
9172 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9173 break;
9174
9176 Opc = AArch64::MLAv4i16_indexed;
9177 RC = &AArch64::FPR64RegClass;
9178 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9179 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
9180 RC);
9181 break;
9183 Opc = AArch64::MLSv4i16_indexed;
9184 RC = &AArch64::FPR64RegClass;
9185 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9186 break;
9188 Opc = AArch64::MLAv8i16_indexed;
9189 RC = &AArch64::FPR128RegClass;
9190 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9191 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
9192 RC);
9193 break;
9195 Opc = AArch64::MLSv8i16_indexed;
9196 RC = &AArch64::FPR128RegClass;
9197 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9198 break;
9200 Opc = AArch64::MLAv2i32_indexed;
9201 RC = &AArch64::FPR64RegClass;
9202 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9203 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
9204 RC);
9205 break;
9207 Opc = AArch64::MLSv2i32_indexed;
9208 RC = &AArch64::FPR64RegClass;
9209 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9210 break;
9212 Opc = AArch64::MLAv4i32_indexed;
9213 RC = &AArch64::FPR128RegClass;
9214 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9215 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
9216 RC);
9217 break;
9219 Opc = AArch64::MLSv4i32_indexed;
9220 RC = &AArch64::FPR128RegClass;
9221 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9222 break;
9223
9224 // Floating Point Support
9226 Opc = AArch64::FMADDHrrr;
9227 RC = &AArch64::FPR16RegClass;
9228 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9229 break;
9231 Opc = AArch64::FMADDSrrr;
9232 RC = &AArch64::FPR32RegClass;
9233 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9234 break;
9236 Opc = AArch64::FMADDDrrr;
9237 RC = &AArch64::FPR64RegClass;
9238 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9239 break;
9240
9242 Opc = AArch64::FMADDHrrr;
9243 RC = &AArch64::FPR16RegClass;
9244 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9245 break;
9247 Opc = AArch64::FMADDSrrr;
9248 RC = &AArch64::FPR32RegClass;
9249 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9250 break;
9252 Opc = AArch64::FMADDDrrr;
9253 RC = &AArch64::FPR64RegClass;
9254 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9255 break;
9256
9258 Opc = AArch64::FMLAv1i32_indexed;
9259 RC = &AArch64::FPR32RegClass;
9260 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9262 break;
9264 Opc = AArch64::FMLAv1i32_indexed;
9265 RC = &AArch64::FPR32RegClass;
9266 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9268 break;
9269
9271 Opc = AArch64::FMLAv1i64_indexed;
9272 RC = &AArch64::FPR64RegClass;
9273 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9275 break;
9277 Opc = AArch64::FMLAv1i64_indexed;
9278 RC = &AArch64::FPR64RegClass;
9279 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9281 break;
9282
9284 RC = &AArch64::FPR64RegClass;
9285 Opc = AArch64::FMLAv4i16_indexed;
9286 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9288 break;
9290 RC = &AArch64::FPR64RegClass;
9291 Opc = AArch64::FMLAv4f16;
9292 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9294 break;
9296 RC = &AArch64::FPR64RegClass;
9297 Opc = AArch64::FMLAv4i16_indexed;
9298 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9300 break;
9302 RC = &AArch64::FPR64RegClass;
9303 Opc = AArch64::FMLAv4f16;
9304 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9306 break;
9307
9310 RC = &AArch64::FPR64RegClass;
9312 Opc = AArch64::FMLAv2i32_indexed;
9313 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9315 } else {
9316 Opc = AArch64::FMLAv2f32;
9317 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9319 }
9320 break;
9323 RC = &AArch64::FPR64RegClass;
9325 Opc = AArch64::FMLAv2i32_indexed;
9326 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9328 } else {
9329 Opc = AArch64::FMLAv2f32;
9330 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9332 }
9333 break;
9334
9336 RC = &AArch64::FPR128RegClass;
9337 Opc = AArch64::FMLAv8i16_indexed;
9338 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9340 break;
9342 RC = &AArch64::FPR128RegClass;
9343 Opc = AArch64::FMLAv8f16;
9344 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9346 break;
9348 RC = &AArch64::FPR128RegClass;
9349 Opc = AArch64::FMLAv8i16_indexed;
9350 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9352 break;
9354 RC = &AArch64::FPR128RegClass;
9355 Opc = AArch64::FMLAv8f16;
9356 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9358 break;
9359
9362 RC = &AArch64::FPR128RegClass;
9364 Opc = AArch64::FMLAv2i64_indexed;
9365 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9367 } else {
9368 Opc = AArch64::FMLAv2f64;
9369 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9371 }
9372 break;
9375 RC = &AArch64::FPR128RegClass;
9377 Opc = AArch64::FMLAv2i64_indexed;
9378 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9380 } else {
9381 Opc = AArch64::FMLAv2f64;
9382 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9384 }
9385 break;
9386
9389 RC = &AArch64::FPR128RegClass;
9391 Opc = AArch64::FMLAv4i32_indexed;
9392 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9394 } else {
9395 Opc = AArch64::FMLAv4f32;
9396 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9398 }
9399 break;
9400
9403 RC = &AArch64::FPR128RegClass;
9405 Opc = AArch64::FMLAv4i32_indexed;
9406 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9408 } else {
9409 Opc = AArch64::FMLAv4f32;
9410 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9412 }
9413 break;
9414
9416 Opc = AArch64::FNMSUBHrrr;
9417 RC = &AArch64::FPR16RegClass;
9418 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9419 break;
9421 Opc = AArch64::FNMSUBSrrr;
9422 RC = &AArch64::FPR32RegClass;
9423 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9424 break;
9426 Opc = AArch64::FNMSUBDrrr;
9427 RC = &AArch64::FPR64RegClass;
9428 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9429 break;
9430
9432 Opc = AArch64::FNMADDHrrr;
9433 RC = &AArch64::FPR16RegClass;
9434 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9435 break;
9437 Opc = AArch64::FNMADDSrrr;
9438 RC = &AArch64::FPR32RegClass;
9439 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9440 break;
9442 Opc = AArch64::FNMADDDrrr;
9443 RC = &AArch64::FPR64RegClass;
9444 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9445 break;
9446
9448 Opc = AArch64::FMSUBHrrr;
9449 RC = &AArch64::FPR16RegClass;
9450 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9451 break;
9453 Opc = AArch64::FMSUBSrrr;
9454 RC = &AArch64::FPR32RegClass;
9455 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9456 break;
9458 Opc = AArch64::FMSUBDrrr;
9459 RC = &AArch64::FPR64RegClass;
9460 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9461 break;
9462
9464 Opc = AArch64::FMLSv1i32_indexed;
9465 RC = &AArch64::FPR32RegClass;
9466 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9468 break;
9469
9471 Opc = AArch64::FMLSv1i64_indexed;
9472 RC = &AArch64::FPR64RegClass;
9473 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9475 break;
9476
9479 RC = &AArch64::FPR64RegClass;
9480 Register NewVR = MRI.createVirtualRegister(RC);
9481 MachineInstrBuilder MIB1 =
9482 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f16), NewVR)
9483 .add(Root.getOperand(2));
9484 InsInstrs.push_back(MIB1);
9485 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9487 Opc = AArch64::FMLAv4f16;
9488 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9489 FMAInstKind::Accumulator, &NewVR);
9490 } else {
9491 Opc = AArch64::FMLAv4i16_indexed;
9492 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9493 FMAInstKind::Indexed, &NewVR);
9494 }
9495 break;
9496 }
9498 RC = &AArch64::FPR64RegClass;
9499 Opc = AArch64::FMLSv4f16;
9500 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9502 break;
9504 RC = &AArch64::FPR64RegClass;
9505 Opc = AArch64::FMLSv4i16_indexed;
9506 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9508 break;
9509
9512 RC = &AArch64::FPR64RegClass;
9514 Opc = AArch64::FMLSv2i32_indexed;
9515 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9517 } else {
9518 Opc = AArch64::FMLSv2f32;
9519 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9521 }
9522 break;
9523
9526 RC = &AArch64::FPR128RegClass;
9527 Register NewVR = MRI.createVirtualRegister(RC);
9528 MachineInstrBuilder MIB1 =
9529 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv8f16), NewVR)
9530 .add(Root.getOperand(2));
9531 InsInstrs.push_back(MIB1);
9532 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9534 Opc = AArch64::FMLAv8f16;
9535 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9536 FMAInstKind::Accumulator, &NewVR);
9537 } else {
9538 Opc = AArch64::FMLAv8i16_indexed;
9539 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9540 FMAInstKind::Indexed, &NewVR);
9541 }
9542 break;
9543 }
9545 RC = &AArch64::FPR128RegClass;
9546 Opc = AArch64::FMLSv8f16;
9547 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9549 break;
9551 RC = &AArch64::FPR128RegClass;
9552 Opc = AArch64::FMLSv8i16_indexed;
9553 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9555 break;
9556
9559 RC = &AArch64::FPR128RegClass;
9561 Opc = AArch64::FMLSv2i64_indexed;
9562 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9564 } else {
9565 Opc = AArch64::FMLSv2f64;
9566 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9568 }
9569 break;
9570
9573 RC = &AArch64::FPR128RegClass;
9575 Opc = AArch64::FMLSv4i32_indexed;
9576 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9578 } else {
9579 Opc = AArch64::FMLSv4f32;
9580 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9582 }
9583 break;
9586 RC = &AArch64::FPR64RegClass;
9587 Register NewVR = MRI.createVirtualRegister(RC);
9588 MachineInstrBuilder MIB1 =
9589 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f32), NewVR)
9590 .add(Root.getOperand(2));
9591 InsInstrs.push_back(MIB1);
9592 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9594 Opc = AArch64::FMLAv2i32_indexed;
9595 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9596 FMAInstKind::Indexed, &NewVR);
9597 } else {
9598 Opc = AArch64::FMLAv2f32;
9599 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9600 FMAInstKind::Accumulator, &NewVR);
9601 }
9602 break;
9603 }
9606 RC = &AArch64::FPR128RegClass;
9607 Register NewVR = MRI.createVirtualRegister(RC);
9608 MachineInstrBuilder MIB1 =
9609 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f32), NewVR)
9610 .add(Root.getOperand(2));
9611 InsInstrs.push_back(MIB1);
9612 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9614 Opc = AArch64::FMLAv4i32_indexed;
9615 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9616 FMAInstKind::Indexed, &NewVR);
9617 } else {
9618 Opc = AArch64::FMLAv4f32;
9619 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9620 FMAInstKind::Accumulator, &NewVR);
9621 }
9622 break;
9623 }
9626 RC = &AArch64::FPR128RegClass;
9627 Register NewVR = MRI.createVirtualRegister(RC);
9628 MachineInstrBuilder MIB1 =
9629 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f64), NewVR)
9630 .add(Root.getOperand(2));
9631 InsInstrs.push_back(MIB1);
9632 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9634 Opc = AArch64::FMLAv2i64_indexed;
9635 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9636 FMAInstKind::Indexed, &NewVR);
9637 } else {
9638 Opc = AArch64::FMLAv2f64;
9639 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9640 FMAInstKind::Accumulator, &NewVR);
9641 }
9642 break;
9643 }
9646 unsigned IdxDupOp =
9648 : 2;
9649 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed,
9650 &AArch64::FPR128RegClass, MRI);
9651 break;
9652 }
9655 unsigned IdxDupOp =
9657 : 2;
9658 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed,
9659 &AArch64::FPR128RegClass, MRI);
9660 break;
9661 }
9664 unsigned IdxDupOp =
9666 : 2;
9667 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed,
9668 &AArch64::FPR128_loRegClass, MRI);
9669 break;
9670 }
9673 unsigned IdxDupOp =
9675 : 2;
9676 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed,
9677 &AArch64::FPR128RegClass, MRI);
9678 break;
9679 }
9682 unsigned IdxDupOp =
9684 : 2;
9685 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed,
9686 &AArch64::FPR128_loRegClass, MRI);
9687 break;
9688 }
9690 MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
9691 break;
9692 }
9694 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9695 Pattern, 4);
9696 break;
9697 }
9699 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9700 Pattern, 8);
9701 break;
9702 }
9704 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9705 Pattern, 16);
9706 break;
9707 }
9708
9709 } // end switch (Pattern)
9710 // Record MUL and ADD/SUB for deletion
9711 if (MUL)
9712 DelInstrs.push_back(MUL);
9713 DelInstrs.push_back(&Root);
9714
9715 // Set the flags on the inserted instructions to be the merged flags of the
9716 // instructions that we have combined.
9717 uint32_t Flags = Root.getFlags();
9718 if (MUL)
9719 Flags = Root.mergeFlagsWith(*MUL);
9720 for (auto *MI : InsInstrs)
9721 MI->setFlags(Flags);
9722}
9723
9724/// Replace csincr-branch sequence by simple conditional branch
9725///
9726/// Examples:
9727/// 1. \code
9728/// csinc w9, wzr, wzr, <condition code>
9729/// tbnz w9, #0, 0x44
9730/// \endcode
9731/// to
9732/// \code
9733/// b.<inverted condition code>
9734/// \endcode
9735///
9736/// 2. \code
9737/// csinc w9, wzr, wzr, <condition code>
9738/// tbz w9, #0, 0x44
9739/// \endcode
9740/// to
9741/// \code
9742/// b.<condition code>
9743/// \endcode
9744///
9745/// Replace compare and branch sequence by TBZ/TBNZ instruction when the
9746/// compare's constant operand is power of 2.
9747///
9748/// Examples:
9749/// \code
9750/// and w8, w8, #0x400
9751/// cbnz w8, L1
9752/// \endcode
9753/// to
9754/// \code
9755/// tbnz w8, #10, L1
9756/// \endcode
9757///
9758/// \param MI Conditional Branch
9759/// \return True when the simple conditional branch is generated
9760///
9762 bool IsNegativeBranch = false;
9763 bool IsTestAndBranch = false;
9764 unsigned TargetBBInMI = 0;
9765 switch (MI.getOpcode()) {
9766 default:
9767 llvm_unreachable("Unknown branch instruction?");
9768 case AArch64::Bcc:
9769 case AArch64::CBWPri:
9770 case AArch64::CBXPri:
9771 case AArch64::CBBAssertExt:
9772 case AArch64::CBHAssertExt:
9773 case AArch64::CBWPrr:
9774 case AArch64::CBXPrr:
9775 return false;
9776 case AArch64::CBZW:
9777 case AArch64::CBZX:
9778 TargetBBInMI = 1;
9779 break;
9780 case AArch64::CBNZW:
9781 case AArch64::CBNZX:
9782 TargetBBInMI = 1;
9783 IsNegativeBranch = true;
9784 break;
9785 case AArch64::TBZW:
9786 case AArch64::TBZX:
9787 TargetBBInMI = 2;
9788 IsTestAndBranch = true;
9789 break;
9790 case AArch64::TBNZW:
9791 case AArch64::TBNZX:
9792 TargetBBInMI = 2;
9793 IsNegativeBranch = true;
9794 IsTestAndBranch = true;
9795 break;
9796 }
9797 // So we increment a zero register and test for bits other
9798 // than bit 0? Conservatively bail out in case the verifier
9799 // missed this case.
9800 if (IsTestAndBranch && MI.getOperand(1).getImm())
9801 return false;
9802
9803 // Find Definition.
9804 assert(MI.getParent() && "Incomplete machine instruction\n");
9805 MachineBasicBlock *MBB = MI.getParent();
9806 MachineFunction *MF = MBB->getParent();
9807 MachineRegisterInfo *MRI = &MF->getRegInfo();
9808 Register VReg = MI.getOperand(0).getReg();
9809 if (!VReg.isVirtual())
9810 return false;
9811
9812 MachineInstr *DefMI = MRI->getVRegDef(VReg);
9813
9814 // Look through COPY instructions to find definition.
9815 while (DefMI->isCopy()) {
9816 Register CopyVReg = DefMI->getOperand(1).getReg();
9817 if (!MRI->hasOneNonDBGUse(CopyVReg))
9818 return false;
9819 if (!MRI->hasOneDef(CopyVReg))
9820 return false;
9821 DefMI = MRI->getVRegDef(CopyVReg);
9822 }
9823
9824 switch (DefMI->getOpcode()) {
9825 default:
9826 return false;
9827 // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
9828 case AArch64::ANDWri:
9829 case AArch64::ANDXri: {
9830 if (IsTestAndBranch)
9831 return false;
9832 if (DefMI->getParent() != MBB)
9833 return false;
9834 if (!MRI->hasOneNonDBGUse(VReg))
9835 return false;
9836
9837 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
9839 DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
9840 if (!isPowerOf2_64(Mask))
9841 return false;
9842
9843 MachineOperand &MO = DefMI->getOperand(1);
9844 Register NewReg = MO.getReg();
9845 if (!NewReg.isVirtual())
9846 return false;
9847
9848 assert(!MRI->def_empty(NewReg) && "Register must be defined.");
9849
9850 MachineBasicBlock &RefToMBB = *MBB;
9851 MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
9852 DebugLoc DL = MI.getDebugLoc();
9853 unsigned Imm = Log2_64(Mask);
9854 unsigned Opc = (Imm < 32)
9855 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
9856 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
9857 MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
9858 .addReg(NewReg)
9859 .addImm(Imm)
9860 .addMBB(TBB);
9861 // Register lives on to the CBZ now.
9862 MO.setIsKill(false);
9863
9864 // For immediate smaller than 32, we need to use the 32-bit
9865 // variant (W) in all cases. Indeed the 64-bit variant does not
9866 // allow to encode them.
9867 // Therefore, if the input register is 64-bit, we need to take the
9868 // 32-bit sub-part.
9869 if (!Is32Bit && Imm < 32)
9870 NewMI->getOperand(0).setSubReg(AArch64::sub_32);
9871 MI.eraseFromParent();
9872 return true;
9873 }
9874 // Look for CSINC
9875 case AArch64::CSINCWr:
9876 case AArch64::CSINCXr: {
9877 if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
9878 DefMI->getOperand(2).getReg() == AArch64::WZR) &&
9879 !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
9880 DefMI->getOperand(2).getReg() == AArch64::XZR))
9881 return false;
9882
9883 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
9884 true) != -1)
9885 return false;
9886
9887 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
9888 // Convert only when the condition code is not modified between
9889 // the CSINC and the branch. The CC may be used by other
9890 // instructions in between.
9892 return false;
9893 MachineBasicBlock &RefToMBB = *MBB;
9894 MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
9895 DebugLoc DL = MI.getDebugLoc();
9896 if (IsNegativeBranch)
9898 BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
9899 MI.eraseFromParent();
9900 return true;
9901 }
9902 }
9903}
9904
9905std::pair<unsigned, unsigned>
9906AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
9907 const unsigned Mask = AArch64II::MO_FRAGMENT;
9908 return std::make_pair(TF & Mask, TF & ~Mask);
9909}
9910
9912AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
9913 using namespace AArch64II;
9914
9915 static const std::pair<unsigned, const char *> TargetFlags[] = {
9916 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
9917 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"},
9918 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"},
9919 {MO_HI12, "aarch64-hi12"}};
9920 return ArrayRef(TargetFlags);
9921}
9922
9924AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
9925 using namespace AArch64II;
9926
9927 static const std::pair<unsigned, const char *> TargetFlags[] = {
9928 {MO_COFFSTUB, "aarch64-coffstub"},
9929 {MO_GOT, "aarch64-got"},
9930 {MO_NC, "aarch64-nc"},
9931 {MO_S, "aarch64-s"},
9932 {MO_TLS, "aarch64-tls"},
9933 {MO_DLLIMPORT, "aarch64-dllimport"},
9934 {MO_PREL, "aarch64-prel"},
9935 {MO_TAGGED, "aarch64-tagged"},
9936 {MO_ARM64EC_CALLMANGLE, "aarch64-arm64ec-callmangle"},
9937 };
9938 return ArrayRef(TargetFlags);
9939}
9940
9942AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
9943 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9944 {{MOSuppressPair, "aarch64-suppress-pair"},
9945 {MOStridedAccess, "aarch64-strided-access"}};
9946 return ArrayRef(TargetFlags);
9947}
9948
9949/// Constants defining how certain sequences should be outlined.
9950/// This encompasses how an outlined function should be called, and what kind of
9951/// frame should be emitted for that outlined function.
9952///
9953/// \p MachineOutlinerDefault implies that the function should be called with
9954/// a save and restore of LR to the stack.
9955///
9956/// That is,
9957///
9958/// I1 Save LR OUTLINED_FUNCTION:
9959/// I2 --> BL OUTLINED_FUNCTION I1
9960/// I3 Restore LR I2
9961/// I3
9962/// RET
9963///
9964/// * Call construction overhead: 3 (save + BL + restore)
9965/// * Frame construction overhead: 1 (ret)
9966/// * Requires stack fixups? Yes
9967///
9968/// \p MachineOutlinerTailCall implies that the function is being created from
9969/// a sequence of instructions ending in a return.
9970///
9971/// That is,
9972///
9973/// I1 OUTLINED_FUNCTION:
9974/// I2 --> B OUTLINED_FUNCTION I1
9975/// RET I2
9976/// RET
9977///
9978/// * Call construction overhead: 1 (B)
9979/// * Frame construction overhead: 0 (Return included in sequence)
9980/// * Requires stack fixups? No
9981///
9982/// \p MachineOutlinerNoLRSave implies that the function should be called using
9983/// a BL instruction, but doesn't require LR to be saved and restored. This
9984/// happens when LR is known to be dead.
9985///
9986/// That is,
9987///
9988/// I1 OUTLINED_FUNCTION:
9989/// I2 --> BL OUTLINED_FUNCTION I1
9990/// I3 I2
9991/// I3
9992/// RET
9993///
9994/// * Call construction overhead: 1 (BL)
9995/// * Frame construction overhead: 1 (RET)
9996/// * Requires stack fixups? No
9997///
9998/// \p MachineOutlinerThunk implies that the function is being created from
9999/// a sequence of instructions ending in a call. The outlined function is
10000/// called with a BL instruction, and the outlined function tail-calls the
10001/// original call destination.
10002///
10003/// That is,
10004///
10005/// I1 OUTLINED_FUNCTION:
10006/// I2 --> BL OUTLINED_FUNCTION I1
10007/// BL f I2
10008/// B f
10009/// * Call construction overhead: 1 (BL)
10010/// * Frame construction overhead: 0
10011/// * Requires stack fixups? No
10012///
10013/// \p MachineOutlinerRegSave implies that the function should be called with a
10014/// save and restore of LR to an available register. This allows us to avoid
10015/// stack fixups. Note that this outlining variant is compatible with the
10016/// NoLRSave case.
10017///
10018/// That is,
10019///
10020/// I1 Save LR OUTLINED_FUNCTION:
10021/// I2 --> BL OUTLINED_FUNCTION I1
10022/// I3 Restore LR I2
10023/// I3
10024/// RET
10025///
10026/// * Call construction overhead: 3 (save + BL + restore)
10027/// * Frame construction overhead: 1 (ret)
10028/// * Requires stack fixups? No
10030 MachineOutlinerDefault, /// Emit a save, restore, call, and return.
10031 MachineOutlinerTailCall, /// Only emit a branch.
10032 MachineOutlinerNoLRSave, /// Emit a call and return.
10033 MachineOutlinerThunk, /// Emit a call and tail-call.
10034 MachineOutlinerRegSave /// Same as default, but save to a register.
10035};
10036
10042
10044AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const {
10045 MachineFunction *MF = C.getMF();
10046 const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
10047 const AArch64RegisterInfo *ARI =
10048 static_cast<const AArch64RegisterInfo *>(&TRI);
10049 // Check if there is an available register across the sequence that we can
10050 // use.
10051 for (unsigned Reg : AArch64::GPR64RegClass) {
10052 if (!ARI->isReservedReg(*MF, Reg) &&
10053 Reg != AArch64::LR && // LR is not reserved, but don't use it.
10054 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
10055 Reg != AArch64::X17 && // Ditto for X17.
10056 C.isAvailableAcrossAndOutOfSeq(Reg, TRI) &&
10057 C.isAvailableInsideSeq(Reg, TRI))
10058 return Reg;
10059 }
10060 return Register();
10061}
10062
10063static bool
10065 const outliner::Candidate &b) {
10066 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
10067 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
10068
10069 return MFIa->getSignReturnAddressCondition() ==
10071}
10072
10073static bool
10075 const outliner::Candidate &b) {
10076 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
10077 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
10078
10079 return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey();
10080}
10081
10083 const outliner::Candidate &b) {
10084 const AArch64Subtarget &SubtargetA =
10086 const AArch64Subtarget &SubtargetB =
10087 b.getMF()->getSubtarget<AArch64Subtarget>();
10088 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
10089}
10090
10091std::optional<std::unique_ptr<outliner::OutlinedFunction>>
10092AArch64InstrInfo::getOutliningCandidateInfo(
10093 const MachineModuleInfo &MMI,
10094 std::vector<outliner::Candidate> &RepeatedSequenceLocs,
10095 unsigned MinRepeats) const {
10096 unsigned SequenceSize = 0;
10097 for (auto &MI : RepeatedSequenceLocs[0])
10098 SequenceSize += getInstSizeInBytes(MI);
10099
10100 unsigned NumBytesToCreateFrame = 0;
10101
10102 // Avoid splitting ADRP ADD/LDR pair into outlined functions.
10103 // These instructions are fused together by the scheduler.
10104 // Any candidate where ADRP is the last instruction should be rejected
10105 // as that will lead to splitting ADRP pair.
10106 MachineInstr &LastMI = RepeatedSequenceLocs[0].back();
10107 MachineInstr &FirstMI = RepeatedSequenceLocs[0].front();
10108 if (LastMI.getOpcode() == AArch64::ADRP &&
10109 (LastMI.getOperand(1).getTargetFlags() & AArch64II::MO_PAGE) != 0 &&
10110 (LastMI.getOperand(1).getTargetFlags() & AArch64II::MO_GOT) != 0) {
10111 return std::nullopt;
10112 }
10113
10114 // Similarly any candidate where the first instruction is ADD/LDR with a
10115 // page offset should be rejected to avoid ADRP splitting.
10116 if ((FirstMI.getOpcode() == AArch64::ADDXri ||
10117 FirstMI.getOpcode() == AArch64::LDRXui) &&
10118 (FirstMI.getOperand(2).getTargetFlags() & AArch64II::MO_PAGEOFF) != 0 &&
10119 (FirstMI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT) != 0) {
10120 return std::nullopt;
10121 }
10122
10123 // We only allow outlining for functions having exactly matching return
10124 // address signing attributes, i.e., all share the same value for the
10125 // attribute "sign-return-address" and all share the same type of key they
10126 // are signed with.
10127 // Additionally we require all functions to simultaneously either support
10128 // v8.3a features or not. Otherwise an outlined function could get signed
10129 // using dedicated v8.3 instructions and a call from a function that doesn't
10130 // support v8.3 instructions would therefore be invalid.
10131 if (std::adjacent_find(
10132 RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
10133 [](const outliner::Candidate &a, const outliner::Candidate &b) {
10134 // Return true if a and b are non-equal w.r.t. return address
10135 // signing or support of v8.3a features
10136 if (outliningCandidatesSigningScopeConsensus(a, b) &&
10137 outliningCandidatesSigningKeyConsensus(a, b) &&
10138 outliningCandidatesV8_3OpsConsensus(a, b)) {
10139 return false;
10140 }
10141 return true;
10142 }) != RepeatedSequenceLocs.end()) {
10143 return std::nullopt;
10144 }
10145
10146 // Since at this point all candidates agree on their return address signing
10147 // picking just one is fine. If the candidate functions potentially sign their
10148 // return addresses, the outlined function should do the same. Note that in
10149 // the case of "sign-return-address"="non-leaf" this is an assumption: It is
10150 // not certainly true that the outlined function will have to sign its return
10151 // address but this decision is made later, when the decision to outline
10152 // has already been made.
10153 // The same holds for the number of additional instructions we need: On
10154 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
10155 // necessary. However, at this point we don't know if the outlined function
10156 // will have a RET instruction so we assume the worst.
10157 const TargetRegisterInfo &TRI = getRegisterInfo();
10158 // Performing a tail call may require extra checks when PAuth is enabled.
10159 // If PAuth is disabled, set it to zero for uniformity.
10160 unsigned NumBytesToCheckLRInTCEpilogue = 0;
10161 const auto RASignCondition = RepeatedSequenceLocs[0]
10162 .getMF()
10163 ->getInfo<AArch64FunctionInfo>()
10164 ->getSignReturnAddressCondition();
10165 if (RASignCondition != SignReturnAddress::None) {
10166 // One PAC and one AUT instructions
10167 NumBytesToCreateFrame += 8;
10168
10169 // PAuth is enabled - set extra tail call cost, if any.
10170 auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod(
10171 *RepeatedSequenceLocs[0].getMF());
10172 NumBytesToCheckLRInTCEpilogue =
10174 // Checking the authenticated LR value may significantly impact
10175 // SequenceSize, so account for it for more precise results.
10176 if (isTailCallReturnInst(RepeatedSequenceLocs[0].back()))
10177 SequenceSize += NumBytesToCheckLRInTCEpilogue;
10178
10179 // We have to check if sp modifying instructions would get outlined.
10180 // If so we only allow outlining if sp is unchanged overall, so matching
10181 // sub and add instructions are okay to outline, all other sp modifications
10182 // are not
10183 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
10184 int SPValue = 0;
10185 for (auto &MI : C) {
10186 if (MI.modifiesRegister(AArch64::SP, &TRI)) {
10187 switch (MI.getOpcode()) {
10188 case AArch64::ADDXri:
10189 case AArch64::ADDWri:
10190 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
10191 assert(MI.getOperand(2).isImm() &&
10192 "Expected operand to be immediate");
10193 assert(MI.getOperand(1).isReg() &&
10194 "Expected operand to be a register");
10195 // Check if the add just increments sp. If so, we search for
10196 // matching sub instructions that decrement sp. If not, the
10197 // modification is illegal
10198 if (MI.getOperand(1).getReg() == AArch64::SP)
10199 SPValue += MI.getOperand(2).getImm();
10200 else
10201 return true;
10202 break;
10203 case AArch64::SUBXri:
10204 case AArch64::SUBWri:
10205 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
10206 assert(MI.getOperand(2).isImm() &&
10207 "Expected operand to be immediate");
10208 assert(MI.getOperand(1).isReg() &&
10209 "Expected operand to be a register");
10210 // Check if the sub just decrements sp. If so, we search for
10211 // matching add instructions that increment sp. If not, the
10212 // modification is illegal
10213 if (MI.getOperand(1).getReg() == AArch64::SP)
10214 SPValue -= MI.getOperand(2).getImm();
10215 else
10216 return true;
10217 break;
10218 default:
10219 return true;
10220 }
10221 }
10222 }
10223 if (SPValue)
10224 return true;
10225 return false;
10226 };
10227 // Remove candidates with illegal stack modifying instructions
10228 llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification);
10229
10230 // If the sequence doesn't have enough candidates left, then we're done.
10231 if (RepeatedSequenceLocs.size() < MinRepeats)
10232 return std::nullopt;
10233 }
10234
10235 // Properties about candidate MBBs that hold for all of them.
10236 unsigned FlagsSetInAll = 0xF;
10237
10238 // Compute liveness information for each candidate, and set FlagsSetInAll.
10239 for (outliner::Candidate &C : RepeatedSequenceLocs)
10240 FlagsSetInAll &= C.Flags;
10241
10242 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back().getOpcode();
10243
10244 // Helper lambda which sets call information for every candidate.
10245 auto SetCandidateCallInfo =
10246 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
10247 for (outliner::Candidate &C : RepeatedSequenceLocs)
10248 C.setCallInfo(CallID, NumBytesForCall);
10249 };
10250
10251 unsigned FrameID = MachineOutlinerDefault;
10252 NumBytesToCreateFrame += 4;
10253
10254 bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
10255 return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement();
10256 });
10257
10258 // We check to see if CFI Instructions are present, and if they are
10259 // we find the number of CFI Instructions in the candidates.
10260 unsigned CFICount = 0;
10261 for (auto &I : RepeatedSequenceLocs[0]) {
10262 if (I.isCFIInstruction())
10263 CFICount++;
10264 }
10265
10266 // We compare the number of found CFI Instructions to the number of CFI
10267 // instructions in the parent function for each candidate. We must check this
10268 // since if we outline one of the CFI instructions in a function, we have to
10269 // outline them all for correctness. If we do not, the address offsets will be
10270 // incorrect between the two sections of the program.
10271 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10272 std::vector<MCCFIInstruction> CFIInstructions =
10273 C.getMF()->getFrameInstructions();
10274
10275 if (CFICount > 0 && CFICount != CFIInstructions.size())
10276 return std::nullopt;
10277 }
10278
10279 // Returns true if an instructions is safe to fix up, false otherwise.
10280 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
10281 if (MI.isCall())
10282 return true;
10283
10284 if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
10285 !MI.readsRegister(AArch64::SP, &TRI))
10286 return true;
10287
10288 // Any modification of SP will break our code to save/restore LR.
10289 // FIXME: We could handle some instructions which add a constant
10290 // offset to SP, with a bit more work.
10291 if (MI.modifiesRegister(AArch64::SP, &TRI))
10292 return false;
10293
10294 // At this point, we have a stack instruction that we might need to
10295 // fix up. We'll handle it if it's a load or store.
10296 if (MI.mayLoadOrStore()) {
10297 const MachineOperand *Base; // Filled with the base operand of MI.
10298 int64_t Offset; // Filled with the offset of MI.
10299 bool OffsetIsScalable;
10300
10301 // Does it allow us to offset the base operand and is the base the
10302 // register SP?
10303 if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) ||
10304 !Base->isReg() || Base->getReg() != AArch64::SP)
10305 return false;
10306
10307 // Fixe-up code below assumes bytes.
10308 if (OffsetIsScalable)
10309 return false;
10310
10311 // Find the minimum/maximum offset for this instruction and check
10312 // if fixing it up would be in range.
10313 int64_t MinOffset,
10314 MaxOffset; // Unscaled offsets for the instruction.
10315 // The scale to multiply the offsets by.
10316 TypeSize Scale(0U, false), DummyWidth(0U, false);
10317 getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
10318
10319 Offset += 16; // Update the offset to what it would be if we outlined.
10320 if (Offset < MinOffset * (int64_t)Scale.getFixedValue() ||
10321 Offset > MaxOffset * (int64_t)Scale.getFixedValue())
10322 return false;
10323
10324 // It's in range, so we can outline it.
10325 return true;
10326 }
10327
10328 // FIXME: Add handling for instructions like "add x0, sp, #8".
10329
10330 // We can't fix it up, so don't outline it.
10331 return false;
10332 };
10333
10334 // True if it's possible to fix up each stack instruction in this sequence.
10335 // Important for frames/call variants that modify the stack.
10336 bool AllStackInstrsSafe =
10337 llvm::all_of(RepeatedSequenceLocs[0], IsSafeToFixup);
10338
10339 // If the last instruction in any candidate is a terminator, then we should
10340 // tail call all of the candidates.
10341 if (RepeatedSequenceLocs[0].back().isTerminator()) {
10342 FrameID = MachineOutlinerTailCall;
10343 NumBytesToCreateFrame = 0;
10344 unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue;
10345 SetCandidateCallInfo(MachineOutlinerTailCall, NumBytesForCall);
10346 }
10347
10348 else if (LastInstrOpcode == AArch64::BL ||
10349 ((LastInstrOpcode == AArch64::BLR ||
10350 LastInstrOpcode == AArch64::BLRNoIP) &&
10351 !HasBTI)) {
10352 // FIXME: Do we need to check if the code after this uses the value of LR?
10353 FrameID = MachineOutlinerThunk;
10354 NumBytesToCreateFrame = NumBytesToCheckLRInTCEpilogue;
10355 SetCandidateCallInfo(MachineOutlinerThunk, 4);
10356 }
10357
10358 else {
10359 // We need to decide how to emit calls + frames. We can always emit the same
10360 // frame if we don't need to save to the stack. If we have to save to the
10361 // stack, then we need a different frame.
10362 unsigned NumBytesNoStackCalls = 0;
10363 std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
10364
10365 // Check if we have to save LR.
10366 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10367 bool LRAvailable =
10369 ? C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI)
10370 : true;
10371 // If we have a noreturn caller, then we're going to be conservative and
10372 // say that we have to save LR. If we don't have a ret at the end of the
10373 // block, then we can't reason about liveness accurately.
10374 //
10375 // FIXME: We can probably do better than always disabling this in
10376 // noreturn functions by fixing up the liveness info.
10377 bool IsNoReturn =
10378 C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);
10379
10380 // Is LR available? If so, we don't need a save.
10381 if (LRAvailable && !IsNoReturn) {
10382 NumBytesNoStackCalls += 4;
10383 C.setCallInfo(MachineOutlinerNoLRSave, 4);
10384 CandidatesWithoutStackFixups.push_back(C);
10385 }
10386
10387 // Is an unused register available? If so, we won't modify the stack, so
10388 // we can outline with the same frame type as those that don't save LR.
10389 else if (findRegisterToSaveLRTo(C)) {
10390 NumBytesNoStackCalls += 12;
10391 C.setCallInfo(MachineOutlinerRegSave, 12);
10392 CandidatesWithoutStackFixups.push_back(C);
10393 }
10394
10395 // Is SP used in the sequence at all? If not, we don't have to modify
10396 // the stack, so we are guaranteed to get the same frame.
10397 else if (C.isAvailableInsideSeq(AArch64::SP, TRI)) {
10398 NumBytesNoStackCalls += 12;
10399 C.setCallInfo(MachineOutlinerDefault, 12);
10400 CandidatesWithoutStackFixups.push_back(C);
10401 }
10402
10403 // If we outline this, we need to modify the stack. Pretend we don't
10404 // outline this by saving all of its bytes.
10405 else {
10406 NumBytesNoStackCalls += SequenceSize;
10407 }
10408 }
10409
10410 // If there are no places where we have to save LR, then note that we
10411 // don't have to update the stack. Otherwise, give every candidate the
10412 // default call type, as long as it's safe to do so.
10413 if (!AllStackInstrsSafe ||
10414 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
10415 RepeatedSequenceLocs = CandidatesWithoutStackFixups;
10416 FrameID = MachineOutlinerNoLRSave;
10417 if (RepeatedSequenceLocs.size() < MinRepeats)
10418 return std::nullopt;
10419 } else {
10420 SetCandidateCallInfo(MachineOutlinerDefault, 12);
10421
10422 // Bugzilla ID: 46767
10423 // TODO: Check if fixing up the stack more than once is safe so we can
10424 // outline these.
10425 //
10426 // An outline resulting in a caller that requires stack fixups at the
10427 // callsite to a callee that also requires stack fixups can happen when
10428 // there are no available registers at the candidate callsite for a
10429 // candidate that itself also has calls.
10430 //
10431 // In other words if function_containing_sequence in the following pseudo
10432 // assembly requires that we save LR at the point of the call, but there
10433 // are no available registers: in this case we save using SP and as a
10434 // result the SP offsets requires stack fixups by multiples of 16.
10435 //
10436 // function_containing_sequence:
10437 // ...
10438 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
10439 // call OUTLINED_FUNCTION_N
10440 // restore LR from SP
10441 // ...
10442 //
10443 // OUTLINED_FUNCTION_N:
10444 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
10445 // ...
10446 // bl foo
10447 // restore LR from SP
10448 // ret
10449 //
10450 // Because the code to handle more than one stack fixup does not
10451 // currently have the proper checks for legality, these cases will assert
10452 // in the AArch64 MachineOutliner. This is because the code to do this
10453 // needs more hardening, testing, better checks that generated code is
10454 // legal, etc and because it is only verified to handle a single pass of
10455 // stack fixup.
10456 //
10457 // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch
10458 // these cases until they are known to be handled. Bugzilla 46767 is
10459 // referenced in comments at the assert site.
10460 //
10461 // To avoid asserting (or generating non-legal code on noassert builds)
10462 // we remove all candidates which would need more than one stack fixup by
10463 // pruning the cases where the candidate has calls while also having no
10464 // available LR and having no available general purpose registers to copy
10465 // LR to (ie one extra stack save/restore).
10466 //
10467 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
10468 erase_if(RepeatedSequenceLocs, [this, &TRI](outliner::Candidate &C) {
10469 auto IsCall = [](const MachineInstr &MI) { return MI.isCall(); };
10470 return (llvm::any_of(C, IsCall)) &&
10471 (!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) ||
10472 !findRegisterToSaveLRTo(C));
10473 });
10474 }
10475 }
10476
10477 // If we dropped all of the candidates, bail out here.
10478 if (RepeatedSequenceLocs.size() < MinRepeats)
10479 return std::nullopt;
10480 }
10481
10482 // Does every candidate's MBB contain a call? If so, then we might have a call
10483 // in the range.
10484 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
10485 // Check if the range contains a call. These require a save + restore of the
10486 // link register.
10487 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
10488 bool ModStackToSaveLR = false;
10489 if (any_of(drop_end(FirstCand),
10490 [](const MachineInstr &MI) { return MI.isCall(); }))
10491 ModStackToSaveLR = true;
10492
10493 // Handle the last instruction separately. If this is a tail call, then the
10494 // last instruction is a call. We don't want to save + restore in this case.
10495 // However, it could be possible that the last instruction is a call without
10496 // it being valid to tail call this sequence. We should consider this as
10497 // well.
10498 else if (FrameID != MachineOutlinerThunk &&
10499 FrameID != MachineOutlinerTailCall && FirstCand.back().isCall())
10500 ModStackToSaveLR = true;
10501
10502 if (ModStackToSaveLR) {
10503 // We can't fix up the stack. Bail out.
10504 if (!AllStackInstrsSafe)
10505 return std::nullopt;
10506
10507 // Save + restore LR.
10508 NumBytesToCreateFrame += 8;
10509 }
10510 }
10511
10512 // If we have CFI instructions, we can only outline if the outlined section
10513 // can be a tail call
10514 if (FrameID != MachineOutlinerTailCall && CFICount > 0)
10515 return std::nullopt;
10516
10517 return std::make_unique<outliner::OutlinedFunction>(
10518 RepeatedSequenceLocs, SequenceSize, NumBytesToCreateFrame, FrameID);
10519}
10520
10521void AArch64InstrInfo::mergeOutliningCandidateAttributes(
10522 Function &F, std::vector<outliner::Candidate> &Candidates) const {
10523 // If a bunch of candidates reach this point they must agree on their return
10524 // address signing. It is therefore enough to just consider the signing
10525 // behaviour of one of them
10526 const auto &CFn = Candidates.front().getMF()->getFunction();
10527
10528 if (CFn.hasFnAttribute("ptrauth-returns"))
10529 F.addFnAttr(CFn.getFnAttribute("ptrauth-returns"));
10530 if (CFn.hasFnAttribute("ptrauth-auth-traps"))
10531 F.addFnAttr(CFn.getFnAttribute("ptrauth-auth-traps"));
10532 // Since all candidates belong to the same module, just copy the
10533 // function-level attributes of an arbitrary function.
10534 if (CFn.hasFnAttribute("sign-return-address"))
10535 F.addFnAttr(CFn.getFnAttribute("sign-return-address"));
10536 if (CFn.hasFnAttribute("sign-return-address-key"))
10537 F.addFnAttr(CFn.getFnAttribute("sign-return-address-key"));
10538
10539 AArch64GenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates);
10540}
10541
10542bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
10543 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
10544 const Function &F = MF.getFunction();
10545
10546 // Can F be deduplicated by the linker? If it can, don't outline from it.
10547 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
10548 return false;
10549
10550 // Don't outline from functions with section markings; the program could
10551 // expect that all the code is in the named section.
10552 // FIXME: Allow outlining from multiple functions with the same section
10553 // marking.
10554 if (F.hasSection())
10555 return false;
10556
10557 // Outlining from functions with redzones is unsafe since the outliner may
10558 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
10559 // outline from it.
10560 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
10561 if (!AFI || AFI->hasRedZone().value_or(true))
10562 return false;
10563
10564 // FIXME: Determine whether it is safe to outline from functions which contain
10565 // streaming-mode changes. We may need to ensure any smstart/smstop pairs are
10566 // outlined together and ensure it is safe to outline with async unwind info,
10567 // required for saving & restoring VG around calls.
10568 if (AFI->hasStreamingModeChanges())
10569 return false;
10570
10571 // FIXME: Teach the outliner to generate/handle Windows unwind info.
10573 return false;
10574
10575 // It's safe to outline from MF.
10576 return true;
10577}
10578
10580AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
10581 unsigned &Flags) const {
10583 "Must track liveness!");
10585 std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
10586 Ranges;
10587 // According to the AArch64 Procedure Call Standard, the following are
10588 // undefined on entry/exit from a function call:
10589 //
10590 // * Registers x16, x17, (and thus w16, w17)
10591 // * Condition codes (and thus the NZCV register)
10592 //
10593 // If any of these registers are used inside or live across an outlined
10594 // function, then they may be modified later, either by the compiler or
10595 // some other tool (like the linker).
10596 //
10597 // To avoid outlining in these situations, partition each block into ranges
10598 // where these registers are dead. We will only outline from those ranges.
10599 LiveRegUnits LRU(getRegisterInfo());
10600 auto AreAllUnsafeRegsDead = [&LRU]() {
10601 return LRU.available(AArch64::W16) && LRU.available(AArch64::W17) &&
10602 LRU.available(AArch64::NZCV);
10603 };
10604
10605 // We need to know if LR is live across an outlining boundary later on in
10606 // order to decide how we'll create the outlined call, frame, etc.
10607 //
10608 // It's pretty expensive to check this for *every candidate* within a block.
10609 // That's some potentially n^2 behaviour, since in the worst case, we'd need
10610 // to compute liveness from the end of the block for O(n) candidates within
10611 // the block.
10612 //
10613 // So, to improve the average case, let's keep track of liveness from the end
10614 // of the block to the beginning of *every outlinable range*. If we know that
10615 // LR is available in every range we could outline from, then we know that
10616 // we don't need to check liveness for any candidate within that range.
10617 bool LRAvailableEverywhere = true;
10618 // Compute liveness bottom-up.
10619 LRU.addLiveOuts(MBB);
10620 // Update flags that require info about the entire MBB.
10621 auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) {
10622 if (MI.isCall() && !MI.isTerminator())
10624 };
10625 // Range: [RangeBegin, RangeEnd)
10626 MachineBasicBlock::instr_iterator RangeBegin, RangeEnd;
10627 unsigned RangeLen;
10628 auto CreateNewRangeStartingAt =
10629 [&RangeBegin, &RangeEnd,
10630 &RangeLen](MachineBasicBlock::instr_iterator NewBegin) {
10631 RangeBegin = NewBegin;
10632 RangeEnd = std::next(RangeBegin);
10633 RangeLen = 0;
10634 };
10635 auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() {
10636 // At least one unsafe register is not dead. We do not want to outline at
10637 // this point. If it is long enough to outline from and does not cross a
10638 // bundle boundary, save the range [RangeBegin, RangeEnd).
10639 if (RangeLen <= 1)
10640 return;
10641 if (!RangeBegin.isEnd() && RangeBegin->isBundledWithPred())
10642 return;
10643 if (!RangeEnd.isEnd() && RangeEnd->isBundledWithPred())
10644 return;
10645 Ranges.emplace_back(RangeBegin, RangeEnd);
10646 };
10647 // Find the first point where all unsafe registers are dead.
10648 // FIND: <safe instr> <-- end of first potential range
10649 // SKIP: <unsafe def>
10650 // SKIP: ... everything between ...
10651 // SKIP: <unsafe use>
10652 auto FirstPossibleEndPt = MBB.instr_rbegin();
10653 for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) {
10654 if (!FirstPossibleEndPt->isDebugInstr())
10655 LRU.stepBackward(*FirstPossibleEndPt);
10656 // Update flags that impact how we outline across the entire block,
10657 // regardless of safety.
10658 UpdateWholeMBBFlags(*FirstPossibleEndPt);
10659 if (AreAllUnsafeRegsDead())
10660 break;
10661 }
10662 // If we exhausted the entire block, we have no safe ranges to outline.
10663 if (FirstPossibleEndPt == MBB.instr_rend())
10664 return Ranges;
10665 // Current range.
10666 CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator());
10667 // StartPt points to the first place where all unsafe registers
10668 // are dead (if there is any such point). Begin partitioning the MBB into
10669 // ranges.
10670 for (auto &MI : make_range(FirstPossibleEndPt, MBB.instr_rend())) {
10671 if (!MI.isDebugInstr())
10672 LRU.stepBackward(MI);
10673 UpdateWholeMBBFlags(MI);
10674 if (!AreAllUnsafeRegsDead()) {
10675 SaveRangeIfNonEmpty();
10676 CreateNewRangeStartingAt(MI.getIterator());
10677 continue;
10678 }
10679 LRAvailableEverywhere &= LRU.available(AArch64::LR);
10680 RangeBegin = MI.getIterator();
10681 ++RangeLen;
10682 }
10683 // Above loop misses the last (or only) range. If we are still safe, then
10684 // let's save the range.
10685 if (AreAllUnsafeRegsDead())
10686 SaveRangeIfNonEmpty();
10687 if (Ranges.empty())
10688 return Ranges;
10689 // We found the ranges bottom-up. Mapping expects the top-down. Reverse
10690 // the order.
10691 std::reverse(Ranges.begin(), Ranges.end());
10692 // If there is at least one outlinable range where LR is unavailable
10693 // somewhere, remember that.
10694 if (!LRAvailableEverywhere)
10696 return Ranges;
10697}
10698
10700AArch64InstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI,
10702 unsigned Flags) const {
10703 MachineInstr &MI = *MIT;
10704
10705 // Don't outline anything used for return address signing. The outlined
10706 // function will get signed later if needed
10707 switch (MI.getOpcode()) {
10708 case AArch64::PACM:
10709 case AArch64::PACIASP:
10710 case AArch64::PACIBSP:
10711 case AArch64::PACIASPPC:
10712 case AArch64::PACIBSPPC:
10713 case AArch64::AUTIASP:
10714 case AArch64::AUTIBSP:
10715 case AArch64::AUTIASPPCi:
10716 case AArch64::AUTIASPPCr:
10717 case AArch64::AUTIBSPPCi:
10718 case AArch64::AUTIBSPPCr:
10719 case AArch64::RETAA:
10720 case AArch64::RETAB:
10721 case AArch64::RETAASPPCi:
10722 case AArch64::RETAASPPCr:
10723 case AArch64::RETABSPPCi:
10724 case AArch64::RETABSPPCr:
10725 case AArch64::EMITBKEY:
10726 case AArch64::PAUTH_PROLOGUE:
10727 case AArch64::PAUTH_EPILOGUE:
10729 }
10730
10731 // We can only outline these if we will tail call the outlined function, or
10732 // fix up the CFI offsets. Currently, CFI instructions are outlined only if
10733 // in a tail call.
10734 //
10735 // FIXME: If the proper fixups for the offset are implemented, this should be
10736 // possible.
10737 if (MI.isCFIInstruction())
10739
10740 // Is this a terminator for a basic block?
10741 if (MI.isTerminator())
10742 // TargetInstrInfo::getOutliningType has already filtered out anything
10743 // that would break this, so we can allow it here.
10745
10746 // Make sure none of the operands are un-outlinable.
10747 for (const MachineOperand &MOP : MI.operands()) {
10748 // A check preventing CFI indices was here before, but only CFI
10749 // instructions should have those.
10750 assert(!MOP.isCFIIndex());
10751
10752 // If it uses LR or W30 explicitly, then don't touch it.
10753 if (MOP.isReg() && !MOP.isImplicit() &&
10754 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
10756 }
10757
10758 // Special cases for instructions that can always be outlined, but will fail
10759 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
10760 // be outlined because they don't require a *specific* value to be in LR.
10761 if (MI.getOpcode() == AArch64::ADRP)
10763
10764 // If MI is a call we might be able to outline it. We don't want to outline
10765 // any calls that rely on the position of items on the stack. When we outline
10766 // something containing a call, we have to emit a save and restore of LR in
10767 // the outlined function. Currently, this always happens by saving LR to the
10768 // stack. Thus, if we outline, say, half the parameters for a function call
10769 // plus the call, then we'll break the callee's expectations for the layout
10770 // of the stack.
10771 //
10772 // FIXME: Allow calls to functions which construct a stack frame, as long
10773 // as they don't access arguments on the stack.
10774 // FIXME: Figure out some way to analyze functions defined in other modules.
10775 // We should be able to compute the memory usage based on the IR calling
10776 // convention, even if we can't see the definition.
10777 if (MI.isCall()) {
10778 // Get the function associated with the call. Look at each operand and find
10779 // the one that represents the callee and get its name.
10780 const Function *Callee = nullptr;
10781 for (const MachineOperand &MOP : MI.operands()) {
10782 if (MOP.isGlobal()) {
10783 Callee = dyn_cast<Function>(MOP.getGlobal());
10784 break;
10785 }
10786 }
10787
10788 // Never outline calls to mcount. There isn't any rule that would require
10789 // this, but the Linux kernel's "ftrace" feature depends on it.
10790 if (Callee && Callee->getName() == "\01_mcount")
10792
10793 // If we don't know anything about the callee, assume it depends on the
10794 // stack layout of the caller. In that case, it's only legal to outline
10795 // as a tail-call. Explicitly list the call instructions we know about so we
10796 // don't get unexpected results with call pseudo-instructions.
10797 auto UnknownCallOutlineType = outliner::InstrType::Illegal;
10798 if (MI.getOpcode() == AArch64::BLR ||
10799 MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
10800 UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
10801
10802 if (!Callee)
10803 return UnknownCallOutlineType;
10804
10805 // We have a function we have information about. Check it if it's something
10806 // can safely outline.
10807 MachineFunction *CalleeMF = MMI.getMachineFunction(*Callee);
10808
10809 // We don't know what's going on with the callee at all. Don't touch it.
10810 if (!CalleeMF)
10811 return UnknownCallOutlineType;
10812
10813 // Check if we know anything about the callee saves on the function. If we
10814 // don't, then don't touch it, since that implies that we haven't
10815 // computed anything about its stack frame yet.
10816 MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
10817 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
10818 MFI.getNumObjects() > 0)
10819 return UnknownCallOutlineType;
10820
10821 // At this point, we can say that CalleeMF ought to not pass anything on the
10822 // stack. Therefore, we can outline it.
10824 }
10825
10826 // Don't touch the link register or W30.
10827 if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
10828 MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
10830
10831 // Don't outline BTI instructions, because that will prevent the outlining
10832 // site from being indirectly callable.
10833 if (hasBTISemantics(MI))
10835
10837}
10838
10839void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
10840 for (MachineInstr &MI : MBB) {
10841 const MachineOperand *Base;
10842 TypeSize Width(0, false);
10843 int64_t Offset;
10844 bool OffsetIsScalable;
10845
10846 // Is this a load or store with an immediate offset with SP as the base?
10847 if (!MI.mayLoadOrStore() ||
10848 !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width,
10849 &RI) ||
10850 (Base->isReg() && Base->getReg() != AArch64::SP))
10851 continue;
10852
10853 // It is, so we have to fix it up.
10854 TypeSize Scale(0U, false);
10855 int64_t Dummy1, Dummy2;
10856
10857 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
10858 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
10859 getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
10860 assert(Scale != 0 && "Unexpected opcode!");
10861 assert(!OffsetIsScalable && "Expected offset to be a byte offset");
10862
10863 // We've pushed the return address to the stack, so add 16 to the offset.
10864 // This is safe, since we already checked if it would overflow when we
10865 // checked if this instruction was legal to outline.
10866 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue();
10867 StackOffsetOperand.setImm(NewImm);
10868 }
10869}
10870
10872 const AArch64InstrInfo *TII,
10873 bool ShouldSignReturnAddr) {
10874 if (!ShouldSignReturnAddr)
10875 return;
10876
10877 BuildMI(MBB, MBB.begin(), DebugLoc(), TII->get(AArch64::PAUTH_PROLOGUE))
10879 TII->createPauthEpilogueInstr(MBB, DebugLoc());
10880}
10881
10882void AArch64InstrInfo::buildOutlinedFrame(
10884 const outliner::OutlinedFunction &OF) const {
10885
10886 AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
10887
10888 if (OF.FrameConstructionID == MachineOutlinerTailCall)
10889 FI->setOutliningStyle("Tail Call");
10890 else if (OF.FrameConstructionID == MachineOutlinerThunk) {
10891 // For thunk outlining, rewrite the last instruction from a call to a
10892 // tail-call.
10893 MachineInstr *Call = &*--MBB.instr_end();
10894 unsigned TailOpcode;
10895 if (Call->getOpcode() == AArch64::BL) {
10896 TailOpcode = AArch64::TCRETURNdi;
10897 } else {
10898 assert(Call->getOpcode() == AArch64::BLR ||
10899 Call->getOpcode() == AArch64::BLRNoIP);
10900 TailOpcode = AArch64::TCRETURNriALL;
10901 }
10902 MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
10903 .add(Call->getOperand(0))
10904 .addImm(0);
10905 MBB.insert(MBB.end(), TC);
10907
10908 FI->setOutliningStyle("Thunk");
10909 }
10910
10911 bool IsLeafFunction = true;
10912
10913 // Is there a call in the outlined range?
10914 auto IsNonTailCall = [](const MachineInstr &MI) {
10915 return MI.isCall() && !MI.isReturn();
10916 };
10917
10918 if (llvm::any_of(MBB.instrs(), IsNonTailCall)) {
10919 // Fix up the instructions in the range, since we're going to modify the
10920 // stack.
10921
10922 // Bugzilla ID: 46767
10923 // TODO: Check if fixing up twice is safe so we can outline these.
10924 assert(OF.FrameConstructionID != MachineOutlinerDefault &&
10925 "Can only fix up stack references once");
10926 fixupPostOutline(MBB);
10927
10928 IsLeafFunction = false;
10929
10930 // LR has to be a live in so that we can save it.
10931 if (!MBB.isLiveIn(AArch64::LR))
10932 MBB.addLiveIn(AArch64::LR);
10933
10936
10937 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
10938 OF.FrameConstructionID == MachineOutlinerThunk)
10939 Et = std::prev(MBB.end());
10940
10941 // Insert a save before the outlined region
10942 MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
10943 .addReg(AArch64::SP, RegState::Define)
10944 .addReg(AArch64::LR)
10945 .addReg(AArch64::SP)
10946 .addImm(-16);
10947 It = MBB.insert(It, STRXpre);
10948
10949 if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) {
10950 CFIInstBuilder CFIBuilder(MBB, It, MachineInstr::FrameSetup);
10951
10952 // Add a CFI saying the stack was moved 16 B down.
10953 CFIBuilder.buildDefCFAOffset(16);
10954
10955 // Add a CFI saying that the LR that we want to find is now 16 B higher
10956 // than before.
10957 CFIBuilder.buildOffset(AArch64::LR, -16);
10958 }
10959
10960 // Insert a restore before the terminator for the function.
10961 MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
10962 .addReg(AArch64::SP, RegState::Define)
10963 .addReg(AArch64::LR, RegState::Define)
10964 .addReg(AArch64::SP)
10965 .addImm(16);
10966 Et = MBB.insert(Et, LDRXpost);
10967 }
10968
10969 auto RASignCondition = FI->getSignReturnAddressCondition();
10970 bool ShouldSignReturnAddr = AArch64FunctionInfo::shouldSignReturnAddress(
10971 RASignCondition, !IsLeafFunction);
10972
10973 // If this is a tail call outlined function, then there's already a return.
10974 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
10975 OF.FrameConstructionID == MachineOutlinerThunk) {
10976 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
10977 return;
10978 }
10979
10980 // It's not a tail call, so we have to insert the return ourselves.
10981
10982 // LR has to be a live in so that we can return to it.
10983 if (!MBB.isLiveIn(AArch64::LR))
10984 MBB.addLiveIn(AArch64::LR);
10985
10986 MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
10987 .addReg(AArch64::LR);
10988 MBB.insert(MBB.end(), ret);
10989
10990 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
10991
10992 FI->setOutliningStyle("Function");
10993
10994 // Did we have to modify the stack by saving the link register?
10995 if (OF.FrameConstructionID != MachineOutlinerDefault)
10996 return;
10997
10998 // We modified the stack.
10999 // Walk over the basic block and fix up all the stack accesses.
11000 fixupPostOutline(MBB);
11001}
11002
11003MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
11006
11007 // Are we tail calling?
11008 if (C.CallConstructionID == MachineOutlinerTailCall) {
11009 // If yes, then we can just branch to the label.
11010 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
11011 .addGlobalAddress(M.getNamedValue(MF.getName()))
11012 .addImm(0));
11013 return It;
11014 }
11015
11016 // Are we saving the link register?
11017 if (C.CallConstructionID == MachineOutlinerNoLRSave ||
11018 C.CallConstructionID == MachineOutlinerThunk) {
11019 // No, so just insert the call.
11020 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
11021 .addGlobalAddress(M.getNamedValue(MF.getName())));
11022 return It;
11023 }
11024
11025 // We want to return the spot where we inserted the call.
11027
11028 // Instructions for saving and restoring LR around the call instruction we're
11029 // going to insert.
11030 MachineInstr *Save;
11031 MachineInstr *Restore;
11032 // Can we save to a register?
11033 if (C.CallConstructionID == MachineOutlinerRegSave) {
11034 // FIXME: This logic should be sunk into a target-specific interface so that
11035 // we don't have to recompute the register.
11036 Register Reg = findRegisterToSaveLRTo(C);
11037 assert(Reg && "No callee-saved register available?");
11038
11039 // LR has to be a live in so that we can save it.
11040 if (!MBB.isLiveIn(AArch64::LR))
11041 MBB.addLiveIn(AArch64::LR);
11042
11043 // Save and restore LR from Reg.
11044 Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
11045 .addReg(AArch64::XZR)
11046 .addReg(AArch64::LR)
11047 .addImm(0);
11048 Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
11049 .addReg(AArch64::XZR)
11050 .addReg(Reg)
11051 .addImm(0);
11052 } else {
11053 // We have the default case. Save and restore from SP.
11054 Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
11055 .addReg(AArch64::SP, RegState::Define)
11056 .addReg(AArch64::LR)
11057 .addReg(AArch64::SP)
11058 .addImm(-16);
11059 Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
11060 .addReg(AArch64::SP, RegState::Define)
11061 .addReg(AArch64::LR, RegState::Define)
11062 .addReg(AArch64::SP)
11063 .addImm(16);
11064 }
11065
11066 It = MBB.insert(It, Save);
11067 It++;
11068
11069 // Insert the call.
11070 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
11071 .addGlobalAddress(M.getNamedValue(MF.getName())));
11072 CallPt = It;
11073 It++;
11074
11075 It = MBB.insert(It, Restore);
11076 return CallPt;
11077}
11078
11079bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
11080 MachineFunction &MF) const {
11081 return MF.getFunction().hasMinSize();
11082}
11083
11084void AArch64InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
11086 DebugLoc &DL,
11087 bool AllowSideEffects) const {
11088 const MachineFunction &MF = *MBB.getParent();
11089 const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
11090 const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
11091
11092 if (TRI.isGeneralPurposeRegister(MF, Reg)) {
11093 BuildMI(MBB, Iter, DL, get(AArch64::MOVZXi), Reg).addImm(0).addImm(0);
11094 } else if (STI.isSVEorStreamingSVEAvailable()) {
11095 BuildMI(MBB, Iter, DL, get(AArch64::DUP_ZI_D), Reg)
11096 .addImm(0)
11097 .addImm(0);
11098 } else if (STI.isNeonAvailable()) {
11099 BuildMI(MBB, Iter, DL, get(AArch64::MOVIv2d_ns), Reg)
11100 .addImm(0);
11101 } else {
11102 // This is a streaming-compatible function without SVE. We don't have full
11103 // Neon (just FPRs), so we can at most use the first 64-bit sub-register.
11104 // So given `movi v..` would be illegal use `fmov d..` instead.
11105 assert(STI.hasNEON() && "Expected to have NEON.");
11106 Register Reg64 = TRI.getSubReg(Reg, AArch64::dsub);
11107 BuildMI(MBB, Iter, DL, get(AArch64::FMOVD0), Reg64);
11108 }
11109}
11110
11111std::optional<DestSourcePair>
11113
11114 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
11115 // and zero immediate operands used as an alias for mov instruction.
11116 if (((MI.getOpcode() == AArch64::ORRWrs &&
11117 MI.getOperand(1).getReg() == AArch64::WZR &&
11118 MI.getOperand(3).getImm() == 0x0) ||
11119 (MI.getOpcode() == AArch64::ORRWrr &&
11120 MI.getOperand(1).getReg() == AArch64::WZR)) &&
11121 // Check that the w->w move is not a zero-extending w->x mov.
11122 (!MI.getOperand(0).getReg().isVirtual() ||
11123 MI.getOperand(0).getSubReg() == 0) &&
11124 (!MI.getOperand(0).getReg().isPhysical() ||
11125 MI.findRegisterDefOperandIdx(getXRegFromWReg(MI.getOperand(0).getReg()),
11126 /*TRI=*/nullptr) == -1))
11127 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
11128
11129 if (MI.getOpcode() == AArch64::ORRXrs &&
11130 MI.getOperand(1).getReg() == AArch64::XZR &&
11131 MI.getOperand(3).getImm() == 0x0)
11132 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
11133
11134 return std::nullopt;
11135}
11136
11137std::optional<DestSourcePair>
11139 if ((MI.getOpcode() == AArch64::ORRWrs &&
11140 MI.getOperand(1).getReg() == AArch64::WZR &&
11141 MI.getOperand(3).getImm() == 0x0) ||
11142 (MI.getOpcode() == AArch64::ORRWrr &&
11143 MI.getOperand(1).getReg() == AArch64::WZR))
11144 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
11145 return std::nullopt;
11146}
11147
11148std::optional<RegImmPair>
11149AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const {
11150 int Sign = 1;
11151 int64_t Offset = 0;
11152
11153 // TODO: Handle cases where Reg is a super- or sub-register of the
11154 // destination register.
11155 const MachineOperand &Op0 = MI.getOperand(0);
11156 if (!Op0.isReg() || Reg != Op0.getReg())
11157 return std::nullopt;
11158
11159 switch (MI.getOpcode()) {
11160 default:
11161 return std::nullopt;
11162 case AArch64::SUBWri:
11163 case AArch64::SUBXri:
11164 case AArch64::SUBSWri:
11165 case AArch64::SUBSXri:
11166 Sign *= -1;
11167 [[fallthrough]];
11168 case AArch64::ADDSWri:
11169 case AArch64::ADDSXri:
11170 case AArch64::ADDWri:
11171 case AArch64::ADDXri: {
11172 // TODO: Third operand can be global address (usually some string).
11173 if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||
11174 !MI.getOperand(2).isImm())
11175 return std::nullopt;
11176 int Shift = MI.getOperand(3).getImm();
11177 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
11178 Offset = Sign * (MI.getOperand(2).getImm() << Shift);
11179 }
11180 }
11181 return RegImmPair{MI.getOperand(1).getReg(), Offset};
11182}
11183
11184/// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
11185/// the destination register then, if possible, describe the value in terms of
11186/// the source register.
11187static std::optional<ParamLoadedValue>
11189 const TargetInstrInfo *TII,
11190 const TargetRegisterInfo *TRI) {
11191 auto DestSrc = TII->isCopyLikeInstr(MI);
11192 if (!DestSrc)
11193 return std::nullopt;
11194
11195 Register DestReg = DestSrc->Destination->getReg();
11196 Register SrcReg = DestSrc->Source->getReg();
11197
11198 if (!DestReg.isValid() || !SrcReg.isValid())
11199 return std::nullopt;
11200
11201 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
11202
11203 // If the described register is the destination, just return the source.
11204 if (DestReg == DescribedReg)
11205 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
11206
11207 // ORRWrs zero-extends to 64-bits, so we need to consider such cases.
11208 if (MI.getOpcode() == AArch64::ORRWrs &&
11209 TRI->isSuperRegister(DestReg, DescribedReg))
11210 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
11211
11212 // We may need to describe the lower part of a ORRXrs move.
11213 if (MI.getOpcode() == AArch64::ORRXrs &&
11214 TRI->isSubRegister(DestReg, DescribedReg)) {
11215 Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32);
11216 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
11217 }
11218
11219 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
11220 "Unhandled ORR[XW]rs copy case");
11221
11222 return std::nullopt;
11223}
11224
11225bool AArch64InstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const {
11226 // Functions cannot be split to different sections on AArch64 if they have
11227 // a red zone. This is because relaxing a cross-section branch may require
11228 // incrementing the stack pointer to spill a register, which would overwrite
11229 // the red zone.
11230 if (MF.getInfo<AArch64FunctionInfo>()->hasRedZone().value_or(true))
11231 return false;
11232
11234}
11235
11236bool AArch64InstrInfo::isMBBSafeToSplitToCold(
11237 const MachineBasicBlock &MBB) const {
11238 // Asm Goto blocks can contain conditional branches to goto labels, which can
11239 // get moved out of range of the branch instruction.
11240 auto isAsmGoto = [](const MachineInstr &MI) {
11241 return MI.getOpcode() == AArch64::INLINEASM_BR;
11242 };
11243 if (llvm::any_of(MBB, isAsmGoto) || MBB.isInlineAsmBrIndirectTarget())
11244 return false;
11245
11246 // Because jump tables are label-relative instead of table-relative, they all
11247 // must be in the same section or relocation fixup handling will fail.
11248
11249 // Check if MBB is a jump table target
11250 const MachineJumpTableInfo *MJTI = MBB.getParent()->getJumpTableInfo();
11251 auto containsMBB = [&MBB](const MachineJumpTableEntry &JTE) {
11252 return llvm::is_contained(JTE.MBBs, &MBB);
11253 };
11254 if (MJTI != nullptr && llvm::any_of(MJTI->getJumpTables(), containsMBB))
11255 return false;
11256
11257 // Check if MBB contains a jump table lookup
11258 for (const MachineInstr &MI : MBB) {
11259 switch (MI.getOpcode()) {
11260 case TargetOpcode::G_BRJT:
11261 case AArch64::JumpTableDest32:
11262 case AArch64::JumpTableDest16:
11263 case AArch64::JumpTableDest8:
11264 return false;
11265 default:
11266 continue;
11267 }
11268 }
11269
11270 // MBB isn't a special case, so it's safe to be split to the cold section.
11271 return true;
11272}
11273
11274std::optional<ParamLoadedValue>
11275AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
11276 Register Reg) const {
11277 const MachineFunction *MF = MI.getMF();
11278 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
11279 switch (MI.getOpcode()) {
11280 case AArch64::MOVZWi:
11281 case AArch64::MOVZXi: {
11282 // MOVZWi may be used for producing zero-extended 32-bit immediates in
11283 // 64-bit parameters, so we need to consider super-registers.
11284 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
11285 return std::nullopt;
11286
11287 if (!MI.getOperand(1).isImm())
11288 return std::nullopt;
11289 int64_t Immediate = MI.getOperand(1).getImm();
11290 int Shift = MI.getOperand(2).getImm();
11291 return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift),
11292 nullptr);
11293 }
11294 case AArch64::ORRWrs:
11295 case AArch64::ORRXrs:
11296 return describeORRLoadedValue(MI, Reg, this, TRI);
11297 }
11298
11300}
11301
11302bool AArch64InstrInfo::isExtendLikelyToBeFolded(
11303 MachineInstr &ExtMI, MachineRegisterInfo &MRI) const {
11304 assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT ||
11305 ExtMI.getOpcode() == TargetOpcode::G_ZEXT ||
11306 ExtMI.getOpcode() == TargetOpcode::G_ANYEXT);
11307
11308 // Anyexts are nops.
11309 if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT)
11310 return true;
11311
11312 Register DefReg = ExtMI.getOperand(0).getReg();
11313 if (!MRI.hasOneNonDBGUse(DefReg))
11314 return false;
11315
11316 // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an
11317 // addressing mode.
11318 auto *UserMI = &*MRI.use_instr_nodbg_begin(DefReg);
11319 return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD;
11320}
11321
11322uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
11323 return get(Opc).TSFlags & AArch64::ElementSizeMask;
11324}
11325
11326bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const {
11327 return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike;
11328}
11329
11330bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const {
11331 return get(Opc).TSFlags & AArch64::InstrFlagIsWhile;
11332}
11333
11334unsigned int
11335AArch64InstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const {
11336 return OptLevel >= CodeGenOptLevel::Aggressive ? 6 : 2;
11337}
11338
11339bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset,
11340 unsigned Scale) const {
11341 if (Offset && Scale)
11342 return false;
11343
11344 // Check Reg + Imm
11345 if (!Scale) {
11346 // 9-bit signed offset
11347 if (isInt<9>(Offset))
11348 return true;
11349
11350 // 12-bit unsigned offset
11351 unsigned Shift = Log2_64(NumBytes);
11352 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
11353 // Must be a multiple of NumBytes (NumBytes is a power of 2)
11354 (Offset >> Shift) << Shift == Offset)
11355 return true;
11356 return false;
11357 }
11358
11359 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
11360 return Scale == 1 || (Scale > 0 && Scale == NumBytes);
11361}
11362
11364 if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
11365 return AArch64::BLRNoIP;
11366 else
11367 return AArch64::BLR;
11368}
11369
11371 DebugLoc DL) const {
11372 MachineBasicBlock::iterator InsertPt = MBB.getFirstTerminator();
11373 auto Builder = BuildMI(MBB, InsertPt, DL, get(AArch64::PAUTH_EPILOGUE))
11375
11376 const auto *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>();
11377 if (AFI->branchProtectionPAuthLR() && !Subtarget.hasPAuthLR())
11378 Builder.addReg(AArch64::X16, RegState::ImplicitDefine);
11379}
11380
11382AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
11383 Register TargetReg, bool FrameSetup) const {
11384 assert(TargetReg != AArch64::SP && "New top of stack cannot already be in SP");
11385
11386 MachineBasicBlock &MBB = *MBBI->getParent();
11387 MachineFunction &MF = *MBB.getParent();
11388 const AArch64InstrInfo *TII =
11389 MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
11390 int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
11391 DebugLoc DL = MBB.findDebugLoc(MBBI);
11392
11393 MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
11394 MachineBasicBlock *LoopTestMBB =
11395 MF.CreateMachineBasicBlock(MBB.getBasicBlock());
11396 MF.insert(MBBInsertPoint, LoopTestMBB);
11397 MachineBasicBlock *LoopBodyMBB =
11398 MF.CreateMachineBasicBlock(MBB.getBasicBlock());
11399 MF.insert(MBBInsertPoint, LoopBodyMBB);
11400 MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
11401 MF.insert(MBBInsertPoint, ExitMBB);
11402 MachineInstr::MIFlag Flags =
11404
11405 // LoopTest:
11406 // SUB SP, SP, #ProbeSize
11407 emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP,
11408 AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags);
11409
11410 // CMP SP, TargetReg
11411 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
11412 AArch64::XZR)
11413 .addReg(AArch64::SP)
11414 .addReg(TargetReg)
11416 .setMIFlags(Flags);
11417
11418 // B.<Cond> LoopExit
11419 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc))
11421 .addMBB(ExitMBB)
11422 .setMIFlags(Flags);
11423
11424 // LDR XZR, [SP]
11425 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::LDRXui))
11426 .addDef(AArch64::XZR)
11427 .addReg(AArch64::SP)
11428 .addImm(0)
11432 Align(8)))
11433 .setMIFlags(Flags);
11434
11435 // B loop
11436 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B))
11437 .addMBB(LoopTestMBB)
11438 .setMIFlags(Flags);
11439
11440 // LoopExit:
11441 // MOV SP, TargetReg
11442 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP)
11443 .addReg(TargetReg)
11444 .addImm(0)
11446 .setMIFlags(Flags);
11447
11448 // LDR XZR, [SP]
11449 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::LDRXui))
11450 .addReg(AArch64::XZR, RegState::Define)
11451 .addReg(AArch64::SP)
11452 .addImm(0)
11453 .setMIFlags(Flags);
11454
11455 ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
11457
11458 LoopTestMBB->addSuccessor(ExitMBB);
11459 LoopTestMBB->addSuccessor(LoopBodyMBB);
11460 LoopBodyMBB->addSuccessor(LoopTestMBB);
11461 MBB.addSuccessor(LoopTestMBB);
11462
11463 // Update liveins.
11464 if (MF.getRegInfo().reservedRegsFrozen())
11465 fullyRecomputeLiveIns({ExitMBB, LoopBodyMBB, LoopTestMBB});
11466
11467 return ExitMBB->begin();
11468}
11469
11470namespace {
11471class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
11472 MachineFunction *MF;
11473 const TargetInstrInfo *TII;
11474 const TargetRegisterInfo *TRI;
11475 MachineRegisterInfo &MRI;
11476
11477 /// The block of the loop
11478 MachineBasicBlock *LoopBB;
11479 /// The conditional branch of the loop
11480 MachineInstr *CondBranch;
11481 /// The compare instruction for loop control
11482 MachineInstr *Comp;
11483 /// The number of the operand of the loop counter value in Comp
11484 unsigned CompCounterOprNum;
11485 /// The instruction that updates the loop counter value
11486 MachineInstr *Update;
11487 /// The number of the operand of the loop counter value in Update
11488 unsigned UpdateCounterOprNum;
11489 /// The initial value of the loop counter
11490 Register Init;
11491 /// True iff Update is a predecessor of Comp
11492 bool IsUpdatePriorComp;
11493
11494 /// The normalized condition used by createTripCountGreaterCondition()
11496
11497public:
11498 AArch64PipelinerLoopInfo(MachineBasicBlock *LoopBB, MachineInstr *CondBranch,
11499 MachineInstr *Comp, unsigned CompCounterOprNum,
11500 MachineInstr *Update, unsigned UpdateCounterOprNum,
11501 Register Init, bool IsUpdatePriorComp,
11502 const SmallVectorImpl<MachineOperand> &Cond)
11503 : MF(Comp->getParent()->getParent()),
11504 TII(MF->getSubtarget().getInstrInfo()),
11505 TRI(MF->getSubtarget().getRegisterInfo()), MRI(MF->getRegInfo()),
11506 LoopBB(LoopBB), CondBranch(CondBranch), Comp(Comp),
11507 CompCounterOprNum(CompCounterOprNum), Update(Update),
11508 UpdateCounterOprNum(UpdateCounterOprNum), Init(Init),
11509 IsUpdatePriorComp(IsUpdatePriorComp), Cond(Cond.begin(), Cond.end()) {}
11510
11511 bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
11512 // Make the instructions for loop control be placed in stage 0.
11513 // The predecessors of Comp are considered by the caller.
11514 return MI == Comp;
11515 }
11516
11517 std::optional<bool> createTripCountGreaterCondition(
11518 int TC, MachineBasicBlock &MBB,
11519 SmallVectorImpl<MachineOperand> &CondParam) override {
11520 // A branch instruction will be inserted as "if (Cond) goto epilogue".
11521 // Cond is normalized for such use.
11522 // The predecessors of the branch are assumed to have already been inserted.
11523 CondParam = Cond;
11524 return {};
11525 }
11526
11527 void createRemainingIterationsGreaterCondition(
11528 int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
11529 DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) override;
11530
11531 void setPreheader(MachineBasicBlock *NewPreheader) override {}
11532
11533 void adjustTripCount(int TripCountAdjust) override {}
11534
11535 bool isMVEExpanderSupported() override { return true; }
11536};
11537} // namespace
11538
11539/// Clone an instruction from MI. The register of ReplaceOprNum-th operand
11540/// is replaced by ReplaceReg. The output register is newly created.
11541/// The other operands are unchanged from MI.
11542static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,
11543 Register ReplaceReg, MachineBasicBlock &MBB,
11544 MachineBasicBlock::iterator InsertTo) {
11545 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
11546 const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
11547 MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(MI);
11548 Register Result = 0;
11549 for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) {
11550 if (I == 0 && NewMI->getOperand(0).getReg().isVirtual()) {
11551 Result = MRI.createVirtualRegister(
11552 MRI.getRegClass(NewMI->getOperand(0).getReg()));
11553 NewMI->getOperand(I).setReg(Result);
11554 } else if (I == ReplaceOprNum) {
11555 MRI.constrainRegClass(ReplaceReg, TII->getRegClass(NewMI->getDesc(), I));
11556 NewMI->getOperand(I).setReg(ReplaceReg);
11557 }
11558 }
11559 MBB.insert(InsertTo, NewMI);
11560 return Result;
11561}
11562
11563void AArch64PipelinerLoopInfo::createRemainingIterationsGreaterCondition(
11566 // Create and accumulate conditions for next TC iterations.
11567 // Example:
11568 // SUBSXrr N, counter, implicit-def $nzcv # compare instruction for the last
11569 // # iteration of the kernel
11570 //
11571 // # insert the following instructions
11572 // cond = CSINCXr 0, 0, C, implicit $nzcv
11573 // counter = ADDXri counter, 1 # clone from this->Update
11574 // SUBSXrr n, counter, implicit-def $nzcv # clone from this->Comp
11575 // cond = CSINCXr cond, cond, C, implicit $nzcv
11576 // ... (repeat TC times)
11577 // SUBSXri cond, 0, implicit-def $nzcv
11578
11579 assert(CondBranch->getOpcode() == AArch64::Bcc);
11580 // CondCode to exit the loop
11582 (AArch64CC::CondCode)CondBranch->getOperand(0).getImm();
11583 if (CondBranch->getOperand(1).getMBB() == LoopBB)
11585
11586 // Accumulate conditions to exit the loop
11587 Register AccCond = AArch64::XZR;
11588
11589 // If CC holds, CurCond+1 is returned; otherwise CurCond is returned.
11590 auto AccumulateCond = [&](Register CurCond,
11592 Register NewCond = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
11593 BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::CSINCXr))
11594 .addReg(NewCond, RegState::Define)
11595 .addReg(CurCond)
11596 .addReg(CurCond)
11598 return NewCond;
11599 };
11600
11601 if (!LastStage0Insts.empty() && LastStage0Insts[Comp]->getParent() == &MBB) {
11602 // Update and Comp for I==0 are already exists in MBB
11603 // (MBB is an unrolled kernel)
11604 Register Counter;
11605 for (int I = 0; I <= TC; ++I) {
11606 Register NextCounter;
11607 if (I != 0)
11608 NextCounter =
11609 cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
11610
11611 AccCond = AccumulateCond(AccCond, CC);
11612
11613 if (I != TC) {
11614 if (I == 0) {
11615 if (Update != Comp && IsUpdatePriorComp) {
11616 Counter =
11617 LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
11618 NextCounter = cloneInstr(Update, UpdateCounterOprNum, Counter, MBB,
11619 MBB.end());
11620 } else {
11621 // can use already calculated value
11622 NextCounter = LastStage0Insts[Update]->getOperand(0).getReg();
11623 }
11624 } else if (Update != Comp) {
11625 NextCounter =
11626 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
11627 }
11628 }
11629 Counter = NextCounter;
11630 }
11631 } else {
11632 Register Counter;
11633 if (LastStage0Insts.empty()) {
11634 // use initial counter value (testing if the trip count is sufficient to
11635 // be executed by pipelined code)
11636 Counter = Init;
11637 if (IsUpdatePriorComp)
11638 Counter =
11639 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
11640 } else {
11641 // MBB is an epilogue block. LastStage0Insts[Comp] is in the kernel block.
11642 Counter = LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
11643 }
11644
11645 for (int I = 0; I <= TC; ++I) {
11646 Register NextCounter;
11647 NextCounter =
11648 cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
11649 AccCond = AccumulateCond(AccCond, CC);
11650 if (I != TC && Update != Comp)
11651 NextCounter =
11652 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
11653 Counter = NextCounter;
11654 }
11655 }
11656
11657 // If AccCond == 0, the remainder is greater than TC.
11658 BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::SUBSXri))
11659 .addReg(AArch64::XZR, RegState::Define | RegState::Dead)
11660 .addReg(AccCond)
11661 .addImm(0)
11662 .addImm(0);
11663 Cond.clear();
11665}
11666
11667static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB,
11668 Register &RegMBB, Register &RegOther) {
11669 assert(Phi.getNumOperands() == 5);
11670 if (Phi.getOperand(2).getMBB() == MBB) {
11671 RegMBB = Phi.getOperand(1).getReg();
11672 RegOther = Phi.getOperand(3).getReg();
11673 } else {
11674 assert(Phi.getOperand(4).getMBB() == MBB);
11675 RegMBB = Phi.getOperand(3).getReg();
11676 RegOther = Phi.getOperand(1).getReg();
11677 }
11678}
11679
11681 if (!Reg.isVirtual())
11682 return false;
11683 const MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
11684 return MRI.getVRegDef(Reg)->getParent() != BB;
11685}
11686
11687/// If Reg is an induction variable, return true and set some parameters
11688static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB,
11689 MachineInstr *&UpdateInst,
11690 unsigned &UpdateCounterOprNum, Register &InitReg,
11691 bool &IsUpdatePriorComp) {
11692 // Example:
11693 //
11694 // Preheader:
11695 // InitReg = ...
11696 // LoopBB:
11697 // Reg0 = PHI (InitReg, Preheader), (Reg1, LoopBB)
11698 // Reg = COPY Reg0 ; COPY is ignored.
11699 // Reg1 = ADD Reg, #1; UpdateInst. Incremented by a loop invariant value.
11700 // ; Reg is the value calculated in the previous
11701 // ; iteration, so IsUpdatePriorComp == false.
11702
11703 if (LoopBB->pred_size() != 2)
11704 return false;
11705 if (!Reg.isVirtual())
11706 return false;
11707 const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
11708 UpdateInst = nullptr;
11709 UpdateCounterOprNum = 0;
11710 InitReg = 0;
11711 IsUpdatePriorComp = true;
11712 Register CurReg = Reg;
11713 while (true) {
11714 MachineInstr *Def = MRI.getVRegDef(CurReg);
11715 if (Def->getParent() != LoopBB)
11716 return false;
11717 if (Def->isCopy()) {
11718 // Ignore copy instructions unless they contain subregisters
11719 if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg())
11720 return false;
11721 CurReg = Def->getOperand(1).getReg();
11722 } else if (Def->isPHI()) {
11723 if (InitReg != 0)
11724 return false;
11725 if (!UpdateInst)
11726 IsUpdatePriorComp = false;
11727 extractPhiReg(*Def, LoopBB, CurReg, InitReg);
11728 } else {
11729 if (UpdateInst)
11730 return false;
11731 switch (Def->getOpcode()) {
11732 case AArch64::ADDSXri:
11733 case AArch64::ADDSWri:
11734 case AArch64::SUBSXri:
11735 case AArch64::SUBSWri:
11736 case AArch64::ADDXri:
11737 case AArch64::ADDWri:
11738 case AArch64::SUBXri:
11739 case AArch64::SUBWri:
11740 UpdateInst = Def;
11741 UpdateCounterOprNum = 1;
11742 break;
11743 case AArch64::ADDSXrr:
11744 case AArch64::ADDSWrr:
11745 case AArch64::SUBSXrr:
11746 case AArch64::SUBSWrr:
11747 case AArch64::ADDXrr:
11748 case AArch64::ADDWrr:
11749 case AArch64::SUBXrr:
11750 case AArch64::SUBWrr:
11751 UpdateInst = Def;
11752 if (isDefinedOutside(Def->getOperand(2).getReg(), LoopBB))
11753 UpdateCounterOprNum = 1;
11754 else if (isDefinedOutside(Def->getOperand(1).getReg(), LoopBB))
11755 UpdateCounterOprNum = 2;
11756 else
11757 return false;
11758 break;
11759 default:
11760 return false;
11761 }
11762 CurReg = Def->getOperand(UpdateCounterOprNum).getReg();
11763 }
11764
11765 if (!CurReg.isVirtual())
11766 return false;
11767 if (Reg == CurReg)
11768 break;
11769 }
11770
11771 if (!UpdateInst)
11772 return false;
11773
11774 return true;
11775}
11776
11777std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
11779 // Accept loops that meet the following conditions
11780 // * The conditional branch is BCC
11781 // * The compare instruction is ADDS/SUBS/WHILEXX
11782 // * One operand of the compare is an induction variable and the other is a
11783 // loop invariant value
11784 // * The induction variable is incremented/decremented by a single instruction
11785 // * Does not contain CALL or instructions which have unmodeled side effects
11786
11787 for (MachineInstr &MI : *LoopBB)
11788 if (MI.isCall() || MI.hasUnmodeledSideEffects())
11789 // This instruction may use NZCV, which interferes with the instruction to
11790 // be inserted for loop control.
11791 return nullptr;
11792
11793 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
11795 if (analyzeBranch(*LoopBB, TBB, FBB, Cond))
11796 return nullptr;
11797
11798 // Infinite loops are not supported
11799 if (TBB == LoopBB && FBB == LoopBB)
11800 return nullptr;
11801
11802 // Must be conditional branch
11803 if (TBB != LoopBB && FBB == nullptr)
11804 return nullptr;
11805
11806 assert((TBB == LoopBB || FBB == LoopBB) &&
11807 "The Loop must be a single-basic-block loop");
11808
11809 MachineInstr *CondBranch = &*LoopBB->getFirstTerminator();
11811
11812 if (CondBranch->getOpcode() != AArch64::Bcc)
11813 return nullptr;
11814
11815 // Normalization for createTripCountGreaterCondition()
11816 if (TBB == LoopBB)
11818
11819 MachineInstr *Comp = nullptr;
11820 unsigned CompCounterOprNum = 0;
11821 for (MachineInstr &MI : reverse(*LoopBB)) {
11822 if (MI.modifiesRegister(AArch64::NZCV, &TRI)) {
11823 // Guarantee that the compare is SUBS/ADDS/WHILEXX and that one of the
11824 // operands is a loop invariant value
11825
11826 switch (MI.getOpcode()) {
11827 case AArch64::SUBSXri:
11828 case AArch64::SUBSWri:
11829 case AArch64::ADDSXri:
11830 case AArch64::ADDSWri:
11831 Comp = &MI;
11832 CompCounterOprNum = 1;
11833 break;
11834 case AArch64::ADDSWrr:
11835 case AArch64::ADDSXrr:
11836 case AArch64::SUBSWrr:
11837 case AArch64::SUBSXrr:
11838 Comp = &MI;
11839 break;
11840 default:
11841 if (isWhileOpcode(MI.getOpcode())) {
11842 Comp = &MI;
11843 break;
11844 }
11845 return nullptr;
11846 }
11847
11848 if (CompCounterOprNum == 0) {
11849 if (isDefinedOutside(Comp->getOperand(1).getReg(), LoopBB))
11850 CompCounterOprNum = 2;
11851 else if (isDefinedOutside(Comp->getOperand(2).getReg(), LoopBB))
11852 CompCounterOprNum = 1;
11853 else
11854 return nullptr;
11855 }
11856 break;
11857 }
11858 }
11859 if (!Comp)
11860 return nullptr;
11861
11862 MachineInstr *Update = nullptr;
11863 Register Init;
11864 bool IsUpdatePriorComp;
11865 unsigned UpdateCounterOprNum;
11866 if (!getIndVarInfo(Comp->getOperand(CompCounterOprNum).getReg(), LoopBB,
11867 Update, UpdateCounterOprNum, Init, IsUpdatePriorComp))
11868 return nullptr;
11869
11870 return std::make_unique<AArch64PipelinerLoopInfo>(
11871 LoopBB, CondBranch, Comp, CompCounterOprNum, Update, UpdateCounterOprNum,
11872 Init, IsUpdatePriorComp, Cond);
11873}
11874
11875/// verifyInstruction - Perform target specific instruction verification.
11876bool AArch64InstrInfo::verifyInstruction(const MachineInstr &MI,
11877 StringRef &ErrInfo) const {
11878 // Verify that immediate offsets on load/store instructions are within range.
11879 // Stack objects with an FI operand are excluded as they can be fixed up
11880 // during PEI.
11881 TypeSize Scale(0U, false), Width(0U, false);
11882 int64_t MinOffset, MaxOffset;
11883 if (getMemOpInfo(MI.getOpcode(), Scale, Width, MinOffset, MaxOffset)) {
11884 unsigned ImmIdx = getLoadStoreImmIdx(MI.getOpcode());
11885 if (MI.getOperand(ImmIdx).isImm() && !MI.getOperand(ImmIdx - 1).isFI()) {
11886 int64_t Imm = MI.getOperand(ImmIdx).getImm();
11887 if (Imm < MinOffset || Imm > MaxOffset) {
11888 ErrInfo = "Unexpected immediate on load/store instruction";
11889 return false;
11890 }
11891 }
11892 }
11893
11894 const MCInstrDesc &MCID = MI.getDesc();
11895 for (unsigned Op = 0; Op < MCID.getNumOperands(); Op++) {
11896 const MachineOperand &MO = MI.getOperand(Op);
11897 switch (MCID.operands()[Op].OperandType) {
11899 if (!MO.isImm() || MO.getImm() != 0) {
11900 ErrInfo = "OPERAND_IMPLICIT_IMM_0 should be 0";
11901 return false;
11902 }
11903 break;
11905 if (!MO.isImm() ||
11907 (AArch64_AM::getShiftValue(MO.getImm()) != 8 &&
11908 AArch64_AM::getShiftValue(MO.getImm()) != 16)) {
11909 ErrInfo = "OPERAND_SHIFT_MSL should be msl shift of 8 or 16";
11910 return false;
11911 }
11912 break;
11913 default:
11914 break;
11915 }
11916 }
11917 return true;
11918}
11919
11920#define GET_INSTRINFO_HELPERS
11921#define GET_INSTRMAP_INFO
11922#include "AArch64GenInstrInfo.inc"
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, unsigned NumRegs)
static cl::opt< unsigned > BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19), cl::desc("Restrict range of Bcc instructions (DEBUG)"))
static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned MnegOpc, const TargetRegisterClass *RC)
genNeg - Helper to generate an intermediate negation of the second operand of Root
static bool isFrameStoreOpcode(int Opcode)
static cl::opt< unsigned > GatherOptSearchLimit("aarch64-search-limit", cl::Hidden, cl::init(2048), cl::desc("Restrict range of instructions to search for the " "machine-combiner gather pattern optimization"))
static bool getMaddPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Find instructions that can be turned into madd.
static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr)
Find a condition code used by the instruction.
static MachineInstr * genFusedMultiplyAcc(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC)
genFusedMultiplyAcc - Helper to generate fused multiply accumulate instructions.
static MachineInstr * genFusedMultiplyAccNeg(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned IdxMulOpd, unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC)
genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate instructions with an additional...
static bool isCombineInstrCandidate64(unsigned Opc)
static bool isFrameLoadOpcode(int Opcode)
static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg)
static bool areCFlagsAccessedBetweenInstrs(MachineBasicBlock::iterator From, MachineBasicBlock::iterator To, const TargetRegisterInfo *TRI, const AccessKind AccessToCheck=AK_All)
True when condition flags are accessed (either by writing or reading) on the instruction trace starti...
static bool getFMAPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Floating-Point Support.
static bool isADDSRegImm(unsigned Opcode)
static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI)
static bool isANDOpcode(MachineInstr &MI)
static void appendOffsetComment(int NumBytes, llvm::raw_string_ostream &Comment, StringRef RegScale={})
static unsigned sForm(MachineInstr &Instr)
Get opcode of S version of Instr.
static bool isCombineInstrSettingFlag(unsigned Opc)
static bool getFNEGPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB, MachineInstr *&UpdateInst, unsigned &UpdateCounterOprNum, Register &InitReg, bool &IsUpdatePriorComp)
If Reg is an induction variable, return true and set some parameters.
static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc)
static bool mustAvoidNeonAtMBBI(const AArch64Subtarget &Subtarget, MachineBasicBlock &MBB, MachineBasicBlock::iterator I)
Returns true if in a streaming call site region without SME-FA64.
static bool isPostIndexLdStOpcode(unsigned Opcode)
Return true if the opcode is a post-index ld/st instruction, which really loads from base+0.
static std::optional< unsigned > getLFIInstSizeInBytes(const MachineInstr &MI)
Return the maximum number of bytes of code the specified instruction may be after LFI rewriting.
static unsigned getBranchDisplacementBits(unsigned Opc)
static cl::opt< unsigned > CBDisplacementBits("aarch64-cb-offset-bits", cl::Hidden, cl::init(9), cl::desc("Restrict range of CB instructions (DEBUG)"))
static std::optional< ParamLoadedValue > describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI)
If the given ORR instruction is a copy, and DescribedReg overlaps with the destination register then,...
static bool getFMULPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
static void appendReadRegExpr(SmallVectorImpl< char > &Expr, unsigned RegNum)
static MachineInstr * genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR, const TargetRegisterClass *RC)
genMaddR - Generate madd instruction and combine mul and add using an extra virtual register Example ...
static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum, Register ReplaceReg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertTo)
Clone an instruction from MI.
static bool scaleOffset(unsigned Opc, int64_t &Offset)
static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO, unsigned MulOpc)
unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale)
static MachineInstr * genFusedMultiplyIdx(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC)
genFusedMultiplyIdx - Helper to generate fused multiply accumulate instructions.
static MachineInstr * genIndexedMultiply(MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxDupOp, unsigned MulOpc, const TargetRegisterClass *RC, MachineRegisterInfo &MRI)
Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
static bool isSUBSRegImm(unsigned Opcode)
static bool UpdateOperandRegClass(MachineInstr &Instr)
static const TargetRegisterClass * getRegClass(const MachineInstr &MI, Register Reg)
static bool isInStreamingCallSiteRegion(MachineBasicBlock &MBB, MachineBasicBlock::iterator I)
Returns true if the instruction at I is in a streaming call site region, within a single basic block.
static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr, int CmpValue, const TargetRegisterInfo &TRI, SmallVectorImpl< MachineInstr * > &CCUseInstrs, bool &IsInvertCC)
unsigned unscaledOffsetOpcode(unsigned Opcode)
static bool getLoadPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Search for patterns of LD instructions we can optimize.
static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr, const TargetRegisterInfo &TRI)
Check if CmpInstr can be substituted by MI.
static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC)
static bool isCombineInstrCandidateFP(const MachineInstr &Inst)
static void appendLoadRegExpr(SmallVectorImpl< char > &Expr, int64_t OffsetFromDefCFA)
static void appendConstantExpr(SmallVectorImpl< char > &Expr, int64_t Constant, dwarf::LocationAtom Operation)
static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI)
Return the opcode that does not set flags when possible - otherwise return the original opcode.
static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a, const outliner::Candidate &b)
static bool isCombineInstrCandidate32(unsigned Opc)
static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, SmallVectorImpl< MachineOperand > &Cond)
static unsigned offsetExtendOpcode(unsigned Opcode)
MachineOutlinerMBBFlags
@ LRUnavailableSomewhere
@ UnsafeRegsDead
static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const MCInstrDesc &MCID, Register DestReg, unsigned SubIdx0, unsigned SubIdx1, int FI, MachineMemOperand *MMO)
static void generateGatherLanePattern(MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned Pattern, unsigned NumLanes)
Generate optimized instruction sequence for gather load patterns to improve Memory-Level Parallelism ...
static bool getMiscPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Find other MI combine patterns.
static bool outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a, const outliner::Candidate &b)
static const MachineInstrBuilder & AddSubReg(const MachineInstrBuilder &MIB, MCRegister Reg, unsigned SubIdx, RegState State, const TargetRegisterInfo *TRI)
static bool outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a, const outliner::Candidate &b)
static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1, int64_t Offset1, unsigned Opcode1, int FI2, int64_t Offset2, unsigned Opcode2)
static cl::opt< unsigned > TBZDisplacementBits("aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"))
static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB, Register &RegMBB, Register &RegOther)
static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI, unsigned Reg, const StackOffset &Offset)
static bool isDefinedOutside(Register Reg, const MachineBasicBlock *BB)
static MachineInstr * genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC, FMAInstKind kind=FMAInstKind::Default, const Register *ReplacedAddend=nullptr)
genFusedMultiply - Generate fused multiply instructions.
static bool getGatherLanePattern(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, unsigned LoadLaneOpCode, unsigned NumLanes)
Check if the given instruction forms a gather load pattern that can be optimized for better Memory-Le...
static MachineInstr * genFusedMultiplyIdxNeg(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned IdxMulOpd, unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC)
genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate instructions with an additional...
static bool isCombineInstrCandidate(unsigned Opc)
static unsigned regOffsetOpcode(unsigned Opcode)
MachineOutlinerClass
Constants defining how certain sequences should be outlined.
@ MachineOutlinerTailCall
Emit a save, restore, call, and return.
@ MachineOutlinerRegSave
Emit a call and tail-call.
@ MachineOutlinerNoLRSave
Only emit a branch.
@ MachineOutlinerThunk
Emit a call and return.
@ MachineOutlinerDefault
static cl::opt< unsigned > BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26), cl::desc("Restrict range of B instructions (DEBUG)"))
static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB)
Check if AArch64::NZCV should be alive in successors of MBB.
static void emitFrameOffsetAdj(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, int64_t Offset, unsigned Opc, const TargetInstrInfo *TII, MachineInstr::MIFlag Flag, bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFAOffset, StackOffset CFAOffset, unsigned FrameReg)
static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize)
static cl::opt< unsigned > CBZDisplacementBits("aarch64-cbz-offset-bits", cl::Hidden, cl::init(19), cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"))
static void genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, unsigned IdxOpd1, DenseMap< Register, unsigned > &InstrIdxForVirtReg)
Do the following transformation A - (B + C) ==> (A - B) - C A - (B + C) ==> (A - C) - B.
static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, unsigned *NewReg=nullptr)
static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB, const AArch64InstrInfo *TII, bool ShouldSignReturnAddr)
static MachineInstr * genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs)
static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, unsigned MulOpc, unsigned ZeroReg)
static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const MCInstrDesc &MCID, Register SrcReg, bool IsKill, unsigned SubIdx0, unsigned SubIdx1, int FI, MachineMemOperand *MMO)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
DXIL Forward Handle Accesses
@ Default
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Machine Check Debug Module
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
PowerPC Reduce CR logical Operation
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:483
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define DEBUG_WITH_TYPE(TYPE,...)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
Definition Debug.h:72
static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, unsigned CombineOpc=0)
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
SignReturnAddress getSignReturnAddressCondition() const
void setOutliningStyle(const std::string &Style)
std::optional< bool > hasRedZone() const
static bool shouldSignReturnAddress(SignReturnAddress Condition, bool IsLRSpilled)
static bool isHForm(const MachineInstr &MI)
Returns whether the instruction is in H form (16 bit operands)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
static bool hasBTISemantics(const MachineInstr &MI)
Returns whether the instruction can be compatible with non-zero BTYPE.
static bool isQForm(const MachineInstr &MI)
Returns whether the instruction is in Q form (128 bit operands)
static bool getMemOpInfo(unsigned Opcode, TypeSize &Scale, TypeSize &Width, int64_t &MinOffset, int64_t &MaxOffset)
Returns true if opcode Opc is a memory operation.
static bool isTailCallReturnInst(const MachineInstr &MI)
Returns true if MI is one of the TCRETURN* instructions.
static bool isFPRCopy(const MachineInstr &MI)
Does this instruction rename an FPR without modifying bits?
MachineInstr * emitLdStWithAddr(MachineInstr &MemI, const ExtAddrMode &AM) const override
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is an instruction that moves/copies value from one register to an...
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
GetInstSize - Return the number of bytes of code the specified instruction may be.
static bool isZExtLoad(const MachineInstr &MI)
Returns whether the instruction is a zero-extending load.
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
static bool isGPRCopy(const MachineInstr &MI)
Does this instruction rename a GPR without modifying bits?
static unsigned convertToFlagSettingOpc(unsigned Opc)
Return the opcode that set flags when possible.
void createPauthEpilogueInstr(MachineBasicBlock &MBB, DebugLoc DL) const
Return true when there is potentially a faster code sequence for an instruction chain ending in Root.
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
bool canInsertSelect(const MachineBasicBlock &, ArrayRef< MachineOperand > Cond, Register, Register, Register, int &, int &, int &) const override
Register isLoadFromStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
Check for post-frame ptr elimination stack locations as well.
static const MachineOperand & getLdStOffsetOp(const MachineInstr &MI)
Returns the immediate offset operator of a load/store.
bool isCoalescableExtInstr(const MachineInstr &MI, Register &SrcReg, Register &DstReg, unsigned &SubIdx) const override
static std::optional< unsigned > getUnscaledLdSt(unsigned Opc)
Returns the unscaled load/store for the scaled load/store opcode, if there is a corresponding unscale...
static bool hasUnscaledLdStOffset(unsigned Opc)
Return true if it has an unscaled load/store offset.
static const MachineOperand & getLdStAmountOp(const MachineInstr &MI)
Returns the shift amount operator of a load/store.
static bool isPreLdSt(const MachineInstr &MI)
Returns whether the instruction is a pre-indexed load/store.
std::optional< ExtAddrMode > getAddrModeFromMemoryOp(const MachineInstr &MemI, const TargetRegisterInfo *TRI) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &MI, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const override
bool analyzeBranchPredicate(MachineBasicBlock &MBB, MachineBranchPredicate &MBP, bool AllowModify) const override
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, unsigned SubReg=0, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isPairableLdStInst(const MachineInstr &MI)
Return true if pairing the given load or store may be paired with another.
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isSExtLoad(const MachineInstr &MI)
Returns whether the instruction is a sign-extending load.
const AArch64RegisterInfo & getRegisterInfo() const
getRegisterInfo - TargetInstrInfo is a superset of MRegister info.
static bool isPreSt(const MachineInstr &MI)
Returns whether the instruction is a pre-indexed store.
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
AArch64InstrInfo(const AArch64Subtarget &STI)
static bool isPairedLdSt(const MachineInstr &MI)
Returns whether the instruction is a paired load/store.
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, int FrameIndex, MachineInstr *&CopyMI, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
bool getMemOperandWithOffsetWidth(const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset, bool &OffsetIsScalable, TypeSize &Width, const TargetRegisterInfo *TRI) const
If OffsetIsScalable is set to 'true', the offset is scaled by vscale.
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isStridedAccess(const MachineInstr &MI)
Return true if the given load or store is a strided memory access.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
Detect opportunities for ldp/stp formation.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
bool isThroughputPattern(unsigned Pattern) const override
Return true when a code sequence can improve throughput.
MachineOperand & getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const
Return the immediate offset of the base register in a load/store LdSt.
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
bool canFoldIntoAddrMode(const MachineInstr &MemI, Register Reg, const MachineInstr &AddrI, ExtAddrMode &AM) const override
static bool isLdStPairSuppressed(const MachineInstr &MI)
Return true if pairing the given load or store is hinted to be unprofitable.
Register isStoreToStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
Check for post-frame ptr elimination stack locations as well.
std::unique_ptr< TargetInstrInfo::PipelinerLoopInfo > analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const override
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
optimizeCompareInstr - Convert the instruction supplying the argument to the comparison into one that...
static unsigned getLoadStoreImmIdx(unsigned Opc)
Returns the index for the immediate for a given instruction.
static bool isGPRZero(const MachineInstr &MI)
Does this instruction set its full destination register to zero?
void copyGPRRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, unsigned Opcode, unsigned ZeroReg, llvm::ArrayRef< unsigned > Indices) const
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
analyzeCompare - For a comparison instruction, return the source registers in SrcReg and SrcReg2,...
CombinerObjective getCombinerObjective(unsigned Pattern) const override
static bool isFpOrNEON(Register Reg)
Returns whether the physical register is FP or NEON.
bool isAsCheapAsAMove(const MachineInstr &MI) const override
std::optional< DestSourcePair > isCopyLikeInstrImpl(const MachineInstr &MI) const override
static void suppressLdStPair(MachineInstr &MI)
Hint that pairing the given load or store is unprofitable.
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static bool isPreLd(const MachineInstr &MI)
Returns whether the instruction is a pre-indexed load.
void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, unsigned Opcode, llvm::ArrayRef< unsigned > Indices) const
bool optimizeCondBranch(MachineInstr &MI) const override
Replace csincr-branch sequence by simple conditional branch.
static int getMemScale(unsigned Opc)
Scaling factor for (scaled or unscaled) load or store.
bool isCandidateToMergeOrPair(const MachineInstr &MI) const
Return true if this is a load/store that can be potentially paired/merged.
MCInst getNop() const override
static const MachineOperand & getLdStBaseOp(const MachineInstr &MI)
Returns the base register operator of a load/store.
bool isReservedReg(const MachineFunction &MF, MCRegister Reg) const
const AArch64RegisterInfo * getRegisterInfo() const override
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
Get the first element.
Definition ArrayRef.h:144
size_t size() const
Get the array size.
Definition ArrayRef.h:141
This is an important base class in LLVM.
Definition Constant.h:43
A debug info location.
Definition DebugLoc.h:123
bool empty() const
Definition DenseMap.h:109
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:239
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:714
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:711
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
static LocationSize precise(uint64_t Value)
This class is intended to be used as a base class for asm properties and features specific to the tar...
Definition MCAsmInfo.h:66
bool usesWindowsCFI() const
Definition MCAsmInfo.h:678
static MCCFIInstruction cfiDefCfa(MCSymbol *L, unsigned Register, int64_t Offset, SMLoc Loc={})
.cfi_def_cfa defines a rule for computing CFA as: take address from Register and add Offset to it.
Definition MCDwarf.h:615
static MCCFIInstruction createOffset(MCSymbol *L, unsigned Register, int64_t Offset, SMLoc Loc={})
.cfi_offset Previous value of Register is saved at offset Offset from CFA.
Definition MCDwarf.h:657
static MCCFIInstruction cfiDefCfaOffset(MCSymbol *L, int64_t Offset, SMLoc Loc={})
.cfi_def_cfa_offset modifies a rule for computing CFA.
Definition MCDwarf.h:630
static MCCFIInstruction createEscape(MCSymbol *L, StringRef Vals, SMLoc Loc={}, StringRef Comment="")
.cfi_escape Allows the user to add arbitrary bytes to the unwind info.
Definition MCDwarf.h:727
Instances of this class represent a single low-level machine instruction.
Definition MCInst.h:188
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
constexpr bool isValid() const
Definition MCRegister.h:84
static constexpr unsigned NoRegister
Definition MCRegister.h:60
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
Set of metadata that should be preserved when using BuildMI().
bool isInlineAsmBrIndirectTarget() const
Returns true if this is the indirect dest of an INLINEASM_BR.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
reverse_instr_iterator instr_rbegin()
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
reverse_instr_iterator instr_rend()
Instructions::iterator instr_iterator
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
LLVM_ABI bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
void setStackID(int ObjectIdx, uint8_t ID)
bool isCalleeSavedInfoValid() const
Has the callee saved info been calculated yet?
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
unsigned getNumObjects() const
Return the number of objects.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
unsigned addFrameInst(const MCCFIInstruction &Inst)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineJumpTableInfo * getJumpTableInfo() const
getJumpTableInfo - Return the jump table info object for the current function.
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addCFIIndex(unsigned CFIIndex) const
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
reverse_iterator getReverse() const
Get a reverse iterator to the same node.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
bool isCall(QueryType Type=AnyInBundle) const
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
LLVM_ABI uint32_t mergeFlagsWith(const MachineInstr &Other) const
Return the MIFlags which represent both MachineInstrs.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
bool registerDefIsDead(Register Reg, const TargetRegisterInfo *TRI) const
Returns true if the register is dead in this machine instruction.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI bool isLoadFoldBarrier() const
Returns true if it is illegal to fold a load across this instruction.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void addRegisterDefined(Register Reg, const TargetRegisterInfo *RegInfo=nullptr)
We have determined MI defines a register.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const std::vector< MachineJumpTableEntry > & getJumpTables() const
A description of a memory reference used in the backend.
@ MOVolatile
The memory access is volatile.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
This class contains meta information specific to a module.
LLVM_ABI MachineFunction * getMachineFunction(const Function &F) const
Returns the MachineFunction associated to IR function F if there is one, otherwise nullptr.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineBasicBlock * getMBB() const
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
bool tracksLiveness() const
tracksLiveness - Returns true when tracking register liveness accurately.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
bool use_nodbg_empty(Register RegNo) const
use_nodbg_empty - Return true if there are no non-Debug instructions using the specified register.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
bool reservedRegsFrozen() const
reservedRegsFrozen - Returns true after freezeReservedRegs() was called to ensure the set of reserved...
bool def_empty(Register RegNo) const
def_empty - Return true if there are no instructions defining the specified register (it may be live-...
use_instr_nodbg_iterator use_instr_nodbg_begin(Register RegNo) const
bool hasOneDef(Register RegNo) const
Return true if there is exactly one operand defining the specified register.
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
MI-level patchpoint operands.
Definition StackMaps.h:77
uint32_t getNumPatchBytes() const
Return the number of patchable bytes the given patchpoint should emit.
Definition StackMaps.h:105
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
static constexpr bool isVirtualRegister(unsigned Reg)
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:66
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Represents a location in source code.
Definition SMLoc.h:22
bool erase(PtrType Ptr)
Remove pointer from the set.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134
bool empty() const
Definition SmallSet.h:169
bool erase(const T &V)
Definition SmallSet.h:200
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
MI-level stackmap operands.
Definition StackMaps.h:36
uint32_t getNumPatchBytes() const
Return the number of patchable bytes the given stackmap should emit.
Definition StackMaps.h:51
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
int64_t getFixed() const
Returns the fixed component of the stack.
Definition TypeSize.h:46
int64_t getScalable() const
Returns the scalable component of the stack.
Definition TypeSize.h:49
static StackOffset get(int64_t Fixed, int64_t Scalable)
Definition TypeSize.h:41
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
MI-level Statepoint operands.
Definition StackMaps.h:159
uint32_t getNumPatchBytes() const
Return the number of patchable bytes the given statepoint should emit.
Definition StackMaps.h:208
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
Object returned by analyzeLoopForPipelining.
TargetInstrInfo - Interface to description of machine instruction set.
virtual void genAlternativeCodeSequence(MachineInstr &Root, unsigned Pattern, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstIdxForVirtReg) const
When getMachineCombinerPatterns() finds patterns, this function generates the instructions that could...
virtual std::optional< ParamLoadedValue > describeLoadedValue(const MachineInstr &MI, Register Reg) const
Produce the expression describing the MI loading a value into the physical register Reg.
virtual bool getMachineCombinerPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, bool DoRegPressureReduce) const
Return true when there is potentially a faster code sequence for an instruction chain ending in Root.
virtual bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const
Test if the given instruction should be considered a scheduling boundary.
virtual CombinerObjective getCombinerObjective(unsigned Pattern) const
Return the objective of a combiner pattern.
virtual bool isFunctionSafeToSplit(const MachineFunction &MF) const
Return true if the function is a viable candidate for machine function splitting.
const MCAsmInfo & getMCAsmInfo() const
Return target specific asm information.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetInstrInfo * getInstrInfo() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
Target - Wrapper for Target specific information.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
Value * getOperand(unsigned i) const
Definition User.h:207
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
self_iterator getIterator()
Definition ilist_node.h:123
A raw_ostream that writes to an std::string.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static CondCode getInvertedCondCode(CondCode Code)
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_S
MO_S - Indicates that the bits of the symbol operand represented by MO_G0 etc are signed.
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_PREL
MO_PREL - Indicates that the bits of the symbol operand represented by MO_G0 etc are PC relative.
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_ARM64EC_CALLMANGLE
MO_ARM64EC_CALLMANGLE - Operand refers to the Arm64EC-mangled version of a symbol,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_TAGGED
MO_TAGGED - With MO_PAGE, indicates that the page includes a memory tag in bits 56-63.
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
unsigned getCheckerSizeInBytes(AuthCheckMethod Method)
Returns the number of bytes added by checkAuthenticatedRegister.
static uint64_t decodeLogicalImmediate(uint64_t val, unsigned regSize)
decodeLogicalImmediate - Decode a logical immediate value in the form "N:immr:imms" (where the immr a...
static unsigned getShiftValue(unsigned Imm)
getShiftValue - Extract the shift value.
static unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET, unsigned Imm)
getArithExtendImm - Encode the extend type and shift amount for an arithmetic instruction: imm: 3-bit...
static unsigned getArithShiftValue(unsigned Imm)
getArithShiftValue - get the arithmetic shift value.
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static AArch64_AM::ShiftExtendType getExtendType(unsigned Imm)
getExtendType - Extract the extend type for operands of arithmetic ops.
static AArch64_AM::ShiftExtendType getArithExtendType(unsigned Imm)
static AArch64_AM::ShiftExtendType getShiftType(unsigned Imm)
getShiftType - Extract the shift type.
static unsigned getShifterImm(AArch64_AM::ShiftExtendType ST, unsigned Imm)
getShifterImm - Encode the shift type and amount: imm: 6-bit shift amount shifter: 000 ==> lsl 001 ==...
void expandMOVAddr(unsigned Opcode, unsigned TargetFlags, bool IsTargetMachO, SmallVectorImpl< AddrInsnModel > &Insn)
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
static const uint64_t InstrFlagIsWhile
static const uint64_t InstrFlagIsPTestLike
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
initializer< Ty > init(const Ty &Val)
constexpr double e
InstrType
Represents how an instruction should be mapped by the outliner.
NodeAddr< InstrNode * > Instr
Definition RDFGraph.h:389
iterator end() const
Definition BasicBlock.h:89
LLVM_ABI Instruction & back() const
LLVM_ABI iterator begin() const
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
@ Offset
Definition DWP.cpp:557
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
static bool isCondBranchOpcode(int Opc)
MCCFIInstruction createDefCFA(const TargetRegisterInfo &TRI, unsigned FrameReg, unsigned Reg, const StackOffset &Offset, bool LastAdjustmentWasScalable=true)
static bool isPTrueOpcode(unsigned Opc)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool succeeded(LogicalResult Result)
Utility function that returns true if the provided LogicalResult corresponds to a success value.
int isAArch64FrameOffsetLegal(const MachineInstr &MI, StackOffset &Offset, bool *OutUseUnscaledOp=nullptr, unsigned *OutUnscaledOp=nullptr, int64_t *EmittableOffset=nullptr)
Check if the Offset is a valid frame offset for MI.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
RegState
Flags to represent properties of register accesses.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
@ Renamable
Register that may be renamed.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
constexpr RegState getKillRegState(bool B)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
static bool isIndirectBranchOpcode(int Opc)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
unsigned getBLRCallOpcode(const MachineFunction &MF)
Return opcode to be used for indirect calls.
@ AArch64FrameOffsetIsLegal
Offset is legal.
@ AArch64FrameOffsetCanUpdate
Offset can apply, at least partly.
@ AArch64FrameOffsetCannotUpdate
Offset cannot apply.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Op::Description Desc
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
static bool isSEHInstruction(const MachineInstr &MI)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
AArch64MachineCombinerPattern
@ MULSUBv8i16_OP2
@ FMULv4i16_indexed_OP1
@ FMLSv1i32_indexed_OP2
@ MULSUBv2i32_indexed_OP1
@ FMLAv2i32_indexed_OP2
@ MULADDv4i16_indexed_OP2
@ FMLAv1i64_indexed_OP1
@ MULSUBv16i8_OP1
@ FMLAv8i16_indexed_OP2
@ FMULv2i32_indexed_OP1
@ MULSUBv8i16_indexed_OP2
@ FMLAv1i64_indexed_OP2
@ MULSUBv4i16_indexed_OP2
@ FMLAv1i32_indexed_OP1
@ FMLAv2i64_indexed_OP2
@ FMLSv8i16_indexed_OP1
@ MULSUBv2i32_OP1
@ FMULv4i16_indexed_OP2
@ MULSUBv4i32_indexed_OP2
@ FMULv2i64_indexed_OP2
@ FMLAv4i32_indexed_OP1
@ MULADDv4i16_OP2
@ FMULv8i16_indexed_OP2
@ MULSUBv4i16_OP1
@ MULADDv4i32_OP2
@ MULADDv2i32_OP2
@ MULADDv16i8_OP2
@ FMLSv4i16_indexed_OP1
@ MULADDv16i8_OP1
@ FMLAv2i64_indexed_OP1
@ FMLAv1i32_indexed_OP2
@ FMLSv2i64_indexed_OP2
@ MULADDv2i32_OP1
@ MULADDv4i32_OP1
@ MULADDv2i32_indexed_OP1
@ MULSUBv16i8_OP2
@ MULADDv4i32_indexed_OP1
@ MULADDv2i32_indexed_OP2
@ FMLAv4i16_indexed_OP2
@ MULSUBv8i16_OP1
@ FMULv2i32_indexed_OP2
@ FMLSv2i32_indexed_OP2
@ FMLSv4i32_indexed_OP1
@ FMULv2i64_indexed_OP1
@ MULSUBv4i16_OP2
@ FMLSv4i16_indexed_OP2
@ FMLAv2i32_indexed_OP1
@ FMLSv2i32_indexed_OP1
@ FMLAv8i16_indexed_OP1
@ MULSUBv4i16_indexed_OP1
@ FMLSv4i32_indexed_OP2
@ MULADDv4i32_indexed_OP2
@ MULSUBv4i32_OP2
@ MULSUBv8i16_indexed_OP1
@ MULADDv8i16_OP2
@ MULSUBv2i32_indexed_OP2
@ FMULv4i32_indexed_OP2
@ FMLSv2i64_indexed_OP1
@ MULADDv4i16_OP1
@ FMLAv4i32_indexed_OP2
@ MULADDv8i16_indexed_OP1
@ FMULv4i32_indexed_OP1
@ FMLAv4i16_indexed_OP1
@ FMULv8i16_indexed_OP1
@ MULADDv8i16_OP1
@ MULSUBv4i32_indexed_OP1
@ MULSUBv4i32_OP1
@ FMLSv8i16_indexed_OP2
@ MULADDv8i16_indexed_OP2
@ MULSUBv2i32_OP2
@ FMLSv1i64_indexed_OP2
@ MULADDv4i16_indexed_OP1
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, StackOffset Offset, const TargetInstrInfo *TII, MachineInstr::MIFlag=MachineInstr::NoFlags, bool SetNZCV=false, bool NeedsWinCFI=false, bool *HasWinCFI=nullptr, bool EmitCFAOffset=false, StackOffset InitialOffset={}, unsigned FrameReg=AArch64::SP)
emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg plus Offset.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
constexpr RegState getDefRegState(bool B)
CombinerObjective
The combiner's goal may differ based on which pattern it is attempting to optimize.
std::optional< UsedNZCV > examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr, const TargetRegisterInfo &TRI, SmallVectorImpl< MachineInstr * > *CCUseInstrs=nullptr)
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
auto instructionsWithoutDebug(IterT It, IterT End, bool SkipPseudoOp=true)
Construct a range iterator which begins at It and moves forwards until End is reached,...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:322
static MCRegister getXRegFromWReg(MCRegister Reg)
MCCFIInstruction createCFAOffset(const TargetRegisterInfo &MRI, unsigned Reg, const StackOffset &OffsetFromDefCFA, std::optional< int64_t > IncomingVGOffsetFromDefCFA)
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
static bool isUncondBranchOpcode(int Opc)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2191
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
bool rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, unsigned FrameReg, StackOffset &Offset, const AArch64InstrInfo *TII)
rewriteAArch64FrameIndex - Rewrite MI to access 'Offset' bytes from the FP.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
static const MachineMemOperand::Flags MOSuppressPair
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
void appendLEB128(SmallVectorImpl< U > &Buffer, T Value)
Definition LEB128.h:236
bool optimizeTerminators(MachineBasicBlock *MBB, const TargetInstrInfo &TII)
std::pair< MachineOperand, DIExpression * > ParamLoadedValue
bool isNZCVTouchedInInstructionRange(const MachineInstr &DefMI, const MachineInstr &UseMI, const TargetRegisterInfo *TRI)
Return true if there is an instruction /after/ DefMI and before UseMI which either reads or clobbers ...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
static const MachineMemOperand::Flags MOStridedAccess
constexpr RegState getUndefRegState(bool B)
void fullyRecomputeLiveIns(ArrayRef< MachineBasicBlock * > MBBs)
Convenience function for recomputing live-in's for a set of MBBs until the computation converges.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Used to describe addressing mode similar to ExtAddrMode in CodeGenPrepare.
LLVM_ABI static const MBBSectionID ColdSectionID
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getUnknownStack(MachineFunction &MF)
Stack memory without other information.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
An individual sequence of instructions to be replaced with a call to an outlined function.
MachineFunction * getMF() const
The information necessary to create an outlined function for some class of candidate.