54#include <unordered_map>
58#define DEBUG_TYPE "aarch64-simdinstr-opt"
61 "Number of SIMD instructions modified");
63#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \
64 "AArch64 SIMD instructions optimization pass"
78 std::map<std::pair<unsigned, std::string>,
bool> SIMDInstrTable;
81 std::unordered_map<std::string, bool> InterlEarlyExit;
91 std::vector<unsigned> ReplOpc;
95#define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC) \
96 {OpcOrg, {OpcR0, OpcR1, OpcR2}, RC}
97#define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, \
98 OpcR7, OpcR8, OpcR9, RC) \
100 {OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9}, RC}
103 std::vector<InstReplInfo> IRT = {
105 RuleST2(AArch64::ST2Twov2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
106 AArch64::STPQi, AArch64::FPR128RegClass),
107 RuleST2(AArch64::ST2Twov4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
108 AArch64::STPQi, AArch64::FPR128RegClass),
109 RuleST2(AArch64::ST2Twov2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
110 AArch64::STPDi, AArch64::FPR64RegClass),
111 RuleST2(AArch64::ST2Twov8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
112 AArch64::STPQi, AArch64::FPR128RegClass),
113 RuleST2(AArch64::ST2Twov4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
114 AArch64::STPDi, AArch64::FPR64RegClass),
115 RuleST2(AArch64::ST2Twov16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
116 AArch64::STPQi, AArch64::FPR128RegClass),
117 RuleST2(AArch64::ST2Twov8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
118 AArch64::STPDi, AArch64::FPR64RegClass),
120 RuleST4(AArch64::ST4Fourv2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
121 AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, AArch64::ZIP1v2i64,
122 AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
123 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
124 RuleST4(AArch64::ST4Fourv4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
125 AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, AArch64::ZIP1v4i32,
126 AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
127 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
128 RuleST4(AArch64::ST4Fourv2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
129 AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, AArch64::ZIP1v2i32,
130 AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
131 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
132 RuleST4(AArch64::ST4Fourv8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
133 AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, AArch64::ZIP1v8i16,
134 AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
135 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
136 RuleST4(AArch64::ST4Fourv4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
137 AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, AArch64::ZIP1v4i16,
138 AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
139 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
140 RuleST4(AArch64::ST4Fourv16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
141 AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, AArch64::ZIP1v16i8,
142 AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
143 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
144 RuleST4(AArch64::ST4Fourv8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
145 AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, AArch64::ZIP1v8i8,
146 AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
147 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass)
152 static const unsigned MaxNumRepl = 10;
154 AArch64SIMDInstrOpt() : MachineFunctionPass(ID) {}
160 bool shouldReplaceInst(MachineFunction *MF,
const MCInstrDesc *InstDesc,
161 SmallVectorImpl<const MCInstrDesc*> &ReplInstrMCID);
167 bool shouldExitEarly(MachineFunction *MF, Subpass SP);
173 bool reuseDUP(MachineInstr &
MI,
unsigned DupOpcode,
unsigned SrcReg,
174 unsigned LaneNumber,
unsigned *DestReg)
const;
180 bool optimizeVectElement(MachineInstr &
MI);
187 bool processSeqRegInst(MachineInstr *DefiningMI,
unsigned* StReg,
188 unsigned* StRegKill,
unsigned NumArg)
const;
193 bool optimizeLdStInterleave(MachineInstr &
MI);
197 unsigned determineSrcReg(MachineInstr &
MI)
const;
199 bool runOnMachineFunction(MachineFunction &Fn)
override;
201 StringRef getPassName()
const override {
206char AArch64SIMDInstrOpt::ID = 0;
217bool AArch64SIMDInstrOpt::
222 std::string Subtarget = std::string(SchedModel.getSubtargetInfo()->getCPU());
223 auto InstID = std::make_pair(InstDesc->getOpcode(), Subtarget);
224 auto It = SIMDInstrTable.find(InstID);
225 if (It != SIMDInstrTable.end())
228 unsigned SCIdx = InstDesc->getSchedClass();
230 SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
237 SIMDInstrTable[InstID] =
false;
240 for (
const auto *IDesc : InstDescRepl)
242 SCDescRepl = SchedModel.getMCSchedModel()->getSchedClassDesc(
243 IDesc->getSchedClass());
246 SIMDInstrTable[InstID] =
false;
252 unsigned ReplCost = 0;
253 for (
const auto *IDesc :InstDescRepl)
254 ReplCost += SchedModel.computeInstrLatency(IDesc->getOpcode());
256 if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > ReplCost)
258 SIMDInstrTable[InstID] =
true;
263 SIMDInstrTable[InstID] =
false;
274bool AArch64SIMDInstrOpt::shouldExitEarly(
MachineFunction *MF, Subpass SP) {
275 const MCInstrDesc* OriginalMCID;
283 OriginalMCID = &
TII->get(AArch64::FMLAv4i32_indexed);
284 ReplInstrMCID.
push_back(&
TII->get(AArch64::DUPv4i32lane));
286 if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID))
292 std::string Subtarget =
294 auto It = InterlEarlyExit.find(Subtarget);
295 if (It != InterlEarlyExit.end())
298 for (
auto &
I : IRT) {
299 OriginalMCID = &
TII->get(
I.OrigOpc);
300 for (
auto &Repl :
I.ReplOpc)
302 if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) {
303 InterlEarlyExit[Subtarget] =
false;
306 ReplInstrMCID.
clear();
308 InterlEarlyExit[Subtarget] =
true;
319bool AArch64SIMDInstrOpt::reuseDUP(MachineInstr &
MI,
unsigned DupOpcode,
320 unsigned SrcReg,
unsigned LaneNumber,
321 unsigned *DestReg)
const {
325 MachineInstr *CurrentMI = &*MII;
327 if (CurrentMI->
getOpcode() == DupOpcode &&
353bool AArch64SIMDInstrOpt::optimizeVectElement(MachineInstr &
MI) {
354 const MCInstrDesc *MulMCID, *DupMCID;
355 const TargetRegisterClass *RC = &AArch64::FPR128RegClass;
357 switch (
MI.getOpcode()) {
362 case AArch64::FMLAv4i32_indexed:
363 DupMCID = &
TII->get(AArch64::DUPv4i32lane);
364 MulMCID = &
TII->get(AArch64::FMLAv4f32);
366 case AArch64::FMLSv4i32_indexed:
367 DupMCID = &
TII->get(AArch64::DUPv4i32lane);
368 MulMCID = &
TII->get(AArch64::FMLSv4f32);
370 case AArch64::FMULXv4i32_indexed:
371 DupMCID = &
TII->get(AArch64::DUPv4i32lane);
372 MulMCID = &
TII->get(AArch64::FMULXv4f32);
374 case AArch64::FMULv4i32_indexed:
375 DupMCID = &
TII->get(AArch64::DUPv4i32lane);
376 MulMCID = &
TII->get(AArch64::FMULv4f32);
380 case AArch64::FMLAv2i64_indexed:
381 DupMCID = &
TII->get(AArch64::DUPv2i64lane);
382 MulMCID = &
TII->get(AArch64::FMLAv2f64);
384 case AArch64::FMLSv2i64_indexed:
385 DupMCID = &
TII->get(AArch64::DUPv2i64lane);
386 MulMCID = &
TII->get(AArch64::FMLSv2f64);
388 case AArch64::FMULXv2i64_indexed:
389 DupMCID = &
TII->get(AArch64::DUPv2i64lane);
390 MulMCID = &
TII->get(AArch64::FMULXv2f64);
392 case AArch64::FMULv2i64_indexed:
393 DupMCID = &
TII->get(AArch64::DUPv2i64lane);
394 MulMCID = &
TII->get(AArch64::FMULv2f64);
398 case AArch64::FMLAv2i32_indexed:
399 RC = &AArch64::FPR64RegClass;
400 DupMCID = &
TII->get(AArch64::DUPv2i32lane);
401 MulMCID = &
TII->get(AArch64::FMLAv2f32);
403 case AArch64::FMLSv2i32_indexed:
404 RC = &AArch64::FPR64RegClass;
405 DupMCID = &
TII->get(AArch64::DUPv2i32lane);
406 MulMCID = &
TII->get(AArch64::FMLSv2f32);
408 case AArch64::FMULXv2i32_indexed:
409 RC = &AArch64::FPR64RegClass;
410 DupMCID = &
TII->get(AArch64::DUPv2i32lane);
411 MulMCID = &
TII->get(AArch64::FMULXv2f32);
413 case AArch64::FMULv2i32_indexed:
414 RC = &AArch64::FPR64RegClass;
415 DupMCID = &
TII->get(AArch64::DUPv2i32lane);
416 MulMCID = &
TII->get(AArch64::FMULv2f32);
423 if (!shouldReplaceInst(
MI.getParent()->getParent(), &
TII->get(
MI.getOpcode()),
428 MachineBasicBlock &
MBB = *
MI.getParent();
440 if (
MI.getNumOperands() == 5) {
443 unsigned LaneNumber =
MI.getOperand(4).getImm();
447 if (!reuseDUP(
MI, DupMCID->
getOpcode(), SrcReg2, LaneNumber, &DupDest)) {
448 DupDest =
MRI.createVirtualRegister(RC);
450 .
addReg(SrcReg2, Src2IsKill)
454 .
addReg(SrcReg0, Src0IsKill)
455 .
addReg(SrcReg1, Src1IsKill)
456 .
addReg(DupDest, Src2IsKill);
457 }
else if (
MI.getNumOperands() == 4) {
458 unsigned LaneNumber =
MI.getOperand(3).getImm();
459 if (!reuseDUP(
MI, DupMCID->
getOpcode(), SrcReg1, LaneNumber, &DupDest)) {
460 DupDest =
MRI.createVirtualRegister(RC);
462 .
addReg(SrcReg1, Src1IsKill)
466 .
addReg(SrcReg0, Src0IsKill)
467 .
addReg(DupDest, Src1IsKill);
505bool AArch64SIMDInstrOpt::optimizeLdStInterleave(MachineInstr &
MI) {
507 unsigned SeqReg, AddrReg;
508 unsigned StReg[4], StRegKill[4];
509 MachineInstr *DefiningMI;
511 MachineBasicBlock &
MBB = *
MI.getParent();
518 for (
auto &
I : IRT) {
519 if (
MI.getOpcode() ==
I.OrigOpc) {
520 SeqReg =
MI.getOperand(0).getReg();
521 AddrReg =
MI.getOperand(1).getReg();
522 DefiningMI =
MRI->getUniqueVRegDef(SeqReg);
523 unsigned NumReg = determineSrcReg(
MI);
524 if (!processSeqRegInst(DefiningMI, StReg, StRegKill, NumReg))
527 for (
auto &Repl :
I.ReplOpc) {
530 if (Repl != AArch64::STPQi && Repl != AArch64::STPDi)
543 if (!shouldReplaceInst(
MI.getParent()->getParent(), &
TII->get(
MI.getOpcode()),
553 switch (
MI.getOpcode()) {
557 case AArch64::ST2Twov16b:
558 case AArch64::ST2Twov8b:
559 case AArch64::ST2Twov8h:
560 case AArch64::ST2Twov4h:
561 case AArch64::ST2Twov4s:
562 case AArch64::ST2Twov2s:
563 case AArch64::ST2Twov2d:
569 .
addReg(StReg[0], StRegKill[0])
570 .
addReg(StReg[1], StRegKill[1]);
579 case AArch64::ST4Fourv16b:
580 case AArch64::ST4Fourv8b:
581 case AArch64::ST4Fourv8h:
582 case AArch64::ST4Fourv4h:
583 case AArch64::ST4Fourv4s:
584 case AArch64::ST4Fourv2s:
585 case AArch64::ST4Fourv2d:
591 .
addReg(StReg[0], StRegKill[0])
592 .
addReg(StReg[2], StRegKill[2]);
597 .
addReg(StReg[1], StRegKill[1])
598 .
addReg(StReg[3], StRegKill[3]);
634bool AArch64SIMDInstrOpt::processSeqRegInst(MachineInstr *DefiningMI,
635 unsigned* StReg,
unsigned* StRegKill,
unsigned NumArg)
const {
636 assert(DefiningMI !=
nullptr);
637 if (DefiningMI->
getOpcode() != AArch64::REG_SEQUENCE)
640 for (
unsigned i=0; i<NumArg; i++) {
669unsigned AArch64SIMDInstrOpt::determineSrcReg(MachineInstr &
MI)
const {
670 switch (
MI.getOpcode()) {
674 case AArch64::ST2Twov16b:
675 case AArch64::ST2Twov8b:
676 case AArch64::ST2Twov8h:
677 case AArch64::ST2Twov4h:
678 case AArch64::ST2Twov4s:
679 case AArch64::ST2Twov2s:
680 case AArch64::ST2Twov2d:
683 case AArch64::ST4Fourv16b:
684 case AArch64::ST4Fourv8b:
685 case AArch64::ST4Fourv8h:
686 case AArch64::ST4Fourv4h:
687 case AArch64::ST4Fourv4s:
688 case AArch64::ST4Fourv2s:
689 case AArch64::ST4Fourv2d:
694bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction &MF) {
700 TII =
ST.getInstrInfo();
701 SchedModel.
init(&ST);
706 for (
auto OptimizationKind : {VectorElem, Interleave}) {
707 if (!shouldExitEarly(&MF, OptimizationKind)) {
708 SmallVector<MachineInstr *, 8> RemoveMIs;
709 for (MachineBasicBlock &
MBB : MF) {
710 for (MachineInstr &
MI :
MBB) {
712 if (OptimizationKind == VectorElem)
713 InstRewrite = optimizeVectElement(
MI) ;
715 InstRewrite = optimizeLdStInterleave(
MI);
724 for (MachineInstr *
MI : RemoveMIs)
725 MI->eraseFromParent();
735 return new AArch64SIMDInstrOpt();
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
#define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9, RC)
#define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC)
#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
const HexagonInstrInfo * TII
Promote Memory to Register
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
FunctionPass class - This class is used to implement most global optimizations.
Describe properties that are true of each instruction in the target description file.
unsigned getOpcode() const
Return the opcode number for this descriptor.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
unsigned getNumOperands() const
Retuns the total number of operands.
const MachineOperand & getOperand(unsigned i) const
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
Provide an instruction scheduling machine model to CodeGen passes.
LLVM_ABI bool hasInstrSchedModel() const
Return true if this machine model includes an instruction-level scheduling model.
LLVM_ABI void init(const TargetSubtargetInfo *TSInfo, bool EnableSModel=true, bool EnableSItins=true)
Initialize the machine model for instruction scheduling.
const TargetSubtargetInfo * getSubtargetInfo() const
TargetSubtargetInfo getter.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
This is an optimization pass for GlobalISel generic memory operations.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
FunctionPass * createAArch64SIMDInstrOptPass()
Returns an instance of the high cost ASIMD instruction replacement optimization pass.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
unsigned getKillRegState(bool B)
Summarize the scheduling resources required for an instruction of a particular scheduling class.