LLVM 23.0.0git
X86ISelDAGToDAG.cpp
Go to the documentation of this file.
1//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines a DAG pattern matching instruction selector for X86,
10// converting from a legalized dag to a X86 dag.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelDAGToDAG.h"
15#include "X86.h"
17#include "X86Subtarget.h"
18#include "X86TargetMachine.h"
19#include "llvm/ADT/Statistic.h"
22#include "llvm/Config/llvm-config.h"
24#include "llvm/IR/Function.h"
26#include "llvm/IR/Intrinsics.h"
27#include "llvm/IR/IntrinsicsX86.h"
28#include "llvm/IR/Module.h"
29#include "llvm/IR/Type.h"
30#include "llvm/Support/Debug.h"
34#include <cstdint>
35
36using namespace llvm;
37
38#define DEBUG_TYPE "x86-isel"
39#define PASS_NAME "X86 DAG->DAG Instruction Selection"
40
41STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
42
43static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
44 cl::desc("Enable setting constant bits to reduce size of mask immediates"),
46
48 "x86-promote-anyext-load", cl::init(true),
49 cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden);
50
52
53//===----------------------------------------------------------------------===//
54// Pattern Matcher Implementation
55//===----------------------------------------------------------------------===//
56
57namespace {
58 /// This corresponds to X86AddressMode, but uses SDValue's instead of register
59 /// numbers for the leaves of the matched tree.
60 struct X86ISelAddressMode {
61 enum {
62 RegBase,
63 FrameIndexBase
64 } BaseType = RegBase;
65
66 // This is really a union, discriminated by BaseType!
67 SDValue Base_Reg;
68 int Base_FrameIndex = 0;
69
70 unsigned Scale = 1;
71 SDValue IndexReg;
72 int32_t Disp = 0;
73 SDValue Segment;
74 const GlobalValue *GV = nullptr;
75 const Constant *CP = nullptr;
76 const BlockAddress *BlockAddr = nullptr;
77 const char *ES = nullptr;
78 MCSymbol *MCSym = nullptr;
79 int JT = -1;
80 Align Alignment; // CP alignment.
81 unsigned char SymbolFlags = X86II::MO_NO_FLAG; // X86II::MO_*
82 bool NegateIndex = false;
83
84 X86ISelAddressMode() = default;
85
86 bool hasSymbolicDisplacement() const {
87 return GV != nullptr || CP != nullptr || ES != nullptr ||
88 MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
89 }
90
91 bool hasBaseOrIndexReg() const {
92 return BaseType == FrameIndexBase ||
93 IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
94 }
95
96 /// Return true if this addressing mode is already RIP-relative.
97 bool isRIPRelative() const {
98 if (BaseType != RegBase) return false;
99 if (RegisterSDNode *RegNode =
100 dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
101 return RegNode->getReg() == X86::RIP;
102 return false;
103 }
104
105 void setBaseReg(SDValue Reg) {
106 BaseType = RegBase;
107 Base_Reg = Reg;
108 }
109
110#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
111 void dump(SelectionDAG *DAG = nullptr) {
112 dbgs() << "X86ISelAddressMode " << this << '\n';
113 dbgs() << "Base_Reg ";
114 if (Base_Reg.getNode())
115 Base_Reg.getNode()->dump(DAG);
116 else
117 dbgs() << "nul\n";
118 if (BaseType == FrameIndexBase)
119 dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
120 dbgs() << " Scale " << Scale << '\n'
121 << "IndexReg ";
122 if (NegateIndex)
123 dbgs() << "negate ";
124 if (IndexReg.getNode())
125 IndexReg.getNode()->dump(DAG);
126 else
127 dbgs() << "nul\n";
128 dbgs() << " Disp " << Disp << '\n'
129 << "GV ";
130 if (GV)
131 GV->dump();
132 else
133 dbgs() << "nul";
134 dbgs() << " CP ";
135 if (CP)
136 CP->dump();
137 else
138 dbgs() << "nul";
139 dbgs() << '\n'
140 << "ES ";
141 if (ES)
142 dbgs() << ES;
143 else
144 dbgs() << "nul";
145 dbgs() << " MCSym ";
146 if (MCSym)
147 dbgs() << MCSym;
148 else
149 dbgs() << "nul";
150 dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n';
151 }
152#endif
153 };
154}
155
156namespace {
157 //===--------------------------------------------------------------------===//
158 /// ISel - X86-specific code to select X86 machine instructions for
159 /// SelectionDAG operations.
160 ///
161 class X86DAGToDAGISel final : public SelectionDAGISel {
162 /// Keep a pointer to the X86Subtarget around so that we can
163 /// make the right decision when generating code for different targets.
164 const X86Subtarget *Subtarget;
165
166 /// If true, selector should try to optimize for minimum code size.
167 bool OptForMinSize;
168
169 /// Disable direct TLS access through segment registers.
170 bool IndirectTlsSegRefs;
171
172 public:
173 X86DAGToDAGISel() = delete;
174
175 explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOptLevel OptLevel)
176 : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr),
177 OptForMinSize(false), IndirectTlsSegRefs(false) {}
178
179 bool runOnMachineFunction(MachineFunction &MF) override {
180 // Reset the subtarget each time through.
181 Subtarget = &MF.getSubtarget<X86Subtarget>();
182 IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
183 "indirect-tls-seg-refs");
184
185 // OptFor[Min]Size are used in pattern predicates that isel is matching.
186 OptForMinSize = MF.getFunction().hasMinSize();
188 }
189
190 void emitFunctionEntryCode() override;
191
192 bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
193
194 void PreprocessISelDAG() override;
195 void PostprocessISelDAG() override;
196
197// Include the pieces autogenerated from the target description.
198#include "X86GenDAGISel.inc"
199
200 private:
201 void Select(SDNode *N) override;
202
203 bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
204 bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
205 bool AllowSegmentRegForX32 = false);
206 bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
207 bool matchAddress(SDValue N, X86ISelAddressMode &AM);
208 bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
209 bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
210 SDValue matchIndexRecursively(SDValue N, X86ISelAddressMode &AM,
211 unsigned Depth);
212 bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
213 unsigned Depth);
214 bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM,
215 unsigned Depth);
216 bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
217 bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale,
218 SDValue &Index, SDValue &Disp, SDValue &Segment,
219 bool HasNDDM = true);
220 bool selectNDDAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale,
221 SDValue &Index, SDValue &Disp, SDValue &Segment);
222 bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp,
223 SDValue ScaleOp, SDValue &Base, SDValue &Scale,
224 SDValue &Index, SDValue &Disp, SDValue &Segment);
225 bool selectMOV64Imm32(SDValue N, SDValue &Imm);
226 bool selectLEAAddr(SDValue N, SDValue &Base,
227 SDValue &Scale, SDValue &Index, SDValue &Disp,
228 SDValue &Segment);
229 bool selectLEA64_Addr(SDValue N, SDValue &Base, SDValue &Scale,
230 SDValue &Index, SDValue &Disp, SDValue &Segment);
231 bool selectTLSADDRAddr(SDValue N, SDValue &Base,
232 SDValue &Scale, SDValue &Index, SDValue &Disp,
233 SDValue &Segment);
234 bool selectRelocImm(SDValue N, SDValue &Op);
235
236 bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
237 SDValue &Base, SDValue &Scale,
238 SDValue &Index, SDValue &Disp,
239 SDValue &Segment);
240
241 // Convenience method where P is also root.
242 bool tryFoldLoad(SDNode *P, SDValue N,
243 SDValue &Base, SDValue &Scale,
244 SDValue &Index, SDValue &Disp,
245 SDValue &Segment) {
246 return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
247 }
248
249 bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
250 SDValue &Base, SDValue &Scale,
251 SDValue &Index, SDValue &Disp,
252 SDValue &Segment);
253
254 bool isProfitableToFormMaskedOp(SDNode *N) const;
255
256 /// Implement addressing mode selection for inline asm expressions.
257 bool SelectInlineAsmMemoryOperand(const SDValue &Op,
258 InlineAsm::ConstraintCode ConstraintID,
259 std::vector<SDValue> &OutOps) override;
260
261 void emitSpecialCodeForMain();
262
263 inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
264 MVT VT, SDValue &Base, SDValue &Scale,
265 SDValue &Index, SDValue &Disp,
266 SDValue &Segment) {
267 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
268 Base = CurDAG->getTargetFrameIndex(
269 AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
270 else if (AM.Base_Reg.getNode())
271 Base = AM.Base_Reg;
272 else
273 Base = CurDAG->getRegister(0, VT);
274
275 Scale = getI8Imm(AM.Scale, DL);
276
277#define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC)
278#define GET_NDM_IF_ENABLED(OPC) \
279 (Subtarget->hasNDD() && Subtarget->hasNDDM() ? OPC##_ND : OPC)
280 // Negate the index if needed.
281 if (AM.NegateIndex) {
282 unsigned NegOpc;
283 switch (VT.SimpleTy) {
284 default:
285 llvm_unreachable("Unsupported VT!");
286 case MVT::i64:
287 NegOpc = GET_ND_IF_ENABLED(X86::NEG64r);
288 break;
289 case MVT::i32:
290 NegOpc = GET_ND_IF_ENABLED(X86::NEG32r);
291 break;
292 case MVT::i16:
293 NegOpc = GET_ND_IF_ENABLED(X86::NEG16r);
294 break;
295 case MVT::i8:
296 NegOpc = GET_ND_IF_ENABLED(X86::NEG8r);
297 break;
298 }
299 SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
300 AM.IndexReg), 0);
301 AM.IndexReg = Neg;
302 }
303
304 if (AM.IndexReg.getNode())
305 Index = AM.IndexReg;
306 else
307 Index = CurDAG->getRegister(0, VT);
308
309 // These are 32-bit even in 64-bit mode since RIP-relative offset
310 // is 32-bit.
311 if (AM.GV)
312 Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
313 MVT::i32, AM.Disp,
314 AM.SymbolFlags);
315 else if (AM.CP)
316 Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Alignment,
317 AM.Disp, AM.SymbolFlags);
318 else if (AM.ES) {
319 assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
320 Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
321 } else if (AM.MCSym) {
322 assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
323 assert(AM.SymbolFlags == 0 && "oo");
324 Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
325 } else if (AM.JT != -1) {
326 assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
327 Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
328 } else if (AM.BlockAddr)
329 Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
330 AM.SymbolFlags);
331 else
332 Disp = CurDAG->getSignedTargetConstant(AM.Disp, DL, MVT::i32);
333
334 if (AM.Segment.getNode())
335 Segment = AM.Segment;
336 else
337 Segment = CurDAG->getRegister(0, MVT::i16);
338 }
339
340 // Utility function to determine whether it is AMX SDNode right after
341 // lowering but before ISEL.
342 bool isAMXSDNode(SDNode *N) const {
343 // Check if N is AMX SDNode:
344 // 1. check result type;
345 // 2. check operand type;
346 for (unsigned Idx = 0, E = N->getNumValues(); Idx != E; ++Idx) {
347 if (N->getValueType(Idx) == MVT::x86amx)
348 return true;
349 }
350 for (unsigned Idx = 0, E = N->getNumOperands(); Idx != E; ++Idx) {
351 SDValue Op = N->getOperand(Idx);
352 if (Op.getValueType() == MVT::x86amx)
353 return true;
354 }
355 return false;
356 }
357
358 // Utility function to determine whether we should avoid selecting
359 // immediate forms of instructions for better code size or not.
360 // At a high level, we'd like to avoid such instructions when
361 // we have similar constants used within the same basic block
362 // that can be kept in a register.
363 //
364 bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
365 uint32_t UseCount = 0;
366
367 // Do not want to hoist if we're not optimizing for size.
368 // TODO: We'd like to remove this restriction.
369 // See the comment in X86InstrInfo.td for more info.
370 if (!CurDAG->shouldOptForSize())
371 return false;
372
373 // Walk all the users of the immediate.
374 for (const SDNode *User : N->users()) {
375 if (UseCount >= 2)
376 break;
377
378 // This user is already selected. Count it as a legitimate use and
379 // move on.
380 if (User->isMachineOpcode()) {
381 UseCount++;
382 continue;
383 }
384
385 // We want to count stores of immediates as real uses.
386 if (User->getOpcode() == ISD::STORE &&
387 User->getOperand(1).getNode() == N) {
388 UseCount++;
389 continue;
390 }
391
392 // We don't currently match users that have > 2 operands (except
393 // for stores, which are handled above)
394 // Those instruction won't match in ISEL, for now, and would
395 // be counted incorrectly.
396 // This may change in the future as we add additional instruction
397 // types.
398 if (User->getNumOperands() != 2)
399 continue;
400
401 // If this is a sign-extended 8-bit integer immediate used in an ALU
402 // instruction, there is probably an opcode encoding to save space.
404 if (C && isInt<8>(C->getSExtValue()))
405 continue;
406
407 // Immediates that are used for offsets as part of stack
408 // manipulation should be left alone. These are typically
409 // used to indicate SP offsets for argument passing and
410 // will get pulled into stores/pushes (implicitly).
411 if (User->getOpcode() == X86ISD::ADD ||
412 User->getOpcode() == ISD::ADD ||
413 User->getOpcode() == X86ISD::SUB ||
414 User->getOpcode() == ISD::SUB) {
415
416 // Find the other operand of the add/sub.
417 SDValue OtherOp = User->getOperand(0);
418 if (OtherOp.getNode() == N)
419 OtherOp = User->getOperand(1);
420
421 // Don't count if the other operand is SP.
422 RegisterSDNode *RegNode;
423 if (OtherOp->getOpcode() == ISD::CopyFromReg &&
425 OtherOp->getOperand(1).getNode())))
426 if ((RegNode->getReg() == X86::ESP) ||
427 (RegNode->getReg() == X86::RSP))
428 continue;
429 }
430
431 // ... otherwise, count this and move on.
432 UseCount++;
433 }
434
435 // If we have more than 1 use, then recommend for hoisting.
436 return (UseCount > 1);
437 }
438
439 /// Return a target constant with the specified value of type i8.
440 inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
441 return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
442 }
443
444 /// Return a target constant with the specified value, of type i32.
445 inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
446 return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
447 }
448
449 /// Return a target constant with the specified value, of type i64.
450 inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
451 return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
452 }
453
454 SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
455 const SDLoc &DL) {
456 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
457 uint64_t Index = N->getConstantOperandVal(1);
458 MVT VecVT = N->getOperand(0).getSimpleValueType();
459 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
460 }
461
462 SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
463 const SDLoc &DL) {
464 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
465 uint64_t Index = N->getConstantOperandVal(2);
466 MVT VecVT = N->getSimpleValueType(0);
467 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
468 }
469
470 SDValue getPermuteVINSERTCommutedImmediate(SDNode *N, unsigned VecWidth,
471 const SDLoc &DL) {
472 assert(VecWidth == 128 && "Unexpected vector width");
473 uint64_t Index = N->getConstantOperandVal(2);
474 MVT VecVT = N->getSimpleValueType(0);
475 uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth;
476 assert((InsertIdx == 0 || InsertIdx == 1) && "Bad insertf128 index");
477 // vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub)
478 // vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub)
479 return getI8Imm(InsertIdx ? 0x02 : 0x30, DL);
480 }
481
482 SDValue getSBBZero(SDNode *N) {
483 SDLoc dl(N);
484 MVT VT = N->getSimpleValueType(0);
485
486 // Create zero.
487 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
488 SDValue Zero =
489 SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, {}), 0);
490 if (VT == MVT::i64) {
491 Zero = SDValue(
492 CurDAG->getMachineNode(
493 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64, Zero,
494 CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
495 0);
496 }
497
498 // Copy flags to the EFLAGS register and glue it to next node.
499 unsigned Opcode = N->getOpcode();
500 assert((Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY) &&
501 "Unexpected opcode for SBB materialization");
502 unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1;
503 SDValue EFLAGS =
504 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
505 N->getOperand(FlagOpIndex), SDValue());
506
507 // Create a 64-bit instruction if the result is 64-bits otherwise use the
508 // 32-bit version.
509 unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
510 MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
511 VTs = CurDAG->getVTList(SBBVT, MVT::i32);
512 return SDValue(
513 CurDAG->getMachineNode(Opc, dl, VTs,
514 {Zero, Zero, EFLAGS, EFLAGS.getValue(1)}),
515 0);
516 }
517
518 // Helper to detect unneeded and instructions on shift amounts. Called
519 // from PatFrags in tablegen.
520 bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
521 assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
522 const APInt &Val = N->getConstantOperandAPInt(1);
523
524 if (Val.countr_one() >= Width)
525 return true;
526
527 APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero;
528 return Mask.countr_one() >= Width;
529 }
530
531 /// Return an SDNode that returns the value of the global base register.
532 /// Output instructions required to initialize the global base register,
533 /// if necessary.
534 SDNode *getGlobalBaseReg();
535
536 /// Return a reference to the TargetMachine, casted to the target-specific
537 /// type.
538 const X86TargetMachine &getTargetMachine() const {
539 return static_cast<const X86TargetMachine &>(TM);
540 }
541
542 /// Return a reference to the TargetInstrInfo, casted to the target-specific
543 /// type.
544 const X86InstrInfo *getInstrInfo() const {
545 return Subtarget->getInstrInfo();
546 }
547
548 /// Return a condition code of the given SDNode
549 X86::CondCode getCondFromNode(SDNode *N) const;
550
551 /// Address-mode matching performs shift-of-and to and-of-shift
552 /// reassociation in order to expose more scaled addressing
553 /// opportunities.
554 bool ComplexPatternFuncMutatesDAG() const override {
555 return true;
556 }
557
558 bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
559
560 // Indicates we should prefer to use a non-temporal load for this load.
561 bool useNonTemporalLoad(LoadSDNode *N) const {
562 if (!N->isNonTemporal())
563 return false;
564
565 unsigned StoreSize = N->getMemoryVT().getStoreSize();
566
567 if (N->getAlign().value() < StoreSize)
568 return false;
569
570 switch (StoreSize) {
571 default: llvm_unreachable("Unsupported store size");
572 case 4:
573 case 8:
574 return false;
575 case 16:
576 return Subtarget->hasSSE41();
577 case 32:
578 return Subtarget->hasAVX2();
579 case 64:
580 return Subtarget->hasAVX512();
581 }
582 }
583
584 bool foldLoadStoreIntoMemOperand(SDNode *Node);
585 MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
586 bool matchBitExtract(SDNode *Node);
587 bool shrinkAndImmediate(SDNode *N);
588 bool isMaskZeroExtended(SDNode *N) const;
589 bool tryShiftAmountMod(SDNode *N);
590 bool tryShrinkShlLogicImm(SDNode *N);
591 bool tryVPTERNLOG(SDNode *N);
592 bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentB,
593 SDNode *ParentC, SDValue A, SDValue B, SDValue C,
594 uint8_t Imm);
595 bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
596 bool tryMatchBitSelect(SDNode *N);
597
598 MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
599 const SDLoc &dl, MVT VT, SDNode *Node);
600 MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
601 const SDLoc &dl, MVT VT, SDNode *Node,
602 SDValue &InGlue);
603
604 bool tryOptimizeRem8Extend(SDNode *N);
605
606 bool onlyUsesZeroFlag(SDValue Flags) const;
607 bool hasNoSignFlagUses(SDValue Flags) const;
608 bool hasNoCarryFlagUses(SDValue Flags) const;
609 bool checkTCRetEnoughRegs(SDNode *N) const;
610 };
611
612 class X86DAGToDAGISelLegacy : public SelectionDAGISelLegacy {
613 public:
614 static char ID;
615 explicit X86DAGToDAGISelLegacy(X86TargetMachine &tm,
616 CodeGenOptLevel OptLevel)
617 : SelectionDAGISelLegacy(
618 ID, std::make_unique<X86DAGToDAGISel>(tm, OptLevel)) {}
619 };
620}
621
622char X86DAGToDAGISelLegacy::ID = 0;
623
624INITIALIZE_PASS(X86DAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false)
625
626// Returns true if this masked compare can be implemented legally with this
627// type.
628static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
629 unsigned Opcode = N->getOpcode();
630 if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM ||
631 Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC ||
632 Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) {
633 // We can get 256-bit 8 element types here without VLX being enabled. When
634 // this happens we will use 512-bit operations and the mask will not be
635 // zero extended.
636 EVT OpVT = N->getOperand(0).getValueType();
637 // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
638 // second operand.
639 if (Opcode == X86ISD::STRICT_CMPM)
640 OpVT = N->getOperand(1).getValueType();
641 if (OpVT.is256BitVector() || OpVT.is128BitVector())
642 return Subtarget->hasVLX();
643
644 return true;
645 }
646 // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
647 if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
648 Opcode == X86ISD::FSETCCM_SAE)
649 return true;
650
651 return false;
652}
653
654// Returns true if we can assume the writer of the mask has zero extended it
655// for us.
656bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
657 // If this is an AND, check if we have a compare on either side. As long as
658 // one side guarantees the mask is zero extended, the AND will preserve those
659 // zeros.
660 if (N->getOpcode() == ISD::AND)
661 return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
662 isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);
663
664 return isLegalMaskCompare(N, Subtarget);
665}
666
667bool
668X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
669 if (OptLevel == CodeGenOptLevel::None)
670 return false;
671
672 if (!N.hasOneUse())
673 return false;
674
675 if (N.getOpcode() != ISD::LOAD)
676 return true;
677
678 // Don't fold non-temporal loads if we have an instruction for them.
679 if (useNonTemporalLoad(cast<LoadSDNode>(N)))
680 return false;
681
682 // If N is a load, do additional profitability checks.
683 if (U == Root) {
684 switch (U->getOpcode()) {
685 default: break;
686 case X86ISD::ADD:
687 case X86ISD::ADC:
688 case X86ISD::SUB:
689 case X86ISD::SBB:
690 case X86ISD::AND:
691 case X86ISD::XOR:
692 case X86ISD::OR:
693 case ISD::ADD:
694 case ISD::UADDO_CARRY:
695 case ISD::AND:
696 case ISD::OR:
697 case ISD::XOR: {
698 SDValue Op1 = U->getOperand(1);
699
700 // If the other operand is a 8-bit immediate we should fold the immediate
701 // instead. This reduces code size.
702 // e.g.
703 // movl 4(%esp), %eax
704 // addl $4, %eax
705 // vs.
706 // movl $4, %eax
707 // addl 4(%esp), %eax
708 // The former is 2 bytes shorter. In case where the increment is 1, then
709 // the saving can be 4 bytes (by using incl %eax).
710 if (auto *Imm = dyn_cast<ConstantSDNode>(Op1)) {
711 if (Imm->getAPIntValue().isSignedIntN(8))
712 return false;
713
714 // If this is a 64-bit AND with an immediate that fits in 32-bits,
715 // prefer using the smaller and over folding the load. This is needed to
716 // make sure immediates created by shrinkAndImmediate are always folded.
717 // Ideally we would narrow the load during DAG combine and get the
718 // best of both worlds.
719 if (U->getOpcode() == ISD::AND &&
720 Imm->getAPIntValue().getBitWidth() == 64 &&
721 Imm->getAPIntValue().isIntN(32))
722 return false;
723
724 // If this really a zext_inreg that can be represented with a movzx
725 // instruction, prefer that.
726 // TODO: We could shrink the load and fold if it is non-volatile.
727 if (U->getOpcode() == ISD::AND &&
728 (Imm->getAPIntValue() == UINT8_MAX ||
729 Imm->getAPIntValue() == UINT16_MAX ||
730 Imm->getAPIntValue() == UINT32_MAX))
731 return false;
732
733 // ADD/SUB with can negate the immediate and use the opposite operation
734 // to fit 128 into a sign extended 8 bit immediate.
735 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
736 (-Imm->getAPIntValue()).isSignedIntN(8))
737 return false;
738
739 if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) &&
740 (-Imm->getAPIntValue()).isSignedIntN(8) &&
741 hasNoCarryFlagUses(SDValue(U, 1)))
742 return false;
743 }
744
745 // If the other operand is a TLS address, we should fold it instead.
746 // This produces
747 // movl %gs:0, %eax
748 // leal i@NTPOFF(%eax), %eax
749 // instead of
750 // movl $i@NTPOFF, %eax
751 // addl %gs:0, %eax
752 // if the block also has an access to a second TLS address this will save
753 // a load.
754 // FIXME: This is probably also true for non-TLS addresses.
755 if (Op1.getOpcode() == X86ISD::Wrapper) {
756 SDValue Val = Op1.getOperand(0);
758 return false;
759 }
760
761 // Don't fold load if this matches the BTS/BTR/BTC patterns.
762 // BTS: (or X, (shl 1, n))
763 // BTR: (and X, (rotl -2, n))
764 // BTC: (xor X, (shl 1, n))
765 if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
766 if (U->getOperand(0).getOpcode() == ISD::SHL &&
767 isOneConstant(U->getOperand(0).getOperand(0)))
768 return false;
769
770 if (U->getOperand(1).getOpcode() == ISD::SHL &&
771 isOneConstant(U->getOperand(1).getOperand(0)))
772 return false;
773 }
774 if (U->getOpcode() == ISD::AND) {
775 SDValue U0 = U->getOperand(0);
776 SDValue U1 = U->getOperand(1);
777 if (U0.getOpcode() == ISD::ROTL) {
779 if (C && C->getSExtValue() == -2)
780 return false;
781 }
782
783 if (U1.getOpcode() == ISD::ROTL) {
785 if (C && C->getSExtValue() == -2)
786 return false;
787 }
788 }
789
790 break;
791 }
792 case ISD::SHL:
793 case ISD::SRA:
794 case ISD::SRL:
795 // Don't fold a load into a shift by immediate. The BMI2 instructions
796 // support folding a load, but not an immediate. The legacy instructions
797 // support folding an immediate, but can't fold a load. Folding an
798 // immediate is preferable to folding a load.
799 if (isa<ConstantSDNode>(U->getOperand(1)))
800 return false;
801
802 break;
803 }
804 }
805
806 // Prevent folding a load if this can implemented with an insert_subreg or
807 // a move that implicitly zeroes.
808 if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
809 isNullConstant(Root->getOperand(2)) &&
810 (Root->getOperand(0).isUndef() ||
812 return false;
813
814 return true;
815}
816
817// Indicates it is profitable to form an AVX512 masked operation. Returning
818// false will favor a masked register-register masked move or vblendm and the
819// operation will be selected separately.
820bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const {
821 assert(
822 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) &&
823 "Unexpected opcode!");
824
825 // If the operation has additional users, the operation will be duplicated.
826 // Check the use count to prevent that.
827 // FIXME: Are there cheap opcodes we might want to duplicate?
828 return N->getOperand(1).hasOneUse();
829}
830
831/// Replace the original chain operand of the call with
832/// load's chain operand and move load below the call's chain operand.
833static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
834 SDValue Call, SDValue OrigChain) {
836 SDValue Chain = OrigChain.getOperand(0);
837 if (Chain.getNode() == Load.getNode())
838 Ops.push_back(Load.getOperand(0));
839 else {
840 assert(Chain.getOpcode() == ISD::TokenFactor &&
841 "Unexpected chain operand");
842 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
843 if (Chain.getOperand(i).getNode() == Load.getNode())
844 Ops.push_back(Load.getOperand(0));
845 else
846 Ops.push_back(Chain.getOperand(i));
847 SDValue NewChain =
848 CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
849 Ops.clear();
850 Ops.push_back(NewChain);
851 }
852 Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
853 CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
854 CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
855 Load.getOperand(1), Load.getOperand(2));
856
857 Ops.clear();
858 Ops.push_back(SDValue(Load.getNode(), 1));
859 Ops.append(Call->op_begin() + 1, Call->op_end());
860 CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
861}
862
863/// Return true if call address is a load and it can be
864/// moved below CALLSEQ_START and the chains leading up to the call.
865/// Return the CALLSEQ_START by reference as a second output.
866/// In the case of a tail call, there isn't a callseq node between the call
867/// chain and the load.
868static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
869 // The transformation is somewhat dangerous if the call's chain was glued to
870 // the call. After MoveBelowOrigChain the load is moved between the call and
871 // the chain, this can create a cycle if the load is not folded. So it is
872 // *really* important that we are sure the load will be folded.
873 if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
874 return false;
875 auto *LD = dyn_cast<LoadSDNode>(Callee.getNode());
876 if (!LD ||
877 !LD->isSimple() ||
878 LD->getAddressingMode() != ISD::UNINDEXED ||
879 LD->getExtensionType() != ISD::NON_EXTLOAD)
880 return false;
881
882 // If the load's outgoing chain has more than one use, we can't (currently)
883 // move the load since we'd most likely create a loop. TODO: Maybe it could
884 // work if moveBelowOrigChain() updated *all* the chain users.
885 if (!Callee.getValue(1).hasOneUse())
886 return false;
887
888 // Now let's find the callseq_start.
889 while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
890 if (!Chain.hasOneUse())
891 return false;
892 Chain = Chain.getOperand(0);
893 }
894
895 while (true) {
896 if (!Chain.getNumOperands())
897 return false;
898
899 // It's not safe to move the callee (a load) across e.g. a store.
900 // Conservatively abort if the chain contains a node other than the ones
901 // below.
902 switch (Chain.getNode()->getOpcode()) {
904 case ISD::CopyToReg:
905 case ISD::LOAD:
906 break;
907 default:
908 return false;
909 }
910
911 if (Chain.getOperand(0).getNode() == Callee.getNode())
912 return true;
913 if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
914 Chain.getOperand(0).getValue(0).hasOneUse() &&
915 Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
916 Callee.getValue(1).hasOneUse())
917 return true;
918
919 // Look past CopyToRegs. We only walk one path, so the chain mustn't branch.
920 if (Chain.getOperand(0).getOpcode() == ISD::CopyToReg &&
921 Chain.getOperand(0).getValue(0).hasOneUse()) {
922 Chain = Chain.getOperand(0);
923 continue;
924 }
925
926 return false;
927 }
928}
929
930static bool isEndbrImm64(uint64_t Imm) {
931// There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
932// i.g: 0xF3660F1EFA, 0xF3670F1EFA
933 if ((Imm & 0x00FFFFFF) != 0x0F1EFA)
934 return false;
935
936 uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64,
937 0x65, 0x66, 0x67, 0xf0, 0xf2};
938 int i = 24; // 24bit 0x0F1EFA has matched
939 while (i < 64) {
940 uint8_t Byte = (Imm >> i) & 0xFF;
941 if (Byte == 0xF3)
942 return true;
943 if (!llvm::is_contained(OptionalPrefixBytes, Byte))
944 return false;
945 i += 8;
946 }
947
948 return false;
949}
950
951static bool needBWI(MVT VT) {
952 return (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v64i8);
953}
954
955void X86DAGToDAGISel::PreprocessISelDAG() {
956 bool MadeChange = false;
957 for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
958 E = CurDAG->allnodes_end(); I != E; ) {
959 SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
960
961 // This is for CET enhancement.
962 //
963 // ENDBR32 and ENDBR64 have specific opcodes:
964 // ENDBR32: F3 0F 1E FB
965 // ENDBR64: F3 0F 1E FA
966 // And we want that attackers won’t find unintended ENDBR32/64
967 // opcode matches in the binary
968 // Here’s an example:
969 // If the compiler had to generate asm for the following code:
970 // a = 0xF30F1EFA
971 // it could, for example, generate:
972 // mov 0xF30F1EFA, dword ptr[a]
973 // In such a case, the binary would include a gadget that starts
974 // with a fake ENDBR64 opcode. Therefore, we split such generation
975 // into multiple operations, let it not shows in the binary
976 if (N->getOpcode() == ISD::Constant) {
977 MVT VT = N->getSimpleValueType(0);
978 int64_t Imm = cast<ConstantSDNode>(N)->getSExtValue();
979 int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB;
980 if (Imm == EndbrImm || isEndbrImm64(Imm)) {
981 // Check that the cf-protection-branch is enabled.
982 Metadata *CFProtectionBranch =
984 "cf-protection-branch");
985 if (CFProtectionBranch || IndirectBranchTracking) {
986 SDLoc dl(N);
987 SDValue Complement = CurDAG->getConstant(~Imm, dl, VT, false, true);
988 Complement = CurDAG->getNOT(dl, Complement, VT);
989 --I;
990 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Complement);
991 ++I;
992 MadeChange = true;
993 continue;
994 }
995 }
996 }
997
998 // If this is a target specific AND node with no flag usages, turn it back
999 // into ISD::AND to enable test instruction matching.
1000 if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
1001 SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
1002 N->getOperand(0), N->getOperand(1));
1003 --I;
1004 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1005 ++I;
1006 MadeChange = true;
1007 continue;
1008 }
1009
1010 // Convert vector increment or decrement to sub/add with an all-ones
1011 // constant:
1012 // add X, <1, 1...> --> sub X, <-1, -1...>
1013 // sub X, <1, 1...> --> add X, <-1, -1...>
1014 // The all-ones vector constant can be materialized using a pcmpeq
1015 // instruction that is commonly recognized as an idiom (has no register
1016 // dependency), so that's better/smaller than loading a splat 1 constant.
1017 //
1018 // But don't do this if it would inhibit a potentially profitable load
1019 // folding opportunity for the other operand. That only occurs with the
1020 // intersection of:
1021 // (1) The other operand (op0) is load foldable.
1022 // (2) The op is an add (otherwise, we are *creating* an add and can still
1023 // load fold the other op).
1024 // (3) The target has AVX (otherwise, we have a destructive add and can't
1025 // load fold the other op without killing the constant op).
1026 // (4) The constant 1 vector has multiple uses (so it is profitable to load
1027 // into a register anyway).
1028 auto mayPreventLoadFold = [&]() {
1029 return X86::mayFoldLoad(N->getOperand(0), *Subtarget) &&
1030 N->getOpcode() == ISD::ADD && Subtarget->hasAVX() &&
1031 !N->getOperand(1).hasOneUse();
1032 };
1033 if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
1034 N->getSimpleValueType(0).isVector() && !mayPreventLoadFold()) {
1035 APInt SplatVal;
1037 peekThroughBitcasts(N->getOperand(0)).getNode()) &&
1038 X86::isConstantSplat(N->getOperand(1), SplatVal) &&
1039 SplatVal.isOne()) {
1040 SDLoc DL(N);
1041
1042 MVT VT = N->getSimpleValueType(0);
1043 unsigned NumElts = VT.getSizeInBits() / 32;
1045 CurDAG->getAllOnesConstant(DL, MVT::getVectorVT(MVT::i32, NumElts));
1046 AllOnes = CurDAG->getBitcast(VT, AllOnes);
1047
1048 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
1049 SDValue Res =
1050 CurDAG->getNode(NewOpcode, DL, VT, N->getOperand(0), AllOnes);
1051 --I;
1052 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1053 ++I;
1054 MadeChange = true;
1055 continue;
1056 }
1057 }
1058
1059 switch (N->getOpcode()) {
1060 case X86ISD::VBROADCAST: {
1061 MVT VT = N->getSimpleValueType(0);
1062 // Emulate v32i16/v64i8 broadcast without BWI.
1063 if (!Subtarget->hasBWI() && needBWI(VT)) {
1064 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1065 SDLoc dl(N);
1066 SDValue NarrowBCast =
1067 CurDAG->getNode(X86ISD::VBROADCAST, dl, NarrowVT, N->getOperand(0));
1068 SDValue Res =
1069 CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1070 NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1071 unsigned Index = NarrowVT.getVectorMinNumElements();
1072 Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1073 CurDAG->getIntPtrConstant(Index, dl));
1074
1075 --I;
1076 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1077 ++I;
1078 MadeChange = true;
1079 continue;
1080 }
1081
1082 break;
1083 }
1084 case X86ISD::VBROADCAST_LOAD: {
1085 MVT VT = N->getSimpleValueType(0);
1086 // Emulate v32i16/v64i8 broadcast without BWI.
1087 if (!Subtarget->hasBWI() && needBWI(VT)) {
1088 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1089 auto *MemNode = cast<MemSDNode>(N);
1090 SDLoc dl(N);
1091 SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other);
1092 SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()};
1093 SDValue NarrowBCast = CurDAG->getMemIntrinsicNode(
1094 X86ISD::VBROADCAST_LOAD, dl, VTs, Ops, MemNode->getMemoryVT(),
1095 MemNode->getMemOperand());
1096 SDValue Res =
1097 CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1098 NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1099 unsigned Index = NarrowVT.getVectorMinNumElements();
1100 Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1101 CurDAG->getIntPtrConstant(Index, dl));
1102
1103 --I;
1104 SDValue To[] = {Res, NarrowBCast.getValue(1)};
1105 CurDAG->ReplaceAllUsesWith(N, To);
1106 ++I;
1107 MadeChange = true;
1108 continue;
1109 }
1110
1111 break;
1112 }
1113 case ISD::LOAD: {
1114 // If this is a XMM/YMM load of the same lower bits as another YMM/ZMM
1115 // load, then just extract the lower subvector and avoid the second load.
1116 auto *Ld = cast<LoadSDNode>(N);
1117 MVT VT = N->getSimpleValueType(0);
1118 if (!ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
1119 !(VT.is128BitVector() || VT.is256BitVector()))
1120 break;
1121
1122 MVT MaxVT = VT;
1123 SDNode *MaxLd = nullptr;
1124 SDValue Ptr = Ld->getBasePtr();
1125 SDValue Chain = Ld->getChain();
1126 for (SDNode *User : Ptr->users()) {
1127 auto *UserLd = dyn_cast<LoadSDNode>(User);
1128 MVT UserVT = User->getSimpleValueType(0);
1129 if (User != N && UserLd && ISD::isNormalLoad(User) &&
1130 UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain &&
1131 !User->hasAnyUseOfValue(1) &&
1132 (UserVT.is256BitVector() || UserVT.is512BitVector()) &&
1133 UserVT.getSizeInBits() > VT.getSizeInBits() &&
1134 (!MaxLd || UserVT.getSizeInBits() > MaxVT.getSizeInBits())) {
1135 MaxLd = User;
1136 MaxVT = UserVT;
1137 }
1138 }
1139 if (MaxLd) {
1140 SDLoc dl(N);
1141 unsigned NumSubElts = VT.getSizeInBits() / MaxVT.getScalarSizeInBits();
1142 MVT SubVT = MVT::getVectorVT(MaxVT.getScalarType(), NumSubElts);
1143 SDValue Extract = CurDAG->getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT,
1144 SDValue(MaxLd, 0),
1145 CurDAG->getIntPtrConstant(0, dl));
1146 SDValue Res = CurDAG->getBitcast(VT, Extract);
1147
1148 --I;
1149 SDValue To[] = {Res, SDValue(MaxLd, 1)};
1150 CurDAG->ReplaceAllUsesWith(N, To);
1151 ++I;
1152 MadeChange = true;
1153 continue;
1154 }
1155 break;
1156 }
1157 case ISD::VSELECT: {
1158 // Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
1159 EVT EleVT = N->getOperand(0).getValueType().getVectorElementType();
1160 if (EleVT == MVT::i1)
1161 break;
1162
1163 assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
1164 assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
1165 "We can't replace VSELECT with BLENDV in vXi16!");
1166 SDValue R;
1167 if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(N->getOperand(0)) ==
1168 EleVT.getSizeInBits()) {
1169 R = CurDAG->getNode(X86ISD::VPTERNLOG, SDLoc(N), N->getValueType(0),
1170 N->getOperand(0), N->getOperand(1), N->getOperand(2),
1171 CurDAG->getTargetConstant(0xCA, SDLoc(N), MVT::i8));
1172 } else {
1173 R = CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
1174 N->getOperand(0), N->getOperand(1),
1175 N->getOperand(2));
1176 }
1177 --I;
1178 CurDAG->ReplaceAllUsesWith(N, R.getNode());
1179 ++I;
1180 MadeChange = true;
1181 continue;
1182 }
1183 case ISD::FP_ROUND:
1185 case ISD::FP_TO_SINT:
1186 case ISD::FP_TO_UINT:
1189 // Replace vector fp_to_s/uint with their X86 specific equivalent so we
1190 // don't need 2 sets of patterns.
1191 if (!N->getSimpleValueType(0).isVector())
1192 break;
1193
1194 unsigned NewOpc;
1195 switch (N->getOpcode()) {
1196 default: llvm_unreachable("Unexpected opcode!");
1197 case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break;
1198 case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break;
1199 case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break;
1200 case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
1201 case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break;
1202 case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
1203 }
1204 SDValue Res;
1205 if (N->isStrictFPOpcode())
1206 Res =
1207 CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(0), MVT::Other},
1208 {N->getOperand(0), N->getOperand(1)});
1209 else
1210 Res =
1211 CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1212 N->getOperand(0));
1213 --I;
1214 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1215 ++I;
1216 MadeChange = true;
1217 continue;
1218 }
1219 case ISD::SHL:
1220 case ISD::SRA:
1221 case ISD::SRL: {
1222 // Replace vector shifts with their X86 specific equivalent so we don't
1223 // need 2 sets of patterns.
1224 if (!N->getValueType(0).isVector())
1225 break;
1226
1227 unsigned NewOpc;
1228 switch (N->getOpcode()) {
1229 default: llvm_unreachable("Unexpected opcode!");
1230 case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
1231 case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
1232 case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
1233 }
1234 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1235 N->getOperand(0), N->getOperand(1));
1236 --I;
1237 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1238 ++I;
1239 MadeChange = true;
1240 continue;
1241 }
1242 case ISD::ANY_EXTEND:
1244 // Replace vector any extend with the zero extend equivalents so we don't
1245 // need 2 sets of patterns. Ignore vXi1 extensions.
1246 if (!N->getValueType(0).isVector())
1247 break;
1248
1249 unsigned NewOpc;
1250 if (N->getOperand(0).getScalarValueSizeInBits() == 1) {
1251 assert(N->getOpcode() == ISD::ANY_EXTEND &&
1252 "Unexpected opcode for mask vector!");
1253 NewOpc = ISD::SIGN_EXTEND;
1254 } else {
1255 NewOpc = N->getOpcode() == ISD::ANY_EXTEND
1258 }
1259
1260 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1261 N->getOperand(0));
1262 --I;
1263 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1264 ++I;
1265 MadeChange = true;
1266 continue;
1267 }
1268 case ISD::FCEIL:
1269 case ISD::STRICT_FCEIL:
1270 case ISD::FFLOOR:
1271 case ISD::STRICT_FFLOOR:
1272 case ISD::FTRUNC:
1273 case ISD::STRICT_FTRUNC:
1274 case ISD::FROUNDEVEN:
1276 case ISD::FNEARBYINT:
1278 case ISD::FRINT:
1279 case ISD::STRICT_FRINT: {
1280 // Replace fp rounding with their X86 specific equivalent so we don't
1281 // need 2 sets of patterns.
1282 unsigned Imm;
1283 switch (N->getOpcode()) {
1284 default: llvm_unreachable("Unexpected opcode!");
1285 case ISD::STRICT_FCEIL:
1286 case ISD::FCEIL: Imm = 0xA; break;
1287 case ISD::STRICT_FFLOOR:
1288 case ISD::FFLOOR: Imm = 0x9; break;
1289 case ISD::STRICT_FTRUNC:
1290 case ISD::FTRUNC: Imm = 0xB; break;
1292 case ISD::FROUNDEVEN: Imm = 0x8; break;
1294 case ISD::FNEARBYINT: Imm = 0xC; break;
1295 case ISD::STRICT_FRINT:
1296 case ISD::FRINT: Imm = 0x4; break;
1297 }
1298 SDLoc dl(N);
1299 bool IsStrict = N->isStrictFPOpcode();
1300 SDValue Res;
1301 if (IsStrict)
1302 Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl,
1303 {N->getValueType(0), MVT::Other},
1304 {N->getOperand(0), N->getOperand(1),
1305 CurDAG->getTargetConstant(Imm, dl, MVT::i32)});
1306 else
1307 Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0),
1308 N->getOperand(0),
1309 CurDAG->getTargetConstant(Imm, dl, MVT::i32));
1310 --I;
1311 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1312 ++I;
1313 MadeChange = true;
1314 continue;
1315 }
1316 case X86ISD::FANDN:
1317 case X86ISD::FAND:
1318 case X86ISD::FOR:
1319 case X86ISD::FXOR: {
1320 // Widen scalar fp logic ops to vector to reduce isel patterns.
1321 // FIXME: Can we do this during lowering/combine.
1322 MVT VT = N->getSimpleValueType(0);
1323 if (VT.isVector() || VT == MVT::f128)
1324 break;
1325
1326 MVT VecVT = VT == MVT::f64 ? MVT::v2f64
1327 : VT == MVT::f32 ? MVT::v4f32
1328 : MVT::v8f16;
1329
1330 SDLoc dl(N);
1331 SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1332 N->getOperand(0));
1333 SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1334 N->getOperand(1));
1335
1336 SDValue Res;
1337 if (Subtarget->hasSSE2()) {
1338 EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
1339 Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
1340 Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
1341 unsigned Opc;
1342 switch (N->getOpcode()) {
1343 default: llvm_unreachable("Unexpected opcode!");
1344 case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
1345 case X86ISD::FAND: Opc = ISD::AND; break;
1346 case X86ISD::FOR: Opc = ISD::OR; break;
1347 case X86ISD::FXOR: Opc = ISD::XOR; break;
1348 }
1349 Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
1350 Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
1351 } else {
1352 Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
1353 }
1354 Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
1355 CurDAG->getIntPtrConstant(0, dl));
1356 --I;
1357 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1358 ++I;
1359 MadeChange = true;
1360 continue;
1361 }
1362 }
1363
1364 if (OptLevel != CodeGenOptLevel::None &&
1365 // Only do this when the target can fold the load into the call or
1366 // jmp.
1367 !Subtarget->useIndirectThunkCalls() &&
1368 ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps() &&
1369 !Subtarget->slowIndirectCall()) ||
1370 (N->getOpcode() == X86ISD::TC_RETURN &&
1371 (Subtarget->is64Bit() ||
1372 !getTargetMachine().isPositionIndependent())))) {
1373 /// Also try moving call address load from outside callseq_start to just
1374 /// before the call to allow it to be folded.
1375 ///
1376 /// [Load chain]
1377 /// ^
1378 /// |
1379 /// [Load]
1380 /// ^ ^
1381 /// | |
1382 /// / \--
1383 /// / |
1384 ///[CALLSEQ_START] |
1385 /// ^ |
1386 /// | |
1387 /// [LOAD/C2Reg] |
1388 /// | |
1389 /// \ /
1390 /// \ /
1391 /// [CALL]
1392 bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
1393 SDValue Chain = N->getOperand(0);
1394 SDValue Load = N->getOperand(1);
1395 if (!isCalleeLoad(Load, Chain, HasCallSeq))
1396 continue;
1397 if (N->getOpcode() == X86ISD::TC_RETURN && !checkTCRetEnoughRegs(N))
1398 continue;
1399 moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
1400 ++NumLoadMoved;
1401 MadeChange = true;
1402 continue;
1403 }
1404
1405 // Lower fpround and fpextend nodes that target the FP stack to be store and
1406 // load to the stack. This is a gross hack. We would like to simply mark
1407 // these as being illegal, but when we do that, legalize produces these when
1408 // it expands calls, then expands these in the same legalize pass. We would
1409 // like dag combine to be able to hack on these between the call expansion
1410 // and the node legalization. As such this pass basically does "really
1411 // late" legalization of these inline with the X86 isel pass.
1412 // FIXME: This should only happen when not compiled with -O0.
1413 switch (N->getOpcode()) {
1414 default: continue;
1415 case ISD::FP_ROUND:
1416 case ISD::FP_EXTEND:
1417 {
1418 MVT SrcVT = N->getOperand(0).getSimpleValueType();
1419 MVT DstVT = N->getSimpleValueType(0);
1420
1421 // If any of the sources are vectors, no fp stack involved.
1422 if (SrcVT.isVector() || DstVT.isVector())
1423 continue;
1424
1425 // If the source and destination are SSE registers, then this is a legal
1426 // conversion that should not be lowered.
1427 const X86TargetLowering *X86Lowering =
1428 static_cast<const X86TargetLowering *>(TLI);
1429 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1430 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1431 if (SrcIsSSE && DstIsSSE)
1432 continue;
1433
1434 if (!SrcIsSSE && !DstIsSSE) {
1435 // If this is an FPStack extension, it is a noop.
1436 if (N->getOpcode() == ISD::FP_EXTEND)
1437 continue;
1438 // If this is a value-preserving FPStack truncation, it is a noop.
1439 if (N->getConstantOperandVal(1))
1440 continue;
1441 }
1442
1443 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1444 // FPStack has extload and truncstore. SSE can fold direct loads into other
1445 // operations. Based on this, decide what we want to do.
1446 MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
1447 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1448 int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1449 MachinePointerInfo MPI =
1450 MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1451 SDLoc dl(N);
1452
1453 // FIXME: optimize the case where the src/dest is a load or store?
1454
1455 SDValue Store = CurDAG->getTruncStore(
1456 CurDAG->getEntryNode(), dl, N->getOperand(0), MemTmp, MPI, MemVT);
1457 SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store,
1458 MemTmp, MPI, MemVT);
1459
1460 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1461 // extload we created. This will cause general havok on the dag because
1462 // anything below the conversion could be folded into other existing nodes.
1463 // To avoid invalidating 'I', back it up to the convert node.
1464 --I;
1465 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
1466 break;
1467 }
1468
1469 //The sequence of events for lowering STRICT_FP versions of these nodes requires
1470 //dealing with the chain differently, as there is already a preexisting chain.
1473 {
1474 MVT SrcVT = N->getOperand(1).getSimpleValueType();
1475 MVT DstVT = N->getSimpleValueType(0);
1476
1477 // If any of the sources are vectors, no fp stack involved.
1478 if (SrcVT.isVector() || DstVT.isVector())
1479 continue;
1480
1481 // If the source and destination are SSE registers, then this is a legal
1482 // conversion that should not be lowered.
1483 const X86TargetLowering *X86Lowering =
1484 static_cast<const X86TargetLowering *>(TLI);
1485 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1486 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1487 if (SrcIsSSE && DstIsSSE)
1488 continue;
1489
1490 if (!SrcIsSSE && !DstIsSSE) {
1491 // If this is an FPStack extension, it is a noop.
1492 if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1493 continue;
1494 // If this is a value-preserving FPStack truncation, it is a noop.
1495 if (N->getConstantOperandVal(2))
1496 continue;
1497 }
1498
1499 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1500 // FPStack has extload and truncstore. SSE can fold direct loads into other
1501 // operations. Based on this, decide what we want to do.
1502 MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
1503 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1504 int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1505 MachinePointerInfo MPI =
1506 MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1507 SDLoc dl(N);
1508
1509 // FIXME: optimize the case where the src/dest is a load or store?
1510
1511 //Since the operation is StrictFP, use the preexisting chain.
1513 if (!SrcIsSSE) {
1514 SDVTList VTs = CurDAG->getVTList(MVT::Other);
1515 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp};
1516 Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT,
1517 MPI, /*Align*/ std::nullopt,
1519 if (N->getFlags().hasNoFPExcept()) {
1520 SDNodeFlags Flags = Store->getFlags();
1521 Flags.setNoFPExcept(true);
1522 Store->setFlags(Flags);
1523 }
1524 } else {
1525 assert(SrcVT == MemVT && "Unexpected VT!");
1526 Store = CurDAG->getStore(N->getOperand(0), dl, N->getOperand(1), MemTmp,
1527 MPI);
1528 }
1529
1530 if (!DstIsSSE) {
1531 SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other);
1532 SDValue Ops[] = {Store, MemTmp};
1533 Result = CurDAG->getMemIntrinsicNode(
1534 X86ISD::FLD, dl, VTs, Ops, MemVT, MPI,
1535 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
1536 if (N->getFlags().hasNoFPExcept()) {
1537 SDNodeFlags Flags = Result->getFlags();
1538 Flags.setNoFPExcept(true);
1539 Result->setFlags(Flags);
1540 }
1541 } else {
1542 assert(DstVT == MemVT && "Unexpected VT!");
1543 Result = CurDAG->getLoad(DstVT, dl, Store, MemTmp, MPI);
1544 }
1545
1546 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1547 // extload we created. This will cause general havok on the dag because
1548 // anything below the conversion could be folded into other existing nodes.
1549 // To avoid invalidating 'I', back it up to the convert node.
1550 --I;
1551 CurDAG->ReplaceAllUsesWith(N, Result.getNode());
1552 break;
1553 }
1554 }
1555
1556
1557 // Now that we did that, the node is dead. Increment the iterator to the
1558 // next node to process, then delete N.
1559 ++I;
1560 MadeChange = true;
1561 }
1562
1563 // Remove any dead nodes that may have been left behind.
1564 if (MadeChange)
1565 CurDAG->RemoveDeadNodes();
1566}
1567
1568// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1569bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1570 unsigned Opc = N->getMachineOpcode();
1571 if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1572 Opc != X86::MOVSX64rr8)
1573 return false;
1574
1575 SDValue N0 = N->getOperand(0);
1576
1577 // We need to be extracting the lower bit of an extend.
1578 if (!N0.isMachineOpcode() ||
1579 N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
1580 N0.getConstantOperandVal(1) != X86::sub_8bit)
1581 return false;
1582
1583 // We're looking for either a movsx or movzx to match the original opcode.
1584 unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1585 : X86::MOVSX32rr8_NOREX;
1586 SDValue N00 = N0.getOperand(0);
1587 if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
1588 return false;
1589
1590 if (Opc == X86::MOVSX64rr8) {
1591 // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1592 // to 64.
1593 MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
1594 MVT::i64, N00);
1595 ReplaceUses(N, Extend);
1596 } else {
1597 // Ok we can drop this extend and just use the original extend.
1598 ReplaceUses(N, N00.getNode());
1599 }
1600
1601 return true;
1602}
1603
1604void X86DAGToDAGISel::PostprocessISelDAG() {
1605 // Skip peepholes at -O0.
1606 if (TM.getOptLevel() == CodeGenOptLevel::None)
1607 return;
1608
1609 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1610
1611 bool MadeChange = false;
1612 while (Position != CurDAG->allnodes_begin()) {
1613 SDNode *N = &*--Position;
1614 // Skip dead nodes and any non-machine opcodes.
1615 if (N->use_empty() || !N->isMachineOpcode())
1616 continue;
1617
1618 if (tryOptimizeRem8Extend(N)) {
1619 MadeChange = true;
1620 continue;
1621 }
1622
1623 unsigned Opc = N->getMachineOpcode();
1624 switch (Opc) {
1625 default:
1626 continue;
1627 // ANDrr/rm + TESTrr+ -> TESTrr/TESTmr
1628 case X86::TEST8rr:
1629 case X86::TEST16rr:
1630 case X86::TEST32rr:
1631 case X86::TEST64rr:
1632 // ANDrr/rm + CTESTrr -> CTESTrr/CTESTmr
1633 case X86::CTEST8rr:
1634 case X86::CTEST16rr:
1635 case X86::CTEST32rr:
1636 case X86::CTEST64rr: {
1637 auto &Op0 = N->getOperand(0);
1638 if (Op0 != N->getOperand(1) || !Op0->hasNUsesOfValue(2, Op0.getResNo()) ||
1639 !Op0.isMachineOpcode())
1640 continue;
1641 SDValue And = N->getOperand(0);
1642#define CASE_ND(OP) \
1643 case X86::OP: \
1644 case X86::OP##_ND:
1645 switch (And.getMachineOpcode()) {
1646 default:
1647 continue;
1648 CASE_ND(AND8rr)
1649 CASE_ND(AND16rr)
1650 CASE_ND(AND32rr)
1651 CASE_ND(AND64rr) {
1652 if (And->hasAnyUseOfValue(1))
1653 continue;
1654 SmallVector<SDValue> Ops(N->op_values());
1655 Ops[0] = And.getOperand(0);
1656 Ops[1] = And.getOperand(1);
1657 MachineSDNode *Test =
1658 CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i32, Ops);
1659 ReplaceUses(N, Test);
1660 MadeChange = true;
1661 continue;
1662 }
1663 CASE_ND(AND8rm)
1664 CASE_ND(AND16rm)
1665 CASE_ND(AND32rm)
1666 CASE_ND(AND64rm) {
1667 if (And->hasAnyUseOfValue(1))
1668 continue;
1669 unsigned NewOpc;
1670 bool IsCTESTCC = X86::isCTESTCC(Opc);
1671#define FROM_TO(A, B) \
1672 CASE_ND(A) NewOpc = IsCTESTCC ? X86::C##B : X86::B; \
1673 break;
1674 switch (And.getMachineOpcode()) {
1675 FROM_TO(AND8rm, TEST8mr);
1676 FROM_TO(AND16rm, TEST16mr);
1677 FROM_TO(AND32rm, TEST32mr);
1678 FROM_TO(AND64rm, TEST64mr);
1679 }
1680#undef FROM_TO
1681#undef CASE_ND
1682 // Need to swap the memory and register operand.
1683 SmallVector<SDValue> Ops = {And.getOperand(1), And.getOperand(2),
1684 And.getOperand(3), And.getOperand(4),
1685 And.getOperand(5), And.getOperand(0)};
1686 // CC, Cflags.
1687 if (IsCTESTCC) {
1688 Ops.push_back(N->getOperand(2));
1689 Ops.push_back(N->getOperand(3));
1690 }
1691 // Chain of memory load
1692 Ops.push_back(And.getOperand(6));
1693 // Glue
1694 if (IsCTESTCC)
1695 Ops.push_back(N->getOperand(4));
1696
1697 MachineSDNode *Test = CurDAG->getMachineNode(
1698 NewOpc, SDLoc(N), MVT::i32, MVT::Other, Ops);
1699 CurDAG->setNodeMemRefs(
1700 Test, cast<MachineSDNode>(And.getNode())->memoperands());
1701 ReplaceUses(And.getValue(2), SDValue(Test, 1));
1702 ReplaceUses(SDValue(N, 0), SDValue(Test, 0));
1703 MadeChange = true;
1704 continue;
1705 }
1706 }
1707 }
1708 // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1709 // used. We're doing this late so we can prefer to fold the AND into masked
1710 // comparisons. Doing that can be better for the live range of the mask
1711 // register.
1712 case X86::KORTESTBkk:
1713 case X86::KORTESTWkk:
1714 case X86::KORTESTDkk:
1715 case X86::KORTESTQkk: {
1716 SDValue Op0 = N->getOperand(0);
1717 if (Op0 != N->getOperand(1) || !N->isOnlyUserOf(Op0.getNode()) ||
1718 !Op0.isMachineOpcode() || !onlyUsesZeroFlag(SDValue(N, 0)))
1719 continue;
1720#define CASE(A) \
1721 case X86::A: \
1722 break;
1723 switch (Op0.getMachineOpcode()) {
1724 default:
1725 continue;
1726 CASE(KANDBkk)
1727 CASE(KANDWkk)
1728 CASE(KANDDkk)
1729 CASE(KANDQkk)
1730 }
1731 unsigned NewOpc;
1732#define FROM_TO(A, B) \
1733 case X86::A: \
1734 NewOpc = X86::B; \
1735 break;
1736 switch (Opc) {
1737 FROM_TO(KORTESTBkk, KTESTBkk)
1738 FROM_TO(KORTESTWkk, KTESTWkk)
1739 FROM_TO(KORTESTDkk, KTESTDkk)
1740 FROM_TO(KORTESTQkk, KTESTQkk)
1741 }
1742 // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1743 // KAND instructions and KTEST use the same ISA feature.
1744 if (NewOpc == X86::KTESTWkk && !Subtarget->hasDQI())
1745 continue;
1746#undef FROM_TO
1747 MachineSDNode *KTest = CurDAG->getMachineNode(
1748 NewOpc, SDLoc(N), MVT::i32, Op0.getOperand(0), Op0.getOperand(1));
1749 ReplaceUses(N, KTest);
1750 MadeChange = true;
1751 continue;
1752 }
1753 // Attempt to remove vectors moves that were inserted to zero upper bits.
1754 case TargetOpcode::SUBREG_TO_REG: {
1755 unsigned SubRegIdx = N->getConstantOperandVal(1);
1756 if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1757 continue;
1758
1759 SDValue Move = N->getOperand(0);
1760 if (!Move.isMachineOpcode())
1761 continue;
1762
1763 // Make sure its one of the move opcodes we recognize.
1764 switch (Move.getMachineOpcode()) {
1765 default:
1766 continue;
1767 CASE(VMOVAPDrr) CASE(VMOVUPDrr)
1768 CASE(VMOVAPSrr) CASE(VMOVUPSrr)
1769 CASE(VMOVDQArr) CASE(VMOVDQUrr)
1770 CASE(VMOVAPDYrr) CASE(VMOVUPDYrr)
1771 CASE(VMOVAPSYrr) CASE(VMOVUPSYrr)
1772 CASE(VMOVDQAYrr) CASE(VMOVDQUYrr)
1773 CASE(VMOVAPDZ128rr) CASE(VMOVUPDZ128rr)
1774 CASE(VMOVAPSZ128rr) CASE(VMOVUPSZ128rr)
1775 CASE(VMOVDQA32Z128rr) CASE(VMOVDQU32Z128rr)
1776 CASE(VMOVDQA64Z128rr) CASE(VMOVDQU64Z128rr)
1777 CASE(VMOVAPDZ256rr) CASE(VMOVUPDZ256rr)
1778 CASE(VMOVAPSZ256rr) CASE(VMOVUPSZ256rr)
1779 CASE(VMOVDQA32Z256rr) CASE(VMOVDQU32Z256rr)
1780 CASE(VMOVDQA64Z256rr) CASE(VMOVDQU64Z256rr)
1781 }
1782#undef CASE
1783
1784 SDValue In = Move.getOperand(0);
1785 if (!In.isMachineOpcode() ||
1786 In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1787 continue;
1788
1789 // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1790 // the SHA instructions which use a legacy encoding.
1791 uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
1792 if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1793 (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1794 (TSFlags & X86II::EncodingMask) != X86II::XOP)
1795 continue;
1796
1797 // Producing instruction is another vector instruction. We can drop the
1798 // move.
1799 CurDAG->UpdateNodeOperands(N, In, N->getOperand(1));
1800 MadeChange = true;
1801 }
1802 }
1803 }
1804
1805 if (MadeChange)
1806 CurDAG->RemoveDeadNodes();
1807}
1808
1809
1810/// Emit any code that needs to be executed only in the main function.
1811void X86DAGToDAGISel::emitSpecialCodeForMain() {
1812 if (Subtarget->isTargetCygMing()) {
1813 TargetLowering::ArgListTy Args;
1814 auto &DL = CurDAG->getDataLayout();
1815
1816 TargetLowering::CallLoweringInfo CLI(*CurDAG);
1817 CLI.setChain(CurDAG->getRoot())
1818 .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
1819 CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
1820 std::move(Args));
1821 const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1822 std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1823 CurDAG->setRoot(Result.second);
1824 }
1825}
1826
1827void X86DAGToDAGISel::emitFunctionEntryCode() {
1828 // If this is main, emit special code for main.
1829 const Function &F = MF->getFunction();
1830 if (F.hasExternalLinkage() && F.getName() == "main")
1831 emitSpecialCodeForMain();
1832}
1833
1834static bool isDispSafeForFrameIndexOrRegBase(int64_t Val) {
1835 // We can run into an issue where a frame index or a register base
1836 // includes a displacement that, when added to the explicit displacement,
1837 // will overflow the displacement field. Assuming that the
1838 // displacement fits into a 31-bit integer (which is only slightly more
1839 // aggressive than the current fundamental assumption that it fits into
1840 // a 32-bit integer), a 31-bit disp should always be safe.
1841 return isInt<31>(Val);
1842}
1843
1844bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1845 X86ISelAddressMode &AM) {
1846 // We may have already matched a displacement and the caller just added the
1847 // symbolic displacement. So we still need to do the checks even if Offset
1848 // is zero.
1849
1850 int64_t Val = AM.Disp + Offset;
1851
1852 // Cannot combine ExternalSymbol displacements with integer offsets.
1853 if (Val != 0 && (AM.ES || AM.MCSym))
1854 return true;
1855
1856 CodeModel::Model M = TM.getCodeModel();
1857 if (Subtarget->is64Bit()) {
1858 if (Val != 0 &&
1860 AM.hasSymbolicDisplacement()))
1861 return true;
1862 // In addition to the checks required for a register base, check that
1863 // we do not try to use an unsafe Disp with a frame index.
1864 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1866 return true;
1867 // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to
1868 // 64 bits. Instructions with 32-bit register addresses perform this zero
1869 // extension for us and we can safely ignore the high bits of Offset.
1870 // Instructions with only a 32-bit immediate address do not, though: they
1871 // sign extend instead. This means only address the low 2GB of address space
1872 // is directly addressable, we need indirect addressing for the high 2GB of
1873 // address space.
1874 // TODO: Some of the earlier checks may be relaxed for ILP32 mode as the
1875 // implicit zero extension of instructions would cover up any problem.
1876 // However, we have asserts elsewhere that get triggered if we do, so keep
1877 // the checks for now.
1878 // TODO: We would actually be able to accept these, as well as the same
1879 // addresses in LP64 mode, by adding the EIZ pseudo-register as an operand
1880 // to get an address size override to be emitted. However, this
1881 // pseudo-register is not part of any register class and therefore causes
1882 // MIR verification to fail.
1883 if (Subtarget->isTarget64BitILP32() &&
1884 !isDispSafeForFrameIndexOrRegBase((uint32_t)Val) &&
1885 !AM.hasBaseOrIndexReg())
1886 return true;
1887 } else if (Subtarget->is16Bit()) {
1888 // In 16-bit mode, displacements are limited to [-65535,65535] for FK_Data_2
1889 // fixups of unknown signedness. See X86AsmBackend::applyFixup.
1890 if (Val < -(int64_t)UINT16_MAX || Val > (int64_t)UINT16_MAX)
1891 return true;
1892 } else if (AM.hasBaseOrIndexReg() && !isDispSafeForFrameIndexOrRegBase(Val))
1893 // For 32-bit X86, make sure the displacement still isn't close to the
1894 // expressible limit.
1895 return true;
1896 AM.Disp = Val;
1897 return false;
1898}
1899
1900bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
1901 bool AllowSegmentRegForX32) {
1902 SDValue Address = N->getOperand(1);
1903
1904 // load gs:0 -> GS segment register.
1905 // load fs:0 -> FS segment register.
1906 //
1907 // This optimization is generally valid because the GNU TLS model defines that
1908 // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
1909 // with 32-bit registers, as we get in ILP32 mode, those registers are first
1910 // zero-extended to 64 bits and then added it to the base address, which gives
1911 // unwanted results when the register holds a negative value.
1912 // For more information see http://people.redhat.com/drepper/tls.pdf
1913 if (isNullConstant(Address) && AM.Segment.getNode() == nullptr &&
1914 !IndirectTlsSegRefs &&
1915 (Subtarget->isTargetGlibc() || Subtarget->isTargetMusl() ||
1916 Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())) {
1917 if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
1918 return true;
1919 switch (N->getPointerInfo().getAddrSpace()) {
1920 case X86AS::GS:
1921 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
1922 return false;
1923 case X86AS::FS:
1924 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
1925 return false;
1926 // Address space X86AS::SS is not handled here, because it is not used to
1927 // address TLS areas.
1928 }
1929 }
1930
1931 return true;
1932}
1933
1934/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1935/// mode. These wrap things that will resolve down into a symbol reference.
1936/// If no match is possible, this returns true, otherwise it returns false.
1937bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1938 // If the addressing mode already has a symbol as the displacement, we can
1939 // never match another symbol.
1940 if (AM.hasSymbolicDisplacement())
1941 return true;
1942
1943 bool IsRIPRelTLS = false;
1944 bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1945 if (IsRIPRel) {
1946 SDValue Val = N.getOperand(0);
1948 IsRIPRelTLS = true;
1949 }
1950
1951 // We can't use an addressing mode in the 64-bit large code model.
1952 // Global TLS addressing is an exception. In the medium code model,
1953 // we use can use a mode when RIP wrappers are present.
1954 // That signifies access to globals that are known to be "near",
1955 // such as the GOT itself.
1956 CodeModel::Model M = TM.getCodeModel();
1957 if (Subtarget->is64Bit() && M == CodeModel::Large && !IsRIPRelTLS)
1958 return true;
1959
1960 // Base and index reg must be 0 in order to use %rip as base.
1961 if (IsRIPRel && AM.hasBaseOrIndexReg())
1962 return true;
1963
1964 // Make a local copy in case we can't do this fold.
1965 X86ISelAddressMode Backup = AM;
1966
1967 int64_t Offset = 0;
1968 SDValue N0 = N.getOperand(0);
1969 if (auto *G = dyn_cast<GlobalAddressSDNode>(N0)) {
1970 AM.GV = G->getGlobal();
1971 AM.SymbolFlags = G->getTargetFlags();
1972 Offset = G->getOffset();
1973 } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
1974 AM.CP = CP->getConstVal();
1975 AM.Alignment = CP->getAlign();
1976 AM.SymbolFlags = CP->getTargetFlags();
1977 Offset = CP->getOffset();
1978 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
1979 AM.ES = S->getSymbol();
1980 AM.SymbolFlags = S->getTargetFlags();
1981 } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
1982 AM.MCSym = S->getMCSymbol();
1983 } else if (auto *J = dyn_cast<JumpTableSDNode>(N0)) {
1984 AM.JT = J->getIndex();
1985 AM.SymbolFlags = J->getTargetFlags();
1986 } else if (auto *BA = dyn_cast<BlockAddressSDNode>(N0)) {
1987 AM.BlockAddr = BA->getBlockAddress();
1988 AM.SymbolFlags = BA->getTargetFlags();
1989 Offset = BA->getOffset();
1990 } else
1991 llvm_unreachable("Unhandled symbol reference node.");
1992
1993 // Can't use an addressing mode with large globals.
1994 if (Subtarget->is64Bit() && !IsRIPRel && AM.GV &&
1995 TM.isLargeGlobalValue(AM.GV)) {
1996 AM = Backup;
1997 return true;
1998 }
1999
2000 if (foldOffsetIntoAddress(Offset, AM)) {
2001 AM = Backup;
2002 return true;
2003 }
2004
2005 if (IsRIPRel)
2006 AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
2007
2008 // Commit the changes now that we know this fold is safe.
2009 return false;
2010}
2011
2012/// Add the specified node to the specified addressing mode, returning true if
2013/// it cannot be done. This just pattern matches for the addressing mode.
2014bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
2015 if (matchAddressRecursively(N, AM, 0))
2016 return true;
2017
2018 // Post-processing: Make a second attempt to fold a load, if we now know
2019 // that there will not be any other register. This is only performed for
2020 // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
2021 // any foldable load the first time.
2022 if (Subtarget->isTarget64BitILP32() &&
2023 AM.BaseType == X86ISelAddressMode::RegBase &&
2024 AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) {
2025 SDValue Save_Base_Reg = AM.Base_Reg;
2026 if (auto *LoadN = dyn_cast<LoadSDNode>(Save_Base_Reg)) {
2027 AM.Base_Reg = SDValue();
2028 if (matchLoadInAddress(LoadN, AM, /*AllowSegmentRegForX32=*/true))
2029 AM.Base_Reg = Save_Base_Reg;
2030 }
2031 }
2032
2033 // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
2034 // a smaller encoding and avoids a scaled-index.
2035 if (AM.Scale == 2 &&
2036 AM.BaseType == X86ISelAddressMode::RegBase &&
2037 AM.Base_Reg.getNode() == nullptr) {
2038 AM.Base_Reg = AM.IndexReg;
2039 AM.Scale = 1;
2040 }
2041
2042 // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
2043 // because it has a smaller encoding.
2044 if (TM.getCodeModel() != CodeModel::Large &&
2045 (!AM.GV || !TM.isLargeGlobalValue(AM.GV)) && Subtarget->is64Bit() &&
2046 AM.Scale == 1 && AM.BaseType == X86ISelAddressMode::RegBase &&
2047 AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr &&
2048 AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) {
2049 // However, when GV is a local function symbol and in the same section as
2050 // the current instruction, and AM.Disp is negative and near INT32_MIN,
2051 // referencing GV+Disp generates a relocation referencing the section symbol
2052 // with an even smaller offset, which might underflow. We should bail out if
2053 // the negative offset is too close to INT32_MIN. Actually, we are more
2054 // conservative here, using a smaller magic number also used by
2055 // isOffsetSuitableForCodeModel.
2056 if (isa_and_nonnull<Function>(AM.GV) && AM.Disp < -16 * 1024 * 1024)
2057 return true;
2058
2059 AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
2060 }
2061
2062 return false;
2063}
2064
2065bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
2066 unsigned Depth) {
2067 // Add an artificial use to this node so that we can keep track of
2068 // it if it gets CSE'd with a different node.
2069 HandleSDNode Handle(N);
2070
2071 X86ISelAddressMode Backup = AM;
2072 if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
2073 !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
2074 return false;
2075 AM = Backup;
2076
2077 // Try again after commutating the operands.
2078 if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM,
2079 Depth + 1) &&
2080 !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth + 1))
2081 return false;
2082 AM = Backup;
2083
2084 // If we couldn't fold both operands into the address at the same time,
2085 // see if we can just put each operand into a register and fold at least
2086 // the add.
2087 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2088 !AM.Base_Reg.getNode() &&
2089 !AM.IndexReg.getNode()) {
2090 N = Handle.getValue();
2091 AM.Base_Reg = N.getOperand(0);
2092 AM.IndexReg = N.getOperand(1);
2093 AM.Scale = 1;
2094 return false;
2095 }
2096 N = Handle.getValue();
2097 return true;
2098}
2099
2100// Insert a node into the DAG at least before the Pos node's position. This
2101// will reposition the node as needed, and will assign it a node ID that is <=
2102// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
2103// IDs! The selection DAG must no longer depend on their uniqueness when this
2104// is used.
2105static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
2106 if (N->getNodeId() == -1 ||
2109 DAG.RepositionNode(Pos->getIterator(), N.getNode());
2110 // Mark Node as invalid for pruning as after this it may be a successor to a
2111 // selected node but otherwise be in the same position of Pos.
2112 // Conservatively mark it with the same -abs(Id) to assure node id
2113 // invariant is preserved.
2114 N->setNodeId(Pos->getNodeId());
2116 }
2117}
2118
2119// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
2120// safe. This allows us to convert the shift and and into an h-register
2121// extract and a scaled index. Returns false if the simplification is
2122// performed.
2124 uint64_t Mask,
2125 SDValue Shift, SDValue X,
2126 X86ISelAddressMode &AM) {
2127 if (Shift.getOpcode() != ISD::SRL ||
2128 !isa<ConstantSDNode>(Shift.getOperand(1)) ||
2129 !Shift.hasOneUse())
2130 return true;
2131
2132 int ScaleLog = 8 - Shift.getConstantOperandVal(1);
2133 if (ScaleLog <= 0 || ScaleLog >= 4 ||
2134 Mask != (0xffu << ScaleLog))
2135 return true;
2136
2137 MVT XVT = X.getSimpleValueType();
2138 MVT VT = N.getSimpleValueType();
2139 SDLoc DL(N);
2140 SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
2141 SDValue NewMask = DAG.getConstant(0xff, DL, XVT);
2142 SDValue Srl = DAG.getNode(ISD::SRL, DL, XVT, X, Eight);
2143 SDValue And = DAG.getNode(ISD::AND, DL, XVT, Srl, NewMask);
2144 SDValue Ext = DAG.getZExtOrTrunc(And, DL, VT);
2145 SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
2146 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Ext, ShlCount);
2147
2148 // Insert the new nodes into the topological ordering. We must do this in
2149 // a valid topological ordering as nothing is going to go back and re-sort
2150 // these nodes. We continually insert before 'N' in sequence as this is
2151 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2152 // hierarchy left to express.
2153 insertDAGNode(DAG, N, Eight);
2154 insertDAGNode(DAG, N, NewMask);
2155 insertDAGNode(DAG, N, Srl);
2156 insertDAGNode(DAG, N, And);
2157 insertDAGNode(DAG, N, Ext);
2158 insertDAGNode(DAG, N, ShlCount);
2159 insertDAGNode(DAG, N, Shl);
2160 DAG.ReplaceAllUsesWith(N, Shl);
2161 DAG.RemoveDeadNode(N.getNode());
2162 AM.IndexReg = Ext;
2163 AM.Scale = (1 << ScaleLog);
2164 return false;
2165}
2166
2167// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
2168// allows us to fold the shift into this addressing mode. Returns false if the
2169// transform succeeded.
2171 X86ISelAddressMode &AM) {
2172 SDValue Shift = N.getOperand(0);
2173
2174 // Use a signed mask so that shifting right will insert sign bits. These
2175 // bits will be removed when we shift the result left so it doesn't matter
2176 // what we use. This might allow a smaller immediate encoding.
2177 int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();
2178
2179 // If we have an any_extend feeding the AND, look through it to see if there
2180 // is a shift behind it. But only if the AND doesn't use the extended bits.
2181 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
2182 bool FoundAnyExtend = false;
2183 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
2184 Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
2185 isUInt<32>(Mask)) {
2186 FoundAnyExtend = true;
2187 Shift = Shift.getOperand(0);
2188 }
2189
2190 if (Shift.getOpcode() != ISD::SHL ||
2192 return true;
2193
2194 SDValue X = Shift.getOperand(0);
2195
2196 // Not likely to be profitable if either the AND or SHIFT node has more
2197 // than one use (unless all uses are for address computation). Besides,
2198 // isel mechanism requires their node ids to be reused.
2199 if (!N.hasOneUse() || !Shift.hasOneUse())
2200 return true;
2201
2202 // Verify that the shift amount is something we can fold.
2203 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2204 if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
2205 return true;
2206
2207 MVT VT = N.getSimpleValueType();
2208 SDLoc DL(N);
2209 if (FoundAnyExtend) {
2210 SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
2211 insertDAGNode(DAG, N, NewX);
2212 X = NewX;
2213 }
2214
2215 SDValue NewMask = DAG.getSignedConstant(Mask >> ShiftAmt, DL, VT);
2216 SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
2217 SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
2218
2219 // Insert the new nodes into the topological ordering. We must do this in
2220 // a valid topological ordering as nothing is going to go back and re-sort
2221 // these nodes. We continually insert before 'N' in sequence as this is
2222 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2223 // hierarchy left to express.
2224 insertDAGNode(DAG, N, NewMask);
2225 insertDAGNode(DAG, N, NewAnd);
2226 insertDAGNode(DAG, N, NewShift);
2227 DAG.ReplaceAllUsesWith(N, NewShift);
2228 DAG.RemoveDeadNode(N.getNode());
2229
2230 AM.Scale = 1 << ShiftAmt;
2231 AM.IndexReg = NewAnd;
2232 return false;
2233}
2234
2235// Implement some heroics to detect shifts of masked values where the mask can
2236// be replaced by extending the shift and undoing that in the addressing mode
2237// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
2238// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
2239// the addressing mode. This results in code such as:
2240//
2241// int f(short *y, int *lookup_table) {
2242// ...
2243// return *y + lookup_table[*y >> 11];
2244// }
2245//
2246// Turning into:
2247// movzwl (%rdi), %eax
2248// movl %eax, %ecx
2249// shrl $11, %ecx
2250// addl (%rsi,%rcx,4), %eax
2251//
2252// Instead of:
2253// movzwl (%rdi), %eax
2254// movl %eax, %ecx
2255// shrl $9, %ecx
2256// andl $124, %rcx
2257// addl (%rsi,%rcx), %eax
2258//
2259// Note that this function assumes the mask is provided as a mask *after* the
2260// value is shifted. The input chain may or may not match that, but computing
2261// such a mask is trivial.
2263 uint64_t Mask,
2264 SDValue Shift, SDValue X,
2265 X86ISelAddressMode &AM) {
2266 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
2268 return true;
2269
2270 // We need to ensure that mask is a continuous run of bits.
2271 unsigned MaskIdx, MaskLen;
2272 if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2273 return true;
2274 unsigned MaskLZ = 64 - (MaskIdx + MaskLen);
2275
2276 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2277
2278 // The amount of shift we're trying to fit into the addressing mode is taken
2279 // from the shifted mask index (number of trailing zeros of the mask).
2280 unsigned AMShiftAmt = MaskIdx;
2281
2282 // There is nothing we can do here unless the mask is removing some bits.
2283 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2284 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2285
2286 // Scale the leading zero count down based on the actual size of the value.
2287 // Also scale it down based on the size of the shift.
2288 unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
2289 if (MaskLZ < ScaleDown)
2290 return true;
2291 MaskLZ -= ScaleDown;
2292
2293 // The final check is to ensure that any masked out high bits of X are
2294 // already known to be zero. Otherwise, the mask has a semantic impact
2295 // other than masking out a couple of low bits. Unfortunately, because of
2296 // the mask, zero extensions will be removed from operands in some cases.
2297 // This code works extra hard to look through extensions because we can
2298 // replace them with zero extensions cheaply if necessary.
2299 bool ReplacingAnyExtend = false;
2300 if (X.getOpcode() == ISD::ANY_EXTEND) {
2301 unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
2302 X.getOperand(0).getSimpleValueType().getSizeInBits();
2303 // Assume that we'll replace the any-extend with a zero-extend, and
2304 // narrow the search to the extended value.
2305 X = X.getOperand(0);
2306 MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
2307 ReplacingAnyExtend = true;
2308 }
2309 APInt MaskedHighBits =
2310 APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
2311 if (!DAG.MaskedValueIsZero(X, MaskedHighBits))
2312 return true;
2313
2314 // We've identified a pattern that can be transformed into a single shift
2315 // and an addressing mode. Make it so.
2316 MVT VT = N.getSimpleValueType();
2317 if (ReplacingAnyExtend) {
2318 assert(X.getValueType() != VT);
2319 // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
2320 SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
2321 insertDAGNode(DAG, N, NewX);
2322 X = NewX;
2323 }
2324
2325 MVT XVT = X.getSimpleValueType();
2326 SDLoc DL(N);
2327 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2328 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2329 SDValue NewExt = DAG.getZExtOrTrunc(NewSRL, DL, VT);
2330 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2331 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2332
2333 // Insert the new nodes into the topological ordering. We must do this in
2334 // a valid topological ordering as nothing is going to go back and re-sort
2335 // these nodes. We continually insert before 'N' in sequence as this is
2336 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2337 // hierarchy left to express.
2338 insertDAGNode(DAG, N, NewSRLAmt);
2339 insertDAGNode(DAG, N, NewSRL);
2340 insertDAGNode(DAG, N, NewExt);
2341 insertDAGNode(DAG, N, NewSHLAmt);
2342 insertDAGNode(DAG, N, NewSHL);
2343 DAG.ReplaceAllUsesWith(N, NewSHL);
2344 DAG.RemoveDeadNode(N.getNode());
2345
2346 AM.Scale = 1 << AMShiftAmt;
2347 AM.IndexReg = NewExt;
2348 return false;
2349}
2350
2351// Transform "(X >> SHIFT) & (MASK << C1)" to
2352// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
2353// matched to a BEXTR later. Returns false if the simplification is performed.
2355 uint64_t Mask,
2356 SDValue Shift, SDValue X,
2357 X86ISelAddressMode &AM,
2358 const X86Subtarget &Subtarget) {
2359 if (Shift.getOpcode() != ISD::SRL ||
2360 !isa<ConstantSDNode>(Shift.getOperand(1)) ||
2361 !Shift.hasOneUse() || !N.hasOneUse())
2362 return true;
2363
2364 // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
2365 if (!Subtarget.hasTBM() &&
2366 !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
2367 return true;
2368
2369 // We need to ensure that mask is a continuous run of bits.
2370 unsigned MaskIdx, MaskLen;
2371 if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2372 return true;
2373
2374 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2375
2376 // The amount of shift we're trying to fit into the addressing mode is taken
2377 // from the shifted mask index (number of trailing zeros of the mask).
2378 unsigned AMShiftAmt = MaskIdx;
2379
2380 // There is nothing we can do here unless the mask is removing some bits.
2381 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2382 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2383
2384 MVT XVT = X.getSimpleValueType();
2385 MVT VT = N.getSimpleValueType();
2386 SDLoc DL(N);
2387 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2388 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2389 SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, XVT);
2390 SDValue NewAnd = DAG.getNode(ISD::AND, DL, XVT, NewSRL, NewMask);
2391 SDValue NewExt = DAG.getZExtOrTrunc(NewAnd, DL, VT);
2392 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2393 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2394
2395 // Insert the new nodes into the topological ordering. We must do this in
2396 // a valid topological ordering as nothing is going to go back and re-sort
2397 // these nodes. We continually insert before 'N' in sequence as this is
2398 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2399 // hierarchy left to express.
2400 insertDAGNode(DAG, N, NewSRLAmt);
2401 insertDAGNode(DAG, N, NewSRL);
2402 insertDAGNode(DAG, N, NewMask);
2403 insertDAGNode(DAG, N, NewAnd);
2404 insertDAGNode(DAG, N, NewExt);
2405 insertDAGNode(DAG, N, NewSHLAmt);
2406 insertDAGNode(DAG, N, NewSHL);
2407 DAG.ReplaceAllUsesWith(N, NewSHL);
2408 DAG.RemoveDeadNode(N.getNode());
2409
2410 AM.Scale = 1 << AMShiftAmt;
2411 AM.IndexReg = NewExt;
2412 return false;
2413}
2414
2415// Attempt to peek further into a scaled index register, collecting additional
2416// extensions / offsets / etc. Returns /p N if we can't peek any further.
2417SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N,
2418 X86ISelAddressMode &AM,
2419 unsigned Depth) {
2420 assert(AM.IndexReg.getNode() == nullptr && "IndexReg already matched");
2421 assert((AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8) &&
2422 "Illegal index scale");
2423
2424 // Limit recursion.
2426 return N;
2427
2428 EVT VT = N.getValueType();
2429 unsigned Opc = N.getOpcode();
2430
2431 // index: add(x,c) -> index: x, disp + c
2432 if (CurDAG->isBaseWithConstantOffset(N)) {
2433 auto *AddVal = cast<ConstantSDNode>(N.getOperand(1));
2434 uint64_t Offset = (uint64_t)AddVal->getSExtValue() * AM.Scale;
2435 if (!foldOffsetIntoAddress(Offset, AM))
2436 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2437 }
2438
2439 // index: add(x,x) -> index: x, scale * 2
2440 if (Opc == ISD::ADD && N.getOperand(0) == N.getOperand(1)) {
2441 if (AM.Scale <= 4) {
2442 AM.Scale *= 2;
2443 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2444 }
2445 }
2446
2447 // index: shl(x,i) -> index: x, scale * (1 << i)
2448 if (Opc == X86ISD::VSHLI) {
2449 uint64_t ShiftAmt = N.getConstantOperandVal(1);
2450 uint64_t ScaleAmt = 1ULL << ShiftAmt;
2451 if ((AM.Scale * ScaleAmt) <= 8) {
2452 AM.Scale *= ScaleAmt;
2453 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2454 }
2455 }
2456
2457 // index: sext(add_nsw(x,c)) -> index: sext(x), disp + sext(c)
2458 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2459 if (Opc == ISD::SIGN_EXTEND && !VT.isVector() && N.hasOneUse()) {
2460 SDValue Src = N.getOperand(0);
2461 if (Src.getOpcode() == ISD::ADD && Src->getFlags().hasNoSignedWrap() &&
2462 Src.hasOneUse()) {
2463 if (CurDAG->isBaseWithConstantOffset(Src)) {
2464 SDValue AddSrc = Src.getOperand(0);
2465 auto *AddVal = cast<ConstantSDNode>(Src.getOperand(1));
2466 int64_t Offset = AddVal->getSExtValue();
2467 if (!foldOffsetIntoAddress((uint64_t)Offset * AM.Scale, AM)) {
2468 SDLoc DL(N);
2469 SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2470 SDValue ExtVal = CurDAG->getSignedConstant(Offset, DL, VT);
2471 SDValue ExtAdd = CurDAG->getNode(ISD::ADD, DL, VT, ExtSrc, ExtVal);
2472 insertDAGNode(*CurDAG, N, ExtSrc);
2473 insertDAGNode(*CurDAG, N, ExtVal);
2474 insertDAGNode(*CurDAG, N, ExtAdd);
2475 CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2476 CurDAG->RemoveDeadNode(N.getNode());
2477 return ExtSrc;
2478 }
2479 }
2480 }
2481 }
2482
2483 // index: zext(add_nuw(x,c)) -> index: zext(x), disp + zext(c)
2484 // index: zext(addlike(x,c)) -> index: zext(x), disp + zext(c)
2485 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2486 if (Opc == ISD::ZERO_EXTEND && !VT.isVector() && N.hasOneUse()) {
2487 SDValue Src = N.getOperand(0);
2488 unsigned SrcOpc = Src.getOpcode();
2489 if (((SrcOpc == ISD::ADD && Src->getFlags().hasNoUnsignedWrap()) ||
2490 CurDAG->isADDLike(Src, /*NoWrap=*/true)) &&
2491 Src.hasOneUse()) {
2492 if (CurDAG->isBaseWithConstantOffset(Src)) {
2493 SDValue AddSrc = Src.getOperand(0);
2494 uint64_t Offset = Src.getConstantOperandVal(1);
2495 if (!foldOffsetIntoAddress(Offset * AM.Scale, AM)) {
2496 SDLoc DL(N);
2497 SDValue Res;
2498 // If we're also scaling, see if we can use that as well.
2499 if (AddSrc.getOpcode() == ISD::SHL &&
2500 isa<ConstantSDNode>(AddSrc.getOperand(1))) {
2501 SDValue ShVal = AddSrc.getOperand(0);
2502 uint64_t ShAmt = AddSrc.getConstantOperandVal(1);
2503 APInt HiBits =
2505 uint64_t ScaleAmt = 1ULL << ShAmt;
2506 if ((AM.Scale * ScaleAmt) <= 8 &&
2507 (AddSrc->getFlags().hasNoUnsignedWrap() ||
2508 CurDAG->MaskedValueIsZero(ShVal, HiBits))) {
2509 AM.Scale *= ScaleAmt;
2510 SDValue ExtShVal = CurDAG->getNode(Opc, DL, VT, ShVal);
2511 SDValue ExtShift = CurDAG->getNode(ISD::SHL, DL, VT, ExtShVal,
2512 AddSrc.getOperand(1));
2513 insertDAGNode(*CurDAG, N, ExtShVal);
2514 insertDAGNode(*CurDAG, N, ExtShift);
2515 AddSrc = ExtShift;
2516 Res = ExtShVal;
2517 }
2518 }
2519 SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2520 SDValue ExtVal = CurDAG->getConstant(Offset, DL, VT);
2521 SDValue ExtAdd = CurDAG->getNode(SrcOpc, DL, VT, ExtSrc, ExtVal);
2522 insertDAGNode(*CurDAG, N, ExtSrc);
2523 insertDAGNode(*CurDAG, N, ExtVal);
2524 insertDAGNode(*CurDAG, N, ExtAdd);
2525 CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2526 CurDAG->RemoveDeadNode(N.getNode());
2527 return Res ? Res : ExtSrc;
2528 }
2529 }
2530 }
2531 }
2532
2533 // TODO: Handle extensions, shifted masks etc.
2534 return N;
2535}
2536
2537bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
2538 unsigned Depth) {
2539 LLVM_DEBUG({
2540 dbgs() << "MatchAddress: ";
2541 AM.dump(CurDAG);
2542 });
2543 // Limit recursion.
2545 return matchAddressBase(N, AM);
2546
2547 // If this is already a %rip relative address, we can only merge immediates
2548 // into it. Instead of handling this in every case, we handle it here.
2549 // RIP relative addressing: %rip + 32-bit displacement!
2550 if (AM.isRIPRelative()) {
2551 // FIXME: JumpTable and ExternalSymbol address currently don't like
2552 // displacements. It isn't very important, but this should be fixed for
2553 // consistency.
2554 if (!(AM.ES || AM.MCSym) && AM.JT != -1)
2555 return true;
2556
2557 if (auto *Cst = dyn_cast<ConstantSDNode>(N))
2558 if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
2559 return false;
2560 return true;
2561 }
2562
2563 switch (N.getOpcode()) {
2564 default: break;
2565 case ISD::LOCAL_RECOVER: {
2566 if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
2567 if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
2568 // Use the symbol and don't prefix it.
2569 AM.MCSym = ESNode->getMCSymbol();
2570 return false;
2571 }
2572 break;
2573 }
2574 case ISD::Constant: {
2575 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2576 if (!foldOffsetIntoAddress(Val, AM))
2577 return false;
2578 break;
2579 }
2580
2581 case X86ISD::Wrapper:
2582 case X86ISD::WrapperRIP:
2583 if (!matchWrapper(N, AM))
2584 return false;
2585 break;
2586
2587 case ISD::LOAD:
2588 if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
2589 return false;
2590 break;
2591
2592 case ISD::FrameIndex:
2593 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2594 AM.Base_Reg.getNode() == nullptr &&
2595 (!Subtarget->is64Bit() || isDispSafeForFrameIndexOrRegBase(AM.Disp))) {
2596 AM.BaseType = X86ISelAddressMode::FrameIndexBase;
2597 AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
2598 return false;
2599 }
2600 break;
2601
2602 case ISD::SHL:
2603 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2604 break;
2605
2606 if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
2607 unsigned Val = CN->getZExtValue();
2608 // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
2609 // that the base operand remains free for further matching. If
2610 // the base doesn't end up getting used, a post-processing step
2611 // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
2612 if (Val == 1 || Val == 2 || Val == 3) {
2613 SDValue ShVal = N.getOperand(0);
2614 AM.Scale = 1 << Val;
2615 AM.IndexReg = matchIndexRecursively(ShVal, AM, Depth + 1);
2616 return false;
2617 }
2618 }
2619 break;
2620
2621 case ISD::SRL: {
2622 // Scale must not be used already.
2623 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2624
2625 // We only handle up to 64-bit values here as those are what matter for
2626 // addressing mode optimizations.
2627 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2628 "Unexpected value size!");
2629
2630 SDValue And = N.getOperand(0);
2631 if (And.getOpcode() != ISD::AND) break;
2632 SDValue X = And.getOperand(0);
2633
2634 // The mask used for the transform is expected to be post-shift, but we
2635 // found the shift first so just apply the shift to the mask before passing
2636 // it down.
2637 if (!isa<ConstantSDNode>(N.getOperand(1)) ||
2638 !isa<ConstantSDNode>(And.getOperand(1)))
2639 break;
2640 uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
2641
2642 // Try to fold the mask and shift into the scale, and return false if we
2643 // succeed.
2644 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
2645 return false;
2646 break;
2647 }
2648
2649 case ISD::SMUL_LOHI:
2650 case ISD::UMUL_LOHI:
2651 // A mul_lohi where we need the low part can be folded as a plain multiply.
2652 if (N.getResNo() != 0) break;
2653 [[fallthrough]];
2654 case ISD::MUL:
2655 case X86ISD::MUL_IMM:
2656 // X*[3,5,9] -> X+X*[2,4,8]
2657 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2658 AM.Base_Reg.getNode() == nullptr &&
2659 AM.IndexReg.getNode() == nullptr) {
2660 if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
2661 if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
2662 CN->getZExtValue() == 9) {
2663 AM.Scale = unsigned(CN->getZExtValue())-1;
2664
2665 SDValue MulVal = N.getOperand(0);
2666 SDValue Reg;
2667
2668 // Okay, we know that we have a scale by now. However, if the scaled
2669 // value is an add of something and a constant, we can fold the
2670 // constant into the disp field here.
2671 if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
2672 isa<ConstantSDNode>(MulVal.getOperand(1))) {
2673 Reg = MulVal.getOperand(0);
2674 auto *AddVal = cast<ConstantSDNode>(MulVal.getOperand(1));
2675 uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
2676 if (foldOffsetIntoAddress(Disp, AM))
2677 Reg = N.getOperand(0);
2678 } else {
2679 Reg = N.getOperand(0);
2680 }
2681
2682 AM.IndexReg = AM.Base_Reg = Reg;
2683 return false;
2684 }
2685 }
2686 break;
2687
2688 case ISD::SUB: {
2689 // Given A-B, if A can be completely folded into the address and
2690 // the index field with the index field unused, use -B as the index.
2691 // This is a win if a has multiple parts that can be folded into
2692 // the address. Also, this saves a mov if the base register has
2693 // other uses, since it avoids a two-address sub instruction, however
2694 // it costs an additional mov if the index register has other uses.
2695
2696 // Add an artificial use to this node so that we can keep track of
2697 // it if it gets CSE'd with a different node.
2698 HandleSDNode Handle(N);
2699
2700 // Test if the LHS of the sub can be folded.
2701 X86ISelAddressMode Backup = AM;
2702 if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
2703 N = Handle.getValue();
2704 AM = Backup;
2705 break;
2706 }
2707 N = Handle.getValue();
2708 // Test if the index field is free for use.
2709 if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
2710 AM = Backup;
2711 break;
2712 }
2713
2714 int Cost = 0;
2715 SDValue RHS = N.getOperand(1);
2716 // If the RHS involves a register with multiple uses, this
2717 // transformation incurs an extra mov, due to the neg instruction
2718 // clobbering its operand.
2719 if (!RHS.getNode()->hasOneUse() ||
2720 RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
2721 RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
2722 RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
2723 (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2724 RHS.getOperand(0).getValueType() == MVT::i32))
2725 ++Cost;
2726 // If the base is a register with multiple uses, this
2727 // transformation may save a mov.
2728 if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2729 !AM.Base_Reg.getNode()->hasOneUse()) ||
2730 AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2731 --Cost;
2732 // If the folded LHS was interesting, this transformation saves
2733 // address arithmetic.
2734 if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2735 ((AM.Disp != 0) && (Backup.Disp == 0)) +
2736 (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
2737 --Cost;
2738 // If it doesn't look like it may be an overall win, don't do it.
2739 if (Cost >= 0) {
2740 AM = Backup;
2741 break;
2742 }
2743
2744 // Ok, the transformation is legal and appears profitable. Go for it.
2745 // Negation will be emitted later to avoid creating dangling nodes if this
2746 // was an unprofitable LEA.
2747 AM.IndexReg = RHS;
2748 AM.NegateIndex = true;
2749 AM.Scale = 1;
2750 return false;
2751 }
2752
2753 case ISD::OR:
2754 case ISD::XOR:
2755 // See if we can treat the OR/XOR node as an ADD node.
2756 if (!CurDAG->isADDLike(N))
2757 break;
2758 [[fallthrough]];
2759 case ISD::ADD:
2760 if (!matchAdd(N, AM, Depth))
2761 return false;
2762 break;
2763
2764 case ISD::AND: {
2765 // Perform some heroic transforms on an and of a constant-count shift
2766 // with a constant to enable use of the scaled offset field.
2767
2768 // Scale must not be used already.
2769 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2770
2771 // We only handle up to 64-bit values here as those are what matter for
2772 // addressing mode optimizations.
2773 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2774 "Unexpected value size!");
2775
2776 if (!isa<ConstantSDNode>(N.getOperand(1)))
2777 break;
2778
2779 if (N.getOperand(0).getOpcode() == ISD::SRL) {
2780 SDValue Shift = N.getOperand(0);
2781 SDValue X = Shift.getOperand(0);
2782
2783 uint64_t Mask = N.getConstantOperandVal(1);
2784
2785 // Try to fold the mask and shift into an extract and scale.
2786 if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
2787 return false;
2788
2789 // Try to fold the mask and shift directly into the scale.
2790 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
2791 return false;
2792
2793 // Try to fold the mask and shift into BEXTR and scale.
2794 if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
2795 return false;
2796 }
2797
2798 // Try to swap the mask and shift to place shifts which can be done as
2799 // a scale on the outside of the mask.
2800 if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
2801 return false;
2802
2803 break;
2804 }
2805 case ISD::ZERO_EXTEND: {
2806 // Try to widen a zexted shift left to the same size as its use, so we can
2807 // match the shift as a scale factor.
2808 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2809 break;
2810
2811 SDValue Src = N.getOperand(0);
2812
2813 // See if we can match a zext(addlike(x,c)).
2814 // TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively.
2815 if (Src.getOpcode() == ISD::ADD || Src.getOpcode() == ISD::OR)
2816 if (SDValue Index = matchIndexRecursively(N, AM, Depth + 1))
2817 if (Index != N) {
2818 AM.IndexReg = Index;
2819 return false;
2820 }
2821
2822 // Peek through mask: zext(and(shl(x,c1),c2))
2823 APInt Mask = APInt::getAllOnes(Src.getScalarValueSizeInBits());
2824 if (Src.getOpcode() == ISD::AND && Src.hasOneUse())
2825 if (auto *MaskC = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
2826 Mask = MaskC->getAPIntValue();
2827 Src = Src.getOperand(0);
2828 }
2829
2830 if (Src.getOpcode() == ISD::SHL && Src.hasOneUse() && N->hasOneUse()) {
2831 // Give up if the shift is not a valid scale factor [1,2,3].
2832 SDValue ShlSrc = Src.getOperand(0);
2833 SDValue ShlAmt = Src.getOperand(1);
2834 auto *ShAmtC = dyn_cast<ConstantSDNode>(ShlAmt);
2835 if (!ShAmtC)
2836 break;
2837 unsigned ShAmtV = ShAmtC->getZExtValue();
2838 if (ShAmtV > 3)
2839 break;
2840
2841 // The narrow shift must only shift out zero bits (it must be 'nuw').
2842 // That makes it safe to widen to the destination type.
2843 APInt HighZeros =
2844 APInt::getHighBitsSet(ShlSrc.getValueSizeInBits(), ShAmtV);
2845 if (!Src->getFlags().hasNoUnsignedWrap() &&
2846 !CurDAG->MaskedValueIsZero(ShlSrc, HighZeros & Mask))
2847 break;
2848
2849 // zext (shl nuw i8 %x, C1) to i32
2850 // --> shl (zext i8 %x to i32), (zext C1)
2851 // zext (and (shl nuw i8 %x, C1), C2) to i32
2852 // --> shl (zext i8 (and %x, C2 >> C1) to i32), (zext C1)
2853 MVT SrcVT = ShlSrc.getSimpleValueType();
2854 MVT VT = N.getSimpleValueType();
2855 SDLoc DL(N);
2856
2857 SDValue Res = ShlSrc;
2858 if (!Mask.isAllOnes()) {
2859 Res = CurDAG->getConstant(Mask.lshr(ShAmtV), DL, SrcVT);
2860 insertDAGNode(*CurDAG, N, Res);
2861 Res = CurDAG->getNode(ISD::AND, DL, SrcVT, ShlSrc, Res);
2862 insertDAGNode(*CurDAG, N, Res);
2863 }
2864 SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Res);
2865 insertDAGNode(*CurDAG, N, Zext);
2866 SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, ShlAmt);
2867 insertDAGNode(*CurDAG, N, NewShl);
2868 CurDAG->ReplaceAllUsesWith(N, NewShl);
2869 CurDAG->RemoveDeadNode(N.getNode());
2870
2871 // Convert the shift to scale factor.
2872 AM.Scale = 1 << ShAmtV;
2873 // If matchIndexRecursively is not called here,
2874 // Zext may be replaced by other nodes but later used to call a builder
2875 // method
2876 AM.IndexReg = matchIndexRecursively(Zext, AM, Depth + 1);
2877 return false;
2878 }
2879
2880 if (Src.getOpcode() == ISD::SRL && !Mask.isAllOnes()) {
2881 // Try to fold the mask and shift into an extract and scale.
2882 if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask.getZExtValue(), Src,
2883 Src.getOperand(0), AM))
2884 return false;
2885
2886 // Try to fold the mask and shift directly into the scale.
2887 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask.getZExtValue(), Src,
2888 Src.getOperand(0), AM))
2889 return false;
2890
2891 // Try to fold the mask and shift into BEXTR and scale.
2892 if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask.getZExtValue(), Src,
2893 Src.getOperand(0), AM, *Subtarget))
2894 return false;
2895 }
2896
2897 break;
2898 }
2899 }
2900
2901 return matchAddressBase(N, AM);
2902}
2903
2904/// Helper for MatchAddress. Add the specified node to the
2905/// specified addressing mode without any further recursion.
2906bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2907 // Is the base register already occupied?
2908 if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
2909 // If so, check to see if the scale index register is set.
2910 if (!AM.IndexReg.getNode()) {
2911 AM.IndexReg = N;
2912 AM.Scale = 1;
2913 return false;
2914 }
2915
2916 // Otherwise, we cannot select it.
2917 return true;
2918 }
2919
2920 // Default, generate it as a register.
2921 AM.BaseType = X86ISelAddressMode::RegBase;
2922 AM.Base_Reg = N;
2923 return false;
2924}
2925
2926bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N,
2927 X86ISelAddressMode &AM,
2928 unsigned Depth) {
2929 LLVM_DEBUG({
2930 dbgs() << "MatchVectorAddress: ";
2931 AM.dump(CurDAG);
2932 });
2933 // Limit recursion.
2935 return matchAddressBase(N, AM);
2936
2937 // TODO: Support other operations.
2938 switch (N.getOpcode()) {
2939 case ISD::Constant: {
2940 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2941 if (!foldOffsetIntoAddress(Val, AM))
2942 return false;
2943 break;
2944 }
2945 case X86ISD::Wrapper:
2946 if (!matchWrapper(N, AM))
2947 return false;
2948 break;
2949 case ISD::ADD: {
2950 // Add an artificial use to this node so that we can keep track of
2951 // it if it gets CSE'd with a different node.
2952 HandleSDNode Handle(N);
2953
2954 X86ISelAddressMode Backup = AM;
2955 if (!matchVectorAddressRecursively(N.getOperand(0), AM, Depth + 1) &&
2956 !matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2957 Depth + 1))
2958 return false;
2959 AM = Backup;
2960
2961 // Try again after commuting the operands.
2962 if (!matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2963 Depth + 1) &&
2964 !matchVectorAddressRecursively(Handle.getValue().getOperand(0), AM,
2965 Depth + 1))
2966 return false;
2967 AM = Backup;
2968
2969 N = Handle.getValue();
2970 break;
2971 }
2972 }
2973
2974 return matchAddressBase(N, AM);
2975}
2976
2977/// Helper for selectVectorAddr. Handles things that can be folded into a
2978/// gather/scatter address. The index register and scale should have already
2979/// been handled.
2980bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2981 return matchVectorAddressRecursively(N, AM, 0);
2982}
2983
2984bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
2985 SDValue IndexOp, SDValue ScaleOp,
2986 SDValue &Base, SDValue &Scale,
2987 SDValue &Index, SDValue &Disp,
2988 SDValue &Segment) {
2989 X86ISelAddressMode AM;
2990 AM.Scale = ScaleOp->getAsZExtVal();
2991
2992 // Attempt to match index patterns, as long as we're not relying on implicit
2993 // sign-extension, which is performed BEFORE scale.
2994 if (IndexOp.getScalarValueSizeInBits() == BasePtr.getScalarValueSizeInBits())
2995 AM.IndexReg = matchIndexRecursively(IndexOp, AM, 0);
2996 else
2997 AM.IndexReg = IndexOp;
2998
2999 unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace();
3000 if (AddrSpace == X86AS::GS)
3001 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
3002 if (AddrSpace == X86AS::FS)
3003 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
3004 if (AddrSpace == X86AS::SS)
3005 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
3006
3007 SDLoc DL(BasePtr);
3008 MVT VT = BasePtr.getSimpleValueType();
3009
3010 // Try to match into the base and displacement fields.
3011 if (matchVectorAddress(BasePtr, AM))
3012 return false;
3013
3014 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3015 return true;
3016}
3017
3018/// Returns true if it is able to pattern match an addressing mode.
3019/// It returns the operands which make up the maximal addressing mode it can
3020/// match by reference.
3021///
3022/// Parent is the parent node of the addr operand that is being matched. It
3023/// is always a load, store, atomic node, or null. It is only null when
3024/// checking memory operands for inline asm nodes.
3025bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
3026 SDValue &Scale, SDValue &Index, SDValue &Disp,
3027 SDValue &Segment, bool HasNDDM) {
3028 X86ISelAddressMode AM;
3029
3030 if (Parent &&
3031 // This list of opcodes are all the nodes that have an "addr:$ptr" operand
3032 // that are not a MemSDNode, and thus don't have proper addrspace info.
3033 Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
3034 Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
3035 Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
3036 Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
3037 Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
3038 Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
3039 Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
3040 unsigned AddrSpace =
3041 cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
3042 if (AddrSpace == X86AS::GS)
3043 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
3044 if (AddrSpace == X86AS::FS)
3045 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
3046 if (AddrSpace == X86AS::SS)
3047 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
3048 }
3049
3050 // Save the DL and VT before calling matchAddress, it can invalidate N.
3051 SDLoc DL(N);
3052 MVT VT = N.getSimpleValueType();
3053
3054 if (matchAddress(N, AM))
3055 return false;
3056
3057 if (!HasNDDM && !AM.isRIPRelative())
3058 return false;
3059
3060 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3061 return true;
3062}
3063
3064bool X86DAGToDAGISel::selectNDDAddr(SDNode *Parent, SDValue N, SDValue &Base,
3065 SDValue &Scale, SDValue &Index,
3066 SDValue &Disp, SDValue &Segment) {
3067 return selectAddr(Parent, N, Base, Scale, Index, Disp, Segment,
3068 Subtarget->hasNDDM());
3069}
3070
3071bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
3072 // Cannot use 32 bit constants to reference objects in kernel/large code
3073 // model.
3074 if (TM.getCodeModel() == CodeModel::Kernel ||
3075 TM.getCodeModel() == CodeModel::Large)
3076 return false;
3077
3078 // In static codegen with small code model, we can get the address of a label
3079 // into a register with 'movl'
3080 if (N->getOpcode() != X86ISD::Wrapper)
3081 return false;
3082
3083 N = N.getOperand(0);
3084
3085 // At least GNU as does not accept 'movl' for TPOFF relocations.
3086 // FIXME: We could use 'movl' when we know we are targeting MC.
3087 if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
3088 return false;
3089
3090 Imm = N;
3091 // Small/medium code model can reference non-TargetGlobalAddress objects with
3092 // 32 bit constants.
3093 if (N->getOpcode() != ISD::TargetGlobalAddress) {
3094 return TM.getCodeModel() == CodeModel::Small ||
3095 TM.getCodeModel() == CodeModel::Medium;
3096 }
3097
3098 const GlobalValue *GV = cast<GlobalAddressSDNode>(N)->getGlobal();
3099 if (std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange())
3100 return CR->getUnsignedMax().ult(1ull << 32);
3101
3102 return !TM.isLargeGlobalValue(GV);
3103}
3104
3105bool X86DAGToDAGISel::selectLEA64_Addr(SDValue N, SDValue &Base, SDValue &Scale,
3106 SDValue &Index, SDValue &Disp,
3107 SDValue &Segment) {
3108 // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
3109 SDLoc DL(N);
3110
3111 if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
3112 return false;
3113
3114 EVT BaseType = Base.getValueType();
3115 unsigned SubReg;
3116 if (BaseType == MVT::i8)
3117 SubReg = X86::sub_8bit;
3118 else if (BaseType == MVT::i16)
3119 SubReg = X86::sub_16bit;
3120 else
3121 SubReg = X86::sub_32bit;
3122
3124 if (RN && RN->getReg() == 0)
3125 Base = CurDAG->getRegister(0, MVT::i64);
3126 else if ((BaseType == MVT::i8 || BaseType == MVT::i16 ||
3127 BaseType == MVT::i32) &&
3129 // Base could already be %rip, particularly in the x32 ABI.
3130 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
3131 MVT::i64), 0);
3132 Base = CurDAG->getTargetInsertSubreg(SubReg, DL, MVT::i64, ImplDef, Base);
3133 }
3134
3135 [[maybe_unused]] EVT IndexType = Index.getValueType();
3137 if (RN && RN->getReg() == 0)
3138 Index = CurDAG->getRegister(0, MVT::i64);
3139 else {
3140 assert((IndexType == BaseType) &&
3141 "Expect to be extending 8/16/32-bit registers for use in LEA");
3142 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
3143 MVT::i64), 0);
3144 Index = CurDAG->getTargetInsertSubreg(SubReg, DL, MVT::i64, ImplDef, Index);
3145 }
3146
3147 return true;
3148}
3149
3150/// Calls SelectAddr and determines if the maximal addressing
3151/// mode it matches can be cost effectively emitted as an LEA instruction.
3152bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
3153 SDValue &Base, SDValue &Scale,
3154 SDValue &Index, SDValue &Disp,
3155 SDValue &Segment) {
3156 X86ISelAddressMode AM;
3157
3158 // Save the DL and VT before calling matchAddress, it can invalidate N.
3159 SDLoc DL(N);
3160 MVT VT = N.getSimpleValueType();
3161
3162 // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
3163 // segments.
3164 SDValue Copy = AM.Segment;
3165 SDValue T = CurDAG->getRegister(0, MVT::i32);
3166 AM.Segment = T;
3167 if (matchAddress(N, AM))
3168 return false;
3169 assert (T == AM.Segment);
3170 AM.Segment = Copy;
3171
3172 unsigned Complexity = 0;
3173 if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
3174 Complexity = 1;
3175 else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
3176 Complexity = 4;
3177
3178 if (AM.IndexReg.getNode())
3179 Complexity++;
3180
3181 // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
3182 // a simple shift.
3183 if (AM.Scale > 1)
3184 Complexity++;
3185
3186 // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
3187 // to a LEA. This is determined with some experimentation but is by no means
3188 // optimal (especially for code size consideration). LEA is nice because of
3189 // its three-address nature. Tweak the cost function again when we can run
3190 // convertToThreeAddress() at register allocation time.
3191 if (AM.hasSymbolicDisplacement()) {
3192 // For X86-64, always use LEA to materialize RIP-relative addresses.
3193 if (Subtarget->is64Bit())
3194 Complexity = 4;
3195 else
3196 Complexity += 2;
3197 }
3198
3199 // Heuristic: try harder to form an LEA from ADD if the operands set flags.
3200 // Unlike ADD, LEA does not affect flags, so we will be less likely to require
3201 // duplicating flag-producing instructions later in the pipeline.
3202 if (N.getOpcode() == ISD::ADD) {
3203 auto isMathWithFlags = [](SDValue V) {
3204 switch (V.getOpcode()) {
3205 case X86ISD::ADD:
3206 case X86ISD::SUB:
3207 case X86ISD::ADC:
3208 case X86ISD::SBB:
3209 case X86ISD::SMUL:
3210 case X86ISD::UMUL:
3211 /* TODO: These opcodes can be added safely, but we may want to justify
3212 their inclusion for different reasons (better for reg-alloc).
3213 case X86ISD::OR:
3214 case X86ISD::XOR:
3215 case X86ISD::AND:
3216 */
3217 // Value 1 is the flag output of the node - verify it's not dead.
3218 return !SDValue(V.getNode(), 1).use_empty();
3219 default:
3220 return false;
3221 }
3222 };
3223 // TODO: We might want to factor in whether there's a load folding
3224 // opportunity for the math op that disappears with LEA.
3225 if (isMathWithFlags(N.getOperand(0)) || isMathWithFlags(N.getOperand(1)))
3226 Complexity++;
3227 }
3228
3229 if (AM.Disp)
3230 Complexity++;
3231
3232 // If it isn't worth using an LEA, reject it.
3233 if (Complexity <= 2)
3234 return false;
3235
3236 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3237 return true;
3238}
3239
3240/// This is only run on TargetGlobalTLSAddress nodes.
3241bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
3242 SDValue &Scale, SDValue &Index,
3243 SDValue &Disp, SDValue &Segment) {
3244 assert(N.getOpcode() == ISD::TargetGlobalTLSAddress ||
3245 N.getOpcode() == ISD::TargetExternalSymbol);
3246
3247 X86ISelAddressMode AM;
3248 if (auto *GA = dyn_cast<GlobalAddressSDNode>(N)) {
3249 AM.GV = GA->getGlobal();
3250 AM.Disp += GA->getOffset();
3251 AM.SymbolFlags = GA->getTargetFlags();
3252 } else {
3253 auto *SA = cast<ExternalSymbolSDNode>(N);
3254 AM.ES = SA->getSymbol();
3255 AM.SymbolFlags = SA->getTargetFlags();
3256 }
3257
3258 if (Subtarget->is32Bit()) {
3259 AM.Scale = 1;
3260 AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
3261 }
3262
3263 MVT VT = N.getSimpleValueType();
3264 getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
3265 return true;
3266}
3267
3268bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
3269 // Keep track of the original value type and whether this value was
3270 // truncated. If we see a truncation from pointer type to VT that truncates
3271 // bits that are known to be zero, we can use a narrow reference.
3272 EVT VT = N.getValueType();
3273 bool WasTruncated = false;
3274 if (N.getOpcode() == ISD::TRUNCATE) {
3275 WasTruncated = true;
3276 N = N.getOperand(0);
3277 }
3278
3279 if (N.getOpcode() != X86ISD::Wrapper)
3280 return false;
3281
3282 // We can only use non-GlobalValues as immediates if they were not truncated,
3283 // as we do not have any range information. If we have a GlobalValue and the
3284 // address was not truncated, we can select it as an operand directly.
3285 unsigned Opc = N.getOperand(0)->getOpcode();
3286 if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
3287 Op = N.getOperand(0);
3288 // We can only select the operand directly if we didn't have to look past a
3289 // truncate.
3290 return !WasTruncated;
3291 }
3292
3293 // Check that the global's range fits into VT.
3294 auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
3295 std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
3296 if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
3297 return false;
3298
3299 // Okay, we can use a narrow reference.
3300 Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
3301 GA->getOffset(), GA->getTargetFlags());
3302 return true;
3303}
3304
3305bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
3306 SDValue &Base, SDValue &Scale,
3307 SDValue &Index, SDValue &Disp,
3308 SDValue &Segment) {
3309 assert(Root && P && "Unknown root/parent nodes");
3310 if (!ISD::isNON_EXTLoad(N.getNode()) ||
3311 !IsProfitableToFold(N, P, Root) ||
3312 !IsLegalToFold(N, P, Root, OptLevel))
3313 return false;
3314
3315 return selectAddr(N.getNode(),
3316 N.getOperand(1), Base, Scale, Index, Disp, Segment);
3317}
3318
3319bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
3320 SDValue &Base, SDValue &Scale,
3321 SDValue &Index, SDValue &Disp,
3322 SDValue &Segment) {
3323 assert(Root && P && "Unknown root/parent nodes");
3324 if (N->getOpcode() != X86ISD::VBROADCAST_LOAD ||
3325 !IsProfitableToFold(N, P, Root) ||
3326 !IsLegalToFold(N, P, Root, OptLevel))
3327 return false;
3328
3329 return selectAddr(N.getNode(),
3330 N.getOperand(1), Base, Scale, Index, Disp, Segment);
3331}
3332
3333/// Return an SDNode that returns the value of the global base register.
3334/// Output instructions required to initialize the global base register,
3335/// if necessary.
3336SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
3337 Register GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
3338 auto &DL = MF->getDataLayout();
3339 return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
3340}
3341
3342bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
3343 if (N->getOpcode() == ISD::TRUNCATE)
3344 N = N->getOperand(0).getNode();
3345 if (N->getOpcode() != X86ISD::Wrapper)
3346 return false;
3347
3348 auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
3349 if (!GA)
3350 return false;
3351
3352 auto *GV = GA->getGlobal();
3353 std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange();
3354 if (CR)
3355 return CR->getSignedMin().sge(-1ull << Width) &&
3356 CR->getSignedMax().slt(1ull << Width);
3357 // In the kernel code model, globals are in the negative 2GB of the address
3358 // space, so globals can be a sign extended 32-bit immediate.
3359 // In other code models, small globals are in the low 2GB of the address
3360 // space, so sign extending them is equivalent to zero extending them.
3361 return TM.getCodeModel() != CodeModel::Large && Width == 32 &&
3362 !TM.isLargeGlobalValue(GV);
3363}
3364
3365X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const {
3366 assert(N->isMachineOpcode() && "Unexpected node");
3367 unsigned Opc = N->getMachineOpcode();
3368 const MCInstrDesc &MCID = getInstrInfo()->get(Opc);
3369 int CondNo = X86::getCondSrcNoFromDesc(MCID);
3370 if (CondNo < 0)
3371 return X86::COND_INVALID;
3372
3373 return static_cast<X86::CondCode>(N->getConstantOperandVal(CondNo));
3374}
3375
3376/// Test whether the given X86ISD::CMP node has any users that use a flag
3377/// other than ZF.
3378bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
3379 // Examine each user of the node.
3380 for (SDUse &Use : Flags->uses()) {
3381 // Only check things that use the flags.
3382 if (Use.getResNo() != Flags.getResNo())
3383 continue;
3384 SDNode *User = Use.getUser();
3385 // Only examine CopyToReg uses that copy to EFLAGS.
3386 if (User->getOpcode() != ISD::CopyToReg ||
3387 cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
3388 return false;
3389 // Examine each user of the CopyToReg use.
3390 for (SDUse &FlagUse : User->uses()) {
3391 // Only examine the Flag result.
3392 if (FlagUse.getResNo() != 1)
3393 continue;
3394 // Anything unusual: assume conservatively.
3395 if (!FlagUse.getUser()->isMachineOpcode())
3396 return false;
3397 // Examine the condition code of the user.
3398 X86::CondCode CC = getCondFromNode(FlagUse.getUser());
3399
3400 switch (CC) {
3401 // Comparisons which only use the zero flag.
3402 case X86::COND_E: case X86::COND_NE:
3403 continue;
3404 // Anything else: assume conservatively.
3405 default:
3406 return false;
3407 }
3408 }
3409 }
3410 return true;
3411}
3412
3413/// Test whether the given X86ISD::CMP node has any uses which require the SF
3414/// flag to be accurate.
3415bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
3416 // Examine each user of the node.
3417 for (SDUse &Use : Flags->uses()) {
3418 // Only check things that use the flags.
3419 if (Use.getResNo() != Flags.getResNo())
3420 continue;
3421 SDNode *User = Use.getUser();
3422 // Only examine CopyToReg uses that copy to EFLAGS.
3423 if (User->getOpcode() != ISD::CopyToReg ||
3424 cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
3425 return false;
3426 // Examine each user of the CopyToReg use.
3427 for (SDUse &FlagUse : User->uses()) {
3428 // Only examine the Flag result.
3429 if (FlagUse.getResNo() != 1)
3430 continue;
3431 // Anything unusual: assume conservatively.
3432 if (!FlagUse.getUser()->isMachineOpcode())
3433 return false;
3434 // Examine the condition code of the user.
3435 X86::CondCode CC = getCondFromNode(FlagUse.getUser());
3436
3437 switch (CC) {
3438 // Comparisons which don't examine the SF flag.
3439 case X86::COND_A: case X86::COND_AE:
3440 case X86::COND_B: case X86::COND_BE:
3441 case X86::COND_E: case X86::COND_NE:
3442 case X86::COND_O: case X86::COND_NO:
3443 case X86::COND_P: case X86::COND_NP:
3444 continue;
3445 // Anything else: assume conservatively.
3446 default:
3447 return false;
3448 }
3449 }
3450 }
3451 return true;
3452}
3453
3455 switch (CC) {
3456 // Comparisons which don't examine the CF flag.
3457 case X86::COND_O: case X86::COND_NO:
3458 case X86::COND_E: case X86::COND_NE:
3459 case X86::COND_S: case X86::COND_NS:
3460 case X86::COND_P: case X86::COND_NP:
3461 case X86::COND_L: case X86::COND_GE:
3462 case X86::COND_G: case X86::COND_LE:
3463 return false;
3464 // Anything else: assume conservatively.
3465 default:
3466 return true;
3467 }
3468}
3469
3470/// Test whether the given node which sets flags has any uses which require the
3471/// CF flag to be accurate.
3472 bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
3473 // Examine each user of the node.
3474 for (SDUse &Use : Flags->uses()) {
3475 // Only check things that use the flags.
3476 if (Use.getResNo() != Flags.getResNo())
3477 continue;
3478
3479 SDNode *User = Use.getUser();
3480 unsigned UserOpc = User->getOpcode();
3481
3482 if (UserOpc == ISD::CopyToReg) {
3483 // Only examine CopyToReg uses that copy to EFLAGS.
3484 if (cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
3485 return false;
3486 // Examine each user of the CopyToReg use.
3487 for (SDUse &FlagUse : User->uses()) {
3488 // Only examine the Flag result.
3489 if (FlagUse.getResNo() != 1)
3490 continue;
3491 // Anything unusual: assume conservatively.
3492 if (!FlagUse.getUser()->isMachineOpcode())
3493 return false;
3494 // Examine the condition code of the user.
3495 X86::CondCode CC = getCondFromNode(FlagUse.getUser());
3496
3497 if (mayUseCarryFlag(CC))
3498 return false;
3499 }
3500
3501 // This CopyToReg is ok. Move on to the next user.
3502 continue;
3503 }
3504
3505 // This might be an unselected node. So look for the pre-isel opcodes that
3506 // use flags.
3507 unsigned CCOpNo;
3508 switch (UserOpc) {
3509 default:
3510 // Something unusual. Be conservative.
3511 return false;
3512 case X86ISD::SETCC: CCOpNo = 0; break;
3513 case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
3514 case X86ISD::CMOV: CCOpNo = 2; break;
3515 case X86ISD::BRCOND: CCOpNo = 2; break;
3516 }
3517
3518 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
3519 if (mayUseCarryFlag(CC))
3520 return false;
3521 }
3522 return true;
3523}
3524
3525bool X86DAGToDAGISel::checkTCRetEnoughRegs(SDNode *N) const {
3526 // Check that there is enough volatile registers to load the callee address.
3527
3528 const X86RegisterInfo *RI = Subtarget->getRegisterInfo();
3529 unsigned AvailGPRs;
3530 // The register classes below must stay in sync with what's used for
3531 // TCRETURNri, TCRETURN_HIPE32ri, TCRETURN_WIN64ri, etc).
3532 if (Subtarget->is64Bit()) {
3533 const TargetRegisterClass *TCGPRs =
3534 Subtarget->isCallingConvWin64(MF->getFunction().getCallingConv())
3535 ? &X86::GR64_TCW64RegClass
3536 : &X86::GR64_TCRegClass;
3537 // Can't use RSP or RIP for the load in general.
3538 assert(TCGPRs->contains(X86::RSP));
3539 assert(TCGPRs->contains(X86::RIP));
3540 AvailGPRs = TCGPRs->getNumRegs() - 2;
3541 } else {
3542 const TargetRegisterClass *TCGPRs =
3543 MF->getFunction().getCallingConv() == CallingConv::HiPE
3544 ? &X86::GR32RegClass
3545 : &X86::GR32_TCRegClass;
3546 // Can't use ESP for the address in general.
3547 assert(TCGPRs->contains(X86::ESP));
3548 AvailGPRs = TCGPRs->getNumRegs() - 1;
3549 }
3550
3551 // The load's base and index need up to two registers.
3552 unsigned LoadGPRs = 2;
3553
3554 assert(N->getOpcode() == X86ISD::TC_RETURN);
3555 // X86tcret args: (*chain, ptr, imm, regs..., glue)
3556
3557 if (Subtarget->is32Bit()) {
3558 // FIXME: This was carried from X86tcret_1reg which was used for 32-bit,
3559 // but it could apply to 64-bit too.
3560 const SDValue &BasePtr = cast<LoadSDNode>(N->getOperand(1))->getBasePtr();
3561 if (isa<FrameIndexSDNode>(BasePtr)) {
3562 LoadGPRs -= 2; // Base is fixed index off ESP; no regs needed.
3563 } else if (BasePtr.getOpcode() == X86ISD::Wrapper &&
3564 isa<GlobalAddressSDNode>(BasePtr->getOperand(0))) {
3565 assert(!getTargetMachine().isPositionIndependent());
3566 LoadGPRs -= 1; // Base is a global (immediate since this is non-PIC), no
3567 // reg needed.
3568 }
3569 }
3570
3571 unsigned ArgGPRs = 0;
3572 for (unsigned I = 3, E = N->getNumOperands(); I != E; ++I) {
3573 if (const auto *RN = dyn_cast<RegisterSDNode>(N->getOperand(I))) {
3574 if (!RI->isGeneralPurposeRegister(*MF, RN->getReg()))
3575 continue;
3576 if (++ArgGPRs + LoadGPRs > AvailGPRs)
3577 return false;
3578 }
3579 }
3580
3581 return true;
3582}
3583
3584/// Check whether or not the chain ending in StoreNode is suitable for doing
3585/// the {load; op; store} to modify transformation.
3587 SDValue StoredVal, SelectionDAG *CurDAG,
3588 unsigned LoadOpNo,
3589 LoadSDNode *&LoadNode,
3590 SDValue &InputChain) {
3591 // Is the stored value result 0 of the operation?
3592 if (StoredVal.getResNo() != 0) return false;
3593
3594 // Are there other uses of the operation other than the store?
3595 if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
3596
3597 // Is the store non-extending and non-indexed?
3598 if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
3599 return false;
3600
3601 SDValue Load = StoredVal->getOperand(LoadOpNo);
3602 // Is the stored value a non-extending and non-indexed load?
3603 if (!ISD::isNormalLoad(Load.getNode())) return false;
3604
3605 // Return LoadNode by reference.
3606 LoadNode = cast<LoadSDNode>(Load);
3607
3608 // Is store the only read of the loaded value?
3609 if (!Load.hasOneUse())
3610 return false;
3611
3612 // Is the address of the store the same as the load?
3613 if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
3614 LoadNode->getOffset() != StoreNode->getOffset())
3615 return false;
3616
3617 bool FoundLoad = false;
3618 SmallVector<SDValue, 4> ChainOps;
3619 SmallVector<const SDNode *, 4> LoopWorklist;
3621 const unsigned int Max = 1024;
3622
3623 // Visualization of Load-Op-Store fusion:
3624 // -------------------------
3625 // Legend:
3626 // *-lines = Chain operand dependencies.
3627 // |-lines = Normal operand dependencies.
3628 // Dependencies flow down and right. n-suffix references multiple nodes.
3629 //
3630 // C Xn C
3631 // * * *
3632 // * * *
3633 // Xn A-LD Yn TF Yn
3634 // * * \ | * |
3635 // * * \ | * |
3636 // * * \ | => A--LD_OP_ST
3637 // * * \| \
3638 // TF OP \
3639 // * | \ Zn
3640 // * | \
3641 // A-ST Zn
3642 //
3643
3644 // This merge induced dependences from: #1: Xn -> LD, OP, Zn
3645 // #2: Yn -> LD
3646 // #3: ST -> Zn
3647
3648 // Ensure the transform is safe by checking for the dual
3649 // dependencies to make sure we do not induce a loop.
3650
3651 // As LD is a predecessor to both OP and ST we can do this by checking:
3652 // a). if LD is a predecessor to a member of Xn or Yn.
3653 // b). if a Zn is a predecessor to ST.
3654
3655 // However, (b) can only occur through being a chain predecessor to
3656 // ST, which is the same as Zn being a member or predecessor of Xn,
3657 // which is a subset of LD being a predecessor of Xn. So it's
3658 // subsumed by check (a).
3659
3660 SDValue Chain = StoreNode->getChain();
3661
3662 // Gather X elements in ChainOps.
3663 if (Chain == Load.getValue(1)) {
3664 FoundLoad = true;
3665 ChainOps.push_back(Load.getOperand(0));
3666 } else if (Chain.getOpcode() == ISD::TokenFactor) {
3667 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
3668 SDValue Op = Chain.getOperand(i);
3669 if (Op == Load.getValue(1)) {
3670 FoundLoad = true;
3671 // Drop Load, but keep its chain. No cycle check necessary.
3672 ChainOps.push_back(Load.getOperand(0));
3673 continue;
3674 }
3675 LoopWorklist.push_back(Op.getNode());
3676 ChainOps.push_back(Op);
3677 }
3678 }
3679
3680 if (!FoundLoad)
3681 return false;
3682
3683 // Worklist is currently Xn. Add Yn to worklist.
3684 for (SDValue Op : StoredVal->ops())
3685 if (Op.getNode() != LoadNode)
3686 LoopWorklist.push_back(Op.getNode());
3687
3688 // Check (a) if Load is a predecessor to Xn + Yn
3689 if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
3690 true))
3691 return false;
3692
3693 InputChain =
3694 CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
3695 return true;
3696}
3697
3698// Change a chain of {load; op; store} of the same value into a simple op
3699// through memory of that value, if the uses of the modified value and its
3700// address are suitable.
3701//
3702// The tablegen pattern memory operand pattern is currently not able to match
3703// the case where the EFLAGS on the original operation are used.
3704//
3705// To move this to tablegen, we'll need to improve tablegen to allow flags to
3706// be transferred from a node in the pattern to the result node, probably with
3707// a new keyword. For example, we have this
3708// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3709// [(store (add (loadi64 addr:$dst), -1), addr:$dst)]>;
3710// but maybe need something like this
3711// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3712// [(store (X86add_flag (loadi64 addr:$dst), -1), addr:$dst),
3713// (transferrable EFLAGS)]>;
3714//
3715// Until then, we manually fold these and instruction select the operation
3716// here.
3717bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
3718 auto *StoreNode = cast<StoreSDNode>(Node);
3719 SDValue StoredVal = StoreNode->getOperand(1);
3720 unsigned Opc = StoredVal->getOpcode();
3721
3722 // Before we try to select anything, make sure this is memory operand size
3723 // and opcode we can handle. Note that this must match the code below that
3724 // actually lowers the opcodes.
3725 EVT MemVT = StoreNode->getMemoryVT();
3726 if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
3727 MemVT != MVT::i8)
3728 return false;
3729
3730 bool IsCommutable = false;
3731 bool IsNegate = false;
3732 switch (Opc) {
3733 default:
3734 return false;
3735 case X86ISD::SUB:
3736 IsNegate = isNullConstant(StoredVal.getOperand(0));
3737 break;
3738 case X86ISD::SBB:
3739 break;
3740 case X86ISD::ADD:
3741 case X86ISD::ADC:
3742 case X86ISD::AND:
3743 case X86ISD::OR:
3744 case X86ISD::XOR:
3745 IsCommutable = true;
3746 break;
3747 }
3748
3749 unsigned LoadOpNo = IsNegate ? 1 : 0;
3750 LoadSDNode *LoadNode = nullptr;
3751 SDValue InputChain;
3752 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3753 LoadNode, InputChain)) {
3754 if (!IsCommutable)
3755 return false;
3756
3757 // This operation is commutable, try the other operand.
3758 LoadOpNo = 1;
3759 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3760 LoadNode, InputChain))
3761 return false;
3762 }
3763
3764 SDValue Base, Scale, Index, Disp, Segment;
3765 if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
3766 Segment))
3767 return false;
3768
3769 auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
3770 unsigned Opc8) {
3771 switch (MemVT.getSimpleVT().SimpleTy) {
3772 case MVT::i64:
3773 return Opc64;
3774 case MVT::i32:
3775 return Opc32;
3776 case MVT::i16:
3777 return Opc16;
3778 case MVT::i8:
3779 return Opc8;
3780 default:
3781 llvm_unreachable("Invalid size!");
3782 }
3783 };
3784
3785 MachineSDNode *Result;
3786 switch (Opc) {
3787 case X86ISD::SUB:
3788 // Handle negate.
3789 if (IsNegate) {
3790 unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
3791 X86::NEG8m);
3792 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3793 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3794 MVT::Other, Ops);
3795 break;
3796 }
3797 [[fallthrough]];
3798 case X86ISD::ADD:
3799 // Try to match inc/dec.
3800 if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) {
3801 bool IsOne = isOneConstant(StoredVal.getOperand(1));
3802 bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
3803 // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3804 if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
3805 unsigned NewOpc =
3806 ((Opc == X86ISD::ADD) == IsOne)
3807 ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
3808 : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
3809 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3810 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3811 MVT::Other, Ops);
3812 break;
3813 }
3814 }
3815 [[fallthrough]];
3816 case X86ISD::ADC:
3817 case X86ISD::SBB:
3818 case X86ISD::AND:
3819 case X86ISD::OR:
3820 case X86ISD::XOR: {
3821 auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3822 switch (Opc) {
3823 case X86ISD::ADD:
3824 return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3825 X86::ADD8mr);
3826 case X86ISD::ADC:
3827 return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3828 X86::ADC8mr);
3829 case X86ISD::SUB:
3830 return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3831 X86::SUB8mr);
3832 case X86ISD::SBB:
3833 return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3834 X86::SBB8mr);
3835 case X86ISD::AND:
3836 return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
3837 X86::AND8mr);
3838 case X86ISD::OR:
3839 return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3840 case X86ISD::XOR:
3841 return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3842 X86::XOR8mr);
3843 default:
3844 llvm_unreachable("Invalid opcode!");
3845 }
3846 };
3847 auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3848 switch (Opc) {
3849 case X86ISD::ADD:
3850 return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3851 X86::ADD8mi);
3852 case X86ISD::ADC:
3853 return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3854 X86::ADC8mi);
3855 case X86ISD::SUB:
3856 return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3857 X86::SUB8mi);
3858 case X86ISD::SBB:
3859 return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3860 X86::SBB8mi);
3861 case X86ISD::AND:
3862 return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3863 X86::AND8mi);
3864 case X86ISD::OR:
3865 return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3866 X86::OR8mi);
3867 case X86ISD::XOR:
3868 return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3869 X86::XOR8mi);
3870 default:
3871 llvm_unreachable("Invalid opcode!");
3872 }
3873 };
3874
3875 unsigned NewOpc = SelectRegOpcode(Opc);
3876 SDValue Operand = StoredVal->getOperand(1-LoadOpNo);
3877
3878 // See if the operand is a constant that we can fold into an immediate
3879 // operand.
3880 if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
3881 int64_t OperandV = OperandC->getSExtValue();
3882
3883 // Check if we can shrink the operand enough to fit in an immediate (or
3884 // fit into a smaller immediate) by negating it and switching the
3885 // operation.
3886 if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
3887 ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) ||
3888 (MemVT == MVT::i64 && !isInt<32>(OperandV) &&
3889 isInt<32>(-OperandV))) &&
3890 hasNoCarryFlagUses(StoredVal.getValue(1))) {
3891 OperandV = -OperandV;
3892 Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
3893 }
3894
3895 if (MemVT != MVT::i64 || isInt<32>(OperandV)) {
3896 Operand = CurDAG->getSignedTargetConstant(OperandV, SDLoc(Node), MemVT);
3897 NewOpc = SelectImmOpcode(Opc);
3898 }
3899 }
3900
3901 if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
3902 SDValue CopyTo =
3903 CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
3904 StoredVal.getOperand(2), SDValue());
3905
3906 const SDValue Ops[] = {Base, Scale, Index, Disp,
3907 Segment, Operand, CopyTo, CopyTo.getValue(1)};
3908 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3909 Ops);
3910 } else {
3911 const SDValue Ops[] = {Base, Scale, Index, Disp,
3912 Segment, Operand, InputChain};
3913 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3914 Ops);
3915 }
3916 break;
3917 }
3918 default:
3919 llvm_unreachable("Invalid opcode!");
3920 }
3921
3922 MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3923 LoadNode->getMemOperand()};
3924 CurDAG->setNodeMemRefs(Result, MemOps);
3925
3926 // Update Load Chain uses as well.
3927 ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
3928 ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
3929 ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
3930 CurDAG->RemoveDeadNode(Node);
3931 return true;
3932}
3933
3934// See if this is an X & Mask that we can match to BEXTR/BZHI.
3935// Where Mask is one of the following patterns:
3936// a) x & (1 << nbits) - 1
3937// b) x & ~(-1 << nbits)
3938// c) x & (-1 >> (32 - y))
3939// d) x << (32 - y) >> (32 - y)
3940// e) (1 << nbits) - 1
3941bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3942 assert(
3943 (Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::AND ||
3944 Node->getOpcode() == ISD::SRL) &&
3945 "Should be either an and-mask, or right-shift after clearing high bits.");
3946
3947 // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3948 if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3949 return false;
3950
3951 MVT NVT = Node->getSimpleValueType(0);
3952
3953 // Only supported for 32 and 64 bits.
3954 if (NVT != MVT::i32 && NVT != MVT::i64)
3955 return false;
3956
3957 SDValue NBits;
3958 bool NegateNBits;
3959
3960 // If we have BMI2's BZHI, we are ok with muti-use patterns.
3961 // Else, if we only have BMI1's BEXTR, we require one-use.
3962 const bool AllowExtraUsesByDefault = Subtarget->hasBMI2();
3963 auto checkUses = [AllowExtraUsesByDefault](
3964 SDValue Op, unsigned NUses,
3965 std::optional<bool> AllowExtraUses) {
3966 return AllowExtraUses.value_or(AllowExtraUsesByDefault) ||
3967 Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
3968 };
3969 auto checkOneUse = [checkUses](SDValue Op,
3970 std::optional<bool> AllowExtraUses =
3971 std::nullopt) {
3972 return checkUses(Op, 1, AllowExtraUses);
3973 };
3974 auto checkTwoUse = [checkUses](SDValue Op,
3975 std::optional<bool> AllowExtraUses =
3976 std::nullopt) {
3977 return checkUses(Op, 2, AllowExtraUses);
3978 };
3979
3980 auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3981 if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
3982 assert(V.getSimpleValueType() == MVT::i32 &&
3983 V.getOperand(0).getSimpleValueType() == MVT::i64 &&
3984 "Expected i64 -> i32 truncation");
3985 V = V.getOperand(0);
3986 }
3987 return V;
3988 };
3989
3990 // a) x & ((1 << nbits) + (-1))
3991 auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits,
3992 &NegateNBits](SDValue Mask) -> bool {
3993 // Match `add`. Must only have one use!
3994 if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
3995 return false;
3996 // We should be adding all-ones constant (i.e. subtracting one.)
3997 if (!isAllOnesConstant(Mask->getOperand(1)))
3998 return false;
3999 // Match `1 << nbits`. Might be truncated. Must only have one use!
4000 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
4001 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
4002 return false;
4003 if (!isOneConstant(M0->getOperand(0)))
4004 return false;
4005 NBits = M0->getOperand(1);
4006 NegateNBits = false;
4007 return true;
4008 };
4009
4010 auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
4011 V = peekThroughOneUseTruncation(V);
4012 return CurDAG->MaskedValueIsAllOnes(
4013 V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(),
4014 NVT.getSizeInBits()));
4015 };
4016
4017 // b) x & ~(-1 << nbits)
4018 auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
4019 &NBits, &NegateNBits](SDValue Mask) -> bool {
4020 // Match `~()`. Must only have one use!
4021 if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
4022 return false;
4023 // The -1 only has to be all-ones for the final Node's NVT.
4024 if (!isAllOnes(Mask->getOperand(1)))
4025 return false;
4026 // Match `-1 << nbits`. Might be truncated. Must only have one use!
4027 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
4028 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
4029 return false;
4030 // The -1 only has to be all-ones for the final Node's NVT.
4031 if (!isAllOnes(M0->getOperand(0)))
4032 return false;
4033 NBits = M0->getOperand(1);
4034 NegateNBits = false;
4035 return true;
4036 };
4037
4038 // Try to match potentially-truncated shift amount as `(bitwidth - y)`,
4039 // or leave the shift amount as-is, but then we'll have to negate it.
4040 auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt,
4041 unsigned Bitwidth) {
4042 NBits = ShiftAmt;
4043 NegateNBits = true;
4044 // Skip over a truncate of the shift amount, if any.
4045 if (NBits.getOpcode() == ISD::TRUNCATE)
4046 NBits = NBits.getOperand(0);
4047 // Try to match the shift amount as (bitwidth - y). It should go away, too.
4048 // If it doesn't match, that's fine, we'll just negate it ourselves.
4049 if (NBits.getOpcode() != ISD::SUB)
4050 return;
4051 auto *V0 = dyn_cast<ConstantSDNode>(NBits.getOperand(0));
4052 if (!V0 || V0->getZExtValue() != Bitwidth)
4053 return;
4054 NBits = NBits.getOperand(1);
4055 NegateNBits = false;
4056 };
4057
4058 // c) x & (-1 >> z) but then we'll have to subtract z from bitwidth
4059 // or
4060 // c) x & (-1 >> (32 - y))
4061 auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits,
4062 canonicalizeShiftAmt](SDValue Mask) -> bool {
4063 // The mask itself may be truncated.
4064 Mask = peekThroughOneUseTruncation(Mask);
4065 unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
4066 // Match `l>>`. Must only have one use!
4067 if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
4068 return false;
4069 // We should be shifting truly all-ones constant.
4070 if (!isAllOnesConstant(Mask.getOperand(0)))
4071 return false;
4072 SDValue M1 = Mask.getOperand(1);
4073 // The shift amount should not be used externally.
4074 if (!checkOneUse(M1))
4075 return false;
4076 canonicalizeShiftAmt(M1, Bitwidth);
4077 // Pattern c. is non-canonical, and is expanded into pattern d. iff there
4078 // is no extra use of the mask. Clearly, there was one since we are here.
4079 // But at the same time, if we need to negate the shift amount,
4080 // then we don't want the mask to stick around, else it's unprofitable.
4081 return !NegateNBits;
4082 };
4083
4084 SDValue X;
4085
4086 // d) x << z >> z but then we'll have to subtract z from bitwidth
4087 // or
4088 // d) x << (32 - y) >> (32 - y)
4089 auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt,
4090 AllowExtraUsesByDefault, &NegateNBits,
4091 &X](SDNode *Node) -> bool {
4092 if (Node->getOpcode() != ISD::SRL)
4093 return false;
4094 SDValue N0 = Node->getOperand(0);
4095 if (N0->getOpcode() != ISD::SHL)
4096 return false;
4097 unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
4098 SDValue N1 = Node->getOperand(1);
4099 SDValue N01 = N0->getOperand(1);
4100 // Both of the shifts must be by the exact same value.
4101 if (N1 != N01)
4102 return false;
4103 canonicalizeShiftAmt(N1, Bitwidth);
4104 // There should not be any external uses of the inner shift / shift amount.
4105 // Note that while we are generally okay with external uses given BMI2,
4106 // iff we need to negate the shift amount, we are not okay with extra uses.
4107 const bool AllowExtraUses = AllowExtraUsesByDefault && !NegateNBits;
4108 if (!checkOneUse(N0, AllowExtraUses) || !checkTwoUse(N1, AllowExtraUses))
4109 return false;
4110 X = N0->getOperand(0);
4111 return true;
4112 };
4113
4114 auto matchLowBitMask = [matchPatternA, matchPatternB,
4115 matchPatternC](SDValue Mask) -> bool {
4116 return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
4117 };
4118
4119 if (Node->getOpcode() == ISD::AND) {
4120 X = Node->getOperand(0);
4121 SDValue Mask = Node->getOperand(1);
4122
4123 if (matchLowBitMask(Mask)) {
4124 // Great.
4125 } else {
4126 std::swap(X, Mask);
4127 if (!matchLowBitMask(Mask))
4128 return false;
4129 }
4130 } else if (matchLowBitMask(SDValue(Node, 0))) {
4131 X = CurDAG->getAllOnesConstant(SDLoc(Node), NVT);
4132 } else if (!matchPatternD(Node))
4133 return false;
4134
4135 // If we need to negate the shift amount, require BMI2 BZHI support.
4136 // It's just too unprofitable for BMI1 BEXTR.
4137 if (NegateNBits && !Subtarget->hasBMI2())
4138 return false;
4139
4140 SDLoc DL(Node);
4141
4142 if (NBits.getSimpleValueType() != MVT::i8) {
4143 // Truncate the shift amount.
4144 NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
4145 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4146 }
4147
4148 // Turn (i32)(x & imm8) into (i32)x & imm32.
4149 ConstantSDNode *Imm = nullptr;
4150 if (NBits->getOpcode() == ISD::AND)
4151 if ((Imm = dyn_cast<ConstantSDNode>(NBits->getOperand(1))))
4152 NBits = NBits->getOperand(0);
4153
4154 // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
4155 // All the other bits are undefined, we do not care about them.
4156 SDValue ImplDef = SDValue(
4157 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
4158 insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
4159
4160 SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
4161 insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
4162 NBits = SDValue(CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
4163 MVT::i32, ImplDef, NBits, SRIdxVal),
4164 0);
4165 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4166
4167 if (Imm) {
4168 NBits =
4169 CurDAG->getNode(ISD::AND, DL, MVT::i32, NBits,
4170 CurDAG->getConstant(Imm->getZExtValue(), DL, MVT::i32));
4171 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4172 }
4173
4174 // We might have matched the amount of high bits to be cleared,
4175 // but we want the amount of low bits to be kept, so negate it then.
4176 if (NegateNBits) {
4177 SDValue BitWidthC = CurDAG->getConstant(NVT.getSizeInBits(), DL, MVT::i32);
4178 insertDAGNode(*CurDAG, SDValue(Node, 0), BitWidthC);
4179
4180 NBits = CurDAG->getNode(ISD::SUB, DL, MVT::i32, BitWidthC, NBits);
4181 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4182 }
4183
4184 if (Subtarget->hasBMI2()) {
4185 // Great, just emit the BZHI..
4186 if (NVT != MVT::i32) {
4187 // But have to place the bit count into the wide-enough register first.
4188 NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits);
4189 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4190 }
4191
4192 SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
4193 ReplaceNode(Node, Extract.getNode());
4194 SelectCode(Extract.getNode());
4195 return true;
4196 }
4197
4198 // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
4199 // *logically* shifted (potentially with one-use trunc inbetween),
4200 // and the truncation was the only use of the shift,
4201 // and if so look past one-use truncation.
4202 {
4203 SDValue RealX = peekThroughOneUseTruncation(X);
4204 // FIXME: only if the shift is one-use?
4205 if (RealX != X && RealX.getOpcode() == ISD::SRL)
4206 X = RealX;
4207 }
4208
4209 MVT XVT = X.getSimpleValueType();
4210
4211 // Else, emitting BEXTR requires one more step.
4212 // The 'control' of BEXTR has the pattern of:
4213 // [15...8 bit][ 7...0 bit] location
4214 // [ bit count][ shift] name
4215 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4216
4217 // Shift NBits left by 8 bits, thus producing 'control'.
4218 // This makes the low 8 bits to be zero.
4219 SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
4220 insertDAGNode(*CurDAG, SDValue(Node, 0), C8);
4221 SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
4222 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4223
4224 // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
4225 // FIXME: only if the shift is one-use?
4226 if (X.getOpcode() == ISD::SRL) {
4227 SDValue ShiftAmt = X.getOperand(1);
4228 X = X.getOperand(0);
4229
4230 assert(ShiftAmt.getValueType() == MVT::i8 &&
4231 "Expected shift amount to be i8");
4232
4233 // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
4234 // We could zext to i16 in some form, but we intentionally don't do that.
4235 SDValue OrigShiftAmt = ShiftAmt;
4236 ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
4237 insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
4238
4239 // And now 'or' these low 8 bits of shift amount into the 'control'.
4240 Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
4241 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4242 }
4243
4244 // But have to place the 'control' into the wide-enough register first.
4245 if (XVT != MVT::i32) {
4246 Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
4247 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4248 }
4249
4250 // And finally, form the BEXTR itself.
4251 SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);
4252
4253 // The 'X' was originally truncated. Do that now.
4254 if (XVT != NVT) {
4255 insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
4256 Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
4257 }
4258
4259 ReplaceNode(Node, Extract.getNode());
4260 SelectCode(Extract.getNode());
4261
4262 return true;
4263}
4264
4265// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
4266MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
4267 MVT NVT = Node->getSimpleValueType(0);
4268 SDLoc dl(Node);
4269
4270 SDValue N0 = Node->getOperand(0);
4271 SDValue N1 = Node->getOperand(1);
4272
4273 // If we have TBM we can use an immediate for the control. If we have BMI
4274 // we should only do this if the BEXTR instruction is implemented well.
4275 // Otherwise moving the control into a register makes this more costly.
4276 // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
4277 // hoisting the move immediate would make it worthwhile with a less optimal
4278 // BEXTR?
4279 bool PreferBEXTR =
4280 Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
4281 if (!PreferBEXTR && !Subtarget->hasBMI2())
4282 return nullptr;
4283
4284 // Must have a shift right.
4285 if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
4286 return nullptr;
4287
4288 // Shift can't have additional users.
4289 if (!N0->hasOneUse())
4290 return nullptr;
4291
4292 // Only supported for 32 and 64 bits.
4293 if (NVT != MVT::i32 && NVT != MVT::i64)
4294 return nullptr;
4295
4296 // Shift amount and RHS of and must be constant.
4297 auto *MaskCst = dyn_cast<ConstantSDNode>(N1);
4298 auto *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
4299 if (!MaskCst || !ShiftCst)
4300 return nullptr;
4301
4302 // And RHS must be a mask.
4303 uint64_t Mask = MaskCst->getZExtValue();
4304 if (!isMask_64(Mask))
4305 return nullptr;
4306
4307 uint64_t Shift = ShiftCst->getZExtValue();
4308 uint64_t MaskSize = llvm::popcount(Mask);
4309
4310 // Don't interfere with something that can be handled by extracting AH.
4311 // TODO: If we are able to fold a load, BEXTR might still be better than AH.
4312 if (Shift == 8 && MaskSize == 8)
4313 return nullptr;
4314
4315 // Make sure we are only using bits that were in the original value, not
4316 // shifted in.
4317 if (Shift + MaskSize > NVT.getSizeInBits())
4318 return nullptr;
4319
4320 // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
4321 // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
4322 // does not fit into 32 bits. Load folding is not a sufficient reason.
4323 if (!PreferBEXTR && MaskSize <= 32)
4324 return nullptr;
4325
4326 SDValue Control;
4327 unsigned ROpc, MOpc;
4328
4329#define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC)
4330 if (!PreferBEXTR) {
4331 assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
4332 // If we can't make use of BEXTR then we can't fuse shift+mask stages.
4333 // Let's perform the mask first, and apply shift later. Note that we need to
4334 // widen the mask to account for the fact that we'll apply shift afterwards!
4335 Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT);
4336 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rr)
4337 : GET_EGPR_IF_ENABLED(X86::BZHI32rr);
4338 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rm)
4339 : GET_EGPR_IF_ENABLED(X86::BZHI32rm);
4340 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4341 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4342 } else {
4343 // The 'control' of BEXTR has the pattern of:
4344 // [15...8 bit][ 7...0 bit] location
4345 // [ bit count][ shift] name
4346 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4347 Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
4348 if (Subtarget->hasTBM()) {
4349 ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
4350 MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
4351 } else {
4352 assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
4353 // BMI requires the immediate to placed in a register.
4354 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rr)
4355 : GET_EGPR_IF_ENABLED(X86::BEXTR32rr);
4356 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rm)
4357 : GET_EGPR_IF_ENABLED(X86::BEXTR32rm);
4358 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4359 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4360 }
4361 }
4362
4363 MachineSDNode *NewNode;
4364 SDValue Input = N0->getOperand(0);
4365 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4366 if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4367 SDValue Ops[] = {
4368 Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)};
4369 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4370 NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4371 // Update the chain.
4372 ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
4373 // Record the mem-refs
4374 CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
4375 } else {
4376 NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control);
4377 }
4378
4379 if (!PreferBEXTR) {
4380 // We still need to apply the shift.
4381 SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT);
4382 unsigned NewOpc = NVT == MVT::i64 ? GET_ND_IF_ENABLED(X86::SHR64ri)
4383 : GET_ND_IF_ENABLED(X86::SHR32ri);
4384 NewNode =
4385 CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt);
4386 }
4387
4388 return NewNode;
4389}
4390
4391// Emit a PCMISTR(I/M) instruction.
4392MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
4393 bool MayFoldLoad, const SDLoc &dl,
4394 MVT VT, SDNode *Node) {
4395 SDValue N0 = Node->getOperand(0);
4396 SDValue N1 = Node->getOperand(1);
4397 SDValue Imm = Node->getOperand(2);
4398 auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4399 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4400
4401 // Try to fold a load. No need to check alignment.
4402 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4403 if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4404 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4405 N1.getOperand(0) };
4406 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
4407 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4408 // Update the chain.
4409 ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
4410 // Record the mem-refs
4411 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4412 return CNode;
4413 }
4414
4415 SDValue Ops[] = { N0, N1, Imm };
4416 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
4417 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4418 return CNode;
4419}
4420
4421// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
4422// to emit a second instruction after this one. This is needed since we have two
4423// copyToReg nodes glued before this and we need to continue that glue through.
4424MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
4425 bool MayFoldLoad, const SDLoc &dl,
4426 MVT VT, SDNode *Node,
4427 SDValue &InGlue) {
4428 SDValue N0 = Node->getOperand(0);
4429 SDValue N2 = Node->getOperand(2);
4430 SDValue Imm = Node->getOperand(4);
4431 auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4432 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4433
4434 // Try to fold a load. No need to check alignment.
4435 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4436 if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4437 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4438 N2.getOperand(0), InGlue };
4439 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
4440 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4441 InGlue = SDValue(CNode, 3);
4442 // Update the chain.
4443 ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
4444 // Record the mem-refs
4445 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
4446 return CNode;
4447 }
4448
4449 SDValue Ops[] = { N0, N2, Imm, InGlue };
4450 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
4451 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4452 InGlue = SDValue(CNode, 2);
4453 return CNode;
4454}
4455
4456bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
4457 EVT VT = N->getValueType(0);
4458
4459 // Only handle scalar shifts.
4460 if (VT.isVector())
4461 return false;
4462
4463 // Narrower shifts only mask to 5 bits in hardware.
4464 unsigned Size = VT == MVT::i64 ? 64 : 32;
4465
4466 SDValue OrigShiftAmt = N->getOperand(1);
4467 SDValue ShiftAmt = OrigShiftAmt;
4468 SDLoc DL(N);
4469
4470 // Skip over a truncate of the shift amount.
4471 if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
4472 ShiftAmt = ShiftAmt->getOperand(0);
4473
4474 // This function is called after X86DAGToDAGISel::matchBitExtract(),
4475 // so we are not afraid that we might mess up BZHI/BEXTR pattern.
4476
4477 SDValue NewShiftAmt;
4478 if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB ||
4479 ShiftAmt->getOpcode() == ISD::XOR) {
4480 SDValue Add0 = ShiftAmt->getOperand(0);
4481 SDValue Add1 = ShiftAmt->getOperand(1);
4482 auto *Add0C = dyn_cast<ConstantSDNode>(Add0);
4483 auto *Add1C = dyn_cast<ConstantSDNode>(Add1);
4484 // If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X
4485 // to avoid the ADD/SUB/XOR.
4486 if (Add1C && Add1C->getAPIntValue().urem(Size) == 0) {
4487 NewShiftAmt = Add0;
4488
4489 } else if (ShiftAmt->getOpcode() != ISD::ADD && ShiftAmt.hasOneUse() &&
4490 ((Add0C && Add0C->getAPIntValue().urem(Size) == Size - 1) ||
4491 (Add1C && Add1C->getAPIntValue().urem(Size) == Size - 1))) {
4492 // If we are doing a NOT on just the lower bits with (Size*N-1) -/^ X
4493 // we can replace it with a NOT. In the XOR case it may save some code
4494 // size, in the SUB case it also may save a move.
4495 assert(Add0C == nullptr || Add1C == nullptr);
4496
4497 // We can only do N-X, not X-N
4498 if (ShiftAmt->getOpcode() == ISD::SUB && Add0C == nullptr)
4499 return false;
4500
4501 EVT OpVT = ShiftAmt.getValueType();
4502
4503 SDValue AllOnes = CurDAG->getAllOnesConstant(DL, OpVT);
4504 NewShiftAmt = CurDAG->getNode(ISD::XOR, DL, OpVT,
4505 Add0C == nullptr ? Add0 : Add1, AllOnes);
4506 insertDAGNode(*CurDAG, OrigShiftAmt, AllOnes);
4507 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4508 // If we are shifting by N-X where N == 0 mod Size, then just shift by
4509 // -X to generate a NEG instead of a SUB of a constant.
4510 } else if (ShiftAmt->getOpcode() == ISD::SUB && Add0C &&
4511 Add0C->getZExtValue() != 0) {
4512 EVT SubVT = ShiftAmt.getValueType();
4513 SDValue X;
4514 if (Add0C->getZExtValue() % Size == 0)
4515 X = Add1;
4516 else if (ShiftAmt.hasOneUse() && Size == 64 &&
4517 Add0C->getZExtValue() % 32 == 0) {
4518 // We have a 64-bit shift by (n*32-x), turn it into -(x+n*32).
4519 // This is mainly beneficial if we already compute (x+n*32).
4520 if (Add1.getOpcode() == ISD::TRUNCATE) {
4521 Add1 = Add1.getOperand(0);
4522 SubVT = Add1.getValueType();
4523 }
4524 if (Add0.getValueType() != SubVT) {
4525 Add0 = CurDAG->getZExtOrTrunc(Add0, DL, SubVT);
4526 insertDAGNode(*CurDAG, OrigShiftAmt, Add0);
4527 }
4528
4529 X = CurDAG->getNode(ISD::ADD, DL, SubVT, Add1, Add0);
4530 insertDAGNode(*CurDAG, OrigShiftAmt, X);
4531 } else
4532 return false;
4533 // Insert a negate op.
4534 // TODO: This isn't guaranteed to replace the sub if there is a logic cone
4535 // that uses it that's not a shift.
4536 SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
4537 SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, X);
4538 NewShiftAmt = Neg;
4539
4540 // Insert these operands into a valid topological order so they can
4541 // get selected independently.
4542 insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
4543 insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
4544 } else
4545 return false;
4546 } else
4547 return false;
4548
4549 if (NewShiftAmt.getValueType() != MVT::i8) {
4550 // Need to truncate the shift amount.
4551 NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
4552 // Add to a correct topological ordering.
4553 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4554 }
4555
4556 // Insert a new mask to keep the shift amount legal. This should be removed
4557 // by isel patterns.
4558 NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
4559 CurDAG->getConstant(Size - 1, DL, MVT::i8));
4560 // Place in a correct topological ordering.
4561 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4562
4563 SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
4564 NewShiftAmt);
4565 if (UpdatedNode != N) {
4566 // If we found an existing node, we should replace ourselves with that node
4567 // and wait for it to be selected after its other users.
4568 ReplaceNode(N, UpdatedNode);
4569 return true;
4570 }
4571
4572 // If the original shift amount is now dead, delete it so that we don't run
4573 // it through isel.
4574 if (OrigShiftAmt.getNode()->use_empty())
4575 CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());
4576
4577 // Now that we've optimized the shift amount, defer to normal isel to get
4578 // load folding and legacy vs BMI2 selection without repeating it here.
4579 SelectCode(N);
4580 return true;
4581}
4582
4583bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
4584 MVT NVT = N->getSimpleValueType(0);
4585 unsigned Opcode = N->getOpcode();
4586 SDLoc dl(N);
4587
4588 // For operations of the form (x << C1) op C2, check if we can use a smaller
4589 // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
4590 SDValue Shift = N->getOperand(0);
4591 SDValue N1 = N->getOperand(1);
4592
4593 auto *Cst = dyn_cast<ConstantSDNode>(N1);
4594 if (!Cst)
4595 return false;
4596
4597 int64_t Val = Cst->getSExtValue();
4598
4599 // If we have an any_extend feeding the AND, look through it to see if there
4600 // is a shift behind it. But only if the AND doesn't use the extended bits.
4601 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
4602 bool FoundAnyExtend = false;
4603 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
4604 Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
4605 isUInt<32>(Val)) {
4606 FoundAnyExtend = true;
4607 Shift = Shift.getOperand(0);
4608 }
4609
4610 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
4611 return false;
4612
4613 // i8 is unshrinkable, i16 should be promoted to i32.
4614 if (NVT != MVT::i32 && NVT != MVT::i64)
4615 return false;
4616
4617 auto *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
4618 if (!ShlCst)
4619 return false;
4620
4621 uint64_t ShAmt = ShlCst->getZExtValue();
4622
4623 // Make sure that we don't change the operation by removing bits.
4624 // This only matters for OR and XOR, AND is unaffected.
4625 uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
4626 if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
4627 return false;
4628
4629 // Check the minimum bitwidth for the new constant.
4630 // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
4631 auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
4632 if (Opcode == ISD::AND) {
4633 // AND32ri is the same as AND64ri32 with zext imm.
4634 // Try this before sign extended immediates below.
4635 ShiftedVal = (uint64_t)Val >> ShAmt;
4636 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4637 return true;
4638 // Also swap order when the AND can become MOVZX.
4639 if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
4640 return true;
4641 }
4642 ShiftedVal = Val >> ShAmt;
4643 if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) ||
4644 (!isInt<32>(Val) && isInt<32>(ShiftedVal)))
4645 return true;
4646 if (Opcode != ISD::AND) {
4647 // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
4648 ShiftedVal = (uint64_t)Val >> ShAmt;
4649 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4650 return true;
4651 }
4652 return false;
4653 };
4654
4655 int64_t ShiftedVal;
4656 if (!CanShrinkImmediate(ShiftedVal))
4657 return false;
4658
4659 // Ok, we can reorder to get a smaller immediate.
4660
4661 // But, its possible the original immediate allowed an AND to become MOVZX.
4662 // Doing this late due to avoid the MakedValueIsZero call as late as
4663 // possible.
4664 if (Opcode == ISD::AND) {
4665 // Find the smallest zext this could possibly be.
4666 unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
4667 ZExtWidth = llvm::bit_ceil(std::max(ZExtWidth, 8U));
4668
4669 // Figure out which bits need to be zero to achieve that mask.
4670 APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
4671 ZExtWidth);
4672 NeededMask &= ~Cst->getAPIntValue();
4673
4674 if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
4675 return false;
4676 }
4677
4678 SDValue X = Shift.getOperand(0);
4679 if (FoundAnyExtend) {
4680 SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
4681 insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
4682 X = NewX;
4683 }
4684
4685 SDValue NewCst = CurDAG->getSignedConstant(ShiftedVal, dl, NVT);
4686 insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
4687 SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
4688 insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
4689 SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
4690 Shift.getOperand(1));
4691 ReplaceNode(N, NewSHL.getNode());
4692 SelectCode(NewSHL.getNode());
4693 return true;
4694}
4695
4696bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
4697 SDNode *ParentB, SDNode *ParentC,
4699 uint8_t Imm) {
4700 assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) &&
4701 C.isOperandOf(ParentC) && "Incorrect parent node");
4702
4703 auto tryFoldLoadOrBCast =
4704 [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale,
4705 SDValue &Index, SDValue &Disp, SDValue &Segment) {
4706 if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
4707 return true;
4708
4709 // Not a load, check for broadcast which may be behind a bitcast.
4710 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4711 P = L.getNode();
4712 L = L.getOperand(0);
4713 }
4714
4715 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4716 return false;
4717
4718 // Only 32 and 64 bit broadcasts are supported.
4719 auto *MemIntr = cast<MemIntrinsicSDNode>(L);
4720 unsigned Size = MemIntr->getMemoryVT().getSizeInBits();
4721 if (Size != 32 && Size != 64)
4722 return false;
4723
4724 return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
4725 };
4726
4727 bool FoldedLoad = false;
4728 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4729 if (tryFoldLoadOrBCast(Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4730 FoldedLoad = true;
4731 } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
4732 Tmp4)) {
4733 FoldedLoad = true;
4734 std::swap(A, C);
4735 // Swap bits 1/4 and 3/6.
4736 uint8_t OldImm = Imm;
4737 Imm = OldImm & 0xa5;
4738 if (OldImm & 0x02) Imm |= 0x10;
4739 if (OldImm & 0x10) Imm |= 0x02;
4740 if (OldImm & 0x08) Imm |= 0x40;
4741 if (OldImm & 0x40) Imm |= 0x08;
4742 } else if (tryFoldLoadOrBCast(Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3,
4743 Tmp4)) {
4744 FoldedLoad = true;
4745 std::swap(B, C);
4746 // Swap bits 1/2 and 5/6.
4747 uint8_t OldImm = Imm;
4748 Imm = OldImm & 0x99;
4749 if (OldImm & 0x02) Imm |= 0x04;
4750 if (OldImm & 0x04) Imm |= 0x02;
4751 if (OldImm & 0x20) Imm |= 0x40;
4752 if (OldImm & 0x40) Imm |= 0x20;
4753 }
4754
4755 SDLoc DL(Root);
4756
4757 SDValue TImm = CurDAG->getTargetConstant(Imm, DL, MVT::i8);
4758
4759 MVT NVT = Root->getSimpleValueType(0);
4760
4761 MachineSDNode *MNode;
4762 if (FoldedLoad) {
4763 SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
4764
4765 unsigned Opc;
4766 if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) {
4767 auto *MemIntr = cast<MemIntrinsicSDNode>(C);
4768 unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits();
4769 assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!");
4770
4771 bool UseD = EltSize == 32;
4772 if (NVT.is128BitVector())
4773 Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi;
4774 else if (NVT.is256BitVector())
4775 Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi;
4776 else if (NVT.is512BitVector())
4777 Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi;
4778 else
4779 llvm_unreachable("Unexpected vector size!");
4780 } else {
4781 bool UseD = NVT.getVectorElementType() == MVT::i32;
4782 if (NVT.is128BitVector())
4783 Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi;
4784 else if (NVT.is256BitVector())
4785 Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi;
4786 else if (NVT.is512BitVector())
4787 Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi;
4788 else
4789 llvm_unreachable("Unexpected vector size!");
4790 }
4791
4792 SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(0)};
4793 MNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops);
4794
4795 // Update the chain.
4796 ReplaceUses(C.getValue(1), SDValue(MNode, 1));
4797 // Record the mem-refs
4798 CurDAG->setNodeMemRefs(MNode, {cast<MemSDNode>(C)->getMemOperand()});
4799 } else {
4800 bool UseD = NVT.getVectorElementType() == MVT::i32;
4801 unsigned Opc;
4802 if (NVT.is128BitVector())
4803 Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri;
4804 else if (NVT.is256BitVector())
4805 Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri;
4806 else if (NVT.is512BitVector())
4807 Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri;
4808 else
4809 llvm_unreachable("Unexpected vector size!");
4810
4811 MNode = CurDAG->getMachineNode(Opc, DL, NVT, {A, B, C, TImm});
4812 }
4813
4814 ReplaceUses(SDValue(Root, 0), SDValue(MNode, 0));
4815 CurDAG->RemoveDeadNode(Root);
4816 return true;
4817}
4818
4819// Try to match two logic ops to a VPTERNLOG.
4820// FIXME: Handle more complex patterns that use an operand more than once?
4821bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
4822 MVT NVT = N->getSimpleValueType(0);
4823
4824 // Make sure we support VPTERNLOG.
4825 if (!NVT.isVector() || !Subtarget->hasAVX512() ||
4826 NVT.getVectorElementType() == MVT::i1)
4827 return false;
4828
4829 // We need VLX for 128/256-bit.
4830 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
4831 return false;
4832
4833 auto getFoldableLogicOp = [](SDValue Op) {
4834 // Peek through single use bitcast.
4835 if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
4836 Op = Op.getOperand(0);
4837
4838 if (!Op.hasOneUse())
4839 return SDValue();
4840
4841 unsigned Opc = Op.getOpcode();
4842 if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
4843 Opc == X86ISD::ANDNP)
4844 return Op;
4845
4846 return SDValue();
4847 };
4848
4849 SDValue N0, N1, A, FoldableOp;
4850
4851 // Identify and (optionally) peel an outer NOT that wraps a pure logic tree
4852 auto tryPeelOuterNotWrappingLogic = [&](SDNode *Op) {
4853 if (Op->getOpcode() == ISD::XOR && Op->hasOneUse() &&
4854 ISD::isBuildVectorAllOnes(Op->getOperand(1).getNode())) {
4855 SDValue InnerOp = getFoldableLogicOp(Op->getOperand(0));
4856
4857 if (!InnerOp)
4858 return SDValue();
4859
4860 N0 = InnerOp.getOperand(0);
4861 N1 = InnerOp.getOperand(1);
4862 if ((FoldableOp = getFoldableLogicOp(N1))) {
4863 A = N0;
4864 return InnerOp;
4865 }
4866 if ((FoldableOp = getFoldableLogicOp(N0))) {
4867 A = N1;
4868 return InnerOp;
4869 }
4870 }
4871 return SDValue();
4872 };
4873
4874 bool PeeledOuterNot = false;
4875 SDNode *OriN = N;
4876 if (SDValue InnerOp = tryPeelOuterNotWrappingLogic(N)) {
4877 PeeledOuterNot = true;
4878 N = InnerOp.getNode();
4879 } else {
4880 N0 = N->getOperand(0);
4881 N1 = N->getOperand(1);
4882
4883 if ((FoldableOp = getFoldableLogicOp(N1)))
4884 A = N0;
4885 else if ((FoldableOp = getFoldableLogicOp(N0)))
4886 A = N1;
4887 else
4888 return false;
4889 }
4890
4891 SDValue B = FoldableOp.getOperand(0);
4892 SDValue C = FoldableOp.getOperand(1);
4893 SDNode *ParentA = N;
4894 SDNode *ParentB = FoldableOp.getNode();
4895 SDNode *ParentC = FoldableOp.getNode();
4896
4897 // We can build the appropriate control immediate by performing the logic
4898 // operation we're matching using these constants for A, B, and C.
4899 uint8_t TernlogMagicA = 0xf0;
4900 uint8_t TernlogMagicB = 0xcc;
4901 uint8_t TernlogMagicC = 0xaa;
4902
4903 // Some of the inputs may be inverted, peek through them and invert the
4904 // magic values accordingly.
4905 // TODO: There may be a bitcast before the xor that we should peek through.
4906 auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) {
4907 if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() &&
4908 ISD::isBuildVectorAllOnes(Op.getOperand(1).getNode())) {
4909 Magic = ~Magic;
4910 Parent = Op.getNode();
4911 Op = Op.getOperand(0);
4912 }
4913 };
4914
4915 PeekThroughNot(A, ParentA, TernlogMagicA);
4916 PeekThroughNot(B, ParentB, TernlogMagicB);
4917 PeekThroughNot(C, ParentC, TernlogMagicC);
4918
4919 uint8_t Imm;
4920 switch (FoldableOp.getOpcode()) {
4921 default: llvm_unreachable("Unexpected opcode!");
4922 case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break;
4923 case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break;
4924 case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break;
4925 case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
4926 }
4927
4928 switch (N->getOpcode()) {
4929 default: llvm_unreachable("Unexpected opcode!");
4930 case X86ISD::ANDNP:
4931 if (A == N0)
4932 Imm &= ~TernlogMagicA;
4933 else
4934 Imm = ~(Imm) & TernlogMagicA;
4935 break;
4936 case ISD::AND: Imm &= TernlogMagicA; break;
4937 case ISD::OR: Imm |= TernlogMagicA; break;
4938 case ISD::XOR: Imm ^= TernlogMagicA; break;
4939 }
4940
4941 if (PeeledOuterNot)
4942 Imm = ~Imm;
4943
4944 return matchVPTERNLOG(OriN, ParentA, ParentB, ParentC, A, B, C, Imm);
4945}
4946
4947/// If the high bits of an 'and' operand are known zero, try setting the
4948/// high bits of an 'and' constant operand to produce a smaller encoding by
4949/// creating a small, sign-extended negative immediate rather than a large
4950/// positive one. This reverses a transform in SimplifyDemandedBits that
4951/// shrinks mask constants by clearing bits. There is also a possibility that
4952/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
4953/// case, just replace the 'and'. Return 'true' if the node is replaced.
4954bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
4955 // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
4956 // have immediate operands.
4957 MVT VT = And->getSimpleValueType(0);
4958 if (VT != MVT::i32 && VT != MVT::i64)
4959 return false;
4960
4961 auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
4962 if (!And1C)
4963 return false;
4964
4965 // Bail out if the mask constant is already negative. It's can't shrink more.
4966 // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
4967 // patterns to use a 32-bit and instead of a 64-bit and by relying on the
4968 // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
4969 // are negative too.
4970 APInt MaskVal = And1C->getAPIntValue();
4971 unsigned MaskLZ = MaskVal.countl_zero();
4972 if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
4973 return false;
4974
4975 // Don't extend into the upper 32 bits of a 64 bit mask.
4976 if (VT == MVT::i64 && MaskLZ >= 32) {
4977 MaskLZ -= 32;
4978 MaskVal = MaskVal.trunc(32);
4979 }
4980
4981 SDValue And0 = And->getOperand(0);
4982 APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
4983 APInt NegMaskVal = MaskVal | HighZeros;
4984
4985 // If a negative constant would not allow a smaller encoding, there's no need
4986 // to continue. Only change the constant when we know it's a win.
4987 unsigned MinWidth = NegMaskVal.getSignificantBits();
4988 if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getSignificantBits() <= 32))
4989 return false;
4990
4991 // Extend masks if we truncated above.
4992 if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
4993 NegMaskVal = NegMaskVal.zext(64);
4994 HighZeros = HighZeros.zext(64);
4995 }
4996
4997 // The variable operand must be all zeros in the top bits to allow using the
4998 // new, negative constant as the mask.
4999 // TODO: Handle constant folding?
5000 KnownBits Known0 = CurDAG->computeKnownBits(And0);
5001 if (Known0.isConstant() || !HighZeros.isSubsetOf(Known0.Zero))
5002 return false;
5003
5004 // Check if the mask is -1. In that case, this is an unnecessary instruction
5005 // that escaped earlier analysis.
5006 if (NegMaskVal.isAllOnes()) {
5007 ReplaceNode(And, And0.getNode());
5008 return true;
5009 }
5010
5011 // A negative mask allows a smaller encoding. Create a new 'and' node.
5012 SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
5013 insertDAGNode(*CurDAG, SDValue(And, 0), NewMask);
5014 SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
5015 ReplaceNode(And, NewAnd.getNode());
5016 SelectCode(NewAnd.getNode());
5017 return true;
5018}
5019
5020static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
5021 bool FoldedBCast, bool Masked) {
5022#define VPTESTM_CASE(VT, SUFFIX) \
5023case MVT::VT: \
5024 if (Masked) \
5025 return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
5026 return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
5027
5028
5029#define VPTESTM_BROADCAST_CASES(SUFFIX) \
5030default: llvm_unreachable("Unexpected VT!"); \
5031VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
5032VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
5033VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
5034VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
5035VPTESTM_CASE(v16i32, DZ##SUFFIX) \
5036VPTESTM_CASE(v8i64, QZ##SUFFIX)
5037
5038#define VPTESTM_FULL_CASES(SUFFIX) \
5039VPTESTM_BROADCAST_CASES(SUFFIX) \
5040VPTESTM_CASE(v16i8, BZ128##SUFFIX) \
5041VPTESTM_CASE(v8i16, WZ128##SUFFIX) \
5042VPTESTM_CASE(v32i8, BZ256##SUFFIX) \
5043VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
5044VPTESTM_CASE(v64i8, BZ##SUFFIX) \
5045VPTESTM_CASE(v32i16, WZ##SUFFIX)
5046
5047 if (FoldedBCast) {
5048 switch (TestVT.SimpleTy) {
5050 }
5051 }
5052
5053 if (FoldedLoad) {
5054 switch (TestVT.SimpleTy) {
5056 }
5057 }
5058
5059 switch (TestVT.SimpleTy) {
5061 }
5062
5063#undef VPTESTM_FULL_CASES
5064#undef VPTESTM_BROADCAST_CASES
5065#undef VPTESTM_CASE
5066}
5067
5068static void orderRegForMul(SDValue &N0, SDValue &N1, const unsigned LoReg,
5069 const MachineRegisterInfo &MRI) {
5070 auto GetPhysReg = [&](SDValue V) -> Register {
5071 if (V.getOpcode() != ISD::CopyFromReg)
5072 return Register();
5073 Register Reg = cast<RegisterSDNode>(V.getOperand(1))->getReg();
5074 if (Reg.isVirtual())
5075 return MRI.getLiveInPhysReg(Reg);
5076 return Reg;
5077 };
5078
5079 if (GetPhysReg(N1) == LoReg && GetPhysReg(N0) != LoReg)
5080 std::swap(N0, N1);
5081}
5082
5083// Try to create VPTESTM instruction. If InMask is not null, it will be used
5084// to form a masked operation.
5085bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
5086 SDValue InMask) {
5087 assert(Subtarget->hasAVX512() && "Expected AVX512!");
5088 assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
5089 "Unexpected VT!");
5090
5091 // Look for equal and not equal compares.
5092 ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
5093 if (CC != ISD::SETEQ && CC != ISD::SETNE)
5094 return false;
5095
5096 SDValue SetccOp0 = Setcc.getOperand(0);
5097 SDValue SetccOp1 = Setcc.getOperand(1);
5098
5099 // Canonicalize the all zero vector to the RHS.
5100 if (ISD::isBuildVectorAllZeros(SetccOp0.getNode()))
5101 std::swap(SetccOp0, SetccOp1);
5102
5103 // See if we're comparing against zero.
5104 if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode()))
5105 return false;
5106
5107 SDValue N0 = SetccOp0;
5108
5109 MVT CmpVT = N0.getSimpleValueType();
5110 MVT CmpSVT = CmpVT.getVectorElementType();
5111
5112 // Start with both operands the same. We'll try to refine this.
5113 SDValue Src0 = N0;
5114 SDValue Src1 = N0;
5115
5116 {
5117 // Look through single use bitcasts.
5118 SDValue N0Temp = N0;
5119 if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
5120 N0Temp = N0.getOperand(0);
5121
5122 // Look for single use AND.
5123 if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
5124 Src0 = N0Temp.getOperand(0);
5125 Src1 = N0Temp.getOperand(1);
5126 }
5127 }
5128
5129 // Without VLX we need to widen the operation.
5130 bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
5131
5132 auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L,
5133 SDValue &Base, SDValue &Scale, SDValue &Index,
5134 SDValue &Disp, SDValue &Segment) {
5135 // If we need to widen, we can't fold the load.
5136 if (!Widen)
5137 if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
5138 return true;
5139
5140 // If we didn't fold a load, try to match broadcast. No widening limitation
5141 // for this. But only 32 and 64 bit types are supported.
5142 if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64)
5143 return false;
5144
5145 // Look through single use bitcasts.
5146 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
5147 P = L.getNode();
5148 L = L.getOperand(0);
5149 }
5150
5151 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
5152 return false;
5153
5154 auto *MemIntr = cast<MemIntrinsicSDNode>(L);
5155 if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits())
5156 return false;
5157
5158 return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
5159 };
5160
5161 // We can only fold loads if the sources are unique.
5162 bool CanFoldLoads = Src0 != Src1;
5163
5164 bool FoldedLoad = false;
5165 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5166 if (CanFoldLoads) {
5167 FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2,
5168 Tmp3, Tmp4);
5169 if (!FoldedLoad) {
5170 // And is commutative.
5171 FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1,
5172 Tmp2, Tmp3, Tmp4);
5173 if (FoldedLoad)
5174 std::swap(Src0, Src1);
5175 }
5176 }
5177
5178 bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD;
5179
5180 bool IsMasked = InMask.getNode() != nullptr;
5181
5182 SDLoc dl(Root);
5183
5184 MVT ResVT = Setcc.getSimpleValueType();
5185 MVT MaskVT = ResVT;
5186 if (Widen) {
5187 // Widen the inputs using insert_subreg or copy_to_regclass.
5188 unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
5189 unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
5190 unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
5191 CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
5192 MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
5193 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
5194 CmpVT), 0);
5195 Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
5196
5197 if (!FoldedBCast)
5198 Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
5199
5200 if (IsMasked) {
5201 // Widen the mask.
5202 unsigned RegClass = TLI->getRegClassFor(MaskVT)->getID();
5203 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
5204 InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
5205 dl, MaskVT, InMask, RC), 0);
5206 }
5207 }
5208
5209 bool IsTestN = CC == ISD::SETEQ;
5210 unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
5211 IsMasked);
5212
5213 MachineSDNode *CNode;
5214 if (FoldedLoad) {
5215 SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
5216
5217 if (IsMasked) {
5218 SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
5219 Src1.getOperand(0) };
5220 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5221 } else {
5222 SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
5223 Src1.getOperand(0) };
5224 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5225 }
5226
5227 // Update the chain.
5228 ReplaceUses(Src1.getValue(1), SDValue(CNode, 1));
5229 // Record the mem-refs
5230 CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Src1)->getMemOperand()});
5231 } else {
5232 if (IsMasked)
5233 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
5234 else
5235 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
5236 }
5237
5238 // If we widened, we need to shrink the mask VT.
5239 if (Widen) {
5240 unsigned RegClass = TLI->getRegClassFor(ResVT)->getID();
5241 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
5242 CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
5243 dl, ResVT, SDValue(CNode, 0), RC);
5244 }
5245
5246 ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
5247 CurDAG->RemoveDeadNode(Root);
5248 return true;
5249}
5250
5251// Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
5252// into vpternlog.
5253bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
5254 assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
5255
5256 MVT NVT = N->getSimpleValueType(0);
5257
5258 // Make sure we support VPTERNLOG.
5259 if (!NVT.isVector() || !Subtarget->hasAVX512())
5260 return false;
5261
5262 // We need VLX for 128/256-bit.
5263 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
5264 return false;
5265
5266 SDValue N0 = N->getOperand(0);
5267 SDValue N1 = N->getOperand(1);
5268
5269 // Canonicalize AND to LHS.
5270 if (N1.getOpcode() == ISD::AND)
5271 std::swap(N0, N1);
5272
5273 if (N0.getOpcode() != ISD::AND ||
5274 N1.getOpcode() != X86ISD::ANDNP ||
5275 !N0.hasOneUse() || !N1.hasOneUse())
5276 return false;
5277
5278 // ANDN is not commutable, use it to pick down A and C.
5279 SDValue A = N1.getOperand(0);
5280 SDValue C = N1.getOperand(1);
5281
5282 // AND is commutable, if one operand matches A, the other operand is B.
5283 // Otherwise this isn't a match.
5284 SDValue B;
5285 if (N0.getOperand(0) == A)
5286 B = N0.getOperand(1);
5287 else if (N0.getOperand(1) == A)
5288 B = N0.getOperand(0);
5289 else
5290 return false;
5291
5292 SDLoc dl(N);
5293 SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8);
5294 SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm);
5295 ReplaceNode(N, Ternlog.getNode());
5296
5297 return matchVPTERNLOG(Ternlog.getNode(), Ternlog.getNode(), Ternlog.getNode(),
5298 Ternlog.getNode(), A, B, C, 0xCA);
5299}
5300
5301void X86DAGToDAGISel::Select(SDNode *Node) {
5302 MVT NVT = Node->getSimpleValueType(0);
5303 unsigned Opcode = Node->getOpcode();
5304 SDLoc dl(Node);
5305
5306 if (Node->isMachineOpcode()) {
5307 LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
5308 Node->setNodeId(-1);
5309 return; // Already selected.
5310 }
5311
5312 switch (Opcode) {
5313 default: break;
5315 unsigned IntNo = Node->getConstantOperandVal(1);
5316 switch (IntNo) {
5317 default: break;
5318 case Intrinsic::x86_encodekey128:
5319 case Intrinsic::x86_encodekey256: {
5320 if (!Subtarget->hasKL())
5321 break;
5322
5323 unsigned Opcode;
5324 switch (IntNo) {
5325 default: llvm_unreachable("Impossible intrinsic");
5326 case Intrinsic::x86_encodekey128:
5327 Opcode = X86::ENCODEKEY128;
5328 break;
5329 case Intrinsic::x86_encodekey256:
5330 Opcode = X86::ENCODEKEY256;
5331 break;
5332 }
5333
5334 SDValue Chain = Node->getOperand(0);
5335 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(3),
5336 SDValue());
5337 if (Opcode == X86::ENCODEKEY256)
5338 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(4),
5339 Chain.getValue(1));
5340
5341 MachineSDNode *Res = CurDAG->getMachineNode(
5342 Opcode, dl, Node->getVTList(),
5343 {Node->getOperand(2), Chain, Chain.getValue(1)});
5344 ReplaceNode(Node, Res);
5345 return;
5346 }
5347 case Intrinsic::x86_tileloaddrs64_internal:
5348 case Intrinsic::x86_tileloaddrst164_internal:
5349 if (!Subtarget->hasAMXMOVRS())
5350 break;
5351 [[fallthrough]];
5352 case Intrinsic::x86_tileloadd64_internal:
5353 case Intrinsic::x86_tileloaddt164_internal: {
5354 if (!Subtarget->hasAMXTILE())
5355 break;
5356 auto *MFI =
5357 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5358 MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5359 unsigned Opc;
5360 switch (IntNo) {
5361 default:
5362 llvm_unreachable("Unexpected intrinsic!");
5363 case Intrinsic::x86_tileloaddrs64_internal:
5364 Opc = X86::PTILELOADDRSV;
5365 break;
5366 case Intrinsic::x86_tileloaddrst164_internal:
5367 Opc = X86::PTILELOADDRST1V;
5368 break;
5369 case Intrinsic::x86_tileloadd64_internal:
5370 Opc = X86::PTILELOADDV;
5371 break;
5372 case Intrinsic::x86_tileloaddt164_internal:
5373 Opc = X86::PTILELOADDT1V;
5374 break;
5375 }
5376 // _tile_loadd_internal(row, col, buf, STRIDE)
5377 SDValue Base = Node->getOperand(4);
5378 SDValue Scale = getI8Imm(1, dl);
5379 SDValue Index = Node->getOperand(5);
5380 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5381 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5382 SDValue Chain = Node->getOperand(0);
5383 MachineSDNode *CNode;
5384 SDValue Ops[] = {Node->getOperand(2),
5385 Node->getOperand(3),
5386 Base,
5387 Scale,
5388 Index,
5389 Disp,
5390 Segment,
5391 Chain};
5392 CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
5393 ReplaceNode(Node, CNode);
5394 return;
5395 }
5396 }
5397 break;
5398 }
5399 case ISD::INTRINSIC_VOID: {
5400 unsigned IntNo = Node->getConstantOperandVal(1);
5401 switch (IntNo) {
5402 default: break;
5403 case Intrinsic::x86_sse3_monitor:
5404 case Intrinsic::x86_monitorx:
5405 case Intrinsic::x86_clzero: {
5406 bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64;
5407
5408 unsigned Opc = 0;
5409 switch (IntNo) {
5410 default: llvm_unreachable("Unexpected intrinsic!");
5411 case Intrinsic::x86_sse3_monitor:
5412 if (!Subtarget->hasSSE3())
5413 break;
5414 Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
5415 break;
5416 case Intrinsic::x86_monitorx:
5417 if (!Subtarget->hasMWAITX())
5418 break;
5419 Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
5420 break;
5421 case Intrinsic::x86_clzero:
5422 if (!Subtarget->hasCLZERO())
5423 break;
5424 Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
5425 break;
5426 }
5427
5428 if (Opc) {
5429 unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
5430 SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg,
5431 Node->getOperand(2), SDValue());
5432 SDValue InGlue = Chain.getValue(1);
5433
5434 if (IntNo == Intrinsic::x86_sse3_monitor ||
5435 IntNo == Intrinsic::x86_monitorx) {
5436 // Copy the other two operands to ECX and EDX.
5437 Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3),
5438 InGlue);
5439 InGlue = Chain.getValue(1);
5440 Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4),
5441 InGlue);
5442 InGlue = Chain.getValue(1);
5443 }
5444
5445 MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
5446 { Chain, InGlue});
5447 ReplaceNode(Node, CNode);
5448 return;
5449 }
5450
5451 break;
5452 }
5453 case Intrinsic::x86_tilestored64_internal: {
5454 auto *MFI =
5455 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5456 MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5457 unsigned Opc = X86::PTILESTOREDV;
5458 // _tile_stored_internal(row, col, buf, STRIDE, c)
5459 SDValue Base = Node->getOperand(4);
5460 SDValue Scale = getI8Imm(1, dl);
5461 SDValue Index = Node->getOperand(5);
5462 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5463 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5464 SDValue Chain = Node->getOperand(0);
5465 MachineSDNode *CNode;
5466 SDValue Ops[] = {Node->getOperand(2),
5467 Node->getOperand(3),
5468 Base,
5469 Scale,
5470 Index,
5471 Disp,
5472 Segment,
5473 Node->getOperand(6),
5474 Chain};
5475 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5476 ReplaceNode(Node, CNode);
5477 return;
5478 }
5479 case Intrinsic::x86_tileloaddrs64:
5480 case Intrinsic::x86_tileloaddrst164:
5481 if (!Subtarget->hasAMXMOVRS())
5482 break;
5483 [[fallthrough]];
5484 case Intrinsic::x86_tileloadd64:
5485 case Intrinsic::x86_tileloaddt164:
5486 case Intrinsic::x86_tilestored64: {
5487 if (!Subtarget->hasAMXTILE())
5488 break;
5489 auto *MFI =
5490 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5491 MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
5492 unsigned Opc;
5493 switch (IntNo) {
5494 default: llvm_unreachable("Unexpected intrinsic!");
5495 case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break;
5496 case Intrinsic::x86_tileloaddrs64:
5497 Opc = X86::PTILELOADDRS;
5498 break;
5499 case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
5500 case Intrinsic::x86_tileloaddrst164:
5501 Opc = X86::PTILELOADDRST1;
5502 break;
5503 case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break;
5504 }
5505 // FIXME: Match displacement and scale.
5506 unsigned TIndex = Node->getConstantOperandVal(2);
5507 SDValue TReg = getI8Imm(TIndex, dl);
5508 SDValue Base = Node->getOperand(3);
5509 SDValue Scale = getI8Imm(1, dl);
5510 SDValue Index = Node->getOperand(4);
5511 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5512 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5513 SDValue Chain = Node->getOperand(0);
5514 MachineSDNode *CNode;
5515 if (Opc == X86::PTILESTORED) {
5516 SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
5517 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5518 } else {
5519 SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
5520 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5521 }
5522 ReplaceNode(Node, CNode);
5523 return;
5524 }
5525 }
5526 break;
5527 }
5528 case ISD::BRIND:
5529 case X86ISD::NT_BRIND: {
5530 if (Subtarget->isTarget64BitILP32()) {
5531 // Converts a 32-bit register to a 64-bit, zero-extended version of
5532 // it. This is needed because x86-64 can do many things, but jmp %r32
5533 // ain't one of them.
5534 SDValue Target = Node->getOperand(1);
5535 assert(Target.getValueType() == MVT::i32 && "Unexpected VT!");
5536 SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, MVT::i64);
5537 SDValue Brind = CurDAG->getNode(Opcode, dl, MVT::Other,
5538 Node->getOperand(0), ZextTarget);
5539 ReplaceNode(Node, Brind.getNode());
5540 SelectCode(ZextTarget.getNode());
5541 SelectCode(Brind.getNode());
5542 return;
5543 }
5544 break;
5545 }
5547 ReplaceNode(Node, getGlobalBaseReg());
5548 return;
5549
5550 case ISD::BITCAST:
5551 // Just drop all 128/256/512-bit bitcasts.
5552 if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
5553 NVT == MVT::f128) {
5554 ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
5555 CurDAG->RemoveDeadNode(Node);
5556 return;
5557 }
5558 break;
5559
5560 case ISD::SRL:
5561 if (matchBitExtract(Node))
5562 return;
5563 [[fallthrough]];
5564 case ISD::SRA:
5565 case ISD::SHL:
5566 if (tryShiftAmountMod(Node))
5567 return;
5568 break;
5569
5570 case X86ISD::VPTERNLOG: {
5571 uint8_t Imm = Node->getConstantOperandVal(3);
5572 if (matchVPTERNLOG(Node, Node, Node, Node, Node->getOperand(0),
5573 Node->getOperand(1), Node->getOperand(2), Imm))
5574 return;
5575 break;
5576 }
5577
5578 case X86ISD::ANDNP:
5579 if (tryVPTERNLOG(Node))
5580 return;
5581 break;
5582
5583 case ISD::AND:
5584 if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
5585 // Try to form a masked VPTESTM. Operands can be in either order.
5586 SDValue N0 = Node->getOperand(0);
5587 SDValue N1 = Node->getOperand(1);
5588 if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
5589 tryVPTESTM(Node, N0, N1))
5590 return;
5591 if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
5592 tryVPTESTM(Node, N1, N0))
5593 return;
5594 }
5595
5596 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
5597 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
5598 CurDAG->RemoveDeadNode(Node);
5599 return;
5600 }
5601 if (matchBitExtract(Node))
5602 return;
5603 if (AndImmShrink && shrinkAndImmediate(Node))
5604 return;
5605
5606 [[fallthrough]];
5607 case ISD::OR:
5608 case ISD::XOR:
5609 if (tryShrinkShlLogicImm(Node))
5610 return;
5611 if (Opcode == ISD::OR && tryMatchBitSelect(Node))
5612 return;
5613 if (tryVPTERNLOG(Node))
5614 return;
5615
5616 [[fallthrough]];
5617 case ISD::ADD:
5618 if (Opcode == ISD::ADD && matchBitExtract(Node))
5619 return;
5620 [[fallthrough]];
5621 case ISD::SUB: {
5622 // Try to avoid folding immediates with multiple uses for optsize.
5623 // This code tries to select to register form directly to avoid going
5624 // through the isel table which might fold the immediate. We can't change
5625 // the patterns on the add/sub/and/or/xor with immediate paterns in the
5626 // tablegen files to check immediate use count without making the patterns
5627 // unavailable to the fast-isel table.
5628 if (!CurDAG->shouldOptForSize())
5629 break;
5630
5631 // Only handle i8/i16/i32/i64.
5632 if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
5633 break;
5634
5635 SDValue N0 = Node->getOperand(0);
5636 SDValue N1 = Node->getOperand(1);
5637
5638 auto *Cst = dyn_cast<ConstantSDNode>(N1);
5639 if (!Cst)
5640 break;
5641
5642 int64_t Val = Cst->getSExtValue();
5643
5644 // Make sure its an immediate that is considered foldable.
5645 // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
5646 if (!isInt<8>(Val) && !isInt<32>(Val))
5647 break;
5648
5649 // If this can match to INC/DEC, let it go.
5650 if (Opcode == ISD::ADD && (Val == 1 || Val == -1))
5651 break;
5652
5653 // Check if we should avoid folding this immediate.
5654 if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
5655 break;
5656
5657 // We should not fold the immediate. So we need a register form instead.
5658 unsigned ROpc, MOpc;
5659 switch (NVT.SimpleTy) {
5660 default: llvm_unreachable("Unexpected VT!");
5661 case MVT::i8:
5662 switch (Opcode) {
5663 default: llvm_unreachable("Unexpected opcode!");
5664 case ISD::ADD:
5665 ROpc = GET_ND_IF_ENABLED(X86::ADD8rr);
5666 MOpc = GET_NDM_IF_ENABLED(X86::ADD8rm);
5667 break;
5668 case ISD::SUB:
5669 ROpc = GET_ND_IF_ENABLED(X86::SUB8rr);
5670 MOpc = GET_NDM_IF_ENABLED(X86::SUB8rm);
5671 break;
5672 case ISD::AND:
5673 ROpc = GET_ND_IF_ENABLED(X86::AND8rr);
5674 MOpc = GET_NDM_IF_ENABLED(X86::AND8rm);
5675 break;
5676 case ISD::OR:
5677 ROpc = GET_ND_IF_ENABLED(X86::OR8rr);
5678 MOpc = GET_NDM_IF_ENABLED(X86::OR8rm);
5679 break;
5680 case ISD::XOR:
5681 ROpc = GET_ND_IF_ENABLED(X86::XOR8rr);
5682 MOpc = GET_NDM_IF_ENABLED(X86::XOR8rm);
5683 break;
5684 }
5685 break;
5686 case MVT::i16:
5687 switch (Opcode) {
5688 default: llvm_unreachable("Unexpected opcode!");
5689 case ISD::ADD:
5690 ROpc = GET_ND_IF_ENABLED(X86::ADD16rr);
5691 MOpc = GET_NDM_IF_ENABLED(X86::ADD16rm);
5692 break;
5693 case ISD::SUB:
5694 ROpc = GET_ND_IF_ENABLED(X86::SUB16rr);
5695 MOpc = GET_NDM_IF_ENABLED(X86::SUB16rm);
5696 break;
5697 case ISD::AND:
5698 ROpc = GET_ND_IF_ENABLED(X86::AND16rr);
5699 MOpc = GET_NDM_IF_ENABLED(X86::AND16rm);
5700 break;
5701 case ISD::OR:
5702 ROpc = GET_ND_IF_ENABLED(X86::OR16rr);
5703 MOpc = GET_NDM_IF_ENABLED(X86::OR16rm);
5704 break;
5705 case ISD::XOR:
5706 ROpc = GET_ND_IF_ENABLED(X86::XOR16rr);
5707 MOpc = GET_NDM_IF_ENABLED(X86::XOR16rm);
5708 break;
5709 }
5710 break;
5711 case MVT::i32:
5712 switch (Opcode) {
5713 default: llvm_unreachable("Unexpected opcode!");
5714 case ISD::ADD:
5715 ROpc = GET_ND_IF_ENABLED(X86::ADD32rr);
5716 MOpc = GET_NDM_IF_ENABLED(X86::ADD32rm);
5717 break;
5718 case ISD::SUB:
5719 ROpc = GET_ND_IF_ENABLED(X86::SUB32rr);
5720 MOpc = GET_NDM_IF_ENABLED(X86::SUB32rm);
5721 break;
5722 case ISD::AND:
5723 ROpc = GET_ND_IF_ENABLED(X86::AND32rr);
5724 MOpc = GET_NDM_IF_ENABLED(X86::AND32rm);
5725 break;
5726 case ISD::OR:
5727 ROpc = GET_ND_IF_ENABLED(X86::OR32rr);
5728 MOpc = GET_NDM_IF_ENABLED(X86::OR32rm);
5729 break;
5730 case ISD::XOR:
5731 ROpc = GET_ND_IF_ENABLED(X86::XOR32rr);
5732 MOpc = GET_NDM_IF_ENABLED(X86::XOR32rm);
5733 break;
5734 }
5735 break;
5736 case MVT::i64:
5737 switch (Opcode) {
5738 default: llvm_unreachable("Unexpected opcode!");
5739 case ISD::ADD:
5740 ROpc = GET_ND_IF_ENABLED(X86::ADD64rr);
5741 MOpc = GET_NDM_IF_ENABLED(X86::ADD64rm);
5742 break;
5743 case ISD::SUB:
5744 ROpc = GET_ND_IF_ENABLED(X86::SUB64rr);
5745 MOpc = GET_NDM_IF_ENABLED(X86::SUB64rm);
5746 break;
5747 case ISD::AND:
5748 ROpc = GET_ND_IF_ENABLED(X86::AND64rr);
5749 MOpc = GET_NDM_IF_ENABLED(X86::AND64rm);
5750 break;
5751 case ISD::OR:
5752 ROpc = GET_ND_IF_ENABLED(X86::OR64rr);
5753 MOpc = GET_NDM_IF_ENABLED(X86::OR64rm);
5754 break;
5755 case ISD::XOR:
5756 ROpc = GET_ND_IF_ENABLED(X86::XOR64rr);
5757 MOpc = GET_NDM_IF_ENABLED(X86::XOR64rm);
5758 break;
5759 }
5760 break;
5761 }
5762
5763 // Ok this is a AND/OR/XOR/ADD/SUB with constant.
5764
5765 // If this is a not a subtract, we can still try to fold a load.
5766 if (Opcode != ISD::SUB) {
5767 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5768 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5769 SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
5770 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5771 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5772 // Update the chain.
5773 ReplaceUses(N0.getValue(1), SDValue(CNode, 2));
5774 // Record the mem-refs
5775 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()});
5776 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5777 CurDAG->RemoveDeadNode(Node);
5778 return;
5779 }
5780 }
5781
5782 CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1);
5783 return;
5784 }
5785
5786 case X86ISD::SMUL:
5787 // i16/i32/i64 are handled with isel patterns.
5788 if (NVT != MVT::i8)
5789 break;
5790 [[fallthrough]];
5791 case X86ISD::UMUL: {
5792 SDValue N0 = Node->getOperand(0);
5793 SDValue N1 = Node->getOperand(1);
5794
5795 unsigned LoReg, ROpc, MOpc;
5796 switch (NVT.SimpleTy) {
5797 default: llvm_unreachable("Unsupported VT!");
5798 case MVT::i8:
5799 LoReg = X86::AL;
5800 ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
5801 MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
5802 break;
5803 case MVT::i16:
5804 LoReg = X86::AX;
5805 ROpc = X86::MUL16r;
5806 MOpc = X86::MUL16m;
5807 break;
5808 case MVT::i32:
5809 LoReg = X86::EAX;
5810 ROpc = X86::MUL32r;
5811 MOpc = X86::MUL32m;
5812 break;
5813 case MVT::i64:
5814 LoReg = X86::RAX;
5815 ROpc = X86::MUL64r;
5816 MOpc = X86::MUL64m;
5817 break;
5818 }
5819
5820 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5821 bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5822 // Multiply is commutative.
5823 if (!FoldedLoad) {
5824 FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5825 if (FoldedLoad)
5826 std::swap(N0, N1);
5827 }
5828
5829 // UMUL/SMUL have an implicit source in LoReg (AL/AX/EAX/RAX). Prefer the
5830 // operand that's already there to avoid an extra register-to-register move.
5831 if (!FoldedLoad)
5832 orderRegForMul(N0, N1, LoReg, CurDAG->getMachineFunction().getRegInfo());
5833
5834 SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
5835 N0, SDValue()).getValue(1);
5836
5837 MachineSDNode *CNode;
5838 if (FoldedLoad) {
5839 // i16/i32/i64 use an instruction that produces a low and high result even
5840 // though only the low result is used.
5841 SDVTList VTs;
5842 if (NVT == MVT::i8)
5843 VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5844 else
5845 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);
5846
5847 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5848 InGlue };
5849 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5850
5851 // Update the chain.
5852 ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
5853 // Record the mem-refs
5854 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5855 } else {
5856 // i16/i32/i64 use an instruction that produces a low and high result even
5857 // though only the low result is used.
5858 SDVTList VTs;
5859 if (NVT == MVT::i8)
5860 VTs = CurDAG->getVTList(NVT, MVT::i32);
5861 else
5862 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
5863
5864 CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InGlue});
5865 }
5866
5867 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5868 ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
5869 CurDAG->RemoveDeadNode(Node);
5870 return;
5871 }
5872
5873 case ISD::SMUL_LOHI:
5874 case ISD::UMUL_LOHI: {
5875 SDValue N0 = Node->getOperand(0);
5876 SDValue N1 = Node->getOperand(1);
5877
5878 unsigned Opc, MOpc;
5879 unsigned LoReg, HiReg;
5880 bool IsSigned = Opcode == ISD::SMUL_LOHI;
5881 bool UseMULX = !IsSigned && Subtarget->hasBMI2();
5882 bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty();
5883 switch (NVT.SimpleTy) {
5884 default: llvm_unreachable("Unsupported VT!");
5885 case MVT::i32:
5886 Opc = UseMULXHi ? X86::MULX32Hrr
5887 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rr)
5888 : IsSigned ? X86::IMUL32r
5889 : X86::MUL32r;
5890 MOpc = UseMULXHi ? X86::MULX32Hrm
5891 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rm)
5892 : IsSigned ? X86::IMUL32m
5893 : X86::MUL32m;
5894 LoReg = UseMULX ? X86::EDX : X86::EAX;
5895 HiReg = X86::EDX;
5896 break;
5897 case MVT::i64:
5898 Opc = UseMULXHi ? X86::MULX64Hrr
5899 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rr)
5900 : IsSigned ? X86::IMUL64r
5901 : X86::MUL64r;
5902 MOpc = UseMULXHi ? X86::MULX64Hrm
5903 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rm)
5904 : IsSigned ? X86::IMUL64m
5905 : X86::MUL64m;
5906 LoReg = UseMULX ? X86::RDX : X86::RAX;
5907 HiReg = X86::RDX;
5908 break;
5909 }
5910
5911 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5912 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5913 // Multiply is commutative.
5914 if (!foldedLoad) {
5915 foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5916 if (foldedLoad)
5917 std::swap(N0, N1);
5918 }
5919
5920 // UMUL/SMUL_LOHI has an implicit source in LoReg (RDX for MULX, RAX for
5921 // MUL/IMUL). Prefer the operand that's already there.
5922 if (!foldedLoad)
5923 orderRegForMul(N0, N1, LoReg, CurDAG->getMachineFunction().getRegInfo());
5924
5925 SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
5926 N0, SDValue()).getValue(1);
5927 SDValue ResHi, ResLo;
5928 if (foldedLoad) {
5929 SDValue Chain;
5930 MachineSDNode *CNode = nullptr;
5931 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5932 InGlue };
5933 if (UseMULXHi) {
5934 SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
5935 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5936 ResHi = SDValue(CNode, 0);
5937 Chain = SDValue(CNode, 1);
5938 } else if (UseMULX) {
5939 SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other);
5940 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5941 ResHi = SDValue(CNode, 0);
5942 ResLo = SDValue(CNode, 1);
5943 Chain = SDValue(CNode, 2);
5944 } else {
5945 SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
5946 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5947 Chain = SDValue(CNode, 0);
5948 InGlue = SDValue(CNode, 1);
5949 }
5950
5951 // Update the chain.
5952 ReplaceUses(N1.getValue(1), Chain);
5953 // Record the mem-refs
5954 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5955 } else {
5956 SDValue Ops[] = { N1, InGlue };
5957 if (UseMULXHi) {
5958 SDVTList VTs = CurDAG->getVTList(NVT);
5959 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5960 ResHi = SDValue(CNode, 0);
5961 } else if (UseMULX) {
5962 SDVTList VTs = CurDAG->getVTList(NVT, NVT);
5963 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5964 ResHi = SDValue(CNode, 0);
5965 ResLo = SDValue(CNode, 1);
5966 } else {
5967 SDVTList VTs = CurDAG->getVTList(MVT::Glue);
5968 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5969 InGlue = SDValue(CNode, 0);
5970 }
5971 }
5972
5973 // Copy the low half of the result, if it is needed.
5974 if (!SDValue(Node, 0).use_empty()) {
5975 if (!ResLo) {
5976 assert(LoReg && "Register for low half is not defined!");
5977 ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
5978 NVT, InGlue);
5979 InGlue = ResLo.getValue(2);
5980 }
5981 ReplaceUses(SDValue(Node, 0), ResLo);
5982 LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
5983 dbgs() << '\n');
5984 }
5985 // Copy the high half of the result, if it is needed.
5986 if (!SDValue(Node, 1).use_empty()) {
5987 if (!ResHi) {
5988 assert(HiReg && "Register for high half is not defined!");
5989 ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
5990 NVT, InGlue);
5991 InGlue = ResHi.getValue(2);
5992 }
5993 ReplaceUses(SDValue(Node, 1), ResHi);
5994 LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
5995 dbgs() << '\n');
5996 }
5997
5998 CurDAG->RemoveDeadNode(Node);
5999 return;
6000 }
6001
6002 case ISD::SDIVREM:
6003 case ISD::UDIVREM: {
6004 SDValue N0 = Node->getOperand(0);
6005 SDValue N1 = Node->getOperand(1);
6006
6007 unsigned ROpc, MOpc;
6008 bool isSigned = Opcode == ISD::SDIVREM;
6009 if (!isSigned) {
6010 switch (NVT.SimpleTy) {
6011 default: llvm_unreachable("Unsupported VT!");
6012 case MVT::i8: ROpc = X86::DIV8r; MOpc = X86::DIV8m; break;
6013 case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break;
6014 case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break;
6015 case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break;
6016 }
6017 } else {
6018 switch (NVT.SimpleTy) {
6019 default: llvm_unreachable("Unsupported VT!");
6020 case MVT::i8: ROpc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
6021 case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
6022 case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
6023 case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
6024 }
6025 }
6026
6027 unsigned LoReg, HiReg, ClrReg;
6028 unsigned SExtOpcode;
6029 switch (NVT.SimpleTy) {
6030 default: llvm_unreachable("Unsupported VT!");
6031 case MVT::i8:
6032 LoReg = X86::AL; ClrReg = HiReg = X86::AH;
6033 SExtOpcode = 0; // Not used.
6034 break;
6035 case MVT::i16:
6036 LoReg = X86::AX; HiReg = X86::DX;
6037 ClrReg = X86::DX;
6038 SExtOpcode = X86::CWD;
6039 break;
6040 case MVT::i32:
6041 LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
6042 SExtOpcode = X86::CDQ;
6043 break;
6044 case MVT::i64:
6045 LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
6046 SExtOpcode = X86::CQO;
6047 break;
6048 }
6049
6050 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
6051 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
6052 bool signBitIsZero = CurDAG->SignBitIsZero(N0);
6053
6054 SDValue InGlue;
6055 if (NVT == MVT::i8) {
6056 // Special case for div8, just use a move with zero extension to AX to
6057 // clear the upper 8 bits (AH).
6058 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
6059 MachineSDNode *Move;
6060 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
6061 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
6062 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
6063 : X86::MOVZX16rm8;
6064 Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops);
6065 Chain = SDValue(Move, 1);
6066 ReplaceUses(N0.getValue(1), Chain);
6067 // Record the mem-refs
6068 CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
6069 } else {
6070 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
6071 : X86::MOVZX16rr8;
6072 Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0);
6073 Chain = CurDAG->getEntryNode();
6074 }
6075 Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0),
6076 SDValue());
6077 InGlue = Chain.getValue(1);
6078 } else {
6079 InGlue =
6080 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
6081 LoReg, N0, SDValue()).getValue(1);
6082 if (isSigned && !signBitIsZero) {
6083 // Sign extend the low part into the high part.
6084 InGlue =
6085 SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InGlue),0);
6086 } else {
6087 // Zero out the high part, effectively zero extending the input.
6088 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
6089 SDValue ClrNode =
6090 SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, {}), 0);
6091 switch (NVT.SimpleTy) {
6092 case MVT::i16:
6093 ClrNode =
6094 SDValue(CurDAG->getMachineNode(
6095 TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
6096 CurDAG->getTargetConstant(X86::sub_16bit, dl,
6097 MVT::i32)),
6098 0);
6099 break;
6100 case MVT::i32:
6101 break;
6102 case MVT::i64:
6103 ClrNode = SDValue(
6104 CurDAG->getMachineNode(
6105 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64, ClrNode,
6106 CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
6107 0);
6108 break;
6109 default:
6110 llvm_unreachable("Unexpected division source");
6111 }
6112
6113 InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
6114 ClrNode, InGlue).getValue(1);
6115 }
6116 }
6117
6118 if (foldedLoad) {
6119 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
6120 InGlue };
6121 MachineSDNode *CNode =
6122 CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
6123 InGlue = SDValue(CNode, 1);
6124 // Update the chain.
6125 ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
6126 // Record the mem-refs
6127 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
6128 } else {
6129 InGlue =
6130 SDValue(CurDAG->getMachineNode(ROpc, dl, MVT::Glue, N1, InGlue), 0);
6131 }
6132
6133 // Prevent use of AH in a REX instruction by explicitly copying it to
6134 // an ABCD_L register.
6135 //
6136 // The current assumption of the register allocator is that isel
6137 // won't generate explicit references to the GR8_ABCD_H registers. If
6138 // the allocator and/or the backend get enhanced to be more robust in
6139 // that regard, this can be, and should be, removed.
6140 if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
6141 SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
6142 unsigned AHExtOpcode =
6143 isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
6144
6145 SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
6146 MVT::Glue, AHCopy, InGlue);
6147 SDValue Result(RNode, 0);
6148 InGlue = SDValue(RNode, 1);
6149
6150 Result =
6151 CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
6152
6153 ReplaceUses(SDValue(Node, 1), Result);
6154 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6155 dbgs() << '\n');
6156 }
6157 // Copy the division (low) result, if it is needed.
6158 if (!SDValue(Node, 0).use_empty()) {
6159 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
6160 LoReg, NVT, InGlue);
6161 InGlue = Result.getValue(2);
6162 ReplaceUses(SDValue(Node, 0), Result);
6163 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6164 dbgs() << '\n');
6165 }
6166 // Copy the remainder (high) result, if it is needed.
6167 if (!SDValue(Node, 1).use_empty()) {
6168 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
6169 HiReg, NVT, InGlue);
6170 InGlue = Result.getValue(2);
6171 ReplaceUses(SDValue(Node, 1), Result);
6172 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6173 dbgs() << '\n');
6174 }
6175 CurDAG->RemoveDeadNode(Node);
6176 return;
6177 }
6178
6179 case X86ISD::FCMP:
6180 case X86ISD::STRICT_FCMP:
6181 case X86ISD::STRICT_FCMPS: {
6182 bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP ||
6183 Node->getOpcode() == X86ISD::STRICT_FCMPS;
6184 SDValue N0 = Node->getOperand(IsStrictCmp ? 1 : 0);
6185 SDValue N1 = Node->getOperand(IsStrictCmp ? 2 : 1);
6186
6187 // Save the original VT of the compare.
6188 MVT CmpVT = N0.getSimpleValueType();
6189
6190 // Floating point needs special handling if we don't have FCOMI.
6191 if (Subtarget->canUseCMOV())
6192 break;
6193
6194 bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS;
6195
6196 unsigned Opc;
6197 switch (CmpVT.SimpleTy) {
6198 default: llvm_unreachable("Unexpected type!");
6199 case MVT::f32:
6200 Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32;
6201 break;
6202 case MVT::f64:
6203 Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64;
6204 break;
6205 case MVT::f80:
6206 Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80;
6207 break;
6208 }
6209
6210 SDValue Chain =
6211 IsStrictCmp ? Node->getOperand(0) : CurDAG->getEntryNode();
6212 SDValue Glue;
6213 if (IsStrictCmp) {
6214 SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
6215 Chain = SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {N0, N1, Chain}), 0);
6216 Glue = Chain.getValue(1);
6217 } else {
6218 Glue = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N0, N1), 0);
6219 }
6220
6221 // Move FPSW to AX.
6222 SDValue FNSTSW =
6223 SDValue(CurDAG->getMachineNode(X86::FNSTSW16r, dl, MVT::i16, Glue), 0);
6224
6225 // Extract upper 8-bits of AX.
6226 SDValue Extract =
6227 CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, MVT::i8, FNSTSW);
6228
6229 // Move AH into flags.
6230 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
6231 assert(Subtarget->canUseLAHFSAHF() &&
6232 "Target doesn't support SAHF or FCOMI?");
6233 SDValue AH = CurDAG->getCopyToReg(Chain, dl, X86::AH, Extract, SDValue());
6234 Chain = AH;
6235 SDValue SAHF = SDValue(
6236 CurDAG->getMachineNode(X86::SAHF, dl, MVT::i32, AH.getValue(1)), 0);
6237
6238 if (IsStrictCmp)
6239 ReplaceUses(SDValue(Node, 1), Chain);
6240
6241 ReplaceUses(SDValue(Node, 0), SAHF);
6242 CurDAG->RemoveDeadNode(Node);
6243 return;
6244 }
6245
6246 case X86ISD::CMP: {
6247 SDValue N0 = Node->getOperand(0);
6248 SDValue N1 = Node->getOperand(1);
6249
6250 // Optimizations for TEST compares.
6251 if (!isNullConstant(N1))
6252 break;
6253
6254 // Save the original VT of the compare.
6255 MVT CmpVT = N0.getSimpleValueType();
6256
6257 // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
6258 // by a test instruction. The test should be removed later by
6259 // analyzeCompare if we are using only the zero flag.
6260 // TODO: Should we check the users and use the BEXTR flags directly?
6261 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
6262 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
6263 unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
6264 : X86::TEST32rr;
6265 SDValue BEXTR = SDValue(NewNode, 0);
6266 NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
6267 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
6268 CurDAG->RemoveDeadNode(Node);
6269 return;
6270 }
6271 }
6272
6273 // We can peek through truncates, but we need to be careful below.
6274 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
6275 N0 = N0.getOperand(0);
6276
6277 // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
6278 // use a smaller encoding.
6279 // Look past the truncate if CMP is the only use of it.
6280 if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
6281 N0.getValueType() != MVT::i8) {
6282 auto *MaskC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
6283 if (!MaskC)
6284 break;
6285
6286 // We may have looked through a truncate so mask off any bits that
6287 // shouldn't be part of the compare.
6288 uint64_t Mask = MaskC->getZExtValue();
6290
6291 // Check if we can replace AND+IMM{32,64} with a shift. This is possible
6292 // for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the
6293 // zero flag.
6294 if (CmpVT == MVT::i64 && !isInt<8>(Mask) && isShiftedMask_64(Mask) &&
6295 onlyUsesZeroFlag(SDValue(Node, 0))) {
6296 unsigned ShiftOpcode = ISD::DELETED_NODE;
6297 unsigned ShiftAmt;
6298 unsigned SubRegIdx;
6299 MVT SubRegVT;
6300 unsigned TestOpcode;
6301 unsigned LeadingZeros = llvm::countl_zero(Mask);
6302 unsigned TrailingZeros = llvm::countr_zero(Mask);
6303
6304 // With leading/trailing zeros, the transform is profitable if we can
6305 // eliminate a movabsq or shrink a 32-bit immediate to 8-bit without
6306 // incurring any extra register moves.
6307 bool SavesBytes = !isInt<32>(Mask) || N0.getOperand(0).hasOneUse();
6308 if (LeadingZeros == 0 && SavesBytes) {
6309 // If the mask covers the most significant bit, then we can replace
6310 // TEST+AND with a SHR and check eflags.
6311 // This emits a redundant TEST which is subsequently eliminated.
6312 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6313 ShiftAmt = TrailingZeros;
6314 SubRegIdx = 0;
6315 TestOpcode = X86::TEST64rr;
6316 } else if (TrailingZeros == 0 && SavesBytes) {
6317 // If the mask covers the least significant bit, then we can replace
6318 // TEST+AND with a SHL and check eflags.
6319 // This emits a redundant TEST which is subsequently eliminated.
6320 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHL64ri);
6321 ShiftAmt = LeadingZeros;
6322 SubRegIdx = 0;
6323 TestOpcode = X86::TEST64rr;
6324 } else if (MaskC->hasOneUse() && !isInt<32>(Mask)) {
6325 // If the shifted mask extends into the high half and is 8/16/32 bits
6326 // wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr.
6327 unsigned PopCount = 64 - LeadingZeros - TrailingZeros;
6328 if (PopCount == 8) {
6329 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6330 ShiftAmt = TrailingZeros;
6331 SubRegIdx = X86::sub_8bit;
6332 SubRegVT = MVT::i8;
6333 TestOpcode = X86::TEST8rr;
6334 } else if (PopCount == 16) {
6335 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6336 ShiftAmt = TrailingZeros;
6337 SubRegIdx = X86::sub_16bit;
6338 SubRegVT = MVT::i16;
6339 TestOpcode = X86::TEST16rr;
6340 } else if (PopCount == 32) {
6341 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6342 ShiftAmt = TrailingZeros;
6343 SubRegIdx = X86::sub_32bit;
6344 SubRegVT = MVT::i32;
6345 TestOpcode = X86::TEST32rr;
6346 }
6347 }
6348 if (ShiftOpcode != ISD::DELETED_NODE) {
6349 SDValue ShiftC = CurDAG->getTargetConstant(ShiftAmt, dl, MVT::i64);
6350 SDValue Shift = SDValue(
6351 CurDAG->getMachineNode(ShiftOpcode, dl, MVT::i64, MVT::i32,
6352 N0.getOperand(0), ShiftC),
6353 0);
6354 if (SubRegIdx != 0) {
6355 Shift =
6356 CurDAG->getTargetExtractSubreg(SubRegIdx, dl, SubRegVT, Shift);
6357 }
6358 MachineSDNode *Test =
6359 CurDAG->getMachineNode(TestOpcode, dl, MVT::i32, Shift, Shift);
6360 ReplaceNode(Node, Test);
6361 return;
6362 }
6363 }
6364
6365 MVT VT;
6366 int SubRegOp;
6367 unsigned ROpc, MOpc;
6368
6369 // For each of these checks we need to be careful if the sign flag is
6370 // being used. It is only safe to use the sign flag in two conditions,
6371 // either the sign bit in the shrunken mask is zero or the final test
6372 // size is equal to the original compare size.
6373
6374 if (isUInt<8>(Mask) &&
6375 (!(Mask & 0x80) || CmpVT == MVT::i8 ||
6376 hasNoSignFlagUses(SDValue(Node, 0)))) {
6377 // For example, convert "testl %eax, $8" to "testb %al, $8"
6378 VT = MVT::i8;
6379 SubRegOp = X86::sub_8bit;
6380 ROpc = X86::TEST8ri;
6381 MOpc = X86::TEST8mi;
6382 } else if (OptForMinSize && isUInt<16>(Mask) &&
6383 (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
6384 hasNoSignFlagUses(SDValue(Node, 0)))) {
6385 // For example, "testl %eax, $32776" to "testw %ax, $32776".
6386 // NOTE: We only want to form TESTW instructions if optimizing for
6387 // min size. Otherwise we only save one byte and possibly get a length
6388 // changing prefix penalty in the decoders.
6389 VT = MVT::i16;
6390 SubRegOp = X86::sub_16bit;
6391 ROpc = X86::TEST16ri;
6392 MOpc = X86::TEST16mi;
6393 } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
6394 ((!(Mask & 0x80000000) &&
6395 // Without minsize 16-bit Cmps can get here so we need to
6396 // be sure we calculate the correct sign flag if needed.
6397 (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
6398 CmpVT == MVT::i32 ||
6399 hasNoSignFlagUses(SDValue(Node, 0)))) {
6400 // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
6401 // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
6402 // Otherwize, we find ourselves in a position where we have to do
6403 // promotion. If previous passes did not promote the and, we assume
6404 // they had a good reason not to and do not promote here.
6405 VT = MVT::i32;
6406 SubRegOp = X86::sub_32bit;
6407 ROpc = X86::TEST32ri;
6408 MOpc = X86::TEST32mi;
6409 } else {
6410 // No eligible transformation was found.
6411 break;
6412 }
6413
6414 SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
6415 SDValue Reg = N0.getOperand(0);
6416
6417 // Emit a testl or testw.
6418 MachineSDNode *NewNode;
6419 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
6420 if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
6421 if (auto *LoadN = dyn_cast<LoadSDNode>(N0.getOperand(0).getNode())) {
6422 if (!LoadN->isSimple()) {
6423 unsigned NumVolBits = LoadN->getValueType(0).getSizeInBits();
6424 if ((MOpc == X86::TEST8mi && NumVolBits != 8) ||
6425 (MOpc == X86::TEST16mi && NumVolBits != 16) ||
6426 (MOpc == X86::TEST32mi && NumVolBits != 32))
6427 break;
6428 }
6429 }
6430 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
6431 Reg.getOperand(0) };
6432 NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
6433 // Update the chain.
6434 ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1));
6435 // Record the mem-refs
6436 CurDAG->setNodeMemRefs(NewNode,
6437 {cast<LoadSDNode>(Reg)->getMemOperand()});
6438 } else {
6439 // Extract the subregister if necessary.
6440 if (N0.getValueType() != VT)
6441 Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
6442
6443 NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
6444 }
6445 // Replace CMP with TEST.
6446 ReplaceNode(Node, NewNode);
6447 return;
6448 }
6449 break;
6450 }
6451 case X86ISD::PCMPISTR: {
6452 if (!Subtarget->hasSSE42())
6453 break;
6454
6455 bool NeedIndex = !SDValue(Node, 0).use_empty();
6456 bool NeedMask = !SDValue(Node, 1).use_empty();
6457 // We can't fold a load if we are going to make two instructions.
6458 bool MayFoldLoad = !NeedIndex || !NeedMask;
6459
6460 MachineSDNode *CNode;
6461 if (NeedMask) {
6462 unsigned ROpc =
6463 Subtarget->hasAVX() ? X86::VPCMPISTRMrri : X86::PCMPISTRMrri;
6464 unsigned MOpc =
6465 Subtarget->hasAVX() ? X86::VPCMPISTRMrmi : X86::PCMPISTRMrmi;
6466 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
6467 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
6468 }
6469 if (NeedIndex || !NeedMask) {
6470 unsigned ROpc =
6471 Subtarget->hasAVX() ? X86::VPCMPISTRIrri : X86::PCMPISTRIrri;
6472 unsigned MOpc =
6473 Subtarget->hasAVX() ? X86::VPCMPISTRIrmi : X86::PCMPISTRIrmi;
6474 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
6475 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
6476 }
6477
6478 // Connect the flag usage to the last instruction created.
6479 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
6480 CurDAG->RemoveDeadNode(Node);
6481 return;
6482 }
6483 case X86ISD::PCMPESTR: {
6484 if (!Subtarget->hasSSE42())
6485 break;
6486
6487 // Copy the two implicit register inputs.
6488 SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
6489 Node->getOperand(1),
6490 SDValue()).getValue(1);
6491 InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
6492 Node->getOperand(3), InGlue).getValue(1);
6493
6494 bool NeedIndex = !SDValue(Node, 0).use_empty();
6495 bool NeedMask = !SDValue(Node, 1).use_empty();
6496 // We can't fold a load if we are going to make two instructions.
6497 bool MayFoldLoad = !NeedIndex || !NeedMask;
6498
6499 MachineSDNode *CNode;
6500 if (NeedMask) {
6501 unsigned ROpc =
6502 Subtarget->hasAVX() ? X86::VPCMPESTRMrri : X86::PCMPESTRMrri;
6503 unsigned MOpc =
6504 Subtarget->hasAVX() ? X86::VPCMPESTRMrmi : X86::PCMPESTRMrmi;
6505 CNode =
6506 emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node, InGlue);
6507 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
6508 }
6509 if (NeedIndex || !NeedMask) {
6510 unsigned ROpc =
6511 Subtarget->hasAVX() ? X86::VPCMPESTRIrri : X86::PCMPESTRIrri;
6512 unsigned MOpc =
6513 Subtarget->hasAVX() ? X86::VPCMPESTRIrmi : X86::PCMPESTRIrmi;
6514 CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InGlue);
6515 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
6516 }
6517 // Connect the flag usage to the last instruction created.
6518 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
6519 CurDAG->RemoveDeadNode(Node);
6520 return;
6521 }
6522
6523 case ISD::SETCC: {
6524 if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
6525 return;
6526
6527 break;
6528 }
6529
6530 case ISD::STORE:
6531 if (foldLoadStoreIntoMemOperand(Node))
6532 return;
6533 break;
6534
6535 case X86ISD::SETCC_CARRY: {
6536 MVT VT = Node->getSimpleValueType(0);
6538 if (Subtarget->hasSBBDepBreaking()) {
6539 // We have to do this manually because tblgen will put the eflags copy in
6540 // the wrong place if we use an extract_subreg in the pattern.
6541 // Copy flags to the EFLAGS register and glue it to next node.
6542 SDValue EFLAGS =
6543 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
6544 Node->getOperand(1), SDValue());
6545
6546 // Create a 64-bit instruction if the result is 64-bits otherwise use the
6547 // 32-bit version.
6548 unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
6549 MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
6550 Result = SDValue(
6551 CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)),
6552 0);
6553 } else {
6554 // The target does not recognize sbb with the same reg operand as a
6555 // no-source idiom, so we explicitly zero the input values.
6556 Result = getSBBZero(Node);
6557 }
6558
6559 // For less than 32-bits we need to extract from the 32-bit node.
6560 if (VT == MVT::i8 || VT == MVT::i16) {
6561 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6562 Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
6563 }
6564
6565 ReplaceUses(SDValue(Node, 0), Result);
6566 CurDAG->RemoveDeadNode(Node);
6567 return;
6568 }
6569 case X86ISD::SBB: {
6570 if (isNullConstant(Node->getOperand(0)) &&
6571 isNullConstant(Node->getOperand(1))) {
6572 SDValue Result = getSBBZero(Node);
6573
6574 // Replace the flag use.
6575 ReplaceUses(SDValue(Node, 1), Result.getValue(1));
6576
6577 // Replace the result use.
6578 if (!SDValue(Node, 0).use_empty()) {
6579 // For less than 32-bits we need to extract from the 32-bit node.
6580 MVT VT = Node->getSimpleValueType(0);
6581 if (VT == MVT::i8 || VT == MVT::i16) {
6582 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6583 Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
6584 }
6585 ReplaceUses(SDValue(Node, 0), Result);
6586 }
6587
6588 CurDAG->RemoveDeadNode(Node);
6589 return;
6590 }
6591 break;
6592 }
6593 case X86ISD::MGATHER: {
6594 auto *Mgt = cast<X86MaskedGatherSDNode>(Node);
6595 SDValue IndexOp = Mgt->getIndex();
6596 SDValue Mask = Mgt->getMask();
6597 MVT IndexVT = IndexOp.getSimpleValueType();
6598 MVT ValueVT = Node->getSimpleValueType(0);
6599 MVT MaskVT = Mask.getSimpleValueType();
6600
6601 // This is just to prevent crashes if the nodes are malformed somehow. We're
6602 // otherwise only doing loose type checking in here based on type what
6603 // a type constraint would say just like table based isel.
6604 if (!ValueVT.isVector() || !MaskVT.isVector())
6605 break;
6606
6607 unsigned NumElts = ValueVT.getVectorNumElements();
6608 MVT ValueSVT = ValueVT.getVectorElementType();
6609
6610 bool IsFP = ValueSVT.isFloatingPoint();
6611 unsigned EltSize = ValueSVT.getSizeInBits();
6612
6613 unsigned Opc = 0;
6614 bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1;
6615 if (AVX512Gather) {
6616 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6617 Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm;
6618 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6619 Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm;
6620 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6621 Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm;
6622 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6623 Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm;
6624 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6625 Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm;
6626 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6627 Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm;
6628 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6629 Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm;
6630 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6631 Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm;
6632 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6633 Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm;
6634 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6635 Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm;
6636 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6637 Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm;
6638 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6639 Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm;
6640 } else {
6641 assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() &&
6642 "Unexpected mask VT!");
6643 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6644 Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm;
6645 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6646 Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm;
6647 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6648 Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm;
6649 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6650 Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm;
6651 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6652 Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm;
6653 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6654 Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm;
6655 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6656 Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm;
6657 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6658 Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm;
6659 }
6660
6661 if (!Opc)
6662 break;
6663
6664 SDValue Base, Scale, Index, Disp, Segment;
6665 if (!selectVectorAddr(Mgt, Mgt->getBasePtr(), IndexOp, Mgt->getScale(),
6666 Base, Scale, Index, Disp, Segment))
6667 break;
6668
6669 SDValue PassThru = Mgt->getPassThru();
6670 SDValue Chain = Mgt->getChain();
6671 // Gather instructions have a mask output not in the ISD node.
6672 SDVTList VTs = CurDAG->getVTList(ValueVT, MaskVT, MVT::Other);
6673
6674 MachineSDNode *NewNode;
6675 if (AVX512Gather) {
6676 SDValue Ops[] = {PassThru, Mask, Base, Scale,
6677 Index, Disp, Segment, Chain};
6678 NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6679 } else {
6680 SDValue Ops[] = {PassThru, Base, Scale, Index,
6681 Disp, Segment, Mask, Chain};
6682 NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6683 }
6684 CurDAG->setNodeMemRefs(NewNode, {Mgt->getMemOperand()});
6685 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
6686 ReplaceUses(SDValue(Node, 1), SDValue(NewNode, 2));
6687 CurDAG->RemoveDeadNode(Node);
6688 return;
6689 }
6690 case X86ISD::MSCATTER: {
6691 auto *Sc = cast<X86MaskedScatterSDNode>(Node);
6692 SDValue Value = Sc->getValue();
6693 SDValue IndexOp = Sc->getIndex();
6694 MVT IndexVT = IndexOp.getSimpleValueType();
6695 MVT ValueVT = Value.getSimpleValueType();
6696
6697 // This is just to prevent crashes if the nodes are malformed somehow. We're
6698 // otherwise only doing loose type checking in here based on type what
6699 // a type constraint would say just like table based isel.
6700 if (!ValueVT.isVector())
6701 break;
6702
6703 unsigned NumElts = ValueVT.getVectorNumElements();
6704 MVT ValueSVT = ValueVT.getVectorElementType();
6705
6706 bool IsFP = ValueSVT.isFloatingPoint();
6707 unsigned EltSize = ValueSVT.getSizeInBits();
6708
6709 unsigned Opc;
6710 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6711 Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr;
6712 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6713 Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr;
6714 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6715 Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr;
6716 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6717 Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr;
6718 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6719 Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr;
6720 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6721 Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr;
6722 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6723 Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr;
6724 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6725 Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr;
6726 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6727 Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr;
6728 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6729 Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr;
6730 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6731 Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr;
6732 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6733 Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr;
6734 else
6735 break;
6736
6737 SDValue Base, Scale, Index, Disp, Segment;
6738 if (!selectVectorAddr(Sc, Sc->getBasePtr(), IndexOp, Sc->getScale(),
6739 Base, Scale, Index, Disp, Segment))
6740 break;
6741
6742 SDValue Mask = Sc->getMask();
6743 SDValue Chain = Sc->getChain();
6744 // Scatter instructions have a mask output not in the ISD node.
6745 SDVTList VTs = CurDAG->getVTList(Mask.getValueType(), MVT::Other);
6746 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain};
6747
6748 MachineSDNode *NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6749 CurDAG->setNodeMemRefs(NewNode, {Sc->getMemOperand()});
6750 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 1));
6751 CurDAG->RemoveDeadNode(Node);
6752 return;
6753 }
6755 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6756 auto CallId = MFI->getPreallocatedIdForCallSite(
6757 cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
6758 SDValue Chain = Node->getOperand(0);
6759 SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6760 MachineSDNode *New = CurDAG->getMachineNode(
6761 TargetOpcode::PREALLOCATED_SETUP, dl, MVT::Other, CallIdValue, Chain);
6762 ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Chain
6763 CurDAG->RemoveDeadNode(Node);
6764 return;
6765 }
6766 case ISD::PREALLOCATED_ARG: {
6767 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6768 auto CallId = MFI->getPreallocatedIdForCallSite(
6769 cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
6770 SDValue Chain = Node->getOperand(0);
6771 SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6772 SDValue ArgIndex = Node->getOperand(2);
6773 SDValue Ops[3];
6774 Ops[0] = CallIdValue;
6775 Ops[1] = ArgIndex;
6776 Ops[2] = Chain;
6777 MachineSDNode *New = CurDAG->getMachineNode(
6778 TargetOpcode::PREALLOCATED_ARG, dl,
6779 CurDAG->getVTList(TLI->getPointerTy(CurDAG->getDataLayout()),
6780 MVT::Other),
6781 Ops);
6782 ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Arg pointer
6783 ReplaceUses(SDValue(Node, 1), SDValue(New, 1)); // Chain
6784 CurDAG->RemoveDeadNode(Node);
6785 return;
6786 }
6791 if (!Subtarget->hasWIDEKL())
6792 break;
6793
6794 unsigned Opcode;
6795 switch (Node->getOpcode()) {
6796 default:
6797 llvm_unreachable("Unexpected opcode!");
6799 Opcode = X86::AESENCWIDE128KL;
6800 break;
6802 Opcode = X86::AESDECWIDE128KL;
6803 break;
6805 Opcode = X86::AESENCWIDE256KL;
6806 break;
6808 Opcode = X86::AESDECWIDE256KL;
6809 break;
6810 }
6811
6812 SDValue Chain = Node->getOperand(0);
6813 SDValue Addr = Node->getOperand(1);
6814
6815 SDValue Base, Scale, Index, Disp, Segment;
6816 if (!selectAddr(Node, Addr, Base, Scale, Index, Disp, Segment))
6817 break;
6818
6819 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(2),
6820 SDValue());
6821 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(3),
6822 Chain.getValue(1));
6823 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM2, Node->getOperand(4),
6824 Chain.getValue(1));
6825 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM3, Node->getOperand(5),
6826 Chain.getValue(1));
6827 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM4, Node->getOperand(6),
6828 Chain.getValue(1));
6829 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM5, Node->getOperand(7),
6830 Chain.getValue(1));
6831 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM6, Node->getOperand(8),
6832 Chain.getValue(1));
6833 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM7, Node->getOperand(9),
6834 Chain.getValue(1));
6835
6836 MachineSDNode *Res = CurDAG->getMachineNode(
6837 Opcode, dl, Node->getVTList(),
6838 {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(1)});
6839 CurDAG->setNodeMemRefs(Res, cast<MemSDNode>(Node)->getMemOperand());
6840 ReplaceNode(Node, Res);
6841 return;
6842 }
6844 SDValue Chain = Node->getOperand(0);
6845 Register Reg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
6846 SDValue Glue;
6847 if (Node->getNumValues() == 3)
6848 Glue = Node->getOperand(2);
6849 SDValue Copy =
6850 CurDAG->getCopyFromReg(Chain, dl, Reg, Node->getValueType(0), Glue);
6851 ReplaceNode(Node, Copy.getNode());
6852 return;
6853 }
6854 }
6855
6856 SelectCode(Node);
6857}
6858
6859bool X86DAGToDAGISel::SelectInlineAsmMemoryOperand(
6860 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
6861 std::vector<SDValue> &OutOps) {
6862 SDValue Op0, Op1, Op2, Op3, Op4;
6863 switch (ConstraintID) {
6864 default:
6865 llvm_unreachable("Unexpected asm memory constraint");
6866 case InlineAsm::ConstraintCode::o: // offsetable ??
6867 case InlineAsm::ConstraintCode::v: // not offsetable ??
6868 case InlineAsm::ConstraintCode::m: // memory
6869 case InlineAsm::ConstraintCode::X:
6870 case InlineAsm::ConstraintCode::p: // address
6871 if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
6872 return true;
6873 break;
6874 }
6875
6876 OutOps.push_back(Op0);
6877 OutOps.push_back(Op1);
6878 OutOps.push_back(Op2);
6879 OutOps.push_back(Op3);
6880 OutOps.push_back(Op4);
6881 return false;
6882}
6883
6886 std::make_unique<X86DAGToDAGISel>(TM, TM.getOptLevel())) {}
6887
6888/// This pass converts a legalized DAG into a X86-specific DAG,
6889/// ready for instruction scheduling.
6891 CodeGenOptLevel OptLevel) {
6892 return new X86DAGToDAGISelLegacy(TM, OptLevel);
6893}
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis false
#define CASE(ATTRNAME, AANAME,...)
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
dxil translate DXIL Translate Metadata
static bool isSigned(unsigned Opcode)
#define DEBUG_TYPE
Module.h This file contains the declarations for the Module class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
const MCPhysReg ArgGPRs[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
Register Reg
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
#define P(N)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
BaseType
A given derived pointer can have multiple base pointers through phi/selects.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:119
static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode, SDValue StoredVal, SelectionDAG *CurDAG, LoadSDNode *&LoadNode, SDValue &InputChain)
static void insertDAGNode(SelectionDAG *DAG, SDNode *Pos, SDValue N)
#define PASS_NAME
static bool isRIPRelative(const MCInst &MI, const MCInstrInfo &MCII)
Check if the instruction uses RIP relative addressing.
#define FROM_TO(FROM, TO)
#define GET_EGPR_IF_ENABLED(OPC)
static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget)
static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM)
static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM)
static bool needBWI(MVT VT)
static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad, bool FoldedBCast, bool Masked)
#define GET_NDM_IF_ENABLED(OPC)
static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM, const X86Subtarget &Subtarget)
static bool mayUseCarryFlag(X86::CondCode CC)
static cl::opt< bool > EnablePromoteAnyextLoad("x86-promote-anyext-load", cl::init(true), cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden)
static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, SDValue Call, SDValue OrigChain)
Replace the original chain operand of the call with load's chain operand and move load below the call...
#define GET_ND_IF_ENABLED(OPC)
#define VPTESTM_BROADCAST_CASES(SUFFIX)
static cl::opt< bool > AndImmShrink("x86-and-imm-shrink", cl::init(true), cl::desc("Enable setting constant bits to reduce size of mask immediates"), cl::Hidden)
static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, X86ISelAddressMode &AM)
#define VPTESTM_FULL_CASES(SUFFIX)
static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq)
Return true if call address is a load and it can be moved below CALLSEQ_START and the chains leading ...
static bool isDispSafeForFrameIndexOrRegBase(int64_t Val)
static bool isEndbrImm64(uint64_t Imm)
static void orderRegForMul(SDValue &N0, SDValue &N1, const unsigned LoReg, const MachineRegisterInfo &MRI)
cl::opt< bool > IndirectBranchTracking("x86-indirect-branch-tracking", cl::init(false), cl::Hidden, cl::desc("Enable X86 indirect branch tracking pass."))
#define GET_ND_IF_ENABLED(OPC)
#define CASE_ND(OP)
Value * RHS
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1055
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:968
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition APInt.h:1621
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition APInt.h:1554
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1264
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:390
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1679
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:711
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:728
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI std::optional< ConstantRange > getAbsoluteSymbolRange() const
If this is an absolute symbol reference, returns the range of the symbol, otherwise returns std::null...
Definition Globals.cpp:463
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool is512BitVector() const
Return true if this is a 512-bit vector type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI MCRegister getLiveInPhysReg(Register VReg) const
getLiveInPhysReg - If VReg is a live-in virtual register, return the corresponding live-in physical r...
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isNonTemporal() const
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition Module.cpp:358
Wrapper class representing virtual and physical registers.
Definition Register.h:20
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
int getNodeId() const
Return the unique node id.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
bool use_empty() const
Return true if there are no uses of this node.
const SDValue & getOperand(unsigned Num) const
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
iterator_range< user_iterator > users()
op_iterator op_end() const
op_iterator op_begin() const
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
unsigned getNumOperands() const
SelectionDAGISelPass(std::unique_ptr< SelectionDAGISel > Selector)
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
static int getUninvalidatedNodeId(SDNode *N)
virtual bool runOnMachineFunction(MachineFunction &mf)
static void InvalidateNodeId(SDNode *N)
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
static constexpr unsigned MaxRecursionDepth
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
void RepositionNode(allnodes_iterator Position, SDNode *N)
Move node N in the AllNodes list to be immediately before the given iterator Position.
ilist< SDNode >::iterator allnodes_iterator
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
X86ISelDAGToDAGPass(X86TargetMachine &TM)
size_t getPreallocatedIdForCallSite(const Value *CS)
bool isScalarFPTypeInSSEReg(EVT VT) const
Return true if the specified scalar FP type is computed in an SSE register, not on the X87 floating p...
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:823
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:600
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:857
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:220
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:997
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:848
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:665
@ PREALLOCATED_SETUP
PREALLOCATED_SETUP - This has 2 operands: an input chain and a SRCVALUE with the preallocated call Va...
@ TargetExternalSymbol
Definition ISDOpcodes.h:190
@ PREALLOCATED_ARG
PREALLOCATED_ARG - This has 3 operands: an input chain, a SRCVALUE with the preallocated call Value,...
@ BRIND
BRIND - Indirect branch.
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:230
@ TargetGlobalAddress
TargetGlobalAddress - Like GlobalAddress, but the DAG does no folding or anything else with this node...
Definition ISDOpcodes.h:185
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:769
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:614
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:854
@ LOCAL_RECOVER
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition ISDOpcodes.h:135
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition ISDOpcodes.h:903
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:982
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:809
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:464
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:478
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:500
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:477
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:930
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:505
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:963
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition ISDOpcodes.h:925
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:458
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:860
@ CALLSEQ_START
CALLSEQ_START/CALLSEQ_END - These operators mark the beginning and end of a call sequence,...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ TargetGlobalTLSAddress
Definition ISDOpcodes.h:186
LLVM_ABI bool isBuildVectorOfConstantSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantSDNode or undef.
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LLVM_ABI bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
@ GlobalBaseReg
The result of the mflr at function entry, used for PIC code.
@ X86
Windows x64, Windows Itanium (IA-64)
Definition MCAsmInfo.h:52
@ MO_NO_FLAG
MO_NO_FLAG - No flag for the operand.
@ EVEX
EVEX - Specifies that this instruction use EVEX form which provides syntax support up to 32 512-bit r...
@ VEX
VEX - encoding using 0xC4/0xC5.
@ XOP
XOP - Opcode prefix used by XOP instructions.
@ GlobalBaseReg
On Darwin, this node represents the result of the popl at function entry, used for PIC code.
@ POP_FROM_X87_REG
The same as ISD::CopyFromReg except that this node makes it explicit that it may lower to an x87 FPU ...
int getCondSrcNoFromDesc(const MCInstrDesc &MCID)
Return the source operand # for condition code by MCID.
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse=false, bool IgnoreAlignment=false)
Check if Op is a load operation that could be folded into some other x86 instruction as a memory oper...
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement)
Returns true of the given offset can be fit into displacement field of the instruction.
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
constexpr uint16_t Magic
Definition SFrame.h:32
This is an optimization pass for GlobalISel generic memory operations.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:557
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:362
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:263
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:261
FunctionPass * createX86ISelDag(X86TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a X86-specific DAG, ready for instruction scheduling.
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
@ And
Bitwise or logical AND of integers.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:874
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:876
#define N
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:381
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:324
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:215
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:176
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition ValueTypes.h:220
bool isConstant() const
Returns true if we know the value of all bits.
Definition KnownBits.h:54
Matching combinators.
LLVM_ABI unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
bool hasNoUnsignedWrap() const