LLVM 22.0.0git
X86ISelDAGToDAG.cpp
Go to the documentation of this file.
1//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines a DAG pattern matching instruction selector for X86,
10// converting from a legalized dag to a X86 dag.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelDAGToDAG.h"
15#include "X86.h"
17#include "X86Subtarget.h"
18#include "X86TargetMachine.h"
19#include "llvm/ADT/Statistic.h"
22#include "llvm/Config/llvm-config.h"
24#include "llvm/IR/Function.h"
26#include "llvm/IR/Intrinsics.h"
27#include "llvm/IR/IntrinsicsX86.h"
28#include "llvm/IR/Module.h"
29#include "llvm/IR/Type.h"
30#include "llvm/Support/Debug.h"
34#include <cstdint>
35
36using namespace llvm;
37
38#define DEBUG_TYPE "x86-isel"
39#define PASS_NAME "X86 DAG->DAG Instruction Selection"
40
41STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
42
43static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
44 cl::desc("Enable setting constant bits to reduce size of mask immediates"),
46
48 "x86-promote-anyext-load", cl::init(true),
49 cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden);
50
52
53//===----------------------------------------------------------------------===//
54// Pattern Matcher Implementation
55//===----------------------------------------------------------------------===//
56
57namespace {
58 /// This corresponds to X86AddressMode, but uses SDValue's instead of register
59 /// numbers for the leaves of the matched tree.
60 struct X86ISelAddressMode {
61 enum {
62 RegBase,
63 FrameIndexBase
64 } BaseType = RegBase;
65
66 // This is really a union, discriminated by BaseType!
67 SDValue Base_Reg;
68 int Base_FrameIndex = 0;
69
70 unsigned Scale = 1;
71 SDValue IndexReg;
72 int32_t Disp = 0;
73 SDValue Segment;
74 const GlobalValue *GV = nullptr;
75 const Constant *CP = nullptr;
76 const BlockAddress *BlockAddr = nullptr;
77 const char *ES = nullptr;
78 MCSymbol *MCSym = nullptr;
79 int JT = -1;
80 Align Alignment; // CP alignment.
81 unsigned char SymbolFlags = X86II::MO_NO_FLAG; // X86II::MO_*
82 bool NegateIndex = false;
83
84 X86ISelAddressMode() = default;
85
86 bool hasSymbolicDisplacement() const {
87 return GV != nullptr || CP != nullptr || ES != nullptr ||
88 MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
89 }
90
91 bool hasBaseOrIndexReg() const {
92 return BaseType == FrameIndexBase ||
93 IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
94 }
95
96 /// Return true if this addressing mode is already RIP-relative.
97 bool isRIPRelative() const {
98 if (BaseType != RegBase) return false;
99 if (RegisterSDNode *RegNode =
100 dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
101 return RegNode->getReg() == X86::RIP;
102 return false;
103 }
104
105 void setBaseReg(SDValue Reg) {
106 BaseType = RegBase;
107 Base_Reg = Reg;
108 }
109
110#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
111 void dump(SelectionDAG *DAG = nullptr) {
112 dbgs() << "X86ISelAddressMode " << this << '\n';
113 dbgs() << "Base_Reg ";
114 if (Base_Reg.getNode())
115 Base_Reg.getNode()->dump(DAG);
116 else
117 dbgs() << "nul\n";
118 if (BaseType == FrameIndexBase)
119 dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
120 dbgs() << " Scale " << Scale << '\n'
121 << "IndexReg ";
122 if (NegateIndex)
123 dbgs() << "negate ";
124 if (IndexReg.getNode())
125 IndexReg.getNode()->dump(DAG);
126 else
127 dbgs() << "nul\n";
128 dbgs() << " Disp " << Disp << '\n'
129 << "GV ";
130 if (GV)
131 GV->dump();
132 else
133 dbgs() << "nul";
134 dbgs() << " CP ";
135 if (CP)
136 CP->dump();
137 else
138 dbgs() << "nul";
139 dbgs() << '\n'
140 << "ES ";
141 if (ES)
142 dbgs() << ES;
143 else
144 dbgs() << "nul";
145 dbgs() << " MCSym ";
146 if (MCSym)
147 dbgs() << MCSym;
148 else
149 dbgs() << "nul";
150 dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n';
151 }
152#endif
153 };
154}
155
156namespace {
157 //===--------------------------------------------------------------------===//
158 /// ISel - X86-specific code to select X86 machine instructions for
159 /// SelectionDAG operations.
160 ///
161 class X86DAGToDAGISel final : public SelectionDAGISel {
162 /// Keep a pointer to the X86Subtarget around so that we can
163 /// make the right decision when generating code for different targets.
164 const X86Subtarget *Subtarget;
165
166 /// If true, selector should try to optimize for minimum code size.
167 bool OptForMinSize;
168
169 /// Disable direct TLS access through segment registers.
170 bool IndirectTlsSegRefs;
171
172 public:
173 X86DAGToDAGISel() = delete;
174
175 explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOptLevel OptLevel)
176 : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr),
177 OptForMinSize(false), IndirectTlsSegRefs(false) {}
178
179 bool runOnMachineFunction(MachineFunction &MF) override {
180 // Reset the subtarget each time through.
181 Subtarget = &MF.getSubtarget<X86Subtarget>();
182 IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
183 "indirect-tls-seg-refs");
184
185 // OptFor[Min]Size are used in pattern predicates that isel is matching.
186 OptForMinSize = MF.getFunction().hasMinSize();
188 }
189
190 void emitFunctionEntryCode() override;
191
192 bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
193
194 void PreprocessISelDAG() override;
195 void PostprocessISelDAG() override;
196
197// Include the pieces autogenerated from the target description.
198#include "X86GenDAGISel.inc"
199
200 private:
201 void Select(SDNode *N) override;
202
203 bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
204 bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
205 bool AllowSegmentRegForX32 = false);
206 bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
207 bool matchAddress(SDValue N, X86ISelAddressMode &AM);
208 bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
209 bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
210 SDValue matchIndexRecursively(SDValue N, X86ISelAddressMode &AM,
211 unsigned Depth);
212 bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
213 unsigned Depth);
214 bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM,
215 unsigned Depth);
216 bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
217 bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
218 SDValue &Scale, SDValue &Index, SDValue &Disp,
219 SDValue &Segment);
220 bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp,
221 SDValue ScaleOp, SDValue &Base, SDValue &Scale,
222 SDValue &Index, SDValue &Disp, SDValue &Segment);
223 bool selectMOV64Imm32(SDValue N, SDValue &Imm);
224 bool selectLEAAddr(SDValue N, SDValue &Base,
225 SDValue &Scale, SDValue &Index, SDValue &Disp,
226 SDValue &Segment);
227 bool selectLEA64_Addr(SDValue N, SDValue &Base, SDValue &Scale,
228 SDValue &Index, SDValue &Disp, SDValue &Segment);
229 bool selectTLSADDRAddr(SDValue N, SDValue &Base,
230 SDValue &Scale, SDValue &Index, SDValue &Disp,
231 SDValue &Segment);
232 bool selectRelocImm(SDValue N, SDValue &Op);
233
234 bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
235 SDValue &Base, SDValue &Scale,
236 SDValue &Index, SDValue &Disp,
237 SDValue &Segment);
238
239 // Convenience method where P is also root.
240 bool tryFoldLoad(SDNode *P, SDValue N,
241 SDValue &Base, SDValue &Scale,
242 SDValue &Index, SDValue &Disp,
243 SDValue &Segment) {
244 return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
245 }
246
247 bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
248 SDValue &Base, SDValue &Scale,
249 SDValue &Index, SDValue &Disp,
250 SDValue &Segment);
251
252 bool isProfitableToFormMaskedOp(SDNode *N) const;
253
254 /// Implement addressing mode selection for inline asm expressions.
255 bool SelectInlineAsmMemoryOperand(const SDValue &Op,
256 InlineAsm::ConstraintCode ConstraintID,
257 std::vector<SDValue> &OutOps) override;
258
259 void emitSpecialCodeForMain();
260
261 inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
262 MVT VT, SDValue &Base, SDValue &Scale,
263 SDValue &Index, SDValue &Disp,
264 SDValue &Segment) {
265 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
266 Base = CurDAG->getTargetFrameIndex(
267 AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
268 else if (AM.Base_Reg.getNode())
269 Base = AM.Base_Reg;
270 else
271 Base = CurDAG->getRegister(0, VT);
272
273 Scale = getI8Imm(AM.Scale, DL);
274
275#define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC)
276 // Negate the index if needed.
277 if (AM.NegateIndex) {
278 unsigned NegOpc;
279 switch (VT.SimpleTy) {
280 default:
281 llvm_unreachable("Unsupported VT!");
282 case MVT::i64:
283 NegOpc = GET_ND_IF_ENABLED(X86::NEG64r);
284 break;
285 case MVT::i32:
286 NegOpc = GET_ND_IF_ENABLED(X86::NEG32r);
287 break;
288 case MVT::i16:
289 NegOpc = GET_ND_IF_ENABLED(X86::NEG16r);
290 break;
291 case MVT::i8:
292 NegOpc = GET_ND_IF_ENABLED(X86::NEG8r);
293 break;
294 }
295 SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
296 AM.IndexReg), 0);
297 AM.IndexReg = Neg;
298 }
299
300 if (AM.IndexReg.getNode())
301 Index = AM.IndexReg;
302 else
303 Index = CurDAG->getRegister(0, VT);
304
305 // These are 32-bit even in 64-bit mode since RIP-relative offset
306 // is 32-bit.
307 if (AM.GV)
308 Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
309 MVT::i32, AM.Disp,
310 AM.SymbolFlags);
311 else if (AM.CP)
312 Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Alignment,
313 AM.Disp, AM.SymbolFlags);
314 else if (AM.ES) {
315 assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
316 Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
317 } else if (AM.MCSym) {
318 assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
319 assert(AM.SymbolFlags == 0 && "oo");
320 Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
321 } else if (AM.JT != -1) {
322 assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
323 Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
324 } else if (AM.BlockAddr)
325 Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
326 AM.SymbolFlags);
327 else
328 Disp = CurDAG->getSignedTargetConstant(AM.Disp, DL, MVT::i32);
329
330 if (AM.Segment.getNode())
331 Segment = AM.Segment;
332 else
333 Segment = CurDAG->getRegister(0, MVT::i16);
334 }
335
336 // Utility function to determine whether it is AMX SDNode right after
337 // lowering but before ISEL.
338 bool isAMXSDNode(SDNode *N) const {
339 // Check if N is AMX SDNode:
340 // 1. check result type;
341 // 2. check operand type;
342 for (unsigned Idx = 0, E = N->getNumValues(); Idx != E; ++Idx) {
343 if (N->getValueType(Idx) == MVT::x86amx)
344 return true;
345 }
346 for (unsigned Idx = 0, E = N->getNumOperands(); Idx != E; ++Idx) {
347 SDValue Op = N->getOperand(Idx);
348 if (Op.getValueType() == MVT::x86amx)
349 return true;
350 }
351 return false;
352 }
353
354 // Utility function to determine whether we should avoid selecting
355 // immediate forms of instructions for better code size or not.
356 // At a high level, we'd like to avoid such instructions when
357 // we have similar constants used within the same basic block
358 // that can be kept in a register.
359 //
360 bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
361 uint32_t UseCount = 0;
362
363 // Do not want to hoist if we're not optimizing for size.
364 // TODO: We'd like to remove this restriction.
365 // See the comment in X86InstrInfo.td for more info.
366 if (!CurDAG->shouldOptForSize())
367 return false;
368
369 // Walk all the users of the immediate.
370 for (const SDNode *User : N->users()) {
371 if (UseCount >= 2)
372 break;
373
374 // This user is already selected. Count it as a legitimate use and
375 // move on.
376 if (User->isMachineOpcode()) {
377 UseCount++;
378 continue;
379 }
380
381 // We want to count stores of immediates as real uses.
382 if (User->getOpcode() == ISD::STORE &&
383 User->getOperand(1).getNode() == N) {
384 UseCount++;
385 continue;
386 }
387
388 // We don't currently match users that have > 2 operands (except
389 // for stores, which are handled above)
390 // Those instruction won't match in ISEL, for now, and would
391 // be counted incorrectly.
392 // This may change in the future as we add additional instruction
393 // types.
394 if (User->getNumOperands() != 2)
395 continue;
396
397 // If this is a sign-extended 8-bit integer immediate used in an ALU
398 // instruction, there is probably an opcode encoding to save space.
400 if (C && isInt<8>(C->getSExtValue()))
401 continue;
402
403 // Immediates that are used for offsets as part of stack
404 // manipulation should be left alone. These are typically
405 // used to indicate SP offsets for argument passing and
406 // will get pulled into stores/pushes (implicitly).
407 if (User->getOpcode() == X86ISD::ADD ||
408 User->getOpcode() == ISD::ADD ||
409 User->getOpcode() == X86ISD::SUB ||
410 User->getOpcode() == ISD::SUB) {
411
412 // Find the other operand of the add/sub.
413 SDValue OtherOp = User->getOperand(0);
414 if (OtherOp.getNode() == N)
415 OtherOp = User->getOperand(1);
416
417 // Don't count if the other operand is SP.
418 RegisterSDNode *RegNode;
419 if (OtherOp->getOpcode() == ISD::CopyFromReg &&
421 OtherOp->getOperand(1).getNode())))
422 if ((RegNode->getReg() == X86::ESP) ||
423 (RegNode->getReg() == X86::RSP))
424 continue;
425 }
426
427 // ... otherwise, count this and move on.
428 UseCount++;
429 }
430
431 // If we have more than 1 use, then recommend for hoisting.
432 return (UseCount > 1);
433 }
434
435 /// Return a target constant with the specified value of type i8.
436 inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
437 return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
438 }
439
440 /// Return a target constant with the specified value, of type i32.
441 inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
442 return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
443 }
444
445 /// Return a target constant with the specified value, of type i64.
446 inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
447 return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
448 }
449
450 SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
451 const SDLoc &DL) {
452 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
453 uint64_t Index = N->getConstantOperandVal(1);
454 MVT VecVT = N->getOperand(0).getSimpleValueType();
455 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
456 }
457
458 SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
459 const SDLoc &DL) {
460 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
461 uint64_t Index = N->getConstantOperandVal(2);
462 MVT VecVT = N->getSimpleValueType(0);
463 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
464 }
465
466 SDValue getPermuteVINSERTCommutedImmediate(SDNode *N, unsigned VecWidth,
467 const SDLoc &DL) {
468 assert(VecWidth == 128 && "Unexpected vector width");
469 uint64_t Index = N->getConstantOperandVal(2);
470 MVT VecVT = N->getSimpleValueType(0);
471 uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth;
472 assert((InsertIdx == 0 || InsertIdx == 1) && "Bad insertf128 index");
473 // vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub)
474 // vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub)
475 return getI8Imm(InsertIdx ? 0x02 : 0x30, DL);
476 }
477
478 SDValue getSBBZero(SDNode *N) {
479 SDLoc dl(N);
480 MVT VT = N->getSimpleValueType(0);
481
482 // Create zero.
483 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
484 SDValue Zero =
485 SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, {}), 0);
486 if (VT == MVT::i64) {
487 Zero = SDValue(
488 CurDAG->getMachineNode(
489 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
490 CurDAG->getTargetConstant(0, dl, MVT::i64), Zero,
491 CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
492 0);
493 }
494
495 // Copy flags to the EFLAGS register and glue it to next node.
496 unsigned Opcode = N->getOpcode();
497 assert((Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY) &&
498 "Unexpected opcode for SBB materialization");
499 unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1;
500 SDValue EFLAGS =
501 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
502 N->getOperand(FlagOpIndex), SDValue());
503
504 // Create a 64-bit instruction if the result is 64-bits otherwise use the
505 // 32-bit version.
506 unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
507 MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
508 VTs = CurDAG->getVTList(SBBVT, MVT::i32);
509 return SDValue(
510 CurDAG->getMachineNode(Opc, dl, VTs,
511 {Zero, Zero, EFLAGS, EFLAGS.getValue(1)}),
512 0);
513 }
514
515 // Helper to detect unneeded and instructions on shift amounts. Called
516 // from PatFrags in tablegen.
517 bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
518 assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
519 const APInt &Val = N->getConstantOperandAPInt(1);
520
521 if (Val.countr_one() >= Width)
522 return true;
523
524 APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero;
525 return Mask.countr_one() >= Width;
526 }
527
528 /// Return an SDNode that returns the value of the global base register.
529 /// Output instructions required to initialize the global base register,
530 /// if necessary.
531 SDNode *getGlobalBaseReg();
532
533 /// Return a reference to the TargetMachine, casted to the target-specific
534 /// type.
535 const X86TargetMachine &getTargetMachine() const {
536 return static_cast<const X86TargetMachine &>(TM);
537 }
538
539 /// Return a reference to the TargetInstrInfo, casted to the target-specific
540 /// type.
541 const X86InstrInfo *getInstrInfo() const {
542 return Subtarget->getInstrInfo();
543 }
544
545 /// Return a condition code of the given SDNode
546 X86::CondCode getCondFromNode(SDNode *N) const;
547
548 /// Address-mode matching performs shift-of-and to and-of-shift
549 /// reassociation in order to expose more scaled addressing
550 /// opportunities.
551 bool ComplexPatternFuncMutatesDAG() const override {
552 return true;
553 }
554
555 bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
556
557 // Indicates we should prefer to use a non-temporal load for this load.
558 bool useNonTemporalLoad(LoadSDNode *N) const {
559 if (!N->isNonTemporal())
560 return false;
561
562 unsigned StoreSize = N->getMemoryVT().getStoreSize();
563
564 if (N->getAlign().value() < StoreSize)
565 return false;
566
567 switch (StoreSize) {
568 default: llvm_unreachable("Unsupported store size");
569 case 4:
570 case 8:
571 return false;
572 case 16:
573 return Subtarget->hasSSE41();
574 case 32:
575 return Subtarget->hasAVX2();
576 case 64:
577 return Subtarget->hasAVX512();
578 }
579 }
580
581 bool foldLoadStoreIntoMemOperand(SDNode *Node);
582 MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
583 bool matchBitExtract(SDNode *Node);
584 bool shrinkAndImmediate(SDNode *N);
585 bool isMaskZeroExtended(SDNode *N) const;
586 bool tryShiftAmountMod(SDNode *N);
587 bool tryShrinkShlLogicImm(SDNode *N);
588 bool tryVPTERNLOG(SDNode *N);
589 bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentB,
590 SDNode *ParentC, SDValue A, SDValue B, SDValue C,
591 uint8_t Imm);
592 bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
593 bool tryMatchBitSelect(SDNode *N);
594
595 MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
596 const SDLoc &dl, MVT VT, SDNode *Node);
597 MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
598 const SDLoc &dl, MVT VT, SDNode *Node,
599 SDValue &InGlue);
600
601 bool tryOptimizeRem8Extend(SDNode *N);
602
603 bool onlyUsesZeroFlag(SDValue Flags) const;
604 bool hasNoSignFlagUses(SDValue Flags) const;
605 bool hasNoCarryFlagUses(SDValue Flags) const;
606 };
607
608 class X86DAGToDAGISelLegacy : public SelectionDAGISelLegacy {
609 public:
610 static char ID;
611 explicit X86DAGToDAGISelLegacy(X86TargetMachine &tm,
612 CodeGenOptLevel OptLevel)
613 : SelectionDAGISelLegacy(
614 ID, std::make_unique<X86DAGToDAGISel>(tm, OptLevel)) {}
615 };
616}
617
618char X86DAGToDAGISelLegacy::ID = 0;
619
620INITIALIZE_PASS(X86DAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false)
621
622// Returns true if this masked compare can be implemented legally with this
623// type.
624static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
625 unsigned Opcode = N->getOpcode();
626 if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM ||
627 Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC ||
628 Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) {
629 // We can get 256-bit 8 element types here without VLX being enabled. When
630 // this happens we will use 512-bit operations and the mask will not be
631 // zero extended.
632 EVT OpVT = N->getOperand(0).getValueType();
633 // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
634 // second operand.
635 if (Opcode == X86ISD::STRICT_CMPM)
636 OpVT = N->getOperand(1).getValueType();
637 if (OpVT.is256BitVector() || OpVT.is128BitVector())
638 return Subtarget->hasVLX();
639
640 return true;
641 }
642 // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
643 if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
644 Opcode == X86ISD::FSETCCM_SAE)
645 return true;
646
647 return false;
648}
649
650// Returns true if we can assume the writer of the mask has zero extended it
651// for us.
652bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
653 // If this is an AND, check if we have a compare on either side. As long as
654 // one side guarantees the mask is zero extended, the AND will preserve those
655 // zeros.
656 if (N->getOpcode() == ISD::AND)
657 return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
658 isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);
659
660 return isLegalMaskCompare(N, Subtarget);
661}
662
663bool
664X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
665 if (OptLevel == CodeGenOptLevel::None)
666 return false;
667
668 if (!N.hasOneUse())
669 return false;
670
671 if (N.getOpcode() != ISD::LOAD)
672 return true;
673
674 // Don't fold non-temporal loads if we have an instruction for them.
675 if (useNonTemporalLoad(cast<LoadSDNode>(N)))
676 return false;
677
678 // If N is a load, do additional profitability checks.
679 if (U == Root) {
680 switch (U->getOpcode()) {
681 default: break;
682 case X86ISD::ADD:
683 case X86ISD::ADC:
684 case X86ISD::SUB:
685 case X86ISD::SBB:
686 case X86ISD::AND:
687 case X86ISD::XOR:
688 case X86ISD::OR:
689 case ISD::ADD:
690 case ISD::UADDO_CARRY:
691 case ISD::AND:
692 case ISD::OR:
693 case ISD::XOR: {
694 SDValue Op1 = U->getOperand(1);
695
696 // If the other operand is a 8-bit immediate we should fold the immediate
697 // instead. This reduces code size.
698 // e.g.
699 // movl 4(%esp), %eax
700 // addl $4, %eax
701 // vs.
702 // movl $4, %eax
703 // addl 4(%esp), %eax
704 // The former is 2 bytes shorter. In case where the increment is 1, then
705 // the saving can be 4 bytes (by using incl %eax).
706 if (auto *Imm = dyn_cast<ConstantSDNode>(Op1)) {
707 if (Imm->getAPIntValue().isSignedIntN(8))
708 return false;
709
710 // If this is a 64-bit AND with an immediate that fits in 32-bits,
711 // prefer using the smaller and over folding the load. This is needed to
712 // make sure immediates created by shrinkAndImmediate are always folded.
713 // Ideally we would narrow the load during DAG combine and get the
714 // best of both worlds.
715 if (U->getOpcode() == ISD::AND &&
716 Imm->getAPIntValue().getBitWidth() == 64 &&
717 Imm->getAPIntValue().isIntN(32))
718 return false;
719
720 // If this really a zext_inreg that can be represented with a movzx
721 // instruction, prefer that.
722 // TODO: We could shrink the load and fold if it is non-volatile.
723 if (U->getOpcode() == ISD::AND &&
724 (Imm->getAPIntValue() == UINT8_MAX ||
725 Imm->getAPIntValue() == UINT16_MAX ||
726 Imm->getAPIntValue() == UINT32_MAX))
727 return false;
728
729 // ADD/SUB with can negate the immediate and use the opposite operation
730 // to fit 128 into a sign extended 8 bit immediate.
731 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
732 (-Imm->getAPIntValue()).isSignedIntN(8))
733 return false;
734
735 if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) &&
736 (-Imm->getAPIntValue()).isSignedIntN(8) &&
737 hasNoCarryFlagUses(SDValue(U, 1)))
738 return false;
739 }
740
741 // If the other operand is a TLS address, we should fold it instead.
742 // This produces
743 // movl %gs:0, %eax
744 // leal i@NTPOFF(%eax), %eax
745 // instead of
746 // movl $i@NTPOFF, %eax
747 // addl %gs:0, %eax
748 // if the block also has an access to a second TLS address this will save
749 // a load.
750 // FIXME: This is probably also true for non-TLS addresses.
751 if (Op1.getOpcode() == X86ISD::Wrapper) {
752 SDValue Val = Op1.getOperand(0);
754 return false;
755 }
756
757 // Don't fold load if this matches the BTS/BTR/BTC patterns.
758 // BTS: (or X, (shl 1, n))
759 // BTR: (and X, (rotl -2, n))
760 // BTC: (xor X, (shl 1, n))
761 if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
762 if (U->getOperand(0).getOpcode() == ISD::SHL &&
763 isOneConstant(U->getOperand(0).getOperand(0)))
764 return false;
765
766 if (U->getOperand(1).getOpcode() == ISD::SHL &&
767 isOneConstant(U->getOperand(1).getOperand(0)))
768 return false;
769 }
770 if (U->getOpcode() == ISD::AND) {
771 SDValue U0 = U->getOperand(0);
772 SDValue U1 = U->getOperand(1);
773 if (U0.getOpcode() == ISD::ROTL) {
775 if (C && C->getSExtValue() == -2)
776 return false;
777 }
778
779 if (U1.getOpcode() == ISD::ROTL) {
781 if (C && C->getSExtValue() == -2)
782 return false;
783 }
784 }
785
786 break;
787 }
788 case ISD::SHL:
789 case ISD::SRA:
790 case ISD::SRL:
791 // Don't fold a load into a shift by immediate. The BMI2 instructions
792 // support folding a load, but not an immediate. The legacy instructions
793 // support folding an immediate, but can't fold a load. Folding an
794 // immediate is preferable to folding a load.
795 if (isa<ConstantSDNode>(U->getOperand(1)))
796 return false;
797
798 break;
799 }
800 }
801
802 // Prevent folding a load if this can implemented with an insert_subreg or
803 // a move that implicitly zeroes.
804 if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
805 isNullConstant(Root->getOperand(2)) &&
806 (Root->getOperand(0).isUndef() ||
808 return false;
809
810 return true;
811}
812
813// Indicates it is profitable to form an AVX512 masked operation. Returning
814// false will favor a masked register-register masked move or vblendm and the
815// operation will be selected separately.
816bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const {
817 assert(
818 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) &&
819 "Unexpected opcode!");
820
821 // If the operation has additional users, the operation will be duplicated.
822 // Check the use count to prevent that.
823 // FIXME: Are there cheap opcodes we might want to duplicate?
824 return N->getOperand(1).hasOneUse();
825}
826
827/// Replace the original chain operand of the call with
828/// load's chain operand and move load below the call's chain operand.
829static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
830 SDValue Call, SDValue OrigChain) {
832 SDValue Chain = OrigChain.getOperand(0);
833 if (Chain.getNode() == Load.getNode())
834 Ops.push_back(Load.getOperand(0));
835 else {
836 assert(Chain.getOpcode() == ISD::TokenFactor &&
837 "Unexpected chain operand");
838 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
839 if (Chain.getOperand(i).getNode() == Load.getNode())
840 Ops.push_back(Load.getOperand(0));
841 else
842 Ops.push_back(Chain.getOperand(i));
843 SDValue NewChain =
844 CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
845 Ops.clear();
846 Ops.push_back(NewChain);
847 }
848 Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
849 CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
850 CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
851 Load.getOperand(1), Load.getOperand(2));
852
853 Ops.clear();
854 Ops.push_back(SDValue(Load.getNode(), 1));
855 Ops.append(Call->op_begin() + 1, Call->op_end());
856 CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
857}
858
859/// Return true if call address is a load and it can be
860/// moved below CALLSEQ_START and the chains leading up to the call.
861/// Return the CALLSEQ_START by reference as a second output.
862/// In the case of a tail call, there isn't a callseq node between the call
863/// chain and the load.
864static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
865 // The transformation is somewhat dangerous if the call's chain was glued to
866 // the call. After MoveBelowOrigChain the load is moved between the call and
867 // the chain, this can create a cycle if the load is not folded. So it is
868 // *really* important that we are sure the load will be folded.
869 if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
870 return false;
871 auto *LD = dyn_cast<LoadSDNode>(Callee.getNode());
872 if (!LD ||
873 !LD->isSimple() ||
874 LD->getAddressingMode() != ISD::UNINDEXED ||
875 LD->getExtensionType() != ISD::NON_EXTLOAD)
876 return false;
877
878 // Now let's find the callseq_start.
879 while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
880 if (!Chain.hasOneUse())
881 return false;
882 Chain = Chain.getOperand(0);
883 }
884
885 if (!Chain.getNumOperands())
886 return false;
887 // Since we are not checking for AA here, conservatively abort if the chain
888 // writes to memory. It's not safe to move the callee (a load) across a store.
889 if (isa<MemSDNode>(Chain.getNode()) &&
890 cast<MemSDNode>(Chain.getNode())->writeMem())
891 return false;
892 if (Chain.getOperand(0).getNode() == Callee.getNode())
893 return true;
894 if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
895 Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
896 Callee.getValue(1).hasOneUse())
897 return true;
898 return false;
899}
900
901static bool isEndbrImm64(uint64_t Imm) {
902// There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
903// i.g: 0xF3660F1EFA, 0xF3670F1EFA
904 if ((Imm & 0x00FFFFFF) != 0x0F1EFA)
905 return false;
906
907 uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64,
908 0x65, 0x66, 0x67, 0xf0, 0xf2};
909 int i = 24; // 24bit 0x0F1EFA has matched
910 while (i < 64) {
911 uint8_t Byte = (Imm >> i) & 0xFF;
912 if (Byte == 0xF3)
913 return true;
914 if (!llvm::is_contained(OptionalPrefixBytes, Byte))
915 return false;
916 i += 8;
917 }
918
919 return false;
920}
921
922static bool needBWI(MVT VT) {
923 return (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v64i8);
924}
925
926void X86DAGToDAGISel::PreprocessISelDAG() {
927 bool MadeChange = false;
928 for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
929 E = CurDAG->allnodes_end(); I != E; ) {
930 SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
931
932 // This is for CET enhancement.
933 //
934 // ENDBR32 and ENDBR64 have specific opcodes:
935 // ENDBR32: F3 0F 1E FB
936 // ENDBR64: F3 0F 1E FA
937 // And we want that attackers won’t find unintended ENDBR32/64
938 // opcode matches in the binary
939 // Here’s an example:
940 // If the compiler had to generate asm for the following code:
941 // a = 0xF30F1EFA
942 // it could, for example, generate:
943 // mov 0xF30F1EFA, dword ptr[a]
944 // In such a case, the binary would include a gadget that starts
945 // with a fake ENDBR64 opcode. Therefore, we split such generation
946 // into multiple operations, let it not shows in the binary
947 if (N->getOpcode() == ISD::Constant) {
948 MVT VT = N->getSimpleValueType(0);
949 int64_t Imm = cast<ConstantSDNode>(N)->getSExtValue();
950 int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB;
951 if (Imm == EndbrImm || isEndbrImm64(Imm)) {
952 // Check that the cf-protection-branch is enabled.
953 Metadata *CFProtectionBranch =
955 "cf-protection-branch");
956 if (CFProtectionBranch || IndirectBranchTracking) {
957 SDLoc dl(N);
958 SDValue Complement = CurDAG->getConstant(~Imm, dl, VT, false, true);
959 Complement = CurDAG->getNOT(dl, Complement, VT);
960 --I;
961 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Complement);
962 ++I;
963 MadeChange = true;
964 continue;
965 }
966 }
967 }
968
969 // If this is a target specific AND node with no flag usages, turn it back
970 // into ISD::AND to enable test instruction matching.
971 if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
972 SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
973 N->getOperand(0), N->getOperand(1));
974 --I;
975 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
976 ++I;
977 MadeChange = true;
978 continue;
979 }
980
981 // Convert vector increment or decrement to sub/add with an all-ones
982 // constant:
983 // add X, <1, 1...> --> sub X, <-1, -1...>
984 // sub X, <1, 1...> --> add X, <-1, -1...>
985 // The all-ones vector constant can be materialized using a pcmpeq
986 // instruction that is commonly recognized as an idiom (has no register
987 // dependency), so that's better/smaller than loading a splat 1 constant.
988 //
989 // But don't do this if it would inhibit a potentially profitable load
990 // folding opportunity for the other operand. That only occurs with the
991 // intersection of:
992 // (1) The other operand (op0) is load foldable.
993 // (2) The op is an add (otherwise, we are *creating* an add and can still
994 // load fold the other op).
995 // (3) The target has AVX (otherwise, we have a destructive add and can't
996 // load fold the other op without killing the constant op).
997 // (4) The constant 1 vector has multiple uses (so it is profitable to load
998 // into a register anyway).
999 auto mayPreventLoadFold = [&]() {
1000 return X86::mayFoldLoad(N->getOperand(0), *Subtarget) &&
1001 N->getOpcode() == ISD::ADD && Subtarget->hasAVX() &&
1002 !N->getOperand(1).hasOneUse();
1003 };
1004 if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
1005 N->getSimpleValueType(0).isVector() && !mayPreventLoadFold()) {
1006 APInt SplatVal;
1008 peekThroughBitcasts(N->getOperand(0)).getNode()) &&
1009 X86::isConstantSplat(N->getOperand(1), SplatVal) &&
1010 SplatVal.isOne()) {
1011 SDLoc DL(N);
1012
1013 MVT VT = N->getSimpleValueType(0);
1014 unsigned NumElts = VT.getSizeInBits() / 32;
1016 CurDAG->getAllOnesConstant(DL, MVT::getVectorVT(MVT::i32, NumElts));
1017 AllOnes = CurDAG->getBitcast(VT, AllOnes);
1018
1019 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
1020 SDValue Res =
1021 CurDAG->getNode(NewOpcode, DL, VT, N->getOperand(0), AllOnes);
1022 --I;
1023 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1024 ++I;
1025 MadeChange = true;
1026 continue;
1027 }
1028 }
1029
1030 switch (N->getOpcode()) {
1031 case X86ISD::VBROADCAST: {
1032 MVT VT = N->getSimpleValueType(0);
1033 // Emulate v32i16/v64i8 broadcast without BWI.
1034 if (!Subtarget->hasBWI() && needBWI(VT)) {
1035 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1036 SDLoc dl(N);
1037 SDValue NarrowBCast =
1038 CurDAG->getNode(X86ISD::VBROADCAST, dl, NarrowVT, N->getOperand(0));
1039 SDValue Res =
1040 CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1041 NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1042 unsigned Index = NarrowVT.getVectorMinNumElements();
1043 Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1044 CurDAG->getIntPtrConstant(Index, dl));
1045
1046 --I;
1047 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1048 ++I;
1049 MadeChange = true;
1050 continue;
1051 }
1052
1053 break;
1054 }
1056 MVT VT = N->getSimpleValueType(0);
1057 // Emulate v32i16/v64i8 broadcast without BWI.
1058 if (!Subtarget->hasBWI() && needBWI(VT)) {
1059 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1060 auto *MemNode = cast<MemSDNode>(N);
1061 SDLoc dl(N);
1062 SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other);
1063 SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()};
1064 SDValue NarrowBCast = CurDAG->getMemIntrinsicNode(
1065 X86ISD::VBROADCAST_LOAD, dl, VTs, Ops, MemNode->getMemoryVT(),
1066 MemNode->getMemOperand());
1067 SDValue Res =
1068 CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1069 NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1070 unsigned Index = NarrowVT.getVectorMinNumElements();
1071 Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1072 CurDAG->getIntPtrConstant(Index, dl));
1073
1074 --I;
1075 SDValue To[] = {Res, NarrowBCast.getValue(1)};
1076 CurDAG->ReplaceAllUsesWith(N, To);
1077 ++I;
1078 MadeChange = true;
1079 continue;
1080 }
1081
1082 break;
1083 }
1084 case ISD::LOAD: {
1085 // If this is a XMM/YMM load of the same lower bits as another YMM/ZMM
1086 // load, then just extract the lower subvector and avoid the second load.
1087 auto *Ld = cast<LoadSDNode>(N);
1088 MVT VT = N->getSimpleValueType(0);
1089 if (!ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
1090 !(VT.is128BitVector() || VT.is256BitVector()))
1091 break;
1092
1093 MVT MaxVT = VT;
1094 SDNode *MaxLd = nullptr;
1095 SDValue Ptr = Ld->getBasePtr();
1096 SDValue Chain = Ld->getChain();
1097 for (SDNode *User : Ptr->users()) {
1098 auto *UserLd = dyn_cast<LoadSDNode>(User);
1099 MVT UserVT = User->getSimpleValueType(0);
1100 if (User != N && UserLd && ISD::isNormalLoad(User) &&
1101 UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain &&
1102 !User->hasAnyUseOfValue(1) &&
1103 (UserVT.is256BitVector() || UserVT.is512BitVector()) &&
1104 UserVT.getSizeInBits() > VT.getSizeInBits() &&
1105 (!MaxLd || UserVT.getSizeInBits() > MaxVT.getSizeInBits())) {
1106 MaxLd = User;
1107 MaxVT = UserVT;
1108 }
1109 }
1110 if (MaxLd) {
1111 SDLoc dl(N);
1112 unsigned NumSubElts = VT.getSizeInBits() / MaxVT.getScalarSizeInBits();
1113 MVT SubVT = MVT::getVectorVT(MaxVT.getScalarType(), NumSubElts);
1114 SDValue Extract = CurDAG->getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT,
1115 SDValue(MaxLd, 0),
1116 CurDAG->getIntPtrConstant(0, dl));
1117 SDValue Res = CurDAG->getBitcast(VT, Extract);
1118
1119 --I;
1120 SDValue To[] = {Res, SDValue(MaxLd, 1)};
1121 CurDAG->ReplaceAllUsesWith(N, To);
1122 ++I;
1123 MadeChange = true;
1124 continue;
1125 }
1126 break;
1127 }
1128 case ISD::VSELECT: {
1129 // Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
1130 EVT EleVT = N->getOperand(0).getValueType().getVectorElementType();
1131 if (EleVT == MVT::i1)
1132 break;
1133
1134 assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
1135 assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
1136 "We can't replace VSELECT with BLENDV in vXi16!");
1137 SDValue R;
1138 if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(N->getOperand(0)) ==
1139 EleVT.getSizeInBits()) {
1140 R = CurDAG->getNode(X86ISD::VPTERNLOG, SDLoc(N), N->getValueType(0),
1141 N->getOperand(0), N->getOperand(1), N->getOperand(2),
1142 CurDAG->getTargetConstant(0xCA, SDLoc(N), MVT::i8));
1143 } else {
1144 R = CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
1145 N->getOperand(0), N->getOperand(1),
1146 N->getOperand(2));
1147 }
1148 --I;
1149 CurDAG->ReplaceAllUsesWith(N, R.getNode());
1150 ++I;
1151 MadeChange = true;
1152 continue;
1153 }
1154 case ISD::FP_ROUND:
1156 case ISD::FP_TO_SINT:
1157 case ISD::FP_TO_UINT:
1160 // Replace vector fp_to_s/uint with their X86 specific equivalent so we
1161 // don't need 2 sets of patterns.
1162 if (!N->getSimpleValueType(0).isVector())
1163 break;
1164
1165 unsigned NewOpc;
1166 switch (N->getOpcode()) {
1167 default: llvm_unreachable("Unexpected opcode!");
1168 case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break;
1169 case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break;
1170 case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break;
1171 case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
1172 case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break;
1173 case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
1174 }
1175 SDValue Res;
1176 if (N->isStrictFPOpcode())
1177 Res =
1178 CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(0), MVT::Other},
1179 {N->getOperand(0), N->getOperand(1)});
1180 else
1181 Res =
1182 CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1183 N->getOperand(0));
1184 --I;
1185 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1186 ++I;
1187 MadeChange = true;
1188 continue;
1189 }
1190 case ISD::SHL:
1191 case ISD::SRA:
1192 case ISD::SRL: {
1193 // Replace vector shifts with their X86 specific equivalent so we don't
1194 // need 2 sets of patterns.
1195 if (!N->getValueType(0).isVector())
1196 break;
1197
1198 unsigned NewOpc;
1199 switch (N->getOpcode()) {
1200 default: llvm_unreachable("Unexpected opcode!");
1201 case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
1202 case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
1203 case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
1204 }
1205 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1206 N->getOperand(0), N->getOperand(1));
1207 --I;
1208 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1209 ++I;
1210 MadeChange = true;
1211 continue;
1212 }
1213 case ISD::ANY_EXTEND:
1215 // Replace vector any extend with the zero extend equivalents so we don't
1216 // need 2 sets of patterns. Ignore vXi1 extensions.
1217 if (!N->getValueType(0).isVector())
1218 break;
1219
1220 unsigned NewOpc;
1221 if (N->getOperand(0).getScalarValueSizeInBits() == 1) {
1222 assert(N->getOpcode() == ISD::ANY_EXTEND &&
1223 "Unexpected opcode for mask vector!");
1224 NewOpc = ISD::SIGN_EXTEND;
1225 } else {
1226 NewOpc = N->getOpcode() == ISD::ANY_EXTEND
1229 }
1230
1231 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1232 N->getOperand(0));
1233 --I;
1234 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1235 ++I;
1236 MadeChange = true;
1237 continue;
1238 }
1239 case ISD::FCEIL:
1240 case ISD::STRICT_FCEIL:
1241 case ISD::FFLOOR:
1242 case ISD::STRICT_FFLOOR:
1243 case ISD::FTRUNC:
1244 case ISD::STRICT_FTRUNC:
1245 case ISD::FROUNDEVEN:
1247 case ISD::FNEARBYINT:
1249 case ISD::FRINT:
1250 case ISD::STRICT_FRINT: {
1251 // Replace fp rounding with their X86 specific equivalent so we don't
1252 // need 2 sets of patterns.
1253 unsigned Imm;
1254 switch (N->getOpcode()) {
1255 default: llvm_unreachable("Unexpected opcode!");
1256 case ISD::STRICT_FCEIL:
1257 case ISD::FCEIL: Imm = 0xA; break;
1258 case ISD::STRICT_FFLOOR:
1259 case ISD::FFLOOR: Imm = 0x9; break;
1260 case ISD::STRICT_FTRUNC:
1261 case ISD::FTRUNC: Imm = 0xB; break;
1263 case ISD::FROUNDEVEN: Imm = 0x8; break;
1265 case ISD::FNEARBYINT: Imm = 0xC; break;
1266 case ISD::STRICT_FRINT:
1267 case ISD::FRINT: Imm = 0x4; break;
1268 }
1269 SDLoc dl(N);
1270 bool IsStrict = N->isStrictFPOpcode();
1271 SDValue Res;
1272 if (IsStrict)
1273 Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl,
1274 {N->getValueType(0), MVT::Other},
1275 {N->getOperand(0), N->getOperand(1),
1276 CurDAG->getTargetConstant(Imm, dl, MVT::i32)});
1277 else
1278 Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0),
1279 N->getOperand(0),
1280 CurDAG->getTargetConstant(Imm, dl, MVT::i32));
1281 --I;
1282 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1283 ++I;
1284 MadeChange = true;
1285 continue;
1286 }
1287 case X86ISD::FANDN:
1288 case X86ISD::FAND:
1289 case X86ISD::FOR:
1290 case X86ISD::FXOR: {
1291 // Widen scalar fp logic ops to vector to reduce isel patterns.
1292 // FIXME: Can we do this during lowering/combine.
1293 MVT VT = N->getSimpleValueType(0);
1294 if (VT.isVector() || VT == MVT::f128)
1295 break;
1296
1297 MVT VecVT = VT == MVT::f64 ? MVT::v2f64
1298 : VT == MVT::f32 ? MVT::v4f32
1299 : MVT::v8f16;
1300
1301 SDLoc dl(N);
1302 SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1303 N->getOperand(0));
1304 SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1305 N->getOperand(1));
1306
1307 SDValue Res;
1308 if (Subtarget->hasSSE2()) {
1309 EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
1310 Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
1311 Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
1312 unsigned Opc;
1313 switch (N->getOpcode()) {
1314 default: llvm_unreachable("Unexpected opcode!");
1315 case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
1316 case X86ISD::FAND: Opc = ISD::AND; break;
1317 case X86ISD::FOR: Opc = ISD::OR; break;
1318 case X86ISD::FXOR: Opc = ISD::XOR; break;
1319 }
1320 Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
1321 Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
1322 } else {
1323 Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
1324 }
1325 Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
1326 CurDAG->getIntPtrConstant(0, dl));
1327 --I;
1328 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1329 ++I;
1330 MadeChange = true;
1331 continue;
1332 }
1333 }
1334
1335 if (OptLevel != CodeGenOptLevel::None &&
1336 // Only do this when the target can fold the load into the call or
1337 // jmp.
1338 !Subtarget->useIndirectThunkCalls() &&
1339 ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
1340 (N->getOpcode() == X86ISD::TC_RETURN &&
1341 (Subtarget->is64Bit() ||
1342 !getTargetMachine().isPositionIndependent())))) {
1343 /// Also try moving call address load from outside callseq_start to just
1344 /// before the call to allow it to be folded.
1345 ///
1346 /// [Load chain]
1347 /// ^
1348 /// |
1349 /// [Load]
1350 /// ^ ^
1351 /// | |
1352 /// / \--
1353 /// / |
1354 ///[CALLSEQ_START] |
1355 /// ^ |
1356 /// | |
1357 /// [LOAD/C2Reg] |
1358 /// | |
1359 /// \ /
1360 /// \ /
1361 /// [CALL]
1362 bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
1363 SDValue Chain = N->getOperand(0);
1364 SDValue Load = N->getOperand(1);
1365 if (!isCalleeLoad(Load, Chain, HasCallSeq))
1366 continue;
1367 moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
1368 ++NumLoadMoved;
1369 MadeChange = true;
1370 continue;
1371 }
1372
1373 // Lower fpround and fpextend nodes that target the FP stack to be store and
1374 // load to the stack. This is a gross hack. We would like to simply mark
1375 // these as being illegal, but when we do that, legalize produces these when
1376 // it expands calls, then expands these in the same legalize pass. We would
1377 // like dag combine to be able to hack on these between the call expansion
1378 // and the node legalization. As such this pass basically does "really
1379 // late" legalization of these inline with the X86 isel pass.
1380 // FIXME: This should only happen when not compiled with -O0.
1381 switch (N->getOpcode()) {
1382 default: continue;
1383 case ISD::FP_ROUND:
1384 case ISD::FP_EXTEND:
1385 {
1386 MVT SrcVT = N->getOperand(0).getSimpleValueType();
1387 MVT DstVT = N->getSimpleValueType(0);
1388
1389 // If any of the sources are vectors, no fp stack involved.
1390 if (SrcVT.isVector() || DstVT.isVector())
1391 continue;
1392
1393 // If the source and destination are SSE registers, then this is a legal
1394 // conversion that should not be lowered.
1395 const X86TargetLowering *X86Lowering =
1396 static_cast<const X86TargetLowering *>(TLI);
1397 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1398 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1399 if (SrcIsSSE && DstIsSSE)
1400 continue;
1401
1402 if (!SrcIsSSE && !DstIsSSE) {
1403 // If this is an FPStack extension, it is a noop.
1404 if (N->getOpcode() == ISD::FP_EXTEND)
1405 continue;
1406 // If this is a value-preserving FPStack truncation, it is a noop.
1407 if (N->getConstantOperandVal(1))
1408 continue;
1409 }
1410
1411 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1412 // FPStack has extload and truncstore. SSE can fold direct loads into other
1413 // operations. Based on this, decide what we want to do.
1414 MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
1415 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1416 int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1417 MachinePointerInfo MPI =
1418 MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1419 SDLoc dl(N);
1420
1421 // FIXME: optimize the case where the src/dest is a load or store?
1422
1423 SDValue Store = CurDAG->getTruncStore(
1424 CurDAG->getEntryNode(), dl, N->getOperand(0), MemTmp, MPI, MemVT);
1425 SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store,
1426 MemTmp, MPI, MemVT);
1427
1428 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1429 // extload we created. This will cause general havok on the dag because
1430 // anything below the conversion could be folded into other existing nodes.
1431 // To avoid invalidating 'I', back it up to the convert node.
1432 --I;
1433 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
1434 break;
1435 }
1436
1437 //The sequence of events for lowering STRICT_FP versions of these nodes requires
1438 //dealing with the chain differently, as there is already a preexisting chain.
1441 {
1442 MVT SrcVT = N->getOperand(1).getSimpleValueType();
1443 MVT DstVT = N->getSimpleValueType(0);
1444
1445 // If any of the sources are vectors, no fp stack involved.
1446 if (SrcVT.isVector() || DstVT.isVector())
1447 continue;
1448
1449 // If the source and destination are SSE registers, then this is a legal
1450 // conversion that should not be lowered.
1451 const X86TargetLowering *X86Lowering =
1452 static_cast<const X86TargetLowering *>(TLI);
1453 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1454 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1455 if (SrcIsSSE && DstIsSSE)
1456 continue;
1457
1458 if (!SrcIsSSE && !DstIsSSE) {
1459 // If this is an FPStack extension, it is a noop.
1460 if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1461 continue;
1462 // If this is a value-preserving FPStack truncation, it is a noop.
1463 if (N->getConstantOperandVal(2))
1464 continue;
1465 }
1466
1467 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1468 // FPStack has extload and truncstore. SSE can fold direct loads into other
1469 // operations. Based on this, decide what we want to do.
1470 MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
1471 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1472 int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1473 MachinePointerInfo MPI =
1474 MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1475 SDLoc dl(N);
1476
1477 // FIXME: optimize the case where the src/dest is a load or store?
1478
1479 //Since the operation is StrictFP, use the preexisting chain.
1481 if (!SrcIsSSE) {
1482 SDVTList VTs = CurDAG->getVTList(MVT::Other);
1483 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp};
1484 Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT,
1485 MPI, /*Align*/ std::nullopt,
1487 if (N->getFlags().hasNoFPExcept()) {
1488 SDNodeFlags Flags = Store->getFlags();
1489 Flags.setNoFPExcept(true);
1490 Store->setFlags(Flags);
1491 }
1492 } else {
1493 assert(SrcVT == MemVT && "Unexpected VT!");
1494 Store = CurDAG->getStore(N->getOperand(0), dl, N->getOperand(1), MemTmp,
1495 MPI);
1496 }
1497
1498 if (!DstIsSSE) {
1499 SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other);
1500 SDValue Ops[] = {Store, MemTmp};
1501 Result = CurDAG->getMemIntrinsicNode(
1502 X86ISD::FLD, dl, VTs, Ops, MemVT, MPI,
1503 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
1504 if (N->getFlags().hasNoFPExcept()) {
1505 SDNodeFlags Flags = Result->getFlags();
1506 Flags.setNoFPExcept(true);
1507 Result->setFlags(Flags);
1508 }
1509 } else {
1510 assert(DstVT == MemVT && "Unexpected VT!");
1511 Result = CurDAG->getLoad(DstVT, dl, Store, MemTmp, MPI);
1512 }
1513
1514 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1515 // extload we created. This will cause general havok on the dag because
1516 // anything below the conversion could be folded into other existing nodes.
1517 // To avoid invalidating 'I', back it up to the convert node.
1518 --I;
1519 CurDAG->ReplaceAllUsesWith(N, Result.getNode());
1520 break;
1521 }
1522 }
1523
1524
1525 // Now that we did that, the node is dead. Increment the iterator to the
1526 // next node to process, then delete N.
1527 ++I;
1528 MadeChange = true;
1529 }
1530
1531 // Remove any dead nodes that may have been left behind.
1532 if (MadeChange)
1533 CurDAG->RemoveDeadNodes();
1534}
1535
1536// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1537bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1538 unsigned Opc = N->getMachineOpcode();
1539 if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1540 Opc != X86::MOVSX64rr8)
1541 return false;
1542
1543 SDValue N0 = N->getOperand(0);
1544
1545 // We need to be extracting the lower bit of an extend.
1546 if (!N0.isMachineOpcode() ||
1547 N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
1548 N0.getConstantOperandVal(1) != X86::sub_8bit)
1549 return false;
1550
1551 // We're looking for either a movsx or movzx to match the original opcode.
1552 unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1553 : X86::MOVSX32rr8_NOREX;
1554 SDValue N00 = N0.getOperand(0);
1555 if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
1556 return false;
1557
1558 if (Opc == X86::MOVSX64rr8) {
1559 // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1560 // to 64.
1561 MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
1562 MVT::i64, N00);
1563 ReplaceUses(N, Extend);
1564 } else {
1565 // Ok we can drop this extend and just use the original extend.
1566 ReplaceUses(N, N00.getNode());
1567 }
1568
1569 return true;
1570}
1571
1572void X86DAGToDAGISel::PostprocessISelDAG() {
1573 // Skip peepholes at -O0.
1574 if (TM.getOptLevel() == CodeGenOptLevel::None)
1575 return;
1576
1577 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1578
1579 bool MadeChange = false;
1580 while (Position != CurDAG->allnodes_begin()) {
1581 SDNode *N = &*--Position;
1582 // Skip dead nodes and any non-machine opcodes.
1583 if (N->use_empty() || !N->isMachineOpcode())
1584 continue;
1585
1586 if (tryOptimizeRem8Extend(N)) {
1587 MadeChange = true;
1588 continue;
1589 }
1590
1591 unsigned Opc = N->getMachineOpcode();
1592 switch (Opc) {
1593 default:
1594 continue;
1595 // ANDrr/rm + TESTrr+ -> TESTrr/TESTmr
1596 case X86::TEST8rr:
1597 case X86::TEST16rr:
1598 case X86::TEST32rr:
1599 case X86::TEST64rr:
1600 // ANDrr/rm + CTESTrr -> CTESTrr/CTESTmr
1601 case X86::CTEST8rr:
1602 case X86::CTEST16rr:
1603 case X86::CTEST32rr:
1604 case X86::CTEST64rr: {
1605 auto &Op0 = N->getOperand(0);
1606 if (Op0 != N->getOperand(1) || !Op0->hasNUsesOfValue(2, Op0.getResNo()) ||
1607 !Op0.isMachineOpcode())
1608 continue;
1609 SDValue And = N->getOperand(0);
1610#define CASE_ND(OP) \
1611 case X86::OP: \
1612 case X86::OP##_ND:
1613 switch (And.getMachineOpcode()) {
1614 default:
1615 continue;
1616 CASE_ND(AND8rr)
1617 CASE_ND(AND16rr)
1618 CASE_ND(AND32rr)
1619 CASE_ND(AND64rr) {
1620 if (And->hasAnyUseOfValue(1))
1621 continue;
1622 SmallVector<SDValue> Ops(N->op_values());
1623 Ops[0] = And.getOperand(0);
1624 Ops[1] = And.getOperand(1);
1625 MachineSDNode *Test =
1626 CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i32, Ops);
1627 ReplaceUses(N, Test);
1628 MadeChange = true;
1629 continue;
1630 }
1631 CASE_ND(AND8rm)
1632 CASE_ND(AND16rm)
1633 CASE_ND(AND32rm)
1634 CASE_ND(AND64rm) {
1635 if (And->hasAnyUseOfValue(1))
1636 continue;
1637 unsigned NewOpc;
1638 bool IsCTESTCC = X86::isCTESTCC(Opc);
1639#define FROM_TO(A, B) \
1640 CASE_ND(A) NewOpc = IsCTESTCC ? X86::C##B : X86::B; \
1641 break;
1642 switch (And.getMachineOpcode()) {
1643 FROM_TO(AND8rm, TEST8mr);
1644 FROM_TO(AND16rm, TEST16mr);
1645 FROM_TO(AND32rm, TEST32mr);
1646 FROM_TO(AND64rm, TEST64mr);
1647 }
1648#undef FROM_TO
1649#undef CASE_ND
1650 // Need to swap the memory and register operand.
1651 SmallVector<SDValue> Ops = {And.getOperand(1), And.getOperand(2),
1652 And.getOperand(3), And.getOperand(4),
1653 And.getOperand(5), And.getOperand(0)};
1654 // CC, Cflags.
1655 if (IsCTESTCC) {
1656 Ops.push_back(N->getOperand(2));
1657 Ops.push_back(N->getOperand(3));
1658 }
1659 // Chain of memory load
1660 Ops.push_back(And.getOperand(6));
1661 // Glue
1662 if (IsCTESTCC)
1663 Ops.push_back(N->getOperand(4));
1664
1665 MachineSDNode *Test = CurDAG->getMachineNode(
1666 NewOpc, SDLoc(N), MVT::i32, MVT::Other, Ops);
1667 CurDAG->setNodeMemRefs(
1668 Test, cast<MachineSDNode>(And.getNode())->memoperands());
1669 ReplaceUses(And.getValue(2), SDValue(Test, 1));
1670 ReplaceUses(SDValue(N, 0), SDValue(Test, 0));
1671 MadeChange = true;
1672 continue;
1673 }
1674 }
1675 }
1676 // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1677 // used. We're doing this late so we can prefer to fold the AND into masked
1678 // comparisons. Doing that can be better for the live range of the mask
1679 // register.
1680 case X86::KORTESTBkk:
1681 case X86::KORTESTWkk:
1682 case X86::KORTESTDkk:
1683 case X86::KORTESTQkk: {
1684 SDValue Op0 = N->getOperand(0);
1685 if (Op0 != N->getOperand(1) || !N->isOnlyUserOf(Op0.getNode()) ||
1686 !Op0.isMachineOpcode() || !onlyUsesZeroFlag(SDValue(N, 0)))
1687 continue;
1688#define CASE(A) \
1689 case X86::A: \
1690 break;
1691 switch (Op0.getMachineOpcode()) {
1692 default:
1693 continue;
1694 CASE(KANDBkk)
1695 CASE(KANDWkk)
1696 CASE(KANDDkk)
1697 CASE(KANDQkk)
1698 }
1699 unsigned NewOpc;
1700#define FROM_TO(A, B) \
1701 case X86::A: \
1702 NewOpc = X86::B; \
1703 break;
1704 switch (Opc) {
1705 FROM_TO(KORTESTBkk, KTESTBkk)
1706 FROM_TO(KORTESTWkk, KTESTWkk)
1707 FROM_TO(KORTESTDkk, KTESTDkk)
1708 FROM_TO(KORTESTQkk, KTESTQkk)
1709 }
1710 // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1711 // KAND instructions and KTEST use the same ISA feature.
1712 if (NewOpc == X86::KTESTWkk && !Subtarget->hasDQI())
1713 continue;
1714#undef FROM_TO
1715 MachineSDNode *KTest = CurDAG->getMachineNode(
1716 NewOpc, SDLoc(N), MVT::i32, Op0.getOperand(0), Op0.getOperand(1));
1717 ReplaceUses(N, KTest);
1718 MadeChange = true;
1719 continue;
1720 }
1721 // Attempt to remove vectors moves that were inserted to zero upper bits.
1722 case TargetOpcode::SUBREG_TO_REG: {
1723 unsigned SubRegIdx = N->getConstantOperandVal(2);
1724 if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1725 continue;
1726
1727 SDValue Move = N->getOperand(1);
1728 if (!Move.isMachineOpcode())
1729 continue;
1730
1731 // Make sure its one of the move opcodes we recognize.
1732 switch (Move.getMachineOpcode()) {
1733 default:
1734 continue;
1735 CASE(VMOVAPDrr) CASE(VMOVUPDrr)
1736 CASE(VMOVAPSrr) CASE(VMOVUPSrr)
1737 CASE(VMOVDQArr) CASE(VMOVDQUrr)
1738 CASE(VMOVAPDYrr) CASE(VMOVUPDYrr)
1739 CASE(VMOVAPSYrr) CASE(VMOVUPSYrr)
1740 CASE(VMOVDQAYrr) CASE(VMOVDQUYrr)
1741 CASE(VMOVAPDZ128rr) CASE(VMOVUPDZ128rr)
1742 CASE(VMOVAPSZ128rr) CASE(VMOVUPSZ128rr)
1743 CASE(VMOVDQA32Z128rr) CASE(VMOVDQU32Z128rr)
1744 CASE(VMOVDQA64Z128rr) CASE(VMOVDQU64Z128rr)
1745 CASE(VMOVAPDZ256rr) CASE(VMOVUPDZ256rr)
1746 CASE(VMOVAPSZ256rr) CASE(VMOVUPSZ256rr)
1747 CASE(VMOVDQA32Z256rr) CASE(VMOVDQU32Z256rr)
1748 CASE(VMOVDQA64Z256rr) CASE(VMOVDQU64Z256rr)
1749 }
1750#undef CASE
1751
1752 SDValue In = Move.getOperand(0);
1753 if (!In.isMachineOpcode() ||
1754 In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1755 continue;
1756
1757 // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1758 // the SHA instructions which use a legacy encoding.
1759 uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
1760 if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1761 (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1762 (TSFlags & X86II::EncodingMask) != X86II::XOP)
1763 continue;
1764
1765 // Producing instruction is another vector instruction. We can drop the
1766 // move.
1767 CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
1768 MadeChange = true;
1769 }
1770 }
1771 }
1772
1773 if (MadeChange)
1774 CurDAG->RemoveDeadNodes();
1775}
1776
1777
1778/// Emit any code that needs to be executed only in the main function.
1779void X86DAGToDAGISel::emitSpecialCodeForMain() {
1780 if (Subtarget->isTargetCygMing()) {
1781 TargetLowering::ArgListTy Args;
1782 auto &DL = CurDAG->getDataLayout();
1783
1784 TargetLowering::CallLoweringInfo CLI(*CurDAG);
1785 CLI.setChain(CurDAG->getRoot())
1786 .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
1787 CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
1788 std::move(Args));
1789 const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1790 std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1791 CurDAG->setRoot(Result.second);
1792 }
1793}
1794
1795void X86DAGToDAGISel::emitFunctionEntryCode() {
1796 // If this is main, emit special code for main.
1797 const Function &F = MF->getFunction();
1798 if (F.hasExternalLinkage() && F.getName() == "main")
1799 emitSpecialCodeForMain();
1800}
1801
1802static bool isDispSafeForFrameIndexOrRegBase(int64_t Val) {
1803 // We can run into an issue where a frame index or a register base
1804 // includes a displacement that, when added to the explicit displacement,
1805 // will overflow the displacement field. Assuming that the
1806 // displacement fits into a 31-bit integer (which is only slightly more
1807 // aggressive than the current fundamental assumption that it fits into
1808 // a 32-bit integer), a 31-bit disp should always be safe.
1809 return isInt<31>(Val);
1810}
1811
1812bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1813 X86ISelAddressMode &AM) {
1814 // We may have already matched a displacement and the caller just added the
1815 // symbolic displacement. So we still need to do the checks even if Offset
1816 // is zero.
1817
1818 int64_t Val = AM.Disp + Offset;
1819
1820 // Cannot combine ExternalSymbol displacements with integer offsets.
1821 if (Val != 0 && (AM.ES || AM.MCSym))
1822 return true;
1823
1824 CodeModel::Model M = TM.getCodeModel();
1825 if (Subtarget->is64Bit()) {
1826 if (Val != 0 &&
1828 AM.hasSymbolicDisplacement()))
1829 return true;
1830 // In addition to the checks required for a register base, check that
1831 // we do not try to use an unsafe Disp with a frame index.
1832 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1834 return true;
1835 // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to
1836 // 64 bits. Instructions with 32-bit register addresses perform this zero
1837 // extension for us and we can safely ignore the high bits of Offset.
1838 // Instructions with only a 32-bit immediate address do not, though: they
1839 // sign extend instead. This means only address the low 2GB of address space
1840 // is directly addressable, we need indirect addressing for the high 2GB of
1841 // address space.
1842 // TODO: Some of the earlier checks may be relaxed for ILP32 mode as the
1843 // implicit zero extension of instructions would cover up any problem.
1844 // However, we have asserts elsewhere that get triggered if we do, so keep
1845 // the checks for now.
1846 // TODO: We would actually be able to accept these, as well as the same
1847 // addresses in LP64 mode, by adding the EIZ pseudo-register as an operand
1848 // to get an address size override to be emitted. However, this
1849 // pseudo-register is not part of any register class and therefore causes
1850 // MIR verification to fail.
1851 if (Subtarget->isTarget64BitILP32() &&
1852 !isDispSafeForFrameIndexOrRegBase((uint32_t)Val) &&
1853 !AM.hasBaseOrIndexReg())
1854 return true;
1855 } else if (AM.hasBaseOrIndexReg() && !isDispSafeForFrameIndexOrRegBase(Val))
1856 // For 32-bit X86, make sure the displacement still isn't close to the
1857 // expressible limit.
1858 return true;
1859 AM.Disp = Val;
1860 return false;
1861}
1862
1863bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
1864 bool AllowSegmentRegForX32) {
1865 SDValue Address = N->getOperand(1);
1866
1867 // load gs:0 -> GS segment register.
1868 // load fs:0 -> FS segment register.
1869 //
1870 // This optimization is generally valid because the GNU TLS model defines that
1871 // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
1872 // with 32-bit registers, as we get in ILP32 mode, those registers are first
1873 // zero-extended to 64 bits and then added it to the base address, which gives
1874 // unwanted results when the register holds a negative value.
1875 // For more information see http://people.redhat.com/drepper/tls.pdf
1876 if (isNullConstant(Address) && AM.Segment.getNode() == nullptr &&
1877 !IndirectTlsSegRefs &&
1878 (Subtarget->isTargetGlibc() || Subtarget->isTargetMusl() ||
1879 Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())) {
1880 if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
1881 return true;
1882 switch (N->getPointerInfo().getAddrSpace()) {
1883 case X86AS::GS:
1884 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
1885 return false;
1886 case X86AS::FS:
1887 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
1888 return false;
1889 // Address space X86AS::SS is not handled here, because it is not used to
1890 // address TLS areas.
1891 }
1892 }
1893
1894 return true;
1895}
1896
1897/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1898/// mode. These wrap things that will resolve down into a symbol reference.
1899/// If no match is possible, this returns true, otherwise it returns false.
1900bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1901 // If the addressing mode already has a symbol as the displacement, we can
1902 // never match another symbol.
1903 if (AM.hasSymbolicDisplacement())
1904 return true;
1905
1906 bool IsRIPRelTLS = false;
1907 bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1908 if (IsRIPRel) {
1909 SDValue Val = N.getOperand(0);
1911 IsRIPRelTLS = true;
1912 }
1913
1914 // We can't use an addressing mode in the 64-bit large code model.
1915 // Global TLS addressing is an exception. In the medium code model,
1916 // we use can use a mode when RIP wrappers are present.
1917 // That signifies access to globals that are known to be "near",
1918 // such as the GOT itself.
1919 CodeModel::Model M = TM.getCodeModel();
1920 if (Subtarget->is64Bit() && M == CodeModel::Large && !IsRIPRelTLS)
1921 return true;
1922
1923 // Base and index reg must be 0 in order to use %rip as base.
1924 if (IsRIPRel && AM.hasBaseOrIndexReg())
1925 return true;
1926
1927 // Make a local copy in case we can't do this fold.
1928 X86ISelAddressMode Backup = AM;
1929
1930 int64_t Offset = 0;
1931 SDValue N0 = N.getOperand(0);
1932 if (auto *G = dyn_cast<GlobalAddressSDNode>(N0)) {
1933 AM.GV = G->getGlobal();
1934 AM.SymbolFlags = G->getTargetFlags();
1935 Offset = G->getOffset();
1936 } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
1937 AM.CP = CP->getConstVal();
1938 AM.Alignment = CP->getAlign();
1939 AM.SymbolFlags = CP->getTargetFlags();
1940 Offset = CP->getOffset();
1941 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
1942 AM.ES = S->getSymbol();
1943 AM.SymbolFlags = S->getTargetFlags();
1944 } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
1945 AM.MCSym = S->getMCSymbol();
1946 } else if (auto *J = dyn_cast<JumpTableSDNode>(N0)) {
1947 AM.JT = J->getIndex();
1948 AM.SymbolFlags = J->getTargetFlags();
1949 } else if (auto *BA = dyn_cast<BlockAddressSDNode>(N0)) {
1950 AM.BlockAddr = BA->getBlockAddress();
1951 AM.SymbolFlags = BA->getTargetFlags();
1952 Offset = BA->getOffset();
1953 } else
1954 llvm_unreachable("Unhandled symbol reference node.");
1955
1956 // Can't use an addressing mode with large globals.
1957 if (Subtarget->is64Bit() && !IsRIPRel && AM.GV &&
1958 TM.isLargeGlobalValue(AM.GV)) {
1959 AM = Backup;
1960 return true;
1961 }
1962
1963 if (foldOffsetIntoAddress(Offset, AM)) {
1964 AM = Backup;
1965 return true;
1966 }
1967
1968 if (IsRIPRel)
1969 AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
1970
1971 // Commit the changes now that we know this fold is safe.
1972 return false;
1973}
1974
1975/// Add the specified node to the specified addressing mode, returning true if
1976/// it cannot be done. This just pattern matches for the addressing mode.
1977bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
1978 if (matchAddressRecursively(N, AM, 0))
1979 return true;
1980
1981 // Post-processing: Make a second attempt to fold a load, if we now know
1982 // that there will not be any other register. This is only performed for
1983 // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
1984 // any foldable load the first time.
1985 if (Subtarget->isTarget64BitILP32() &&
1986 AM.BaseType == X86ISelAddressMode::RegBase &&
1987 AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) {
1988 SDValue Save_Base_Reg = AM.Base_Reg;
1989 if (auto *LoadN = dyn_cast<LoadSDNode>(Save_Base_Reg)) {
1990 AM.Base_Reg = SDValue();
1991 if (matchLoadInAddress(LoadN, AM, /*AllowSegmentRegForX32=*/true))
1992 AM.Base_Reg = Save_Base_Reg;
1993 }
1994 }
1995
1996 // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
1997 // a smaller encoding and avoids a scaled-index.
1998 if (AM.Scale == 2 &&
1999 AM.BaseType == X86ISelAddressMode::RegBase &&
2000 AM.Base_Reg.getNode() == nullptr) {
2001 AM.Base_Reg = AM.IndexReg;
2002 AM.Scale = 1;
2003 }
2004
2005 // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
2006 // because it has a smaller encoding.
2007 if (TM.getCodeModel() != CodeModel::Large &&
2008 (!AM.GV || !TM.isLargeGlobalValue(AM.GV)) && Subtarget->is64Bit() &&
2009 AM.Scale == 1 && AM.BaseType == X86ISelAddressMode::RegBase &&
2010 AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr &&
2011 AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) {
2012 // However, when GV is a local function symbol and in the same section as
2013 // the current instruction, and AM.Disp is negative and near INT32_MIN,
2014 // referencing GV+Disp generates a relocation referencing the section symbol
2015 // with an even smaller offset, which might underflow. We should bail out if
2016 // the negative offset is too close to INT32_MIN. Actually, we are more
2017 // conservative here, using a smaller magic number also used by
2018 // isOffsetSuitableForCodeModel.
2019 if (isa_and_nonnull<Function>(AM.GV) && AM.Disp < -16 * 1024 * 1024)
2020 return true;
2021
2022 AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
2023 }
2024
2025 return false;
2026}
2027
2028bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
2029 unsigned Depth) {
2030 // Add an artificial use to this node so that we can keep track of
2031 // it if it gets CSE'd with a different node.
2032 HandleSDNode Handle(N);
2033
2034 X86ISelAddressMode Backup = AM;
2035 if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
2036 !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
2037 return false;
2038 AM = Backup;
2039
2040 // Try again after commutating the operands.
2041 if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM,
2042 Depth + 1) &&
2043 !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth + 1))
2044 return false;
2045 AM = Backup;
2046
2047 // If we couldn't fold both operands into the address at the same time,
2048 // see if we can just put each operand into a register and fold at least
2049 // the add.
2050 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2051 !AM.Base_Reg.getNode() &&
2052 !AM.IndexReg.getNode()) {
2053 N = Handle.getValue();
2054 AM.Base_Reg = N.getOperand(0);
2055 AM.IndexReg = N.getOperand(1);
2056 AM.Scale = 1;
2057 return false;
2058 }
2059 N = Handle.getValue();
2060 return true;
2061}
2062
2063// Insert a node into the DAG at least before the Pos node's position. This
2064// will reposition the node as needed, and will assign it a node ID that is <=
2065// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
2066// IDs! The selection DAG must no longer depend on their uniqueness when this
2067// is used.
2068static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
2069 if (N->getNodeId() == -1 ||
2072 DAG.RepositionNode(Pos->getIterator(), N.getNode());
2073 // Mark Node as invalid for pruning as after this it may be a successor to a
2074 // selected node but otherwise be in the same position of Pos.
2075 // Conservatively mark it with the same -abs(Id) to assure node id
2076 // invariant is preserved.
2077 N->setNodeId(Pos->getNodeId());
2079 }
2080}
2081
2082// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
2083// safe. This allows us to convert the shift and and into an h-register
2084// extract and a scaled index. Returns false if the simplification is
2085// performed.
2087 uint64_t Mask,
2088 SDValue Shift, SDValue X,
2089 X86ISelAddressMode &AM) {
2090 if (Shift.getOpcode() != ISD::SRL ||
2091 !isa<ConstantSDNode>(Shift.getOperand(1)) ||
2092 !Shift.hasOneUse())
2093 return true;
2094
2095 int ScaleLog = 8 - Shift.getConstantOperandVal(1);
2096 if (ScaleLog <= 0 || ScaleLog >= 4 ||
2097 Mask != (0xffu << ScaleLog))
2098 return true;
2099
2100 MVT XVT = X.getSimpleValueType();
2101 MVT VT = N.getSimpleValueType();
2102 SDLoc DL(N);
2103 SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
2104 SDValue NewMask = DAG.getConstant(0xff, DL, XVT);
2105 SDValue Srl = DAG.getNode(ISD::SRL, DL, XVT, X, Eight);
2106 SDValue And = DAG.getNode(ISD::AND, DL, XVT, Srl, NewMask);
2107 SDValue Ext = DAG.getZExtOrTrunc(And, DL, VT);
2108 SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
2109 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Ext, ShlCount);
2110
2111 // Insert the new nodes into the topological ordering. We must do this in
2112 // a valid topological ordering as nothing is going to go back and re-sort
2113 // these nodes. We continually insert before 'N' in sequence as this is
2114 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2115 // hierarchy left to express.
2116 insertDAGNode(DAG, N, Eight);
2117 insertDAGNode(DAG, N, NewMask);
2118 insertDAGNode(DAG, N, Srl);
2119 insertDAGNode(DAG, N, And);
2120 insertDAGNode(DAG, N, Ext);
2121 insertDAGNode(DAG, N, ShlCount);
2122 insertDAGNode(DAG, N, Shl);
2123 DAG.ReplaceAllUsesWith(N, Shl);
2124 DAG.RemoveDeadNode(N.getNode());
2125 AM.IndexReg = Ext;
2126 AM.Scale = (1 << ScaleLog);
2127 return false;
2128}
2129
2130// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
2131// allows us to fold the shift into this addressing mode. Returns false if the
2132// transform succeeded.
2134 X86ISelAddressMode &AM) {
2135 SDValue Shift = N.getOperand(0);
2136
2137 // Use a signed mask so that shifting right will insert sign bits. These
2138 // bits will be removed when we shift the result left so it doesn't matter
2139 // what we use. This might allow a smaller immediate encoding.
2140 int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();
2141
2142 // If we have an any_extend feeding the AND, look through it to see if there
2143 // is a shift behind it. But only if the AND doesn't use the extended bits.
2144 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
2145 bool FoundAnyExtend = false;
2146 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
2147 Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
2148 isUInt<32>(Mask)) {
2149 FoundAnyExtend = true;
2150 Shift = Shift.getOperand(0);
2151 }
2152
2153 if (Shift.getOpcode() != ISD::SHL ||
2155 return true;
2156
2157 SDValue X = Shift.getOperand(0);
2158
2159 // Not likely to be profitable if either the AND or SHIFT node has more
2160 // than one use (unless all uses are for address computation). Besides,
2161 // isel mechanism requires their node ids to be reused.
2162 if (!N.hasOneUse() || !Shift.hasOneUse())
2163 return true;
2164
2165 // Verify that the shift amount is something we can fold.
2166 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2167 if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
2168 return true;
2169
2170 MVT VT = N.getSimpleValueType();
2171 SDLoc DL(N);
2172 if (FoundAnyExtend) {
2173 SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
2174 insertDAGNode(DAG, N, NewX);
2175 X = NewX;
2176 }
2177
2178 SDValue NewMask = DAG.getSignedConstant(Mask >> ShiftAmt, DL, VT);
2179 SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
2180 SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
2181
2182 // Insert the new nodes into the topological ordering. We must do this in
2183 // a valid topological ordering as nothing is going to go back and re-sort
2184 // these nodes. We continually insert before 'N' in sequence as this is
2185 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2186 // hierarchy left to express.
2187 insertDAGNode(DAG, N, NewMask);
2188 insertDAGNode(DAG, N, NewAnd);
2189 insertDAGNode(DAG, N, NewShift);
2190 DAG.ReplaceAllUsesWith(N, NewShift);
2191 DAG.RemoveDeadNode(N.getNode());
2192
2193 AM.Scale = 1 << ShiftAmt;
2194 AM.IndexReg = NewAnd;
2195 return false;
2196}
2197
2198// Implement some heroics to detect shifts of masked values where the mask can
2199// be replaced by extending the shift and undoing that in the addressing mode
2200// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
2201// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
2202// the addressing mode. This results in code such as:
2203//
2204// int f(short *y, int *lookup_table) {
2205// ...
2206// return *y + lookup_table[*y >> 11];
2207// }
2208//
2209// Turning into:
2210// movzwl (%rdi), %eax
2211// movl %eax, %ecx
2212// shrl $11, %ecx
2213// addl (%rsi,%rcx,4), %eax
2214//
2215// Instead of:
2216// movzwl (%rdi), %eax
2217// movl %eax, %ecx
2218// shrl $9, %ecx
2219// andl $124, %rcx
2220// addl (%rsi,%rcx), %eax
2221//
2222// Note that this function assumes the mask is provided as a mask *after* the
2223// value is shifted. The input chain may or may not match that, but computing
2224// such a mask is trivial.
2226 uint64_t Mask,
2227 SDValue Shift, SDValue X,
2228 X86ISelAddressMode &AM) {
2229 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
2231 return true;
2232
2233 // We need to ensure that mask is a continuous run of bits.
2234 unsigned MaskIdx, MaskLen;
2235 if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2236 return true;
2237 unsigned MaskLZ = 64 - (MaskIdx + MaskLen);
2238
2239 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2240
2241 // The amount of shift we're trying to fit into the addressing mode is taken
2242 // from the shifted mask index (number of trailing zeros of the mask).
2243 unsigned AMShiftAmt = MaskIdx;
2244
2245 // There is nothing we can do here unless the mask is removing some bits.
2246 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2247 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2248
2249 // Scale the leading zero count down based on the actual size of the value.
2250 // Also scale it down based on the size of the shift.
2251 unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
2252 if (MaskLZ < ScaleDown)
2253 return true;
2254 MaskLZ -= ScaleDown;
2255
2256 // The final check is to ensure that any masked out high bits of X are
2257 // already known to be zero. Otherwise, the mask has a semantic impact
2258 // other than masking out a couple of low bits. Unfortunately, because of
2259 // the mask, zero extensions will be removed from operands in some cases.
2260 // This code works extra hard to look through extensions because we can
2261 // replace them with zero extensions cheaply if necessary.
2262 bool ReplacingAnyExtend = false;
2263 if (X.getOpcode() == ISD::ANY_EXTEND) {
2264 unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
2265 X.getOperand(0).getSimpleValueType().getSizeInBits();
2266 // Assume that we'll replace the any-extend with a zero-extend, and
2267 // narrow the search to the extended value.
2268 X = X.getOperand(0);
2269 MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
2270 ReplacingAnyExtend = true;
2271 }
2272 APInt MaskedHighBits =
2273 APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
2274 if (!DAG.MaskedValueIsZero(X, MaskedHighBits))
2275 return true;
2276
2277 // We've identified a pattern that can be transformed into a single shift
2278 // and an addressing mode. Make it so.
2279 MVT VT = N.getSimpleValueType();
2280 if (ReplacingAnyExtend) {
2281 assert(X.getValueType() != VT);
2282 // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
2283 SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
2284 insertDAGNode(DAG, N, NewX);
2285 X = NewX;
2286 }
2287
2288 MVT XVT = X.getSimpleValueType();
2289 SDLoc DL(N);
2290 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2291 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2292 SDValue NewExt = DAG.getZExtOrTrunc(NewSRL, DL, VT);
2293 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2294 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2295
2296 // Insert the new nodes into the topological ordering. We must do this in
2297 // a valid topological ordering as nothing is going to go back and re-sort
2298 // these nodes. We continually insert before 'N' in sequence as this is
2299 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2300 // hierarchy left to express.
2301 insertDAGNode(DAG, N, NewSRLAmt);
2302 insertDAGNode(DAG, N, NewSRL);
2303 insertDAGNode(DAG, N, NewExt);
2304 insertDAGNode(DAG, N, NewSHLAmt);
2305 insertDAGNode(DAG, N, NewSHL);
2306 DAG.ReplaceAllUsesWith(N, NewSHL);
2307 DAG.RemoveDeadNode(N.getNode());
2308
2309 AM.Scale = 1 << AMShiftAmt;
2310 AM.IndexReg = NewExt;
2311 return false;
2312}
2313
2314// Transform "(X >> SHIFT) & (MASK << C1)" to
2315// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
2316// matched to a BEXTR later. Returns false if the simplification is performed.
2318 uint64_t Mask,
2319 SDValue Shift, SDValue X,
2320 X86ISelAddressMode &AM,
2321 const X86Subtarget &Subtarget) {
2322 if (Shift.getOpcode() != ISD::SRL ||
2323 !isa<ConstantSDNode>(Shift.getOperand(1)) ||
2324 !Shift.hasOneUse() || !N.hasOneUse())
2325 return true;
2326
2327 // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
2328 if (!Subtarget.hasTBM() &&
2329 !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
2330 return true;
2331
2332 // We need to ensure that mask is a continuous run of bits.
2333 unsigned MaskIdx, MaskLen;
2334 if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2335 return true;
2336
2337 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2338
2339 // The amount of shift we're trying to fit into the addressing mode is taken
2340 // from the shifted mask index (number of trailing zeros of the mask).
2341 unsigned AMShiftAmt = MaskIdx;
2342
2343 // There is nothing we can do here unless the mask is removing some bits.
2344 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2345 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2346
2347 MVT XVT = X.getSimpleValueType();
2348 MVT VT = N.getSimpleValueType();
2349 SDLoc DL(N);
2350 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2351 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2352 SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, XVT);
2353 SDValue NewAnd = DAG.getNode(ISD::AND, DL, XVT, NewSRL, NewMask);
2354 SDValue NewExt = DAG.getZExtOrTrunc(NewAnd, DL, VT);
2355 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2356 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2357
2358 // Insert the new nodes into the topological ordering. We must do this in
2359 // a valid topological ordering as nothing is going to go back and re-sort
2360 // these nodes. We continually insert before 'N' in sequence as this is
2361 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2362 // hierarchy left to express.
2363 insertDAGNode(DAG, N, NewSRLAmt);
2364 insertDAGNode(DAG, N, NewSRL);
2365 insertDAGNode(DAG, N, NewMask);
2366 insertDAGNode(DAG, N, NewAnd);
2367 insertDAGNode(DAG, N, NewExt);
2368 insertDAGNode(DAG, N, NewSHLAmt);
2369 insertDAGNode(DAG, N, NewSHL);
2370 DAG.ReplaceAllUsesWith(N, NewSHL);
2371 DAG.RemoveDeadNode(N.getNode());
2372
2373 AM.Scale = 1 << AMShiftAmt;
2374 AM.IndexReg = NewExt;
2375 return false;
2376}
2377
2378// Attempt to peek further into a scaled index register, collecting additional
2379// extensions / offsets / etc. Returns /p N if we can't peek any further.
2380SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N,
2381 X86ISelAddressMode &AM,
2382 unsigned Depth) {
2383 assert(AM.IndexReg.getNode() == nullptr && "IndexReg already matched");
2384 assert((AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8) &&
2385 "Illegal index scale");
2386
2387 // Limit recursion.
2389 return N;
2390
2391 EVT VT = N.getValueType();
2392 unsigned Opc = N.getOpcode();
2393
2394 // index: add(x,c) -> index: x, disp + c
2395 if (CurDAG->isBaseWithConstantOffset(N)) {
2396 auto *AddVal = cast<ConstantSDNode>(N.getOperand(1));
2397 uint64_t Offset = (uint64_t)AddVal->getSExtValue() * AM.Scale;
2398 if (!foldOffsetIntoAddress(Offset, AM))
2399 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2400 }
2401
2402 // index: add(x,x) -> index: x, scale * 2
2403 if (Opc == ISD::ADD && N.getOperand(0) == N.getOperand(1)) {
2404 if (AM.Scale <= 4) {
2405 AM.Scale *= 2;
2406 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2407 }
2408 }
2409
2410 // index: shl(x,i) -> index: x, scale * (1 << i)
2411 if (Opc == X86ISD::VSHLI) {
2412 uint64_t ShiftAmt = N.getConstantOperandVal(1);
2413 uint64_t ScaleAmt = 1ULL << ShiftAmt;
2414 if ((AM.Scale * ScaleAmt) <= 8) {
2415 AM.Scale *= ScaleAmt;
2416 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2417 }
2418 }
2419
2420 // index: sext(add_nsw(x,c)) -> index: sext(x), disp + sext(c)
2421 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2422 if (Opc == ISD::SIGN_EXTEND && !VT.isVector() && N.hasOneUse()) {
2423 SDValue Src = N.getOperand(0);
2424 if (Src.getOpcode() == ISD::ADD && Src->getFlags().hasNoSignedWrap() &&
2425 Src.hasOneUse()) {
2426 if (CurDAG->isBaseWithConstantOffset(Src)) {
2427 SDValue AddSrc = Src.getOperand(0);
2428 auto *AddVal = cast<ConstantSDNode>(Src.getOperand(1));
2429 int64_t Offset = AddVal->getSExtValue();
2430 if (!foldOffsetIntoAddress((uint64_t)Offset * AM.Scale, AM)) {
2431 SDLoc DL(N);
2432 SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2433 SDValue ExtVal = CurDAG->getSignedConstant(Offset, DL, VT);
2434 SDValue ExtAdd = CurDAG->getNode(ISD::ADD, DL, VT, ExtSrc, ExtVal);
2435 insertDAGNode(*CurDAG, N, ExtSrc);
2436 insertDAGNode(*CurDAG, N, ExtVal);
2437 insertDAGNode(*CurDAG, N, ExtAdd);
2438 CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2439 CurDAG->RemoveDeadNode(N.getNode());
2440 return ExtSrc;
2441 }
2442 }
2443 }
2444 }
2445
2446 // index: zext(add_nuw(x,c)) -> index: zext(x), disp + zext(c)
2447 // index: zext(addlike(x,c)) -> index: zext(x), disp + zext(c)
2448 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2449 if (Opc == ISD::ZERO_EXTEND && !VT.isVector() && N.hasOneUse()) {
2450 SDValue Src = N.getOperand(0);
2451 unsigned SrcOpc = Src.getOpcode();
2452 if (((SrcOpc == ISD::ADD && Src->getFlags().hasNoUnsignedWrap()) ||
2453 CurDAG->isADDLike(Src, /*NoWrap=*/true)) &&
2454 Src.hasOneUse()) {
2455 if (CurDAG->isBaseWithConstantOffset(Src)) {
2456 SDValue AddSrc = Src.getOperand(0);
2457 uint64_t Offset = Src.getConstantOperandVal(1);
2458 if (!foldOffsetIntoAddress(Offset * AM.Scale, AM)) {
2459 SDLoc DL(N);
2460 SDValue Res;
2461 // If we're also scaling, see if we can use that as well.
2462 if (AddSrc.getOpcode() == ISD::SHL &&
2463 isa<ConstantSDNode>(AddSrc.getOperand(1))) {
2464 SDValue ShVal = AddSrc.getOperand(0);
2465 uint64_t ShAmt = AddSrc.getConstantOperandVal(1);
2466 APInt HiBits =
2468 uint64_t ScaleAmt = 1ULL << ShAmt;
2469 if ((AM.Scale * ScaleAmt) <= 8 &&
2470 (AddSrc->getFlags().hasNoUnsignedWrap() ||
2471 CurDAG->MaskedValueIsZero(ShVal, HiBits))) {
2472 AM.Scale *= ScaleAmt;
2473 SDValue ExtShVal = CurDAG->getNode(Opc, DL, VT, ShVal);
2474 SDValue ExtShift = CurDAG->getNode(ISD::SHL, DL, VT, ExtShVal,
2475 AddSrc.getOperand(1));
2476 insertDAGNode(*CurDAG, N, ExtShVal);
2477 insertDAGNode(*CurDAG, N, ExtShift);
2478 AddSrc = ExtShift;
2479 Res = ExtShVal;
2480 }
2481 }
2482 SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2483 SDValue ExtVal = CurDAG->getConstant(Offset, DL, VT);
2484 SDValue ExtAdd = CurDAG->getNode(SrcOpc, DL, VT, ExtSrc, ExtVal);
2485 insertDAGNode(*CurDAG, N, ExtSrc);
2486 insertDAGNode(*CurDAG, N, ExtVal);
2487 insertDAGNode(*CurDAG, N, ExtAdd);
2488 CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2489 CurDAG->RemoveDeadNode(N.getNode());
2490 return Res ? Res : ExtSrc;
2491 }
2492 }
2493 }
2494 }
2495
2496 // TODO: Handle extensions, shifted masks etc.
2497 return N;
2498}
2499
2500bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
2501 unsigned Depth) {
2502 LLVM_DEBUG({
2503 dbgs() << "MatchAddress: ";
2504 AM.dump(CurDAG);
2505 });
2506 // Limit recursion.
2508 return matchAddressBase(N, AM);
2509
2510 // If this is already a %rip relative address, we can only merge immediates
2511 // into it. Instead of handling this in every case, we handle it here.
2512 // RIP relative addressing: %rip + 32-bit displacement!
2513 if (AM.isRIPRelative()) {
2514 // FIXME: JumpTable and ExternalSymbol address currently don't like
2515 // displacements. It isn't very important, but this should be fixed for
2516 // consistency.
2517 if (!(AM.ES || AM.MCSym) && AM.JT != -1)
2518 return true;
2519
2520 if (auto *Cst = dyn_cast<ConstantSDNode>(N))
2521 if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
2522 return false;
2523 return true;
2524 }
2525
2526 switch (N.getOpcode()) {
2527 default: break;
2528 case ISD::LOCAL_RECOVER: {
2529 if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
2530 if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
2531 // Use the symbol and don't prefix it.
2532 AM.MCSym = ESNode->getMCSymbol();
2533 return false;
2534 }
2535 break;
2536 }
2537 case ISD::Constant: {
2538 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2539 if (!foldOffsetIntoAddress(Val, AM))
2540 return false;
2541 break;
2542 }
2543
2544 case X86ISD::Wrapper:
2545 case X86ISD::WrapperRIP:
2546 if (!matchWrapper(N, AM))
2547 return false;
2548 break;
2549
2550 case ISD::LOAD:
2551 if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
2552 return false;
2553 break;
2554
2555 case ISD::FrameIndex:
2556 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2557 AM.Base_Reg.getNode() == nullptr &&
2558 (!Subtarget->is64Bit() || isDispSafeForFrameIndexOrRegBase(AM.Disp))) {
2559 AM.BaseType = X86ISelAddressMode::FrameIndexBase;
2560 AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
2561 return false;
2562 }
2563 break;
2564
2565 case ISD::SHL:
2566 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2567 break;
2568
2569 if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
2570 unsigned Val = CN->getZExtValue();
2571 // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
2572 // that the base operand remains free for further matching. If
2573 // the base doesn't end up getting used, a post-processing step
2574 // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
2575 if (Val == 1 || Val == 2 || Val == 3) {
2576 SDValue ShVal = N.getOperand(0);
2577 AM.Scale = 1 << Val;
2578 AM.IndexReg = matchIndexRecursively(ShVal, AM, Depth + 1);
2579 return false;
2580 }
2581 }
2582 break;
2583
2584 case ISD::SRL: {
2585 // Scale must not be used already.
2586 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2587
2588 // We only handle up to 64-bit values here as those are what matter for
2589 // addressing mode optimizations.
2590 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2591 "Unexpected value size!");
2592
2593 SDValue And = N.getOperand(0);
2594 if (And.getOpcode() != ISD::AND) break;
2595 SDValue X = And.getOperand(0);
2596
2597 // The mask used for the transform is expected to be post-shift, but we
2598 // found the shift first so just apply the shift to the mask before passing
2599 // it down.
2600 if (!isa<ConstantSDNode>(N.getOperand(1)) ||
2601 !isa<ConstantSDNode>(And.getOperand(1)))
2602 break;
2603 uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
2604
2605 // Try to fold the mask and shift into the scale, and return false if we
2606 // succeed.
2607 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
2608 return false;
2609 break;
2610 }
2611
2612 case ISD::SMUL_LOHI:
2613 case ISD::UMUL_LOHI:
2614 // A mul_lohi where we need the low part can be folded as a plain multiply.
2615 if (N.getResNo() != 0) break;
2616 [[fallthrough]];
2617 case ISD::MUL:
2618 case X86ISD::MUL_IMM:
2619 // X*[3,5,9] -> X+X*[2,4,8]
2620 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2621 AM.Base_Reg.getNode() == nullptr &&
2622 AM.IndexReg.getNode() == nullptr) {
2623 if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
2624 if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
2625 CN->getZExtValue() == 9) {
2626 AM.Scale = unsigned(CN->getZExtValue())-1;
2627
2628 SDValue MulVal = N.getOperand(0);
2629 SDValue Reg;
2630
2631 // Okay, we know that we have a scale by now. However, if the scaled
2632 // value is an add of something and a constant, we can fold the
2633 // constant into the disp field here.
2634 if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
2635 isa<ConstantSDNode>(MulVal.getOperand(1))) {
2636 Reg = MulVal.getOperand(0);
2637 auto *AddVal = cast<ConstantSDNode>(MulVal.getOperand(1));
2638 uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
2639 if (foldOffsetIntoAddress(Disp, AM))
2640 Reg = N.getOperand(0);
2641 } else {
2642 Reg = N.getOperand(0);
2643 }
2644
2645 AM.IndexReg = AM.Base_Reg = Reg;
2646 return false;
2647 }
2648 }
2649 break;
2650
2651 case ISD::SUB: {
2652 // Given A-B, if A can be completely folded into the address and
2653 // the index field with the index field unused, use -B as the index.
2654 // This is a win if a has multiple parts that can be folded into
2655 // the address. Also, this saves a mov if the base register has
2656 // other uses, since it avoids a two-address sub instruction, however
2657 // it costs an additional mov if the index register has other uses.
2658
2659 // Add an artificial use to this node so that we can keep track of
2660 // it if it gets CSE'd with a different node.
2661 HandleSDNode Handle(N);
2662
2663 // Test if the LHS of the sub can be folded.
2664 X86ISelAddressMode Backup = AM;
2665 if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
2666 N = Handle.getValue();
2667 AM = Backup;
2668 break;
2669 }
2670 N = Handle.getValue();
2671 // Test if the index field is free for use.
2672 if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
2673 AM = Backup;
2674 break;
2675 }
2676
2677 int Cost = 0;
2678 SDValue RHS = N.getOperand(1);
2679 // If the RHS involves a register with multiple uses, this
2680 // transformation incurs an extra mov, due to the neg instruction
2681 // clobbering its operand.
2682 if (!RHS.getNode()->hasOneUse() ||
2683 RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
2684 RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
2685 RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
2686 (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2687 RHS.getOperand(0).getValueType() == MVT::i32))
2688 ++Cost;
2689 // If the base is a register with multiple uses, this
2690 // transformation may save a mov.
2691 if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2692 !AM.Base_Reg.getNode()->hasOneUse()) ||
2693 AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2694 --Cost;
2695 // If the folded LHS was interesting, this transformation saves
2696 // address arithmetic.
2697 if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2698 ((AM.Disp != 0) && (Backup.Disp == 0)) +
2699 (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
2700 --Cost;
2701 // If it doesn't look like it may be an overall win, don't do it.
2702 if (Cost >= 0) {
2703 AM = Backup;
2704 break;
2705 }
2706
2707 // Ok, the transformation is legal and appears profitable. Go for it.
2708 // Negation will be emitted later to avoid creating dangling nodes if this
2709 // was an unprofitable LEA.
2710 AM.IndexReg = RHS;
2711 AM.NegateIndex = true;
2712 AM.Scale = 1;
2713 return false;
2714 }
2715
2716 case ISD::OR:
2717 case ISD::XOR:
2718 // See if we can treat the OR/XOR node as an ADD node.
2719 if (!CurDAG->isADDLike(N))
2720 break;
2721 [[fallthrough]];
2722 case ISD::ADD:
2723 if (!matchAdd(N, AM, Depth))
2724 return false;
2725 break;
2726
2727 case ISD::AND: {
2728 // Perform some heroic transforms on an and of a constant-count shift
2729 // with a constant to enable use of the scaled offset field.
2730
2731 // Scale must not be used already.
2732 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2733
2734 // We only handle up to 64-bit values here as those are what matter for
2735 // addressing mode optimizations.
2736 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2737 "Unexpected value size!");
2738
2739 if (!isa<ConstantSDNode>(N.getOperand(1)))
2740 break;
2741
2742 if (N.getOperand(0).getOpcode() == ISD::SRL) {
2743 SDValue Shift = N.getOperand(0);
2744 SDValue X = Shift.getOperand(0);
2745
2746 uint64_t Mask = N.getConstantOperandVal(1);
2747
2748 // Try to fold the mask and shift into an extract and scale.
2749 if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
2750 return false;
2751
2752 // Try to fold the mask and shift directly into the scale.
2753 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
2754 return false;
2755
2756 // Try to fold the mask and shift into BEXTR and scale.
2757 if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
2758 return false;
2759 }
2760
2761 // Try to swap the mask and shift to place shifts which can be done as
2762 // a scale on the outside of the mask.
2763 if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
2764 return false;
2765
2766 break;
2767 }
2768 case ISD::ZERO_EXTEND: {
2769 // Try to widen a zexted shift left to the same size as its use, so we can
2770 // match the shift as a scale factor.
2771 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2772 break;
2773
2774 SDValue Src = N.getOperand(0);
2775
2776 // See if we can match a zext(addlike(x,c)).
2777 // TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively.
2778 if (Src.getOpcode() == ISD::ADD || Src.getOpcode() == ISD::OR)
2779 if (SDValue Index = matchIndexRecursively(N, AM, Depth + 1))
2780 if (Index != N) {
2781 AM.IndexReg = Index;
2782 return false;
2783 }
2784
2785 // Peek through mask: zext(and(shl(x,c1),c2))
2786 APInt Mask = APInt::getAllOnes(Src.getScalarValueSizeInBits());
2787 if (Src.getOpcode() == ISD::AND && Src.hasOneUse())
2788 if (auto *MaskC = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
2789 Mask = MaskC->getAPIntValue();
2790 Src = Src.getOperand(0);
2791 }
2792
2793 if (Src.getOpcode() == ISD::SHL && Src.hasOneUse() && N->hasOneUse()) {
2794 // Give up if the shift is not a valid scale factor [1,2,3].
2795 SDValue ShlSrc = Src.getOperand(0);
2796 SDValue ShlAmt = Src.getOperand(1);
2797 auto *ShAmtC = dyn_cast<ConstantSDNode>(ShlAmt);
2798 if (!ShAmtC)
2799 break;
2800 unsigned ShAmtV = ShAmtC->getZExtValue();
2801 if (ShAmtV > 3)
2802 break;
2803
2804 // The narrow shift must only shift out zero bits (it must be 'nuw').
2805 // That makes it safe to widen to the destination type.
2806 APInt HighZeros =
2807 APInt::getHighBitsSet(ShlSrc.getValueSizeInBits(), ShAmtV);
2808 if (!Src->getFlags().hasNoUnsignedWrap() &&
2809 !CurDAG->MaskedValueIsZero(ShlSrc, HighZeros & Mask))
2810 break;
2811
2812 // zext (shl nuw i8 %x, C1) to i32
2813 // --> shl (zext i8 %x to i32), (zext C1)
2814 // zext (and (shl nuw i8 %x, C1), C2) to i32
2815 // --> shl (zext i8 (and %x, C2 >> C1) to i32), (zext C1)
2816 MVT SrcVT = ShlSrc.getSimpleValueType();
2817 MVT VT = N.getSimpleValueType();
2818 SDLoc DL(N);
2819
2820 SDValue Res = ShlSrc;
2821 if (!Mask.isAllOnes()) {
2822 Res = CurDAG->getConstant(Mask.lshr(ShAmtV), DL, SrcVT);
2823 insertDAGNode(*CurDAG, N, Res);
2824 Res = CurDAG->getNode(ISD::AND, DL, SrcVT, ShlSrc, Res);
2825 insertDAGNode(*CurDAG, N, Res);
2826 }
2827 SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Res);
2828 insertDAGNode(*CurDAG, N, Zext);
2829 SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, ShlAmt);
2830 insertDAGNode(*CurDAG, N, NewShl);
2831 CurDAG->ReplaceAllUsesWith(N, NewShl);
2832 CurDAG->RemoveDeadNode(N.getNode());
2833
2834 // Convert the shift to scale factor.
2835 AM.Scale = 1 << ShAmtV;
2836 // If matchIndexRecursively is not called here,
2837 // Zext may be replaced by other nodes but later used to call a builder
2838 // method
2839 AM.IndexReg = matchIndexRecursively(Zext, AM, Depth + 1);
2840 return false;
2841 }
2842
2843 if (Src.getOpcode() == ISD::SRL && !Mask.isAllOnes()) {
2844 // Try to fold the mask and shift into an extract and scale.
2845 if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask.getZExtValue(), Src,
2846 Src.getOperand(0), AM))
2847 return false;
2848
2849 // Try to fold the mask and shift directly into the scale.
2850 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask.getZExtValue(), Src,
2851 Src.getOperand(0), AM))
2852 return false;
2853
2854 // Try to fold the mask and shift into BEXTR and scale.
2855 if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask.getZExtValue(), Src,
2856 Src.getOperand(0), AM, *Subtarget))
2857 return false;
2858 }
2859
2860 break;
2861 }
2862 }
2863
2864 return matchAddressBase(N, AM);
2865}
2866
2867/// Helper for MatchAddress. Add the specified node to the
2868/// specified addressing mode without any further recursion.
2869bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2870 // Is the base register already occupied?
2871 if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
2872 // If so, check to see if the scale index register is set.
2873 if (!AM.IndexReg.getNode()) {
2874 AM.IndexReg = N;
2875 AM.Scale = 1;
2876 return false;
2877 }
2878
2879 // Otherwise, we cannot select it.
2880 return true;
2881 }
2882
2883 // Default, generate it as a register.
2884 AM.BaseType = X86ISelAddressMode::RegBase;
2885 AM.Base_Reg = N;
2886 return false;
2887}
2888
2889bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N,
2890 X86ISelAddressMode &AM,
2891 unsigned Depth) {
2892 LLVM_DEBUG({
2893 dbgs() << "MatchVectorAddress: ";
2894 AM.dump(CurDAG);
2895 });
2896 // Limit recursion.
2898 return matchAddressBase(N, AM);
2899
2900 // TODO: Support other operations.
2901 switch (N.getOpcode()) {
2902 case ISD::Constant: {
2903 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2904 if (!foldOffsetIntoAddress(Val, AM))
2905 return false;
2906 break;
2907 }
2908 case X86ISD::Wrapper:
2909 if (!matchWrapper(N, AM))
2910 return false;
2911 break;
2912 case ISD::ADD: {
2913 // Add an artificial use to this node so that we can keep track of
2914 // it if it gets CSE'd with a different node.
2915 HandleSDNode Handle(N);
2916
2917 X86ISelAddressMode Backup = AM;
2918 if (!matchVectorAddressRecursively(N.getOperand(0), AM, Depth + 1) &&
2919 !matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2920 Depth + 1))
2921 return false;
2922 AM = Backup;
2923
2924 // Try again after commuting the operands.
2925 if (!matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2926 Depth + 1) &&
2927 !matchVectorAddressRecursively(Handle.getValue().getOperand(0), AM,
2928 Depth + 1))
2929 return false;
2930 AM = Backup;
2931
2932 N = Handle.getValue();
2933 break;
2934 }
2935 }
2936
2937 return matchAddressBase(N, AM);
2938}
2939
2940/// Helper for selectVectorAddr. Handles things that can be folded into a
2941/// gather/scatter address. The index register and scale should have already
2942/// been handled.
2943bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2944 return matchVectorAddressRecursively(N, AM, 0);
2945}
2946
2947bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
2948 SDValue IndexOp, SDValue ScaleOp,
2949 SDValue &Base, SDValue &Scale,
2950 SDValue &Index, SDValue &Disp,
2951 SDValue &Segment) {
2952 X86ISelAddressMode AM;
2953 AM.Scale = ScaleOp->getAsZExtVal();
2954
2955 // Attempt to match index patterns, as long as we're not relying on implicit
2956 // sign-extension, which is performed BEFORE scale.
2957 if (IndexOp.getScalarValueSizeInBits() == BasePtr.getScalarValueSizeInBits())
2958 AM.IndexReg = matchIndexRecursively(IndexOp, AM, 0);
2959 else
2960 AM.IndexReg = IndexOp;
2961
2962 unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace();
2963 if (AddrSpace == X86AS::GS)
2964 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2965 if (AddrSpace == X86AS::FS)
2966 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2967 if (AddrSpace == X86AS::SS)
2968 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2969
2970 SDLoc DL(BasePtr);
2971 MVT VT = BasePtr.getSimpleValueType();
2972
2973 // Try to match into the base and displacement fields.
2974 if (matchVectorAddress(BasePtr, AM))
2975 return false;
2976
2977 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2978 return true;
2979}
2980
2981/// Returns true if it is able to pattern match an addressing mode.
2982/// It returns the operands which make up the maximal addressing mode it can
2983/// match by reference.
2984///
2985/// Parent is the parent node of the addr operand that is being matched. It
2986/// is always a load, store, atomic node, or null. It is only null when
2987/// checking memory operands for inline asm nodes.
2988bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
2989 SDValue &Scale, SDValue &Index,
2990 SDValue &Disp, SDValue &Segment) {
2991 X86ISelAddressMode AM;
2992
2993 if (Parent &&
2994 // This list of opcodes are all the nodes that have an "addr:$ptr" operand
2995 // that are not a MemSDNode, and thus don't have proper addrspace info.
2996 Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
2997 Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
2998 Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
2999 Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
3000 Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
3001 Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
3002 Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
3003 unsigned AddrSpace =
3004 cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
3005 if (AddrSpace == X86AS::GS)
3006 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
3007 if (AddrSpace == X86AS::FS)
3008 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
3009 if (AddrSpace == X86AS::SS)
3010 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
3011 }
3012
3013 // Save the DL and VT before calling matchAddress, it can invalidate N.
3014 SDLoc DL(N);
3015 MVT VT = N.getSimpleValueType();
3016
3017 if (matchAddress(N, AM))
3018 return false;
3019
3020 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3021 return true;
3022}
3023
3024bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
3025 // Cannot use 32 bit constants to reference objects in kernel/large code
3026 // model.
3027 if (TM.getCodeModel() == CodeModel::Kernel ||
3028 TM.getCodeModel() == CodeModel::Large)
3029 return false;
3030
3031 // In static codegen with small code model, we can get the address of a label
3032 // into a register with 'movl'
3033 if (N->getOpcode() != X86ISD::Wrapper)
3034 return false;
3035
3036 N = N.getOperand(0);
3037
3038 // At least GNU as does not accept 'movl' for TPOFF relocations.
3039 // FIXME: We could use 'movl' when we know we are targeting MC.
3040 if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
3041 return false;
3042
3043 Imm = N;
3044 // Small/medium code model can reference non-TargetGlobalAddress objects with
3045 // 32 bit constants.
3046 if (N->getOpcode() != ISD::TargetGlobalAddress) {
3047 return TM.getCodeModel() == CodeModel::Small ||
3048 TM.getCodeModel() == CodeModel::Medium;
3049 }
3050
3051 const GlobalValue *GV = cast<GlobalAddressSDNode>(N)->getGlobal();
3052 if (std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange())
3053 return CR->getUnsignedMax().ult(1ull << 32);
3054
3055 return !TM.isLargeGlobalValue(GV);
3056}
3057
3058bool X86DAGToDAGISel::selectLEA64_Addr(SDValue N, SDValue &Base, SDValue &Scale,
3059 SDValue &Index, SDValue &Disp,
3060 SDValue &Segment) {
3061 // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
3062 SDLoc DL(N);
3063
3064 if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
3065 return false;
3066
3067 EVT BaseType = Base.getValueType();
3068 unsigned SubReg;
3069 if (BaseType == MVT::i8)
3070 SubReg = X86::sub_8bit;
3071 else if (BaseType == MVT::i16)
3072 SubReg = X86::sub_16bit;
3073 else
3074 SubReg = X86::sub_32bit;
3075
3077 if (RN && RN->getReg() == 0)
3078 Base = CurDAG->getRegister(0, MVT::i64);
3079 else if ((BaseType == MVT::i8 || BaseType == MVT::i16 ||
3080 BaseType == MVT::i32) &&
3082 // Base could already be %rip, particularly in the x32 ABI.
3083 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
3084 MVT::i64), 0);
3085 Base = CurDAG->getTargetInsertSubreg(SubReg, DL, MVT::i64, ImplDef, Base);
3086 }
3087
3088 [[maybe_unused]] EVT IndexType = Index.getValueType();
3090 if (RN && RN->getReg() == 0)
3091 Index = CurDAG->getRegister(0, MVT::i64);
3092 else {
3093 assert((IndexType == BaseType) &&
3094 "Expect to be extending 8/16/32-bit registers for use in LEA");
3095 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
3096 MVT::i64), 0);
3097 Index = CurDAG->getTargetInsertSubreg(SubReg, DL, MVT::i64, ImplDef, Index);
3098 }
3099
3100 return true;
3101}
3102
3103/// Calls SelectAddr and determines if the maximal addressing
3104/// mode it matches can be cost effectively emitted as an LEA instruction.
3105bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
3106 SDValue &Base, SDValue &Scale,
3107 SDValue &Index, SDValue &Disp,
3108 SDValue &Segment) {
3109 X86ISelAddressMode AM;
3110
3111 // Save the DL and VT before calling matchAddress, it can invalidate N.
3112 SDLoc DL(N);
3113 MVT VT = N.getSimpleValueType();
3114
3115 // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
3116 // segments.
3117 SDValue Copy = AM.Segment;
3118 SDValue T = CurDAG->getRegister(0, MVT::i32);
3119 AM.Segment = T;
3120 if (matchAddress(N, AM))
3121 return false;
3122 assert (T == AM.Segment);
3123 AM.Segment = Copy;
3124
3125 unsigned Complexity = 0;
3126 if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
3127 Complexity = 1;
3128 else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
3129 Complexity = 4;
3130
3131 if (AM.IndexReg.getNode())
3132 Complexity++;
3133
3134 // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
3135 // a simple shift.
3136 if (AM.Scale > 1)
3137 Complexity++;
3138
3139 // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
3140 // to a LEA. This is determined with some experimentation but is by no means
3141 // optimal (especially for code size consideration). LEA is nice because of
3142 // its three-address nature. Tweak the cost function again when we can run
3143 // convertToThreeAddress() at register allocation time.
3144 if (AM.hasSymbolicDisplacement()) {
3145 // For X86-64, always use LEA to materialize RIP-relative addresses.
3146 if (Subtarget->is64Bit())
3147 Complexity = 4;
3148 else
3149 Complexity += 2;
3150 }
3151
3152 // Heuristic: try harder to form an LEA from ADD if the operands set flags.
3153 // Unlike ADD, LEA does not affect flags, so we will be less likely to require
3154 // duplicating flag-producing instructions later in the pipeline.
3155 if (N.getOpcode() == ISD::ADD) {
3156 auto isMathWithFlags = [](SDValue V) {
3157 switch (V.getOpcode()) {
3158 case X86ISD::ADD:
3159 case X86ISD::SUB:
3160 case X86ISD::ADC:
3161 case X86ISD::SBB:
3162 case X86ISD::SMUL:
3163 case X86ISD::UMUL:
3164 /* TODO: These opcodes can be added safely, but we may want to justify
3165 their inclusion for different reasons (better for reg-alloc).
3166 case X86ISD::OR:
3167 case X86ISD::XOR:
3168 case X86ISD::AND:
3169 */
3170 // Value 1 is the flag output of the node - verify it's not dead.
3171 return !SDValue(V.getNode(), 1).use_empty();
3172 default:
3173 return false;
3174 }
3175 };
3176 // TODO: We might want to factor in whether there's a load folding
3177 // opportunity for the math op that disappears with LEA.
3178 if (isMathWithFlags(N.getOperand(0)) || isMathWithFlags(N.getOperand(1)))
3179 Complexity++;
3180 }
3181
3182 if (AM.Disp)
3183 Complexity++;
3184
3185 // If it isn't worth using an LEA, reject it.
3186 if (Complexity <= 2)
3187 return false;
3188
3189 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3190 return true;
3191}
3192
3193/// This is only run on TargetGlobalTLSAddress nodes.
3194bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
3195 SDValue &Scale, SDValue &Index,
3196 SDValue &Disp, SDValue &Segment) {
3197 assert(N.getOpcode() == ISD::TargetGlobalTLSAddress ||
3198 N.getOpcode() == ISD::TargetExternalSymbol);
3199
3200 X86ISelAddressMode AM;
3201 if (auto *GA = dyn_cast<GlobalAddressSDNode>(N)) {
3202 AM.GV = GA->getGlobal();
3203 AM.Disp += GA->getOffset();
3204 AM.SymbolFlags = GA->getTargetFlags();
3205 } else {
3206 auto *SA = cast<ExternalSymbolSDNode>(N);
3207 AM.ES = SA->getSymbol();
3208 AM.SymbolFlags = SA->getTargetFlags();
3209 }
3210
3211 if (Subtarget->is32Bit()) {
3212 AM.Scale = 1;
3213 AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
3214 }
3215
3216 MVT VT = N.getSimpleValueType();
3217 getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
3218 return true;
3219}
3220
3221bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
3222 // Keep track of the original value type and whether this value was
3223 // truncated. If we see a truncation from pointer type to VT that truncates
3224 // bits that are known to be zero, we can use a narrow reference.
3225 EVT VT = N.getValueType();
3226 bool WasTruncated = false;
3227 if (N.getOpcode() == ISD::TRUNCATE) {
3228 WasTruncated = true;
3229 N = N.getOperand(0);
3230 }
3231
3232 if (N.getOpcode() != X86ISD::Wrapper)
3233 return false;
3234
3235 // We can only use non-GlobalValues as immediates if they were not truncated,
3236 // as we do not have any range information. If we have a GlobalValue and the
3237 // address was not truncated, we can select it as an operand directly.
3238 unsigned Opc = N.getOperand(0)->getOpcode();
3239 if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
3240 Op = N.getOperand(0);
3241 // We can only select the operand directly if we didn't have to look past a
3242 // truncate.
3243 return !WasTruncated;
3244 }
3245
3246 // Check that the global's range fits into VT.
3247 auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
3248 std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
3249 if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
3250 return false;
3251
3252 // Okay, we can use a narrow reference.
3253 Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
3254 GA->getOffset(), GA->getTargetFlags());
3255 return true;
3256}
3257
3258bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
3259 SDValue &Base, SDValue &Scale,
3260 SDValue &Index, SDValue &Disp,
3261 SDValue &Segment) {
3262 assert(Root && P && "Unknown root/parent nodes");
3263 if (!ISD::isNON_EXTLoad(N.getNode()) ||
3264 !IsProfitableToFold(N, P, Root) ||
3265 !IsLegalToFold(N, P, Root, OptLevel))
3266 return false;
3267
3268 return selectAddr(N.getNode(),
3269 N.getOperand(1), Base, Scale, Index, Disp, Segment);
3270}
3271
3272bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
3273 SDValue &Base, SDValue &Scale,
3274 SDValue &Index, SDValue &Disp,
3275 SDValue &Segment) {
3276 assert(Root && P && "Unknown root/parent nodes");
3277 if (N->getOpcode() != X86ISD::VBROADCAST_LOAD ||
3278 !IsProfitableToFold(N, P, Root) ||
3279 !IsLegalToFold(N, P, Root, OptLevel))
3280 return false;
3281
3282 return selectAddr(N.getNode(),
3283 N.getOperand(1), Base, Scale, Index, Disp, Segment);
3284}
3285
3286/// Return an SDNode that returns the value of the global base register.
3287/// Output instructions required to initialize the global base register,
3288/// if necessary.
3289SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
3290 Register GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
3291 auto &DL = MF->getDataLayout();
3292 return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
3293}
3294
3295bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
3296 if (N->getOpcode() == ISD::TRUNCATE)
3297 N = N->getOperand(0).getNode();
3298 if (N->getOpcode() != X86ISD::Wrapper)
3299 return false;
3300
3301 auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
3302 if (!GA)
3303 return false;
3304
3305 auto *GV = GA->getGlobal();
3306 std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange();
3307 if (CR)
3308 return CR->getSignedMin().sge(-1ull << Width) &&
3309 CR->getSignedMax().slt(1ull << Width);
3310 // In the kernel code model, globals are in the negative 2GB of the address
3311 // space, so globals can be a sign extended 32-bit immediate.
3312 // In other code models, small globals are in the low 2GB of the address
3313 // space, so sign extending them is equivalent to zero extending them.
3314 return TM.getCodeModel() != CodeModel::Large && Width == 32 &&
3315 !TM.isLargeGlobalValue(GV);
3316}
3317
3318X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const {
3319 assert(N->isMachineOpcode() && "Unexpected node");
3320 unsigned Opc = N->getMachineOpcode();
3321 const MCInstrDesc &MCID = getInstrInfo()->get(Opc);
3322 int CondNo = X86::getCondSrcNoFromDesc(MCID);
3323 if (CondNo < 0)
3324 return X86::COND_INVALID;
3325
3326 return static_cast<X86::CondCode>(N->getConstantOperandVal(CondNo));
3327}
3328
3329/// Test whether the given X86ISD::CMP node has any users that use a flag
3330/// other than ZF.
3331bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
3332 // Examine each user of the node.
3333 for (SDUse &Use : Flags->uses()) {
3334 // Only check things that use the flags.
3335 if (Use.getResNo() != Flags.getResNo())
3336 continue;
3337 SDNode *User = Use.getUser();
3338 // Only examine CopyToReg uses that copy to EFLAGS.
3339 if (User->getOpcode() != ISD::CopyToReg ||
3340 cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
3341 return false;
3342 // Examine each user of the CopyToReg use.
3343 for (SDUse &FlagUse : User->uses()) {
3344 // Only examine the Flag result.
3345 if (FlagUse.getResNo() != 1)
3346 continue;
3347 // Anything unusual: assume conservatively.
3348 if (!FlagUse.getUser()->isMachineOpcode())
3349 return false;
3350 // Examine the condition code of the user.
3351 X86::CondCode CC = getCondFromNode(FlagUse.getUser());
3352
3353 switch (CC) {
3354 // Comparisons which only use the zero flag.
3355 case X86::COND_E: case X86::COND_NE:
3356 continue;
3357 // Anything else: assume conservatively.
3358 default:
3359 return false;
3360 }
3361 }
3362 }
3363 return true;
3364}
3365
3366/// Test whether the given X86ISD::CMP node has any uses which require the SF
3367/// flag to be accurate.
3368bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
3369 // Examine each user of the node.
3370 for (SDUse &Use : Flags->uses()) {
3371 // Only check things that use the flags.
3372 if (Use.getResNo() != Flags.getResNo())
3373 continue;
3374 SDNode *User = Use.getUser();
3375 // Only examine CopyToReg uses that copy to EFLAGS.
3376 if (User->getOpcode() != ISD::CopyToReg ||
3377 cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
3378 return false;
3379 // Examine each user of the CopyToReg use.
3380 for (SDUse &FlagUse : User->uses()) {
3381 // Only examine the Flag result.
3382 if (FlagUse.getResNo() != 1)
3383 continue;
3384 // Anything unusual: assume conservatively.
3385 if (!FlagUse.getUser()->isMachineOpcode())
3386 return false;
3387 // Examine the condition code of the user.
3388 X86::CondCode CC = getCondFromNode(FlagUse.getUser());
3389
3390 switch (CC) {
3391 // Comparisons which don't examine the SF flag.
3392 case X86::COND_A: case X86::COND_AE:
3393 case X86::COND_B: case X86::COND_BE:
3394 case X86::COND_E: case X86::COND_NE:
3395 case X86::COND_O: case X86::COND_NO:
3396 case X86::COND_P: case X86::COND_NP:
3397 continue;
3398 // Anything else: assume conservatively.
3399 default:
3400 return false;
3401 }
3402 }
3403 }
3404 return true;
3405}
3406
3408 switch (CC) {
3409 // Comparisons which don't examine the CF flag.
3410 case X86::COND_O: case X86::COND_NO:
3411 case X86::COND_E: case X86::COND_NE:
3412 case X86::COND_S: case X86::COND_NS:
3413 case X86::COND_P: case X86::COND_NP:
3414 case X86::COND_L: case X86::COND_GE:
3415 case X86::COND_G: case X86::COND_LE:
3416 return false;
3417 // Anything else: assume conservatively.
3418 default:
3419 return true;
3420 }
3421}
3422
3423/// Test whether the given node which sets flags has any uses which require the
3424/// CF flag to be accurate.
3425 bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
3426 // Examine each user of the node.
3427 for (SDUse &Use : Flags->uses()) {
3428 // Only check things that use the flags.
3429 if (Use.getResNo() != Flags.getResNo())
3430 continue;
3431
3432 SDNode *User = Use.getUser();
3433 unsigned UserOpc = User->getOpcode();
3434
3435 if (UserOpc == ISD::CopyToReg) {
3436 // Only examine CopyToReg uses that copy to EFLAGS.
3437 if (cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
3438 return false;
3439 // Examine each user of the CopyToReg use.
3440 for (SDUse &FlagUse : User->uses()) {
3441 // Only examine the Flag result.
3442 if (FlagUse.getResNo() != 1)
3443 continue;
3444 // Anything unusual: assume conservatively.
3445 if (!FlagUse.getUser()->isMachineOpcode())
3446 return false;
3447 // Examine the condition code of the user.
3448 X86::CondCode CC = getCondFromNode(FlagUse.getUser());
3449
3450 if (mayUseCarryFlag(CC))
3451 return false;
3452 }
3453
3454 // This CopyToReg is ok. Move on to the next user.
3455 continue;
3456 }
3457
3458 // This might be an unselected node. So look for the pre-isel opcodes that
3459 // use flags.
3460 unsigned CCOpNo;
3461 switch (UserOpc) {
3462 default:
3463 // Something unusual. Be conservative.
3464 return false;
3465 case X86ISD::SETCC: CCOpNo = 0; break;
3466 case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
3467 case X86ISD::CMOV: CCOpNo = 2; break;
3468 case X86ISD::BRCOND: CCOpNo = 2; break;
3469 }
3470
3471 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
3472 if (mayUseCarryFlag(CC))
3473 return false;
3474 }
3475 return true;
3476}
3477
3478/// Check whether or not the chain ending in StoreNode is suitable for doing
3479/// the {load; op; store} to modify transformation.
3481 SDValue StoredVal, SelectionDAG *CurDAG,
3482 unsigned LoadOpNo,
3483 LoadSDNode *&LoadNode,
3484 SDValue &InputChain) {
3485 // Is the stored value result 0 of the operation?
3486 if (StoredVal.getResNo() != 0) return false;
3487
3488 // Are there other uses of the operation other than the store?
3489 if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
3490
3491 // Is the store non-extending and non-indexed?
3492 if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
3493 return false;
3494
3495 SDValue Load = StoredVal->getOperand(LoadOpNo);
3496 // Is the stored value a non-extending and non-indexed load?
3497 if (!ISD::isNormalLoad(Load.getNode())) return false;
3498
3499 // Return LoadNode by reference.
3500 LoadNode = cast<LoadSDNode>(Load);
3501
3502 // Is store the only read of the loaded value?
3503 if (!Load.hasOneUse())
3504 return false;
3505
3506 // Is the address of the store the same as the load?
3507 if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
3508 LoadNode->getOffset() != StoreNode->getOffset())
3509 return false;
3510
3511 bool FoundLoad = false;
3512 SmallVector<SDValue, 4> ChainOps;
3513 SmallVector<const SDNode *, 4> LoopWorklist;
3515 const unsigned int Max = 1024;
3516
3517 // Visualization of Load-Op-Store fusion:
3518 // -------------------------
3519 // Legend:
3520 // *-lines = Chain operand dependencies.
3521 // |-lines = Normal operand dependencies.
3522 // Dependencies flow down and right. n-suffix references multiple nodes.
3523 //
3524 // C Xn C
3525 // * * *
3526 // * * *
3527 // Xn A-LD Yn TF Yn
3528 // * * \ | * |
3529 // * * \ | * |
3530 // * * \ | => A--LD_OP_ST
3531 // * * \| \
3532 // TF OP \
3533 // * | \ Zn
3534 // * | \
3535 // A-ST Zn
3536 //
3537
3538 // This merge induced dependences from: #1: Xn -> LD, OP, Zn
3539 // #2: Yn -> LD
3540 // #3: ST -> Zn
3541
3542 // Ensure the transform is safe by checking for the dual
3543 // dependencies to make sure we do not induce a loop.
3544
3545 // As LD is a predecessor to both OP and ST we can do this by checking:
3546 // a). if LD is a predecessor to a member of Xn or Yn.
3547 // b). if a Zn is a predecessor to ST.
3548
3549 // However, (b) can only occur through being a chain predecessor to
3550 // ST, which is the same as Zn being a member or predecessor of Xn,
3551 // which is a subset of LD being a predecessor of Xn. So it's
3552 // subsumed by check (a).
3553
3554 SDValue Chain = StoreNode->getChain();
3555
3556 // Gather X elements in ChainOps.
3557 if (Chain == Load.getValue(1)) {
3558 FoundLoad = true;
3559 ChainOps.push_back(Load.getOperand(0));
3560 } else if (Chain.getOpcode() == ISD::TokenFactor) {
3561 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
3562 SDValue Op = Chain.getOperand(i);
3563 if (Op == Load.getValue(1)) {
3564 FoundLoad = true;
3565 // Drop Load, but keep its chain. No cycle check necessary.
3566 ChainOps.push_back(Load.getOperand(0));
3567 continue;
3568 }
3569 LoopWorklist.push_back(Op.getNode());
3570 ChainOps.push_back(Op);
3571 }
3572 }
3573
3574 if (!FoundLoad)
3575 return false;
3576
3577 // Worklist is currently Xn. Add Yn to worklist.
3578 for (SDValue Op : StoredVal->ops())
3579 if (Op.getNode() != LoadNode)
3580 LoopWorklist.push_back(Op.getNode());
3581
3582 // Check (a) if Load is a predecessor to Xn + Yn
3583 if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
3584 true))
3585 return false;
3586
3587 InputChain =
3588 CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
3589 return true;
3590}
3591
3592// Change a chain of {load; op; store} of the same value into a simple op
3593// through memory of that value, if the uses of the modified value and its
3594// address are suitable.
3595//
3596// The tablegen pattern memory operand pattern is currently not able to match
3597// the case where the EFLAGS on the original operation are used.
3598//
3599// To move this to tablegen, we'll need to improve tablegen to allow flags to
3600// be transferred from a node in the pattern to the result node, probably with
3601// a new keyword. For example, we have this
3602// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3603// [(store (add (loadi64 addr:$dst), -1), addr:$dst)]>;
3604// but maybe need something like this
3605// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3606// [(store (X86add_flag (loadi64 addr:$dst), -1), addr:$dst),
3607// (transferrable EFLAGS)]>;
3608//
3609// Until then, we manually fold these and instruction select the operation
3610// here.
3611bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
3612 auto *StoreNode = cast<StoreSDNode>(Node);
3613 SDValue StoredVal = StoreNode->getOperand(1);
3614 unsigned Opc = StoredVal->getOpcode();
3615
3616 // Before we try to select anything, make sure this is memory operand size
3617 // and opcode we can handle. Note that this must match the code below that
3618 // actually lowers the opcodes.
3619 EVT MemVT = StoreNode->getMemoryVT();
3620 if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
3621 MemVT != MVT::i8)
3622 return false;
3623
3624 bool IsCommutable = false;
3625 bool IsNegate = false;
3626 switch (Opc) {
3627 default:
3628 return false;
3629 case X86ISD::SUB:
3630 IsNegate = isNullConstant(StoredVal.getOperand(0));
3631 break;
3632 case X86ISD::SBB:
3633 break;
3634 case X86ISD::ADD:
3635 case X86ISD::ADC:
3636 case X86ISD::AND:
3637 case X86ISD::OR:
3638 case X86ISD::XOR:
3639 IsCommutable = true;
3640 break;
3641 }
3642
3643 unsigned LoadOpNo = IsNegate ? 1 : 0;
3644 LoadSDNode *LoadNode = nullptr;
3645 SDValue InputChain;
3646 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3647 LoadNode, InputChain)) {
3648 if (!IsCommutable)
3649 return false;
3650
3651 // This operation is commutable, try the other operand.
3652 LoadOpNo = 1;
3653 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3654 LoadNode, InputChain))
3655 return false;
3656 }
3657
3658 SDValue Base, Scale, Index, Disp, Segment;
3659 if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
3660 Segment))
3661 return false;
3662
3663 auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
3664 unsigned Opc8) {
3665 switch (MemVT.getSimpleVT().SimpleTy) {
3666 case MVT::i64:
3667 return Opc64;
3668 case MVT::i32:
3669 return Opc32;
3670 case MVT::i16:
3671 return Opc16;
3672 case MVT::i8:
3673 return Opc8;
3674 default:
3675 llvm_unreachable("Invalid size!");
3676 }
3677 };
3678
3679 MachineSDNode *Result;
3680 switch (Opc) {
3681 case X86ISD::SUB:
3682 // Handle negate.
3683 if (IsNegate) {
3684 unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
3685 X86::NEG8m);
3686 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3687 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3688 MVT::Other, Ops);
3689 break;
3690 }
3691 [[fallthrough]];
3692 case X86ISD::ADD:
3693 // Try to match inc/dec.
3694 if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) {
3695 bool IsOne = isOneConstant(StoredVal.getOperand(1));
3696 bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
3697 // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3698 if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
3699 unsigned NewOpc =
3700 ((Opc == X86ISD::ADD) == IsOne)
3701 ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
3702 : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
3703 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3704 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3705 MVT::Other, Ops);
3706 break;
3707 }
3708 }
3709 [[fallthrough]];
3710 case X86ISD::ADC:
3711 case X86ISD::SBB:
3712 case X86ISD::AND:
3713 case X86ISD::OR:
3714 case X86ISD::XOR: {
3715 auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3716 switch (Opc) {
3717 case X86ISD::ADD:
3718 return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3719 X86::ADD8mr);
3720 case X86ISD::ADC:
3721 return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3722 X86::ADC8mr);
3723 case X86ISD::SUB:
3724 return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3725 X86::SUB8mr);
3726 case X86ISD::SBB:
3727 return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3728 X86::SBB8mr);
3729 case X86ISD::AND:
3730 return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
3731 X86::AND8mr);
3732 case X86ISD::OR:
3733 return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3734 case X86ISD::XOR:
3735 return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3736 X86::XOR8mr);
3737 default:
3738 llvm_unreachable("Invalid opcode!");
3739 }
3740 };
3741 auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3742 switch (Opc) {
3743 case X86ISD::ADD:
3744 return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3745 X86::ADD8mi);
3746 case X86ISD::ADC:
3747 return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3748 X86::ADC8mi);
3749 case X86ISD::SUB:
3750 return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3751 X86::SUB8mi);
3752 case X86ISD::SBB:
3753 return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3754 X86::SBB8mi);
3755 case X86ISD::AND:
3756 return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3757 X86::AND8mi);
3758 case X86ISD::OR:
3759 return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3760 X86::OR8mi);
3761 case X86ISD::XOR:
3762 return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3763 X86::XOR8mi);
3764 default:
3765 llvm_unreachable("Invalid opcode!");
3766 }
3767 };
3768
3769 unsigned NewOpc = SelectRegOpcode(Opc);
3770 SDValue Operand = StoredVal->getOperand(1-LoadOpNo);
3771
3772 // See if the operand is a constant that we can fold into an immediate
3773 // operand.
3774 if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
3775 int64_t OperandV = OperandC->getSExtValue();
3776
3777 // Check if we can shrink the operand enough to fit in an immediate (or
3778 // fit into a smaller immediate) by negating it and switching the
3779 // operation.
3780 if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
3781 ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) ||
3782 (MemVT == MVT::i64 && !isInt<32>(OperandV) &&
3783 isInt<32>(-OperandV))) &&
3784 hasNoCarryFlagUses(StoredVal.getValue(1))) {
3785 OperandV = -OperandV;
3787 }
3788
3789 if (MemVT != MVT::i64 || isInt<32>(OperandV)) {
3790 Operand = CurDAG->getSignedTargetConstant(OperandV, SDLoc(Node), MemVT);
3791 NewOpc = SelectImmOpcode(Opc);
3792 }
3793 }
3794
3795 if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
3796 SDValue CopyTo =
3797 CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
3798 StoredVal.getOperand(2), SDValue());
3799
3800 const SDValue Ops[] = {Base, Scale, Index, Disp,
3801 Segment, Operand, CopyTo, CopyTo.getValue(1)};
3802 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3803 Ops);
3804 } else {
3805 const SDValue Ops[] = {Base, Scale, Index, Disp,
3806 Segment, Operand, InputChain};
3807 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3808 Ops);
3809 }
3810 break;
3811 }
3812 default:
3813 llvm_unreachable("Invalid opcode!");
3814 }
3815
3816 MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3817 LoadNode->getMemOperand()};
3818 CurDAG->setNodeMemRefs(Result, MemOps);
3819
3820 // Update Load Chain uses as well.
3821 ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
3822 ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
3823 ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
3824 CurDAG->RemoveDeadNode(Node);
3825 return true;
3826}
3827
3828// See if this is an X & Mask that we can match to BEXTR/BZHI.
3829// Where Mask is one of the following patterns:
3830// a) x & (1 << nbits) - 1
3831// b) x & ~(-1 << nbits)
3832// c) x & (-1 >> (32 - y))
3833// d) x << (32 - y) >> (32 - y)
3834// e) (1 << nbits) - 1
3835bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3836 assert(
3837 (Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::AND ||
3838 Node->getOpcode() == ISD::SRL) &&
3839 "Should be either an and-mask, or right-shift after clearing high bits.");
3840
3841 // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3842 if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3843 return false;
3844
3845 MVT NVT = Node->getSimpleValueType(0);
3846
3847 // Only supported for 32 and 64 bits.
3848 if (NVT != MVT::i32 && NVT != MVT::i64)
3849 return false;
3850
3851 SDValue NBits;
3852 bool NegateNBits;
3853
3854 // If we have BMI2's BZHI, we are ok with muti-use patterns.
3855 // Else, if we only have BMI1's BEXTR, we require one-use.
3856 const bool AllowExtraUsesByDefault = Subtarget->hasBMI2();
3857 auto checkUses = [AllowExtraUsesByDefault](
3858 SDValue Op, unsigned NUses,
3859 std::optional<bool> AllowExtraUses) {
3860 return AllowExtraUses.value_or(AllowExtraUsesByDefault) ||
3861 Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
3862 };
3863 auto checkOneUse = [checkUses](SDValue Op,
3864 std::optional<bool> AllowExtraUses =
3865 std::nullopt) {
3866 return checkUses(Op, 1, AllowExtraUses);
3867 };
3868 auto checkTwoUse = [checkUses](SDValue Op,
3869 std::optional<bool> AllowExtraUses =
3870 std::nullopt) {
3871 return checkUses(Op, 2, AllowExtraUses);
3872 };
3873
3874 auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3875 if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
3876 assert(V.getSimpleValueType() == MVT::i32 &&
3877 V.getOperand(0).getSimpleValueType() == MVT::i64 &&
3878 "Expected i64 -> i32 truncation");
3879 V = V.getOperand(0);
3880 }
3881 return V;
3882 };
3883
3884 // a) x & ((1 << nbits) + (-1))
3885 auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits,
3886 &NegateNBits](SDValue Mask) -> bool {
3887 // Match `add`. Must only have one use!
3888 if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
3889 return false;
3890 // We should be adding all-ones constant (i.e. subtracting one.)
3891 if (!isAllOnesConstant(Mask->getOperand(1)))
3892 return false;
3893 // Match `1 << nbits`. Might be truncated. Must only have one use!
3894 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3895 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3896 return false;
3897 if (!isOneConstant(M0->getOperand(0)))
3898 return false;
3899 NBits = M0->getOperand(1);
3900 NegateNBits = false;
3901 return true;
3902 };
3903
3904 auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
3905 V = peekThroughOneUseTruncation(V);
3906 return CurDAG->MaskedValueIsAllOnes(
3907 V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(),
3908 NVT.getSizeInBits()));
3909 };
3910
3911 // b) x & ~(-1 << nbits)
3912 auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
3913 &NBits, &NegateNBits](SDValue Mask) -> bool {
3914 // Match `~()`. Must only have one use!
3915 if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
3916 return false;
3917 // The -1 only has to be all-ones for the final Node's NVT.
3918 if (!isAllOnes(Mask->getOperand(1)))
3919 return false;
3920 // Match `-1 << nbits`. Might be truncated. Must only have one use!
3921 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3922 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3923 return false;
3924 // The -1 only has to be all-ones for the final Node's NVT.
3925 if (!isAllOnes(M0->getOperand(0)))
3926 return false;
3927 NBits = M0->getOperand(1);
3928 NegateNBits = false;
3929 return true;
3930 };
3931
3932 // Try to match potentially-truncated shift amount as `(bitwidth - y)`,
3933 // or leave the shift amount as-is, but then we'll have to negate it.
3934 auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt,
3935 unsigned Bitwidth) {
3936 NBits = ShiftAmt;
3937 NegateNBits = true;
3938 // Skip over a truncate of the shift amount, if any.
3939 if (NBits.getOpcode() == ISD::TRUNCATE)
3940 NBits = NBits.getOperand(0);
3941 // Try to match the shift amount as (bitwidth - y). It should go away, too.
3942 // If it doesn't match, that's fine, we'll just negate it ourselves.
3943 if (NBits.getOpcode() != ISD::SUB)
3944 return;
3945 auto *V0 = dyn_cast<ConstantSDNode>(NBits.getOperand(0));
3946 if (!V0 || V0->getZExtValue() != Bitwidth)
3947 return;
3948 NBits = NBits.getOperand(1);
3949 NegateNBits = false;
3950 };
3951
3952 // c) x & (-1 >> z) but then we'll have to subtract z from bitwidth
3953 // or
3954 // c) x & (-1 >> (32 - y))
3955 auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits,
3956 canonicalizeShiftAmt](SDValue Mask) -> bool {
3957 // The mask itself may be truncated.
3958 Mask = peekThroughOneUseTruncation(Mask);
3959 unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
3960 // Match `l>>`. Must only have one use!
3961 if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
3962 return false;
3963 // We should be shifting truly all-ones constant.
3964 if (!isAllOnesConstant(Mask.getOperand(0)))
3965 return false;
3966 SDValue M1 = Mask.getOperand(1);
3967 // The shift amount should not be used externally.
3968 if (!checkOneUse(M1))
3969 return false;
3970 canonicalizeShiftAmt(M1, Bitwidth);
3971 // Pattern c. is non-canonical, and is expanded into pattern d. iff there
3972 // is no extra use of the mask. Clearly, there was one since we are here.
3973 // But at the same time, if we need to negate the shift amount,
3974 // then we don't want the mask to stick around, else it's unprofitable.
3975 return !NegateNBits;
3976 };
3977
3978 SDValue X;
3979
3980 // d) x << z >> z but then we'll have to subtract z from bitwidth
3981 // or
3982 // d) x << (32 - y) >> (32 - y)
3983 auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt,
3984 AllowExtraUsesByDefault, &NegateNBits,
3985 &X](SDNode *Node) -> bool {
3986 if (Node->getOpcode() != ISD::SRL)
3987 return false;
3988 SDValue N0 = Node->getOperand(0);
3989 if (N0->getOpcode() != ISD::SHL)
3990 return false;
3991 unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
3992 SDValue N1 = Node->getOperand(1);
3993 SDValue N01 = N0->getOperand(1);
3994 // Both of the shifts must be by the exact same value.
3995 if (N1 != N01)
3996 return false;
3997 canonicalizeShiftAmt(N1, Bitwidth);
3998 // There should not be any external uses of the inner shift / shift amount.
3999 // Note that while we are generally okay with external uses given BMI2,
4000 // iff we need to negate the shift amount, we are not okay with extra uses.
4001 const bool AllowExtraUses = AllowExtraUsesByDefault && !NegateNBits;
4002 if (!checkOneUse(N0, AllowExtraUses) || !checkTwoUse(N1, AllowExtraUses))
4003 return false;
4004 X = N0->getOperand(0);
4005 return true;
4006 };
4007
4008 auto matchLowBitMask = [matchPatternA, matchPatternB,
4009 matchPatternC](SDValue Mask) -> bool {
4010 return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
4011 };
4012
4013 if (Node->getOpcode() == ISD::AND) {
4014 X = Node->getOperand(0);
4015 SDValue Mask = Node->getOperand(1);
4016
4017 if (matchLowBitMask(Mask)) {
4018 // Great.
4019 } else {
4020 std::swap(X, Mask);
4021 if (!matchLowBitMask(Mask))
4022 return false;
4023 }
4024 } else if (matchLowBitMask(SDValue(Node, 0))) {
4025 X = CurDAG->getAllOnesConstant(SDLoc(Node), NVT);
4026 } else if (!matchPatternD(Node))
4027 return false;
4028
4029 // If we need to negate the shift amount, require BMI2 BZHI support.
4030 // It's just too unprofitable for BMI1 BEXTR.
4031 if (NegateNBits && !Subtarget->hasBMI2())
4032 return false;
4033
4034 SDLoc DL(Node);
4035
4036 // Truncate the shift amount.
4037 NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
4038 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4039
4040 // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
4041 // All the other bits are undefined, we do not care about them.
4042 SDValue ImplDef = SDValue(
4043 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
4044 insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
4045
4046 SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
4047 insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
4048 NBits = SDValue(CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
4049 MVT::i32, ImplDef, NBits, SRIdxVal),
4050 0);
4051 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4052
4053 // We might have matched the amount of high bits to be cleared,
4054 // but we want the amount of low bits to be kept, so negate it then.
4055 if (NegateNBits) {
4056 SDValue BitWidthC = CurDAG->getConstant(NVT.getSizeInBits(), DL, MVT::i32);
4057 insertDAGNode(*CurDAG, SDValue(Node, 0), BitWidthC);
4058
4059 NBits = CurDAG->getNode(ISD::SUB, DL, MVT::i32, BitWidthC, NBits);
4060 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4061 }
4062
4063 if (Subtarget->hasBMI2()) {
4064 // Great, just emit the BZHI..
4065 if (NVT != MVT::i32) {
4066 // But have to place the bit count into the wide-enough register first.
4067 NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits);
4068 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4069 }
4070
4071 SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
4072 ReplaceNode(Node, Extract.getNode());
4073 SelectCode(Extract.getNode());
4074 return true;
4075 }
4076
4077 // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
4078 // *logically* shifted (potentially with one-use trunc inbetween),
4079 // and the truncation was the only use of the shift,
4080 // and if so look past one-use truncation.
4081 {
4082 SDValue RealX = peekThroughOneUseTruncation(X);
4083 // FIXME: only if the shift is one-use?
4084 if (RealX != X && RealX.getOpcode() == ISD::SRL)
4085 X = RealX;
4086 }
4087
4088 MVT XVT = X.getSimpleValueType();
4089
4090 // Else, emitting BEXTR requires one more step.
4091 // The 'control' of BEXTR has the pattern of:
4092 // [15...8 bit][ 7...0 bit] location
4093 // [ bit count][ shift] name
4094 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4095
4096 // Shift NBits left by 8 bits, thus producing 'control'.
4097 // This makes the low 8 bits to be zero.
4098 SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
4099 insertDAGNode(*CurDAG, SDValue(Node, 0), C8);
4100 SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
4101 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4102
4103 // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
4104 // FIXME: only if the shift is one-use?
4105 if (X.getOpcode() == ISD::SRL) {
4106 SDValue ShiftAmt = X.getOperand(1);
4107 X = X.getOperand(0);
4108
4109 assert(ShiftAmt.getValueType() == MVT::i8 &&
4110 "Expected shift amount to be i8");
4111
4112 // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
4113 // We could zext to i16 in some form, but we intentionally don't do that.
4114 SDValue OrigShiftAmt = ShiftAmt;
4115 ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
4116 insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
4117
4118 // And now 'or' these low 8 bits of shift amount into the 'control'.
4119 Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
4120 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4121 }
4122
4123 // But have to place the 'control' into the wide-enough register first.
4124 if (XVT != MVT::i32) {
4125 Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
4126 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4127 }
4128
4129 // And finally, form the BEXTR itself.
4130 SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);
4131
4132 // The 'X' was originally truncated. Do that now.
4133 if (XVT != NVT) {
4134 insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
4135 Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
4136 }
4137
4138 ReplaceNode(Node, Extract.getNode());
4139 SelectCode(Extract.getNode());
4140
4141 return true;
4142}
4143
4144// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
4145MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
4146 MVT NVT = Node->getSimpleValueType(0);
4147 SDLoc dl(Node);
4148
4149 SDValue N0 = Node->getOperand(0);
4150 SDValue N1 = Node->getOperand(1);
4151
4152 // If we have TBM we can use an immediate for the control. If we have BMI
4153 // we should only do this if the BEXTR instruction is implemented well.
4154 // Otherwise moving the control into a register makes this more costly.
4155 // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
4156 // hoisting the move immediate would make it worthwhile with a less optimal
4157 // BEXTR?
4158 bool PreferBEXTR =
4159 Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
4160 if (!PreferBEXTR && !Subtarget->hasBMI2())
4161 return nullptr;
4162
4163 // Must have a shift right.
4164 if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
4165 return nullptr;
4166
4167 // Shift can't have additional users.
4168 if (!N0->hasOneUse())
4169 return nullptr;
4170
4171 // Only supported for 32 and 64 bits.
4172 if (NVT != MVT::i32 && NVT != MVT::i64)
4173 return nullptr;
4174
4175 // Shift amount and RHS of and must be constant.
4176 auto *MaskCst = dyn_cast<ConstantSDNode>(N1);
4177 auto *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
4178 if (!MaskCst || !ShiftCst)
4179 return nullptr;
4180
4181 // And RHS must be a mask.
4182 uint64_t Mask = MaskCst->getZExtValue();
4183 if (!isMask_64(Mask))
4184 return nullptr;
4185
4186 uint64_t Shift = ShiftCst->getZExtValue();
4187 uint64_t MaskSize = llvm::popcount(Mask);
4188
4189 // Don't interfere with something that can be handled by extracting AH.
4190 // TODO: If we are able to fold a load, BEXTR might still be better than AH.
4191 if (Shift == 8 && MaskSize == 8)
4192 return nullptr;
4193
4194 // Make sure we are only using bits that were in the original value, not
4195 // shifted in.
4196 if (Shift + MaskSize > NVT.getSizeInBits())
4197 return nullptr;
4198
4199 // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
4200 // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
4201 // does not fit into 32 bits. Load folding is not a sufficient reason.
4202 if (!PreferBEXTR && MaskSize <= 32)
4203 return nullptr;
4204
4205 SDValue Control;
4206 unsigned ROpc, MOpc;
4207
4208#define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC)
4209 if (!PreferBEXTR) {
4210 assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
4211 // If we can't make use of BEXTR then we can't fuse shift+mask stages.
4212 // Let's perform the mask first, and apply shift later. Note that we need to
4213 // widen the mask to account for the fact that we'll apply shift afterwards!
4214 Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT);
4215 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rr)
4216 : GET_EGPR_IF_ENABLED(X86::BZHI32rr);
4217 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rm)
4218 : GET_EGPR_IF_ENABLED(X86::BZHI32rm);
4219 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4220 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4221 } else {
4222 // The 'control' of BEXTR has the pattern of:
4223 // [15...8 bit][ 7...0 bit] location
4224 // [ bit count][ shift] name
4225 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4226 Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
4227 if (Subtarget->hasTBM()) {
4228 ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
4229 MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
4230 } else {
4231 assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
4232 // BMI requires the immediate to placed in a register.
4233 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rr)
4234 : GET_EGPR_IF_ENABLED(X86::BEXTR32rr);
4235 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rm)
4236 : GET_EGPR_IF_ENABLED(X86::BEXTR32rm);
4237 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4238 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4239 }
4240 }
4241
4242 MachineSDNode *NewNode;
4243 SDValue Input = N0->getOperand(0);
4244 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4245 if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4246 SDValue Ops[] = {
4247 Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)};
4248 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4249 NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4250 // Update the chain.
4251 ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
4252 // Record the mem-refs
4253 CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
4254 } else {
4255 NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control);
4256 }
4257
4258 if (!PreferBEXTR) {
4259 // We still need to apply the shift.
4260 SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT);
4261 unsigned NewOpc = NVT == MVT::i64 ? GET_ND_IF_ENABLED(X86::SHR64ri)
4262 : GET_ND_IF_ENABLED(X86::SHR32ri);
4263 NewNode =
4264 CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt);
4265 }
4266
4267 return NewNode;
4268}
4269
4270// Emit a PCMISTR(I/M) instruction.
4271MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
4272 bool MayFoldLoad, const SDLoc &dl,
4273 MVT VT, SDNode *Node) {
4274 SDValue N0 = Node->getOperand(0);
4275 SDValue N1 = Node->getOperand(1);
4276 SDValue Imm = Node->getOperand(2);
4277 auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4278 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4279
4280 // Try to fold a load. No need to check alignment.
4281 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4282 if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4283 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4284 N1.getOperand(0) };
4285 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
4286 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4287 // Update the chain.
4288 ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
4289 // Record the mem-refs
4290 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4291 return CNode;
4292 }
4293
4294 SDValue Ops[] = { N0, N1, Imm };
4295 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
4296 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4297 return CNode;
4298}
4299
4300// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
4301// to emit a second instruction after this one. This is needed since we have two
4302// copyToReg nodes glued before this and we need to continue that glue through.
4303MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
4304 bool MayFoldLoad, const SDLoc &dl,
4305 MVT VT, SDNode *Node,
4306 SDValue &InGlue) {
4307 SDValue N0 = Node->getOperand(0);
4308 SDValue N2 = Node->getOperand(2);
4309 SDValue Imm = Node->getOperand(4);
4310 auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4311 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4312
4313 // Try to fold a load. No need to check alignment.
4314 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4315 if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4316 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4317 N2.getOperand(0), InGlue };
4318 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
4319 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4320 InGlue = SDValue(CNode, 3);
4321 // Update the chain.
4322 ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
4323 // Record the mem-refs
4324 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
4325 return CNode;
4326 }
4327
4328 SDValue Ops[] = { N0, N2, Imm, InGlue };
4329 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
4330 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4331 InGlue = SDValue(CNode, 2);
4332 return CNode;
4333}
4334
4335bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
4336 EVT VT = N->getValueType(0);
4337
4338 // Only handle scalar shifts.
4339 if (VT.isVector())
4340 return false;
4341
4342 // Narrower shifts only mask to 5 bits in hardware.
4343 unsigned Size = VT == MVT::i64 ? 64 : 32;
4344
4345 SDValue OrigShiftAmt = N->getOperand(1);
4346 SDValue ShiftAmt = OrigShiftAmt;
4347 SDLoc DL(N);
4348
4349 // Skip over a truncate of the shift amount.
4350 if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
4351 ShiftAmt = ShiftAmt->getOperand(0);
4352
4353 // This function is called after X86DAGToDAGISel::matchBitExtract(),
4354 // so we are not afraid that we might mess up BZHI/BEXTR pattern.
4355
4356 SDValue NewShiftAmt;
4357 if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB ||
4358 ShiftAmt->getOpcode() == ISD::XOR) {
4359 SDValue Add0 = ShiftAmt->getOperand(0);
4360 SDValue Add1 = ShiftAmt->getOperand(1);
4361 auto *Add0C = dyn_cast<ConstantSDNode>(Add0);
4362 auto *Add1C = dyn_cast<ConstantSDNode>(Add1);
4363 // If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X
4364 // to avoid the ADD/SUB/XOR.
4365 if (Add1C && Add1C->getAPIntValue().urem(Size) == 0) {
4366 NewShiftAmt = Add0;
4367
4368 } else if (ShiftAmt->getOpcode() != ISD::ADD && ShiftAmt.hasOneUse() &&
4369 ((Add0C && Add0C->getAPIntValue().urem(Size) == Size - 1) ||
4370 (Add1C && Add1C->getAPIntValue().urem(Size) == Size - 1))) {
4371 // If we are doing a NOT on just the lower bits with (Size*N-1) -/^ X
4372 // we can replace it with a NOT. In the XOR case it may save some code
4373 // size, in the SUB case it also may save a move.
4374 assert(Add0C == nullptr || Add1C == nullptr);
4375
4376 // We can only do N-X, not X-N
4377 if (ShiftAmt->getOpcode() == ISD::SUB && Add0C == nullptr)
4378 return false;
4379
4380 EVT OpVT = ShiftAmt.getValueType();
4381
4382 SDValue AllOnes = CurDAG->getAllOnesConstant(DL, OpVT);
4383 NewShiftAmt = CurDAG->getNode(ISD::XOR, DL, OpVT,
4384 Add0C == nullptr ? Add0 : Add1, AllOnes);
4385 insertDAGNode(*CurDAG, OrigShiftAmt, AllOnes);
4386 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4387 // If we are shifting by N-X where N == 0 mod Size, then just shift by
4388 // -X to generate a NEG instead of a SUB of a constant.
4389 } else if (ShiftAmt->getOpcode() == ISD::SUB && Add0C &&
4390 Add0C->getZExtValue() != 0) {
4391 EVT SubVT = ShiftAmt.getValueType();
4392 SDValue X;
4393 if (Add0C->getZExtValue() % Size == 0)
4394 X = Add1;
4395 else if (ShiftAmt.hasOneUse() && Size == 64 &&
4396 Add0C->getZExtValue() % 32 == 0) {
4397 // We have a 64-bit shift by (n*32-x), turn it into -(x+n*32).
4398 // This is mainly beneficial if we already compute (x+n*32).
4399 if (Add1.getOpcode() == ISD::TRUNCATE) {
4400 Add1 = Add1.getOperand(0);
4401 SubVT = Add1.getValueType();
4402 }
4403 if (Add0.getValueType() != SubVT) {
4404 Add0 = CurDAG->getZExtOrTrunc(Add0, DL, SubVT);
4405 insertDAGNode(*CurDAG, OrigShiftAmt, Add0);
4406 }
4407
4408 X = CurDAG->getNode(ISD::ADD, DL, SubVT, Add1, Add0);
4409 insertDAGNode(*CurDAG, OrigShiftAmt, X);
4410 } else
4411 return false;
4412 // Insert a negate op.
4413 // TODO: This isn't guaranteed to replace the sub if there is a logic cone
4414 // that uses it that's not a shift.
4415 SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
4416 SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, X);
4417 NewShiftAmt = Neg;
4418
4419 // Insert these operands into a valid topological order so they can
4420 // get selected independently.
4421 insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
4422 insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
4423 } else
4424 return false;
4425 } else
4426 return false;
4427
4428 if (NewShiftAmt.getValueType() != MVT::i8) {
4429 // Need to truncate the shift amount.
4430 NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
4431 // Add to a correct topological ordering.
4432 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4433 }
4434
4435 // Insert a new mask to keep the shift amount legal. This should be removed
4436 // by isel patterns.
4437 NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
4438 CurDAG->getConstant(Size - 1, DL, MVT::i8));
4439 // Place in a correct topological ordering.
4440 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4441
4442 SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
4443 NewShiftAmt);
4444 if (UpdatedNode != N) {
4445 // If we found an existing node, we should replace ourselves with that node
4446 // and wait for it to be selected after its other users.
4447 ReplaceNode(N, UpdatedNode);
4448 return true;
4449 }
4450
4451 // If the original shift amount is now dead, delete it so that we don't run
4452 // it through isel.
4453 if (OrigShiftAmt.getNode()->use_empty())
4454 CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());
4455
4456 // Now that we've optimized the shift amount, defer to normal isel to get
4457 // load folding and legacy vs BMI2 selection without repeating it here.
4458 SelectCode(N);
4459 return true;
4460}
4461
4462bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
4463 MVT NVT = N->getSimpleValueType(0);
4464 unsigned Opcode = N->getOpcode();
4465 SDLoc dl(N);
4466
4467 // For operations of the form (x << C1) op C2, check if we can use a smaller
4468 // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
4469 SDValue Shift = N->getOperand(0);
4470 SDValue N1 = N->getOperand(1);
4471
4472 auto *Cst = dyn_cast<ConstantSDNode>(N1);
4473 if (!Cst)
4474 return false;
4475
4476 int64_t Val = Cst->getSExtValue();
4477
4478 // If we have an any_extend feeding the AND, look through it to see if there
4479 // is a shift behind it. But only if the AND doesn't use the extended bits.
4480 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
4481 bool FoundAnyExtend = false;
4482 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
4483 Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
4484 isUInt<32>(Val)) {
4485 FoundAnyExtend = true;
4486 Shift = Shift.getOperand(0);
4487 }
4488
4489 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
4490 return false;
4491
4492 // i8 is unshrinkable, i16 should be promoted to i32.
4493 if (NVT != MVT::i32 && NVT != MVT::i64)
4494 return false;
4495
4496 auto *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
4497 if (!ShlCst)
4498 return false;
4499
4500 uint64_t ShAmt = ShlCst->getZExtValue();
4501
4502 // Make sure that we don't change the operation by removing bits.
4503 // This only matters for OR and XOR, AND is unaffected.
4504 uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
4505 if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
4506 return false;
4507
4508 // Check the minimum bitwidth for the new constant.
4509 // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
4510 auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
4511 if (Opcode == ISD::AND) {
4512 // AND32ri is the same as AND64ri32 with zext imm.
4513 // Try this before sign extended immediates below.
4514 ShiftedVal = (uint64_t)Val >> ShAmt;
4515 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4516 return true;
4517 // Also swap order when the AND can become MOVZX.
4518 if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
4519 return true;
4520 }
4521 ShiftedVal = Val >> ShAmt;
4522 if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) ||
4523 (!isInt<32>(Val) && isInt<32>(ShiftedVal)))
4524 return true;
4525 if (Opcode != ISD::AND) {
4526 // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
4527 ShiftedVal = (uint64_t)Val >> ShAmt;
4528 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4529 return true;
4530 }
4531 return false;
4532 };
4533
4534 int64_t ShiftedVal;
4535 if (!CanShrinkImmediate(ShiftedVal))
4536 return false;
4537
4538 // Ok, we can reorder to get a smaller immediate.
4539
4540 // But, its possible the original immediate allowed an AND to become MOVZX.
4541 // Doing this late due to avoid the MakedValueIsZero call as late as
4542 // possible.
4543 if (Opcode == ISD::AND) {
4544 // Find the smallest zext this could possibly be.
4545 unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
4546 ZExtWidth = llvm::bit_ceil(std::max(ZExtWidth, 8U));
4547
4548 // Figure out which bits need to be zero to achieve that mask.
4549 APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
4550 ZExtWidth);
4551 NeededMask &= ~Cst->getAPIntValue();
4552
4553 if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
4554 return false;
4555 }
4556
4557 SDValue X = Shift.getOperand(0);
4558 if (FoundAnyExtend) {
4559 SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
4560 insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
4561 X = NewX;
4562 }
4563
4564 SDValue NewCst = CurDAG->getSignedConstant(ShiftedVal, dl, NVT);
4565 insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
4566 SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
4567 insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
4568 SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
4569 Shift.getOperand(1));
4570 ReplaceNode(N, NewSHL.getNode());
4571 SelectCode(NewSHL.getNode());
4572 return true;
4573}
4574
4575bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
4576 SDNode *ParentB, SDNode *ParentC,
4578 uint8_t Imm) {
4579 assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) &&
4580 C.isOperandOf(ParentC) && "Incorrect parent node");
4581
4582 auto tryFoldLoadOrBCast =
4583 [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale,
4584 SDValue &Index, SDValue &Disp, SDValue &Segment) {
4585 if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
4586 return true;
4587
4588 // Not a load, check for broadcast which may be behind a bitcast.
4589 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4590 P = L.getNode();
4591 L = L.getOperand(0);
4592 }
4593
4594 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4595 return false;
4596
4597 // Only 32 and 64 bit broadcasts are supported.
4598 auto *MemIntr = cast<MemIntrinsicSDNode>(L);
4599 unsigned Size = MemIntr->getMemoryVT().getSizeInBits();
4600 if (Size != 32 && Size != 64)
4601 return false;
4602
4603 return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
4604 };
4605
4606 bool FoldedLoad = false;
4607 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4608 if (tryFoldLoadOrBCast(Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4609 FoldedLoad = true;
4610 } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
4611 Tmp4)) {
4612 FoldedLoad = true;
4613 std::swap(A, C);
4614 // Swap bits 1/4 and 3/6.
4615 uint8_t OldImm = Imm;
4616 Imm = OldImm & 0xa5;
4617 if (OldImm & 0x02) Imm |= 0x10;
4618 if (OldImm & 0x10) Imm |= 0x02;
4619 if (OldImm & 0x08) Imm |= 0x40;
4620 if (OldImm & 0x40) Imm |= 0x08;
4621 } else if (tryFoldLoadOrBCast(Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3,
4622 Tmp4)) {
4623 FoldedLoad = true;
4624 std::swap(B, C);
4625 // Swap bits 1/2 and 5/6.
4626 uint8_t OldImm = Imm;
4627 Imm = OldImm & 0x99;
4628 if (OldImm & 0x02) Imm |= 0x04;
4629 if (OldImm & 0x04) Imm |= 0x02;
4630 if (OldImm & 0x20) Imm |= 0x40;
4631 if (OldImm & 0x40) Imm |= 0x20;
4632 }
4633
4634 SDLoc DL(Root);
4635
4636 SDValue TImm = CurDAG->getTargetConstant(Imm, DL, MVT::i8);
4637
4638 MVT NVT = Root->getSimpleValueType(0);
4639
4640 MachineSDNode *MNode;
4641 if (FoldedLoad) {
4642 SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
4643
4644 unsigned Opc;
4645 if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) {
4646 auto *MemIntr = cast<MemIntrinsicSDNode>(C);
4647 unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits();
4648 assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!");
4649
4650 bool UseD = EltSize == 32;
4651 if (NVT.is128BitVector())
4652 Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi;
4653 else if (NVT.is256BitVector())
4654 Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi;
4655 else if (NVT.is512BitVector())
4656 Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi;
4657 else
4658 llvm_unreachable("Unexpected vector size!");
4659 } else {
4660 bool UseD = NVT.getVectorElementType() == MVT::i32;
4661 if (NVT.is128BitVector())
4662 Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi;
4663 else if (NVT.is256BitVector())
4664 Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi;
4665 else if (NVT.is512BitVector())
4666 Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi;
4667 else
4668 llvm_unreachable("Unexpected vector size!");
4669 }
4670
4671 SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(0)};
4672 MNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops);
4673
4674 // Update the chain.
4675 ReplaceUses(C.getValue(1), SDValue(MNode, 1));
4676 // Record the mem-refs
4677 CurDAG->setNodeMemRefs(MNode, {cast<MemSDNode>(C)->getMemOperand()});
4678 } else {
4679 bool UseD = NVT.getVectorElementType() == MVT::i32;
4680 unsigned Opc;
4681 if (NVT.is128BitVector())
4682 Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri;
4683 else if (NVT.is256BitVector())
4684 Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri;
4685 else if (NVT.is512BitVector())
4686 Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri;
4687 else
4688 llvm_unreachable("Unexpected vector size!");
4689
4690 MNode = CurDAG->getMachineNode(Opc, DL, NVT, {A, B, C, TImm});
4691 }
4692
4693 ReplaceUses(SDValue(Root, 0), SDValue(MNode, 0));
4694 CurDAG->RemoveDeadNode(Root);
4695 return true;
4696}
4697
4698// Try to match two logic ops to a VPTERNLOG.
4699// FIXME: Handle more complex patterns that use an operand more than once?
4700bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
4701 MVT NVT = N->getSimpleValueType(0);
4702
4703 // Make sure we support VPTERNLOG.
4704 if (!NVT.isVector() || !Subtarget->hasAVX512() ||
4705 NVT.getVectorElementType() == MVT::i1)
4706 return false;
4707
4708 // We need VLX for 128/256-bit.
4709 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
4710 return false;
4711
4712 auto getFoldableLogicOp = [](SDValue Op) {
4713 // Peek through single use bitcast.
4714 if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
4715 Op = Op.getOperand(0);
4716
4717 if (!Op.hasOneUse())
4718 return SDValue();
4719
4720 unsigned Opc = Op.getOpcode();
4721 if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
4722 Opc == X86ISD::ANDNP)
4723 return Op;
4724
4725 return SDValue();
4726 };
4727
4728 SDValue N0, N1, A, FoldableOp;
4729
4730 // Identify and (optionally) peel an outer NOT that wraps a pure logic tree
4731 auto tryPeelOuterNotWrappingLogic = [&](SDNode *Op) {
4732 if (Op->getOpcode() == ISD::XOR && Op->hasOneUse() &&
4733 ISD::isBuildVectorAllOnes(Op->getOperand(1).getNode())) {
4734 SDValue InnerOp = getFoldableLogicOp(Op->getOperand(0));
4735
4736 if (!InnerOp)
4737 return SDValue();
4738
4739 N0 = InnerOp.getOperand(0);
4740 N1 = InnerOp.getOperand(1);
4741 if ((FoldableOp = getFoldableLogicOp(N1))) {
4742 A = N0;
4743 return InnerOp;
4744 }
4745 if ((FoldableOp = getFoldableLogicOp(N0))) {
4746 A = N1;
4747 return InnerOp;
4748 }
4749 }
4750 return SDValue();
4751 };
4752
4753 bool PeeledOuterNot = false;
4754 SDNode *OriN = N;
4755 if (SDValue InnerOp = tryPeelOuterNotWrappingLogic(N)) {
4756 PeeledOuterNot = true;
4757 N = InnerOp.getNode();
4758 } else {
4759 N0 = N->getOperand(0);
4760 N1 = N->getOperand(1);
4761
4762 if ((FoldableOp = getFoldableLogicOp(N1)))
4763 A = N0;
4764 else if ((FoldableOp = getFoldableLogicOp(N0)))
4765 A = N1;
4766 else
4767 return false;
4768 }
4769
4770 SDValue B = FoldableOp.getOperand(0);
4771 SDValue C = FoldableOp.getOperand(1);
4772 SDNode *ParentA = N;
4773 SDNode *ParentB = FoldableOp.getNode();
4774 SDNode *ParentC = FoldableOp.getNode();
4775
4776 // We can build the appropriate control immediate by performing the logic
4777 // operation we're matching using these constants for A, B, and C.
4778 uint8_t TernlogMagicA = 0xf0;
4779 uint8_t TernlogMagicB = 0xcc;
4780 uint8_t TernlogMagicC = 0xaa;
4781
4782 // Some of the inputs may be inverted, peek through them and invert the
4783 // magic values accordingly.
4784 // TODO: There may be a bitcast before the xor that we should peek through.
4785 auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) {
4786 if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() &&
4787 ISD::isBuildVectorAllOnes(Op.getOperand(1).getNode())) {
4788 Magic = ~Magic;
4789 Parent = Op.getNode();
4790 Op = Op.getOperand(0);
4791 }
4792 };
4793
4794 PeekThroughNot(A, ParentA, TernlogMagicA);
4795 PeekThroughNot(B, ParentB, TernlogMagicB);
4796 PeekThroughNot(C, ParentC, TernlogMagicC);
4797
4798 uint8_t Imm;
4799 switch (FoldableOp.getOpcode()) {
4800 default: llvm_unreachable("Unexpected opcode!");
4801 case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break;
4802 case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break;
4803 case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break;
4804 case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
4805 }
4806
4807 switch (N->getOpcode()) {
4808 default: llvm_unreachable("Unexpected opcode!");
4809 case X86ISD::ANDNP:
4810 if (A == N0)
4811 Imm &= ~TernlogMagicA;
4812 else
4813 Imm = ~(Imm) & TernlogMagicA;
4814 break;
4815 case ISD::AND: Imm &= TernlogMagicA; break;
4816 case ISD::OR: Imm |= TernlogMagicA; break;
4817 case ISD::XOR: Imm ^= TernlogMagicA; break;
4818 }
4819
4820 if (PeeledOuterNot)
4821 Imm = ~Imm;
4822
4823 return matchVPTERNLOG(OriN, ParentA, ParentB, ParentC, A, B, C, Imm);
4824}
4825
4826/// If the high bits of an 'and' operand are known zero, try setting the
4827/// high bits of an 'and' constant operand to produce a smaller encoding by
4828/// creating a small, sign-extended negative immediate rather than a large
4829/// positive one. This reverses a transform in SimplifyDemandedBits that
4830/// shrinks mask constants by clearing bits. There is also a possibility that
4831/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
4832/// case, just replace the 'and'. Return 'true' if the node is replaced.
4833bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
4834 // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
4835 // have immediate operands.
4836 MVT VT = And->getSimpleValueType(0);
4837 if (VT != MVT::i32 && VT != MVT::i64)
4838 return false;
4839
4840 auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
4841 if (!And1C)
4842 return false;
4843
4844 // Bail out if the mask constant is already negative. It's can't shrink more.
4845 // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
4846 // patterns to use a 32-bit and instead of a 64-bit and by relying on the
4847 // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
4848 // are negative too.
4849 APInt MaskVal = And1C->getAPIntValue();
4850 unsigned MaskLZ = MaskVal.countl_zero();
4851 if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
4852 return false;
4853
4854 // Don't extend into the upper 32 bits of a 64 bit mask.
4855 if (VT == MVT::i64 && MaskLZ >= 32) {
4856 MaskLZ -= 32;
4857 MaskVal = MaskVal.trunc(32);
4858 }
4859
4860 SDValue And0 = And->getOperand(0);
4861 APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
4862 APInt NegMaskVal = MaskVal | HighZeros;
4863
4864 // If a negative constant would not allow a smaller encoding, there's no need
4865 // to continue. Only change the constant when we know it's a win.
4866 unsigned MinWidth = NegMaskVal.getSignificantBits();
4867 if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getSignificantBits() <= 32))
4868 return false;
4869
4870 // Extend masks if we truncated above.
4871 if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
4872 NegMaskVal = NegMaskVal.zext(64);
4873 HighZeros = HighZeros.zext(64);
4874 }
4875
4876 // The variable operand must be all zeros in the top bits to allow using the
4877 // new, negative constant as the mask.
4878 // TODO: Handle constant folding?
4879 KnownBits Known0 = CurDAG->computeKnownBits(And0);
4880 if (Known0.isConstant() || !HighZeros.isSubsetOf(Known0.Zero))
4881 return false;
4882
4883 // Check if the mask is -1. In that case, this is an unnecessary instruction
4884 // that escaped earlier analysis.
4885 if (NegMaskVal.isAllOnes()) {
4886 ReplaceNode(And, And0.getNode());
4887 return true;
4888 }
4889
4890 // A negative mask allows a smaller encoding. Create a new 'and' node.
4891 SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
4892 insertDAGNode(*CurDAG, SDValue(And, 0), NewMask);
4893 SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
4894 ReplaceNode(And, NewAnd.getNode());
4895 SelectCode(NewAnd.getNode());
4896 return true;
4897}
4898
4899static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
4900 bool FoldedBCast, bool Masked) {
4901#define VPTESTM_CASE(VT, SUFFIX) \
4902case MVT::VT: \
4903 if (Masked) \
4904 return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
4905 return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
4906
4907
4908#define VPTESTM_BROADCAST_CASES(SUFFIX) \
4909default: llvm_unreachable("Unexpected VT!"); \
4910VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
4911VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
4912VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
4913VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
4914VPTESTM_CASE(v16i32, DZ##SUFFIX) \
4915VPTESTM_CASE(v8i64, QZ##SUFFIX)
4916
4917#define VPTESTM_FULL_CASES(SUFFIX) \
4918VPTESTM_BROADCAST_CASES(SUFFIX) \
4919VPTESTM_CASE(v16i8, BZ128##SUFFIX) \
4920VPTESTM_CASE(v8i16, WZ128##SUFFIX) \
4921VPTESTM_CASE(v32i8, BZ256##SUFFIX) \
4922VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
4923VPTESTM_CASE(v64i8, BZ##SUFFIX) \
4924VPTESTM_CASE(v32i16, WZ##SUFFIX)
4925
4926 if (FoldedBCast) {
4927 switch (TestVT.SimpleTy) {
4929 }
4930 }
4931
4932 if (FoldedLoad) {
4933 switch (TestVT.SimpleTy) {
4935 }
4936 }
4937
4938 switch (TestVT.SimpleTy) {
4940 }
4941
4942#undef VPTESTM_FULL_CASES
4943#undef VPTESTM_BROADCAST_CASES
4944#undef VPTESTM_CASE
4945}
4946
4947// Try to create VPTESTM instruction. If InMask is not null, it will be used
4948// to form a masked operation.
4949bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
4950 SDValue InMask) {
4951 assert(Subtarget->hasAVX512() && "Expected AVX512!");
4952 assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
4953 "Unexpected VT!");
4954
4955 // Look for equal and not equal compares.
4956 ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
4957 if (CC != ISD::SETEQ && CC != ISD::SETNE)
4958 return false;
4959
4960 SDValue SetccOp0 = Setcc.getOperand(0);
4961 SDValue SetccOp1 = Setcc.getOperand(1);
4962
4963 // Canonicalize the all zero vector to the RHS.
4964 if (ISD::isBuildVectorAllZeros(SetccOp0.getNode()))
4965 std::swap(SetccOp0, SetccOp1);
4966
4967 // See if we're comparing against zero.
4968 if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode()))
4969 return false;
4970
4971 SDValue N0 = SetccOp0;
4972
4973 MVT CmpVT = N0.getSimpleValueType();
4974 MVT CmpSVT = CmpVT.getVectorElementType();
4975
4976 // Start with both operands the same. We'll try to refine this.
4977 SDValue Src0 = N0;
4978 SDValue Src1 = N0;
4979
4980 {
4981 // Look through single use bitcasts.
4982 SDValue N0Temp = N0;
4983 if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
4984 N0Temp = N0.getOperand(0);
4985
4986 // Look for single use AND.
4987 if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
4988 Src0 = N0Temp.getOperand(0);
4989 Src1 = N0Temp.getOperand(1);
4990 }
4991 }
4992
4993 // Without VLX we need to widen the operation.
4994 bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
4995
4996 auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L,
4997 SDValue &Base, SDValue &Scale, SDValue &Index,
4998 SDValue &Disp, SDValue &Segment) {
4999 // If we need to widen, we can't fold the load.
5000 if (!Widen)
5001 if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
5002 return true;
5003
5004 // If we didn't fold a load, try to match broadcast. No widening limitation
5005 // for this. But only 32 and 64 bit types are supported.
5006 if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64)
5007 return false;
5008
5009 // Look through single use bitcasts.
5010 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
5011 P = L.getNode();
5012 L = L.getOperand(0);
5013 }
5014
5015 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
5016 return false;
5017
5018 auto *MemIntr = cast<MemIntrinsicSDNode>(L);
5019 if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits())
5020 return false;
5021
5022 return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
5023 };
5024
5025 // We can only fold loads if the sources are unique.
5026 bool CanFoldLoads = Src0 != Src1;
5027
5028 bool FoldedLoad = false;
5029 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5030 if (CanFoldLoads) {
5031 FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2,
5032 Tmp3, Tmp4);
5033 if (!FoldedLoad) {
5034 // And is commutative.
5035 FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1,
5036 Tmp2, Tmp3, Tmp4);
5037 if (FoldedLoad)
5038 std::swap(Src0, Src1);
5039 }
5040 }
5041
5042 bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD;
5043
5044 bool IsMasked = InMask.getNode() != nullptr;
5045
5046 SDLoc dl(Root);
5047
5048 MVT ResVT = Setcc.getSimpleValueType();
5049 MVT MaskVT = ResVT;
5050 if (Widen) {
5051 // Widen the inputs using insert_subreg or copy_to_regclass.
5052 unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
5053 unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
5054 unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
5055 CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
5056 MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
5057 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
5058 CmpVT), 0);
5059 Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
5060
5061 if (!FoldedBCast)
5062 Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
5063
5064 if (IsMasked) {
5065 // Widen the mask.
5066 unsigned RegClass = TLI->getRegClassFor(MaskVT)->getID();
5067 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
5068 InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
5069 dl, MaskVT, InMask, RC), 0);
5070 }
5071 }
5072
5073 bool IsTestN = CC == ISD::SETEQ;
5074 unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
5075 IsMasked);
5076
5077 MachineSDNode *CNode;
5078 if (FoldedLoad) {
5079 SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
5080
5081 if (IsMasked) {
5082 SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
5083 Src1.getOperand(0) };
5084 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5085 } else {
5086 SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
5087 Src1.getOperand(0) };
5088 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5089 }
5090
5091 // Update the chain.
5092 ReplaceUses(Src1.getValue(1), SDValue(CNode, 1));
5093 // Record the mem-refs
5094 CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Src1)->getMemOperand()});
5095 } else {
5096 if (IsMasked)
5097 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
5098 else
5099 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
5100 }
5101
5102 // If we widened, we need to shrink the mask VT.
5103 if (Widen) {
5104 unsigned RegClass = TLI->getRegClassFor(ResVT)->getID();
5105 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
5106 CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
5107 dl, ResVT, SDValue(CNode, 0), RC);
5108 }
5109
5110 ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
5111 CurDAG->RemoveDeadNode(Root);
5112 return true;
5113}
5114
5115// Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
5116// into vpternlog.
5117bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
5118 assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
5119
5120 MVT NVT = N->getSimpleValueType(0);
5121
5122 // Make sure we support VPTERNLOG.
5123 if (!NVT.isVector() || !Subtarget->hasAVX512())
5124 return false;
5125
5126 // We need VLX for 128/256-bit.
5127 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
5128 return false;
5129
5130 SDValue N0 = N->getOperand(0);
5131 SDValue N1 = N->getOperand(1);
5132
5133 // Canonicalize AND to LHS.
5134 if (N1.getOpcode() == ISD::AND)
5135 std::swap(N0, N1);
5136
5137 if (N0.getOpcode() != ISD::AND ||
5138 N1.getOpcode() != X86ISD::ANDNP ||
5139 !N0.hasOneUse() || !N1.hasOneUse())
5140 return false;
5141
5142 // ANDN is not commutable, use it to pick down A and C.
5143 SDValue A = N1.getOperand(0);
5144 SDValue C = N1.getOperand(1);
5145
5146 // AND is commutable, if one operand matches A, the other operand is B.
5147 // Otherwise this isn't a match.
5148 SDValue B;
5149 if (N0.getOperand(0) == A)
5150 B = N0.getOperand(1);
5151 else if (N0.getOperand(1) == A)
5152 B = N0.getOperand(0);
5153 else
5154 return false;
5155
5156 SDLoc dl(N);
5157 SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8);
5158 SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm);
5159 ReplaceNode(N, Ternlog.getNode());
5160
5161 return matchVPTERNLOG(Ternlog.getNode(), Ternlog.getNode(), Ternlog.getNode(),
5162 Ternlog.getNode(), A, B, C, 0xCA);
5163}
5164
5165void X86DAGToDAGISel::Select(SDNode *Node) {
5166 MVT NVT = Node->getSimpleValueType(0);
5167 unsigned Opcode = Node->getOpcode();
5168 SDLoc dl(Node);
5169
5170 if (Node->isMachineOpcode()) {
5171 LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
5172 Node->setNodeId(-1);
5173 return; // Already selected.
5174 }
5175
5176 switch (Opcode) {
5177 default: break;
5179 unsigned IntNo = Node->getConstantOperandVal(1);
5180 switch (IntNo) {
5181 default: break;
5182 case Intrinsic::x86_encodekey128:
5183 case Intrinsic::x86_encodekey256: {
5184 if (!Subtarget->hasKL())
5185 break;
5186
5187 unsigned Opcode;
5188 switch (IntNo) {
5189 default: llvm_unreachable("Impossible intrinsic");
5190 case Intrinsic::x86_encodekey128:
5191 Opcode = X86::ENCODEKEY128;
5192 break;
5193 case Intrinsic::x86_encodekey256:
5194 Opcode = X86::ENCODEKEY256;
5195 break;
5196 }
5197
5198 SDValue Chain = Node->getOperand(0);
5199 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(3),
5200 SDValue());
5201 if (Opcode == X86::ENCODEKEY256)
5202 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(4),
5203 Chain.getValue(1));
5204
5205 MachineSDNode *Res = CurDAG->getMachineNode(
5206 Opcode, dl, Node->getVTList(),
5207 {Node->getOperand(2), Chain, Chain.getValue(1)});
5208 ReplaceNode(Node, Res);
5209 return;
5210 }
5211 case Intrinsic::x86_tileloaddrs64_internal:
5212 case Intrinsic::x86_tileloaddrst164_internal:
5213 if (!Subtarget->hasAMXMOVRS())
5214 break;
5215 [[fallthrough]];
5216 case Intrinsic::x86_tileloadd64_internal:
5217 case Intrinsic::x86_tileloaddt164_internal: {
5218 if (!Subtarget->hasAMXTILE())
5219 break;
5220 auto *MFI =
5221 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5222 MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5223 unsigned Opc;
5224 switch (IntNo) {
5225 default:
5226 llvm_unreachable("Unexpected intrinsic!");
5227 case Intrinsic::x86_tileloaddrs64_internal:
5228 Opc = X86::PTILELOADDRSV;
5229 break;
5230 case Intrinsic::x86_tileloaddrst164_internal:
5231 Opc = X86::PTILELOADDRST1V;
5232 break;
5233 case Intrinsic::x86_tileloadd64_internal:
5234 Opc = X86::PTILELOADDV;
5235 break;
5236 case Intrinsic::x86_tileloaddt164_internal:
5237 Opc = X86::PTILELOADDT1V;
5238 break;
5239 }
5240 // _tile_loadd_internal(row, col, buf, STRIDE)
5241 SDValue Base = Node->getOperand(4);
5242 SDValue Scale = getI8Imm(1, dl);
5243 SDValue Index = Node->getOperand(5);
5244 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5245 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5246 SDValue Chain = Node->getOperand(0);
5247 MachineSDNode *CNode;
5248 SDValue Ops[] = {Node->getOperand(2),
5249 Node->getOperand(3),
5250 Base,
5251 Scale,
5252 Index,
5253 Disp,
5254 Segment,
5255 Chain};
5256 CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
5257 ReplaceNode(Node, CNode);
5258 return;
5259 }
5260 }
5261 break;
5262 }
5263 case ISD::INTRINSIC_VOID: {
5264 unsigned IntNo = Node->getConstantOperandVal(1);
5265 switch (IntNo) {
5266 default: break;
5267 case Intrinsic::x86_sse3_monitor:
5268 case Intrinsic::x86_monitorx:
5269 case Intrinsic::x86_clzero: {
5270 bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64;
5271
5272 unsigned Opc = 0;
5273 switch (IntNo) {
5274 default: llvm_unreachable("Unexpected intrinsic!");
5275 case Intrinsic::x86_sse3_monitor:
5276 if (!Subtarget->hasSSE3())
5277 break;
5278 Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
5279 break;
5280 case Intrinsic::x86_monitorx:
5281 if (!Subtarget->hasMWAITX())
5282 break;
5283 Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
5284 break;
5285 case Intrinsic::x86_clzero:
5286 if (!Subtarget->hasCLZERO())
5287 break;
5288 Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
5289 break;
5290 }
5291
5292 if (Opc) {
5293 unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
5294 SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg,
5295 Node->getOperand(2), SDValue());
5296 SDValue InGlue = Chain.getValue(1);
5297
5298 if (IntNo == Intrinsic::x86_sse3_monitor ||
5299 IntNo == Intrinsic::x86_monitorx) {
5300 // Copy the other two operands to ECX and EDX.
5301 Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3),
5302 InGlue);
5303 InGlue = Chain.getValue(1);
5304 Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4),
5305 InGlue);
5306 InGlue = Chain.getValue(1);
5307 }
5308
5309 MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
5310 { Chain, InGlue});
5311 ReplaceNode(Node, CNode);
5312 return;
5313 }
5314
5315 break;
5316 }
5317 case Intrinsic::x86_tilestored64_internal: {
5318 auto *MFI =
5319 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5320 MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5321 unsigned Opc = X86::PTILESTOREDV;
5322 // _tile_stored_internal(row, col, buf, STRIDE, c)
5323 SDValue Base = Node->getOperand(4);
5324 SDValue Scale = getI8Imm(1, dl);
5325 SDValue Index = Node->getOperand(5);
5326 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5327 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5328 SDValue Chain = Node->getOperand(0);
5329 MachineSDNode *CNode;
5330 SDValue Ops[] = {Node->getOperand(2),
5331 Node->getOperand(3),
5332 Base,
5333 Scale,
5334 Index,
5335 Disp,
5336 Segment,
5337 Node->getOperand(6),
5338 Chain};
5339 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5340 ReplaceNode(Node, CNode);
5341 return;
5342 }
5343 case Intrinsic::x86_tileloaddrs64:
5344 case Intrinsic::x86_tileloaddrst164:
5345 if (!Subtarget->hasAMXMOVRS())
5346 break;
5347 [[fallthrough]];
5348 case Intrinsic::x86_tileloadd64:
5349 case Intrinsic::x86_tileloaddt164:
5350 case Intrinsic::x86_tilestored64: {
5351 if (!Subtarget->hasAMXTILE())
5352 break;
5353 auto *MFI =
5354 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5355 MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
5356 unsigned Opc;
5357 switch (IntNo) {
5358 default: llvm_unreachable("Unexpected intrinsic!");
5359 case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break;
5360 case Intrinsic::x86_tileloaddrs64:
5361 Opc = X86::PTILELOADDRS;
5362 break;
5363 case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
5364 case Intrinsic::x86_tileloaddrst164:
5365 Opc = X86::PTILELOADDRST1;
5366 break;
5367 case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break;
5368 }
5369 // FIXME: Match displacement and scale.
5370 unsigned TIndex = Node->getConstantOperandVal(2);
5371 SDValue TReg = getI8Imm(TIndex, dl);
5372 SDValue Base = Node->getOperand(3);
5373 SDValue Scale = getI8Imm(1, dl);
5374 SDValue Index = Node->getOperand(4);
5375 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5376 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5377 SDValue Chain = Node->getOperand(0);
5378 MachineSDNode *CNode;
5379 if (Opc == X86::PTILESTORED) {
5380 SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
5381 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5382 } else {
5383 SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
5384 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5385 }
5386 ReplaceNode(Node, CNode);
5387 return;
5388 }
5389 }
5390 break;
5391 }
5392 case ISD::BRIND:
5393 case X86ISD::NT_BRIND: {
5394 if (Subtarget->isTarget64BitILP32()) {
5395 // Converts a 32-bit register to a 64-bit, zero-extended version of
5396 // it. This is needed because x86-64 can do many things, but jmp %r32
5397 // ain't one of them.
5398 SDValue Target = Node->getOperand(1);
5399 assert(Target.getValueType() == MVT::i32 && "Unexpected VT!");
5400 SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, MVT::i64);
5401 SDValue Brind = CurDAG->getNode(Opcode, dl, MVT::Other,
5402 Node->getOperand(0), ZextTarget);
5403 ReplaceNode(Node, Brind.getNode());
5404 SelectCode(ZextTarget.getNode());
5405 SelectCode(Brind.getNode());
5406 return;
5407 }
5408 break;
5409 }
5411 ReplaceNode(Node, getGlobalBaseReg());
5412 return;
5413
5414 case ISD::BITCAST:
5415 // Just drop all 128/256/512-bit bitcasts.
5416 if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
5417 NVT == MVT::f128) {
5418 ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
5419 CurDAG->RemoveDeadNode(Node);
5420 return;
5421 }
5422 break;
5423
5424 case ISD::SRL:
5425 if (matchBitExtract(Node))
5426 return;
5427 [[fallthrough]];
5428 case ISD::SRA:
5429 case ISD::SHL:
5430 if (tryShiftAmountMod(Node))
5431 return;
5432 break;
5433
5434 case X86ISD::VPTERNLOG: {
5435 uint8_t Imm = Node->getConstantOperandVal(3);
5436 if (matchVPTERNLOG(Node, Node, Node, Node, Node->getOperand(0),
5437 Node->getOperand(1), Node->getOperand(2), Imm))
5438 return;
5439 break;
5440 }
5441
5442 case X86ISD::ANDNP:
5443 if (tryVPTERNLOG(Node))
5444 return;
5445 break;
5446
5447 case ISD::AND:
5448 if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
5449 // Try to form a masked VPTESTM. Operands can be in either order.
5450 SDValue N0 = Node->getOperand(0);
5451 SDValue N1 = Node->getOperand(1);
5452 if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
5453 tryVPTESTM(Node, N0, N1))
5454 return;
5455 if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
5456 tryVPTESTM(Node, N1, N0))
5457 return;
5458 }
5459
5460 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
5461 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
5462 CurDAG->RemoveDeadNode(Node);
5463 return;
5464 }
5465 if (matchBitExtract(Node))
5466 return;
5467 if (AndImmShrink && shrinkAndImmediate(Node))
5468 return;
5469
5470 [[fallthrough]];
5471 case ISD::OR:
5472 case ISD::XOR:
5473 if (tryShrinkShlLogicImm(Node))
5474 return;
5475 if (Opcode == ISD::OR && tryMatchBitSelect(Node))
5476 return;
5477 if (tryVPTERNLOG(Node))
5478 return;
5479
5480 [[fallthrough]];
5481 case ISD::ADD:
5482 if (Opcode == ISD::ADD && matchBitExtract(Node))
5483 return;
5484 [[fallthrough]];
5485 case ISD::SUB: {
5486 // Try to avoid folding immediates with multiple uses for optsize.
5487 // This code tries to select to register form directly to avoid going
5488 // through the isel table which might fold the immediate. We can't change
5489 // the patterns on the add/sub/and/or/xor with immediate paterns in the
5490 // tablegen files to check immediate use count without making the patterns
5491 // unavailable to the fast-isel table.
5492 if (!CurDAG->shouldOptForSize())
5493 break;
5494
5495 // Only handle i8/i16/i32/i64.
5496 if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
5497 break;
5498
5499 SDValue N0 = Node->getOperand(0);
5500 SDValue N1 = Node->getOperand(1);
5501
5502 auto *Cst = dyn_cast<ConstantSDNode>(N1);
5503 if (!Cst)
5504 break;
5505
5506 int64_t Val = Cst->getSExtValue();
5507
5508 // Make sure its an immediate that is considered foldable.
5509 // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
5510 if (!isInt<8>(Val) && !isInt<32>(Val))
5511 break;
5512
5513 // If this can match to INC/DEC, let it go.
5514 if (Opcode == ISD::ADD && (Val == 1 || Val == -1))
5515 break;
5516
5517 // Check if we should avoid folding this immediate.
5518 if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
5519 break;
5520
5521 // We should not fold the immediate. So we need a register form instead.
5522 unsigned ROpc, MOpc;
5523 switch (NVT.SimpleTy) {
5524 default: llvm_unreachable("Unexpected VT!");
5525 case MVT::i8:
5526 switch (Opcode) {
5527 default: llvm_unreachable("Unexpected opcode!");
5528 case ISD::ADD:
5529 ROpc = GET_ND_IF_ENABLED(X86::ADD8rr);
5530 MOpc = GET_ND_IF_ENABLED(X86::ADD8rm);
5531 break;
5532 case ISD::SUB:
5533 ROpc = GET_ND_IF_ENABLED(X86::SUB8rr);
5534 MOpc = GET_ND_IF_ENABLED(X86::SUB8rm);
5535 break;
5536 case ISD::AND:
5537 ROpc = GET_ND_IF_ENABLED(X86::AND8rr);
5538 MOpc = GET_ND_IF_ENABLED(X86::AND8rm);
5539 break;
5540 case ISD::OR:
5541 ROpc = GET_ND_IF_ENABLED(X86::OR8rr);
5542 MOpc = GET_ND_IF_ENABLED(X86::OR8rm);
5543 break;
5544 case ISD::XOR:
5545 ROpc = GET_ND_IF_ENABLED(X86::XOR8rr);
5546 MOpc = GET_ND_IF_ENABLED(X86::XOR8rm);
5547 break;
5548 }
5549 break;
5550 case MVT::i16:
5551 switch (Opcode) {
5552 default: llvm_unreachable("Unexpected opcode!");
5553 case ISD::ADD:
5554 ROpc = GET_ND_IF_ENABLED(X86::ADD16rr);
5555 MOpc = GET_ND_IF_ENABLED(X86::ADD16rm);
5556 break;
5557 case ISD::SUB:
5558 ROpc = GET_ND_IF_ENABLED(X86::SUB16rr);
5559 MOpc = GET_ND_IF_ENABLED(X86::SUB16rm);
5560 break;
5561 case ISD::AND:
5562 ROpc = GET_ND_IF_ENABLED(X86::AND16rr);
5563 MOpc = GET_ND_IF_ENABLED(X86::AND16rm);
5564 break;
5565 case ISD::OR:
5566 ROpc = GET_ND_IF_ENABLED(X86::OR16rr);
5567 MOpc = GET_ND_IF_ENABLED(X86::OR16rm);
5568 break;
5569 case ISD::XOR:
5570 ROpc = GET_ND_IF_ENABLED(X86::XOR16rr);
5571 MOpc = GET_ND_IF_ENABLED(X86::XOR16rm);
5572 break;
5573 }
5574 break;
5575 case MVT::i32:
5576 switch (Opcode) {
5577 default: llvm_unreachable("Unexpected opcode!");
5578 case ISD::ADD:
5579 ROpc = GET_ND_IF_ENABLED(X86::ADD32rr);
5580 MOpc = GET_ND_IF_ENABLED(X86::ADD32rm);
5581 break;
5582 case ISD::SUB:
5583 ROpc = GET_ND_IF_ENABLED(X86::SUB32rr);
5584 MOpc = GET_ND_IF_ENABLED(X86::SUB32rm);
5585 break;
5586 case ISD::AND:
5587 ROpc = GET_ND_IF_ENABLED(X86::AND32rr);
5588 MOpc = GET_ND_IF_ENABLED(X86::AND32rm);
5589 break;
5590 case ISD::OR:
5591 ROpc = GET_ND_IF_ENABLED(X86::OR32rr);
5592 MOpc = GET_ND_IF_ENABLED(X86::OR32rm);
5593 break;
5594 case ISD::XOR:
5595 ROpc = GET_ND_IF_ENABLED(X86::XOR32rr);
5596 MOpc = GET_ND_IF_ENABLED(X86::XOR32rm);
5597 break;
5598 }
5599 break;
5600 case MVT::i64:
5601 switch (Opcode) {
5602 default: llvm_unreachable("Unexpected opcode!");
5603 case ISD::ADD:
5604 ROpc = GET_ND_IF_ENABLED(X86::ADD64rr);
5605 MOpc = GET_ND_IF_ENABLED(X86::ADD64rm);
5606 break;
5607 case ISD::SUB:
5608 ROpc = GET_ND_IF_ENABLED(X86::SUB64rr);
5609 MOpc = GET_ND_IF_ENABLED(X86::SUB64rm);
5610 break;
5611 case ISD::AND:
5612 ROpc = GET_ND_IF_ENABLED(X86::AND64rr);
5613 MOpc = GET_ND_IF_ENABLED(X86::AND64rm);
5614 break;
5615 case ISD::OR:
5616 ROpc = GET_ND_IF_ENABLED(X86::OR64rr);
5617 MOpc = GET_ND_IF_ENABLED(X86::OR64rm);
5618 break;
5619 case ISD::XOR:
5620 ROpc = GET_ND_IF_ENABLED(X86::XOR64rr);
5621 MOpc = GET_ND_IF_ENABLED(X86::XOR64rm);
5622 break;
5623 }
5624 break;
5625 }
5626
5627 // Ok this is a AND/OR/XOR/ADD/SUB with constant.
5628
5629 // If this is a not a subtract, we can still try to fold a load.
5630 if (Opcode != ISD::SUB) {
5631 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5632 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5633 SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
5634 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5635 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5636 // Update the chain.
5637 ReplaceUses(N0.getValue(1), SDValue(CNode, 2));
5638 // Record the mem-refs
5639 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()});
5640 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5641 CurDAG->RemoveDeadNode(Node);
5642 return;
5643 }
5644 }
5645
5646 CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1);
5647 return;
5648 }
5649
5650 case X86ISD::SMUL:
5651 // i16/i32/i64 are handled with isel patterns.
5652 if (NVT != MVT::i8)
5653 break;
5654 [[fallthrough]];
5655 case X86ISD::UMUL: {
5656 SDValue N0 = Node->getOperand(0);
5657 SDValue N1 = Node->getOperand(1);
5658
5659 unsigned LoReg, ROpc, MOpc;
5660 switch (NVT.SimpleTy) {
5661 default: llvm_unreachable("Unsupported VT!");
5662 case MVT::i8:
5663 LoReg = X86::AL;
5664 ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
5665 MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
5666 break;
5667 case MVT::i16:
5668 LoReg = X86::AX;
5669 ROpc = X86::MUL16r;
5670 MOpc = X86::MUL16m;
5671 break;
5672 case MVT::i32:
5673 LoReg = X86::EAX;
5674 ROpc = X86::MUL32r;
5675 MOpc = X86::MUL32m;
5676 break;
5677 case MVT::i64:
5678 LoReg = X86::RAX;
5679 ROpc = X86::MUL64r;
5680 MOpc = X86::MUL64m;
5681 break;
5682 }
5683
5684 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5685 bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5686 // Multiply is commutative.
5687 if (!FoldedLoad) {
5688 FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5689 if (FoldedLoad)
5690 std::swap(N0, N1);
5691 }
5692
5693 SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
5694 N0, SDValue()).getValue(1);
5695
5696 MachineSDNode *CNode;
5697 if (FoldedLoad) {
5698 // i16/i32/i64 use an instruction that produces a low and high result even
5699 // though only the low result is used.
5700 SDVTList VTs;
5701 if (NVT == MVT::i8)
5702 VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5703 else
5704 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);
5705
5706 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5707 InGlue };
5708 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5709
5710 // Update the chain.
5711 ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
5712 // Record the mem-refs
5713 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5714 } else {
5715 // i16/i32/i64 use an instruction that produces a low and high result even
5716 // though only the low result is used.
5717 SDVTList VTs;
5718 if (NVT == MVT::i8)
5719 VTs = CurDAG->getVTList(NVT, MVT::i32);
5720 else
5721 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
5722
5723 CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InGlue});
5724 }
5725
5726 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5727 ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
5728 CurDAG->RemoveDeadNode(Node);
5729 return;
5730 }
5731
5732 case ISD::SMUL_LOHI:
5733 case ISD::UMUL_LOHI: {
5734 SDValue N0 = Node->getOperand(0);
5735 SDValue N1 = Node->getOperand(1);
5736
5737 unsigned Opc, MOpc;
5738 unsigned LoReg, HiReg;
5739 bool IsSigned = Opcode == ISD::SMUL_LOHI;
5740 bool UseMULX = !IsSigned && Subtarget->hasBMI2();
5741 bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty();
5742 switch (NVT.SimpleTy) {
5743 default: llvm_unreachable("Unsupported VT!");
5744 case MVT::i32:
5745 Opc = UseMULXHi ? X86::MULX32Hrr
5746 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rr)
5747 : IsSigned ? X86::IMUL32r
5748 : X86::MUL32r;
5749 MOpc = UseMULXHi ? X86::MULX32Hrm
5750 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rm)
5751 : IsSigned ? X86::IMUL32m
5752 : X86::MUL32m;
5753 LoReg = UseMULX ? X86::EDX : X86::EAX;
5754 HiReg = X86::EDX;
5755 break;
5756 case MVT::i64:
5757 Opc = UseMULXHi ? X86::MULX64Hrr
5758 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rr)
5759 : IsSigned ? X86::IMUL64r
5760 : X86::MUL64r;
5761 MOpc = UseMULXHi ? X86::MULX64Hrm
5762 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rm)
5763 : IsSigned ? X86::IMUL64m
5764 : X86::MUL64m;
5765 LoReg = UseMULX ? X86::RDX : X86::RAX;
5766 HiReg = X86::RDX;
5767 break;
5768 }
5769
5770 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5771 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5772 // Multiply is commutative.
5773 if (!foldedLoad) {
5774 foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5775 if (foldedLoad)
5776 std::swap(N0, N1);
5777 }
5778
5779 SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
5780 N0, SDValue()).getValue(1);
5781 SDValue ResHi, ResLo;
5782 if (foldedLoad) {
5783 SDValue Chain;
5784 MachineSDNode *CNode = nullptr;
5785 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5786 InGlue };
5787 if (UseMULXHi) {
5788 SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
5789 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5790 ResHi = SDValue(CNode, 0);
5791 Chain = SDValue(CNode, 1);
5792 } else if (UseMULX) {
5793 SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other);
5794 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5795 ResHi = SDValue(CNode, 0);
5796 ResLo = SDValue(CNode, 1);
5797 Chain = SDValue(CNode, 2);
5798 } else {
5799 SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
5800 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5801 Chain = SDValue(CNode, 0);
5802 InGlue = SDValue(CNode, 1);
5803 }
5804
5805 // Update the chain.
5806 ReplaceUses(N1.getValue(1), Chain);
5807 // Record the mem-refs
5808 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5809 } else {
5810 SDValue Ops[] = { N1, InGlue };
5811 if (UseMULXHi) {
5812 SDVTList VTs = CurDAG->getVTList(NVT);
5813 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5814 ResHi = SDValue(CNode, 0);
5815 } else if (UseMULX) {
5816 SDVTList VTs = CurDAG->getVTList(NVT, NVT);
5817 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5818 ResHi = SDValue(CNode, 0);
5819 ResLo = SDValue(CNode, 1);
5820 } else {
5821 SDVTList VTs = CurDAG->getVTList(MVT::Glue);
5822 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5823 InGlue = SDValue(CNode, 0);
5824 }
5825 }
5826
5827 // Copy the low half of the result, if it is needed.
5828 if (!SDValue(Node, 0).use_empty()) {
5829 if (!ResLo) {
5830 assert(LoReg && "Register for low half is not defined!");
5831 ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
5832 NVT, InGlue);
5833 InGlue = ResLo.getValue(2);
5834 }
5835 ReplaceUses(SDValue(Node, 0), ResLo);
5836 LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
5837 dbgs() << '\n');
5838 }
5839 // Copy the high half of the result, if it is needed.
5840 if (!SDValue(Node, 1).use_empty()) {
5841 if (!ResHi) {
5842 assert(HiReg && "Register for high half is not defined!");
5843 ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
5844 NVT, InGlue);
5845 InGlue = ResHi.getValue(2);
5846 }
5847 ReplaceUses(SDValue(Node, 1), ResHi);
5848 LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
5849 dbgs() << '\n');
5850 }
5851
5852 CurDAG->RemoveDeadNode(Node);
5853 return;
5854 }
5855
5856 case ISD::SDIVREM:
5857 case ISD::UDIVREM: {
5858 SDValue N0 = Node->getOperand(0);
5859 SDValue N1 = Node->getOperand(1);
5860
5861 unsigned ROpc, MOpc;
5862 bool isSigned = Opcode == ISD::SDIVREM;
5863 if (!isSigned) {
5864 switch (NVT.SimpleTy) {
5865 default: llvm_unreachable("Unsupported VT!");
5866 case MVT::i8: ROpc = X86::DIV8r; MOpc = X86::DIV8m; break;
5867 case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break;
5868 case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break;
5869 case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break;
5870 }
5871 } else {
5872 switch (NVT.SimpleTy) {
5873 default: llvm_unreachable("Unsupported VT!");
5874 case MVT::i8: ROpc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
5875 case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
5876 case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
5877 case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
5878 }
5879 }
5880
5881 unsigned LoReg, HiReg, ClrReg;
5882 unsigned SExtOpcode;
5883 switch (NVT.SimpleTy) {
5884 default: llvm_unreachable("Unsupported VT!");
5885 case MVT::i8:
5886 LoReg = X86::AL; ClrReg = HiReg = X86::AH;
5887 SExtOpcode = 0; // Not used.
5888 break;
5889 case MVT::i16:
5890 LoReg = X86::AX; HiReg = X86::DX;
5891 ClrReg = X86::DX;
5892 SExtOpcode = X86::CWD;
5893 break;
5894 case MVT::i32:
5895 LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
5896 SExtOpcode = X86::CDQ;
5897 break;
5898 case MVT::i64:
5899 LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
5900 SExtOpcode = X86::CQO;
5901 break;
5902 }
5903
5904 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5905 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5906 bool signBitIsZero = CurDAG->SignBitIsZero(N0);
5907
5908 SDValue InGlue;
5909 if (NVT == MVT::i8) {
5910 // Special case for div8, just use a move with zero extension to AX to
5911 // clear the upper 8 bits (AH).
5912 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
5913 MachineSDNode *Move;
5914 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5915 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
5916 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
5917 : X86::MOVZX16rm8;
5918 Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops);
5919 Chain = SDValue(Move, 1);
5920 ReplaceUses(N0.getValue(1), Chain);
5921 // Record the mem-refs
5922 CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
5923 } else {
5924 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
5925 : X86::MOVZX16rr8;
5926 Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0);
5927 Chain = CurDAG->getEntryNode();
5928 }
5929 Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0),
5930 SDValue());
5931 InGlue = Chain.getValue(1);
5932 } else {
5933 InGlue =
5934 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
5935 LoReg, N0, SDValue()).getValue(1);
5936 if (isSigned && !signBitIsZero) {
5937 // Sign extend the low part into the high part.
5938 InGlue =
5939 SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InGlue),0);
5940 } else {
5941 // Zero out the high part, effectively zero extending the input.
5942 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
5943 SDValue ClrNode =
5944 SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, {}), 0);
5945 switch (NVT.SimpleTy) {
5946 case MVT::i16:
5947 ClrNode =
5948 SDValue(CurDAG->getMachineNode(
5949 TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
5950 CurDAG->getTargetConstant(X86::sub_16bit, dl,
5951 MVT::i32)),
5952 0);
5953 break;
5954 case MVT::i32:
5955 break;
5956 case MVT::i64:
5957 ClrNode =
5958 SDValue(CurDAG->getMachineNode(
5959 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
5960 CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
5961 CurDAG->getTargetConstant(X86::sub_32bit, dl,
5962 MVT::i32)),
5963 0);
5964 break;
5965 default:
5966 llvm_unreachable("Unexpected division source");
5967 }
5968
5969 InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
5970 ClrNode, InGlue).getValue(1);
5971 }
5972 }
5973
5974 if (foldedLoad) {
5975 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5976 InGlue };
5977 MachineSDNode *CNode =
5978 CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
5979 InGlue = SDValue(CNode, 1);
5980 // Update the chain.
5981 ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
5982 // Record the mem-refs
5983 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5984 } else {
5985 InGlue =
5986 SDValue(CurDAG->getMachineNode(ROpc, dl, MVT::Glue, N1, InGlue), 0);
5987 }
5988
5989 // Prevent use of AH in a REX instruction by explicitly copying it to
5990 // an ABCD_L register.
5991 //
5992 // The current assumption of the register allocator is that isel
5993 // won't generate explicit references to the GR8_ABCD_H registers. If
5994 // the allocator and/or the backend get enhanced to be more robust in
5995 // that regard, this can be, and should be, removed.
5996 if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
5997 SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
5998 unsigned AHExtOpcode =
5999 isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
6000
6001 SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
6002 MVT::Glue, AHCopy, InGlue);
6003 SDValue Result(RNode, 0);
6004 InGlue = SDValue(RNode, 1);
6005
6006 Result =
6007 CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
6008
6009 ReplaceUses(SDValue(Node, 1), Result);
6010 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6011 dbgs() << '\n');
6012 }
6013 // Copy the division (low) result, if it is needed.
6014 if (!SDValue(Node, 0).use_empty()) {
6015 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
6016 LoReg, NVT, InGlue);
6017 InGlue = Result.getValue(2);
6018 ReplaceUses(SDValue(Node, 0), Result);
6019 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6020 dbgs() << '\n');
6021 }
6022 // Copy the remainder (high) result, if it is needed.
6023 if (!SDValue(Node, 1).use_empty()) {
6024 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
6025 HiReg, NVT, InGlue);
6026 InGlue = Result.getValue(2);
6027 ReplaceUses(SDValue(Node, 1), Result);
6028 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6029 dbgs() << '\n');
6030 }
6031 CurDAG->RemoveDeadNode(Node);
6032 return;
6033 }
6034
6035 case X86ISD::FCMP:
6037 case X86ISD::STRICT_FCMPS: {
6038 bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP ||
6039 Node->getOpcode() == X86ISD::STRICT_FCMPS;
6040 SDValue N0 = Node->getOperand(IsStrictCmp ? 1 : 0);
6041 SDValue N1 = Node->getOperand(IsStrictCmp ? 2 : 1);
6042
6043 // Save the original VT of the compare.
6044 MVT CmpVT = N0.getSimpleValueType();
6045
6046 // Floating point needs special handling if we don't have FCOMI.
6047 if (Subtarget->canUseCMOV())
6048 break;
6049
6050 bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS;
6051
6052 unsigned Opc;
6053 switch (CmpVT.SimpleTy) {
6054 default: llvm_unreachable("Unexpected type!");
6055 case MVT::f32:
6056 Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32;
6057 break;
6058 case MVT::f64:
6059 Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64;
6060 break;
6061 case MVT::f80:
6062 Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80;
6063 break;
6064 }
6065
6066 SDValue Chain =
6067 IsStrictCmp ? Node->getOperand(0) : CurDAG->getEntryNode();
6068 SDValue Glue;
6069 if (IsStrictCmp) {
6070 SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
6071 Chain = SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {N0, N1, Chain}), 0);
6072 Glue = Chain.getValue(1);
6073 } else {
6074 Glue = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N0, N1), 0);
6075 }
6076
6077 // Move FPSW to AX.
6078 SDValue FNSTSW =
6079 SDValue(CurDAG->getMachineNode(X86::FNSTSW16r, dl, MVT::i16, Glue), 0);
6080
6081 // Extract upper 8-bits of AX.
6082 SDValue Extract =
6083 CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, MVT::i8, FNSTSW);
6084
6085 // Move AH into flags.
6086 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
6087 assert(Subtarget->canUseLAHFSAHF() &&
6088 "Target doesn't support SAHF or FCOMI?");
6089 SDValue AH = CurDAG->getCopyToReg(Chain, dl, X86::AH, Extract, SDValue());
6090 Chain = AH;
6091 SDValue SAHF = SDValue(
6092 CurDAG->getMachineNode(X86::SAHF, dl, MVT::i32, AH.getValue(1)), 0);
6093
6094 if (IsStrictCmp)
6095 ReplaceUses(SDValue(Node, 1), Chain);
6096
6097 ReplaceUses(SDValue(Node, 0), SAHF);
6098 CurDAG->RemoveDeadNode(Node);
6099 return;
6100 }
6101
6102 case X86ISD::CMP: {
6103 SDValue N0 = Node->getOperand(0);
6104 SDValue N1 = Node->getOperand(1);
6105
6106 // Optimizations for TEST compares.
6107 if (!isNullConstant(N1))
6108 break;
6109
6110 // Save the original VT of the compare.
6111 MVT CmpVT = N0.getSimpleValueType();
6112
6113 // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
6114 // by a test instruction. The test should be removed later by
6115 // analyzeCompare if we are using only the zero flag.
6116 // TODO: Should we check the users and use the BEXTR flags directly?
6117 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
6118 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
6119 unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
6120 : X86::TEST32rr;
6121 SDValue BEXTR = SDValue(NewNode, 0);
6122 NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
6123 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
6124 CurDAG->RemoveDeadNode(Node);
6125 return;
6126 }
6127 }
6128
6129 // We can peek through truncates, but we need to be careful below.
6130 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
6131 N0 = N0.getOperand(0);
6132
6133 // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
6134 // use a smaller encoding.
6135 // Look past the truncate if CMP is the only use of it.
6136 if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
6137 N0.getValueType() != MVT::i8) {
6138 auto *MaskC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
6139 if (!MaskC)
6140 break;
6141
6142 // We may have looked through a truncate so mask off any bits that
6143 // shouldn't be part of the compare.
6144 uint64_t Mask = MaskC->getZExtValue();
6146
6147 // Check if we can replace AND+IMM{32,64} with a shift. This is possible
6148 // for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the
6149 // zero flag.
6150 if (CmpVT == MVT::i64 && !isInt<8>(Mask) && isShiftedMask_64(Mask) &&
6151 onlyUsesZeroFlag(SDValue(Node, 0))) {
6152 unsigned ShiftOpcode = ISD::DELETED_NODE;
6153 unsigned ShiftAmt;
6154 unsigned SubRegIdx;
6155 MVT SubRegVT;
6156 unsigned TestOpcode;
6157 unsigned LeadingZeros = llvm::countl_zero(Mask);
6158 unsigned TrailingZeros = llvm::countr_zero(Mask);
6159
6160 // With leading/trailing zeros, the transform is profitable if we can
6161 // eliminate a movabsq or shrink a 32-bit immediate to 8-bit without
6162 // incurring any extra register moves.
6163 bool SavesBytes = !isInt<32>(Mask) || N0.getOperand(0).hasOneUse();
6164 if (LeadingZeros == 0 && SavesBytes) {
6165 // If the mask covers the most significant bit, then we can replace
6166 // TEST+AND with a SHR and check eflags.
6167 // This emits a redundant TEST which is subsequently eliminated.
6168 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6169 ShiftAmt = TrailingZeros;
6170 SubRegIdx = 0;
6171 TestOpcode = X86::TEST64rr;
6172 } else if (TrailingZeros == 0 && SavesBytes) {
6173 // If the mask covers the least significant bit, then we can replace
6174 // TEST+AND with a SHL and check eflags.
6175 // This emits a redundant TEST which is subsequently eliminated.
6176 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHL64ri);
6177 ShiftAmt = LeadingZeros;
6178 SubRegIdx = 0;
6179 TestOpcode = X86::TEST64rr;
6180 } else if (MaskC->hasOneUse() && !isInt<32>(Mask)) {
6181 // If the shifted mask extends into the high half and is 8/16/32 bits
6182 // wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr.
6183 unsigned PopCount = 64 - LeadingZeros - TrailingZeros;
6184 if (PopCount == 8) {
6185 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6186 ShiftAmt = TrailingZeros;
6187 SubRegIdx = X86::sub_8bit;
6188 SubRegVT = MVT::i8;
6189 TestOpcode = X86::TEST8rr;
6190 } else if (PopCount == 16) {
6191 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6192 ShiftAmt = TrailingZeros;
6193 SubRegIdx = X86::sub_16bit;
6194 SubRegVT = MVT::i16;
6195 TestOpcode = X86::TEST16rr;
6196 } else if (PopCount == 32) {
6197 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6198 ShiftAmt = TrailingZeros;
6199 SubRegIdx = X86::sub_32bit;
6200 SubRegVT = MVT::i32;
6201 TestOpcode = X86::TEST32rr;
6202 }
6203 }
6204 if (ShiftOpcode != ISD::DELETED_NODE) {
6205 SDValue ShiftC = CurDAG->getTargetConstant(ShiftAmt, dl, MVT::i64);
6206 SDValue Shift = SDValue(
6207 CurDAG->getMachineNode(ShiftOpcode, dl, MVT::i64, MVT::i32,
6208 N0.getOperand(0), ShiftC),
6209 0);
6210 if (SubRegIdx != 0) {
6211 Shift =
6212 CurDAG->getTargetExtractSubreg(SubRegIdx, dl, SubRegVT, Shift);
6213 }
6214 MachineSDNode *Test =
6215 CurDAG->getMachineNode(TestOpcode, dl, MVT::i32, Shift, Shift);
6216 ReplaceNode(Node, Test);
6217 return;
6218 }
6219 }
6220
6221 MVT VT;
6222 int SubRegOp;
6223 unsigned ROpc, MOpc;
6224
6225 // For each of these checks we need to be careful if the sign flag is
6226 // being used. It is only safe to use the sign flag in two conditions,
6227 // either the sign bit in the shrunken mask is zero or the final test
6228 // size is equal to the original compare size.
6229
6230 if (isUInt<8>(Mask) &&
6231 (!(Mask & 0x80) || CmpVT == MVT::i8 ||
6232 hasNoSignFlagUses(SDValue(Node, 0)))) {
6233 // For example, convert "testl %eax, $8" to "testb %al, $8"
6234 VT = MVT::i8;
6235 SubRegOp = X86::sub_8bit;
6236 ROpc = X86::TEST8ri;
6237 MOpc = X86::TEST8mi;
6238 } else if (OptForMinSize && isUInt<16>(Mask) &&
6239 (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
6240 hasNoSignFlagUses(SDValue(Node, 0)))) {
6241 // For example, "testl %eax, $32776" to "testw %ax, $32776".
6242 // NOTE: We only want to form TESTW instructions if optimizing for
6243 // min size. Otherwise we only save one byte and possibly get a length
6244 // changing prefix penalty in the decoders.
6245 VT = MVT::i16;
6246 SubRegOp = X86::sub_16bit;
6247 ROpc = X86::TEST16ri;
6248 MOpc = X86::TEST16mi;
6249 } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
6250 ((!(Mask & 0x80000000) &&
6251 // Without minsize 16-bit Cmps can get here so we need to
6252 // be sure we calculate the correct sign flag if needed.
6253 (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
6254 CmpVT == MVT::i32 ||
6255 hasNoSignFlagUses(SDValue(Node, 0)))) {
6256 // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
6257 // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
6258 // Otherwize, we find ourselves in a position where we have to do
6259 // promotion. If previous passes did not promote the and, we assume
6260 // they had a good reason not to and do not promote here.
6261 VT = MVT::i32;
6262 SubRegOp = X86::sub_32bit;
6263 ROpc = X86::TEST32ri;
6264 MOpc = X86::TEST32mi;
6265 } else {
6266 // No eligible transformation was found.
6267 break;
6268 }
6269
6270 SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
6271 SDValue Reg = N0.getOperand(0);
6272
6273 // Emit a testl or testw.
6274 MachineSDNode *NewNode;
6275 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
6276 if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
6277 if (auto *LoadN = dyn_cast<LoadSDNode>(N0.getOperand(0).getNode())) {
6278 if (!LoadN->isSimple()) {
6279 unsigned NumVolBits = LoadN->getValueType(0).getSizeInBits();
6280 if ((MOpc == X86::TEST8mi && NumVolBits != 8) ||
6281 (MOpc == X86::TEST16mi && NumVolBits != 16) ||
6282 (MOpc == X86::TEST32mi && NumVolBits != 32))
6283 break;
6284 }
6285 }
6286 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
6287 Reg.getOperand(0) };
6288 NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
6289 // Update the chain.
6290 ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1));
6291 // Record the mem-refs
6292 CurDAG->setNodeMemRefs(NewNode,
6293 {cast<LoadSDNode>(Reg)->getMemOperand()});
6294 } else {
6295 // Extract the subregister if necessary.
6296 if (N0.getValueType() != VT)
6297 Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
6298
6299 NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
6300 }
6301 // Replace CMP with TEST.
6302 ReplaceNode(Node, NewNode);
6303 return;
6304 }
6305 break;
6306 }
6307 case X86ISD::PCMPISTR: {
6308 if (!Subtarget->hasSSE42())
6309 break;
6310
6311 bool NeedIndex = !SDValue(Node, 0).use_empty();
6312 bool NeedMask = !SDValue(Node, 1).use_empty();
6313 // We can't fold a load if we are going to make two instructions.
6314 bool MayFoldLoad = !NeedIndex || !NeedMask;
6315
6316 MachineSDNode *CNode;
6317 if (NeedMask) {
6318 unsigned ROpc =
6319 Subtarget->hasAVX() ? X86::VPCMPISTRMrri : X86::PCMPISTRMrri;
6320 unsigned MOpc =
6321 Subtarget->hasAVX() ? X86::VPCMPISTRMrmi : X86::PCMPISTRMrmi;
6322 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
6323 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
6324 }
6325 if (NeedIndex || !NeedMask) {
6326 unsigned ROpc =
6327 Subtarget->hasAVX() ? X86::VPCMPISTRIrri : X86::PCMPISTRIrri;
6328 unsigned MOpc =
6329 Subtarget->hasAVX() ? X86::VPCMPISTRIrmi : X86::PCMPISTRIrmi;
6330 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
6331 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
6332 }
6333
6334 // Connect the flag usage to the last instruction created.
6335 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
6336 CurDAG->RemoveDeadNode(Node);
6337 return;
6338 }
6339 case X86ISD::PCMPESTR: {
6340 if (!Subtarget->hasSSE42())
6341 break;
6342
6343 // Copy the two implicit register inputs.
6344 SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
6345 Node->getOperand(1),
6346 SDValue()).getValue(1);
6347 InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
6348 Node->getOperand(3), InGlue).getValue(1);
6349
6350 bool NeedIndex = !SDValue(Node, 0).use_empty();
6351 bool NeedMask = !SDValue(Node, 1).use_empty();
6352 // We can't fold a load if we are going to make two instructions.
6353 bool MayFoldLoad = !NeedIndex || !NeedMask;
6354
6355 MachineSDNode *CNode;
6356 if (NeedMask) {
6357 unsigned ROpc =
6358 Subtarget->hasAVX() ? X86::VPCMPESTRMrri : X86::PCMPESTRMrri;
6359 unsigned MOpc =
6360 Subtarget->hasAVX() ? X86::VPCMPESTRMrmi : X86::PCMPESTRMrmi;
6361 CNode =
6362 emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node, InGlue);
6363 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
6364 }
6365 if (NeedIndex || !NeedMask) {
6366 unsigned ROpc =
6367 Subtarget->hasAVX() ? X86::VPCMPESTRIrri : X86::PCMPESTRIrri;
6368 unsigned MOpc =
6369 Subtarget->hasAVX() ? X86::VPCMPESTRIrmi : X86::PCMPESTRIrmi;
6370 CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InGlue);
6371 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
6372 }
6373 // Connect the flag usage to the last instruction created.
6374 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
6375 CurDAG->RemoveDeadNode(Node);
6376 return;
6377 }
6378
6379 case ISD::SETCC: {
6380 if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
6381 return;
6382
6383 break;
6384 }
6385
6386 case ISD::STORE:
6387 if (foldLoadStoreIntoMemOperand(Node))
6388 return;
6389 break;
6390
6391 case X86ISD::SETCC_CARRY: {
6392 MVT VT = Node->getSimpleValueType(0);
6394 if (Subtarget->hasSBBDepBreaking()) {
6395 // We have to do this manually because tblgen will put the eflags copy in
6396 // the wrong place if we use an extract_subreg in the pattern.
6397 // Copy flags to the EFLAGS register and glue it to next node.
6398 SDValue EFLAGS =
6399 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
6400 Node->getOperand(1), SDValue());
6401
6402 // Create a 64-bit instruction if the result is 64-bits otherwise use the
6403 // 32-bit version.
6404 unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
6405 MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
6406 Result = SDValue(
6407 CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)),
6408 0);
6409 } else {
6410 // The target does not recognize sbb with the same reg operand as a
6411 // no-source idiom, so we explicitly zero the input values.
6412 Result = getSBBZero(Node);
6413 }
6414
6415 // For less than 32-bits we need to extract from the 32-bit node.
6416 if (VT == MVT::i8 || VT == MVT::i16) {
6417 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6418 Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
6419 }
6420
6421 ReplaceUses(SDValue(Node, 0), Result);
6422 CurDAG->RemoveDeadNode(Node);
6423 return;
6424 }
6425 case X86ISD::SBB: {
6426 if (isNullConstant(Node->getOperand(0)) &&
6427 isNullConstant(Node->getOperand(1))) {
6428 SDValue Result = getSBBZero(Node);
6429
6430 // Replace the flag use.
6431 ReplaceUses(SDValue(Node, 1), Result.getValue(1));
6432
6433 // Replace the result use.
6434 if (!SDValue(Node, 0).use_empty()) {
6435 // For less than 32-bits we need to extract from the 32-bit node.
6436 MVT VT = Node->getSimpleValueType(0);
6437 if (VT == MVT::i8 || VT == MVT::i16) {
6438 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6439 Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
6440 }
6441 ReplaceUses(SDValue(Node, 0), Result);
6442 }
6443
6444 CurDAG->RemoveDeadNode(Node);
6445 return;
6446 }
6447 break;
6448 }
6449 case X86ISD::MGATHER: {
6450 auto *Mgt = cast<X86MaskedGatherSDNode>(Node);
6451 SDValue IndexOp = Mgt->getIndex();
6452 SDValue Mask = Mgt->getMask();
6453 MVT IndexVT = IndexOp.getSimpleValueType();
6454 MVT ValueVT = Node->getSimpleValueType(0);
6455 MVT MaskVT = Mask.getSimpleValueType();
6456
6457 // This is just to prevent crashes if the nodes are malformed somehow. We're
6458 // otherwise only doing loose type checking in here based on type what
6459 // a type constraint would say just like table based isel.
6460 if (!ValueVT.isVector() || !MaskVT.isVector())
6461 break;
6462
6463 unsigned NumElts = ValueVT.getVectorNumElements();
6464 MVT ValueSVT = ValueVT.getVectorElementType();
6465
6466 bool IsFP = ValueSVT.isFloatingPoint();
6467 unsigned EltSize = ValueSVT.getSizeInBits();
6468
6469 unsigned Opc = 0;
6470 bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1;
6471 if (AVX512Gather) {
6472 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6473 Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm;
6474 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6475 Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm;
6476 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6477 Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm;
6478 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6479 Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm;
6480 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6481 Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm;
6482 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6483 Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm;
6484 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6485 Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm;
6486 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6487 Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm;
6488 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6489 Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm;
6490 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6491 Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm;
6492 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6493 Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm;
6494 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6495 Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm;
6496 } else {
6497 assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() &&
6498 "Unexpected mask VT!");
6499 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6500 Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm;
6501 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6502 Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm;
6503 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6504 Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm;
6505 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6506 Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm;
6507 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6508 Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm;
6509 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6510 Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm;
6511 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6512 Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm;
6513 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6514 Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm;
6515 }
6516
6517 if (!Opc)
6518 break;
6519
6520 SDValue Base, Scale, Index, Disp, Segment;
6521 if (!selectVectorAddr(Mgt, Mgt->getBasePtr(), IndexOp, Mgt->getScale(),
6522 Base, Scale, Index, Disp, Segment))
6523 break;
6524
6525 SDValue PassThru = Mgt->getPassThru();
6526 SDValue Chain = Mgt->getChain();
6527 // Gather instructions have a mask output not in the ISD node.
6528 SDVTList VTs = CurDAG->getVTList(ValueVT, MaskVT, MVT::Other);
6529
6530 MachineSDNode *NewNode;
6531 if (AVX512Gather) {
6532 SDValue Ops[] = {PassThru, Mask, Base, Scale,
6533 Index, Disp, Segment, Chain};
6534 NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6535 } else {
6536 SDValue Ops[] = {PassThru, Base, Scale, Index,
6537 Disp, Segment, Mask, Chain};
6538 NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6539 }
6540 CurDAG->setNodeMemRefs(NewNode, {Mgt->getMemOperand()});
6541 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
6542 ReplaceUses(SDValue(Node, 1), SDValue(NewNode, 2));
6543 CurDAG->RemoveDeadNode(Node);
6544 return;
6545 }
6546 case X86ISD::MSCATTER: {
6547 auto *Sc = cast<X86MaskedScatterSDNode>(Node);
6548 SDValue Value = Sc->getValue();
6549 SDValue IndexOp = Sc->getIndex();
6550 MVT IndexVT = IndexOp.getSimpleValueType();
6551 MVT ValueVT = Value.getSimpleValueType();
6552
6553 // This is just to prevent crashes if the nodes are malformed somehow. We're
6554 // otherwise only doing loose type checking in here based on type what
6555 // a type constraint would say just like table based isel.
6556 if (!ValueVT.isVector())
6557 break;
6558
6559 unsigned NumElts = ValueVT.getVectorNumElements();
6560 MVT ValueSVT = ValueVT.getVectorElementType();
6561
6562 bool IsFP = ValueSVT.isFloatingPoint();
6563 unsigned EltSize = ValueSVT.getSizeInBits();
6564
6565 unsigned Opc;
6566 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6567 Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr;
6568 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6569 Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr;
6570 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6571 Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr;
6572 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6573 Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr;
6574 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6575 Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr;
6576 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6577 Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr;
6578 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6579 Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr;
6580 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6581 Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr;
6582 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6583 Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr;
6584 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6585 Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr;
6586 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6587 Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr;
6588 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6589 Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr;
6590 else
6591 break;
6592
6593 SDValue Base, Scale, Index, Disp, Segment;
6594 if (!selectVectorAddr(Sc, Sc->getBasePtr(), IndexOp, Sc->getScale(),
6595 Base, Scale, Index, Disp, Segment))
6596 break;
6597
6598 SDValue Mask = Sc->getMask();
6599 SDValue Chain = Sc->getChain();
6600 // Scatter instructions have a mask output not in the ISD node.
6601 SDVTList VTs = CurDAG->getVTList(Mask.getValueType(), MVT::Other);
6602 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain};
6603
6604 MachineSDNode *NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6605 CurDAG->setNodeMemRefs(NewNode, {Sc->getMemOperand()});
6606 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 1));
6607 CurDAG->RemoveDeadNode(Node);
6608 return;
6609 }
6611 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6612 auto CallId = MFI->getPreallocatedIdForCallSite(
6613 cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
6614 SDValue Chain = Node->getOperand(0);
6615 SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6616 MachineSDNode *New = CurDAG->getMachineNode(
6617 TargetOpcode::PREALLOCATED_SETUP, dl, MVT::Other, CallIdValue, Chain);
6618 ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Chain
6619 CurDAG->RemoveDeadNode(Node);
6620 return;
6621 }
6622 case ISD::PREALLOCATED_ARG: {
6623 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6624 auto CallId = MFI->getPreallocatedIdForCallSite(
6625 cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
6626 SDValue Chain = Node->getOperand(0);
6627 SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6628 SDValue ArgIndex = Node->getOperand(2);
6629 SDValue Ops[3];
6630 Ops[0] = CallIdValue;
6631 Ops[1] = ArgIndex;
6632 Ops[2] = Chain;
6633 MachineSDNode *New = CurDAG->getMachineNode(
6634 TargetOpcode::PREALLOCATED_ARG, dl,
6635 CurDAG->getVTList(TLI->getPointerTy(CurDAG->getDataLayout()),
6636 MVT::Other),
6637 Ops);
6638 ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Arg pointer
6639 ReplaceUses(SDValue(Node, 1), SDValue(New, 1)); // Chain
6640 CurDAG->RemoveDeadNode(Node);
6641 return;
6642 }
6647 if (!Subtarget->hasWIDEKL())
6648 break;
6649
6650 unsigned Opcode;
6651 switch (Node->getOpcode()) {
6652 default:
6653 llvm_unreachable("Unexpected opcode!");
6655 Opcode = X86::AESENCWIDE128KL;
6656 break;
6658 Opcode = X86::AESDECWIDE128KL;
6659 break;
6661 Opcode = X86::AESENCWIDE256KL;
6662 break;
6664 Opcode = X86::AESDECWIDE256KL;
6665 break;
6666 }
6667
6668 SDValue Chain = Node->getOperand(0);
6669 SDValue Addr = Node->getOperand(1);
6670
6671 SDValue Base, Scale, Index, Disp, Segment;
6672 if (!selectAddr(Node, Addr, Base, Scale, Index, Disp, Segment))
6673 break;
6674
6675 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(2),
6676 SDValue());
6677 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(3),
6678 Chain.getValue(1));
6679 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM2, Node->getOperand(4),
6680 Chain.getValue(1));
6681 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM3, Node->getOperand(5),
6682 Chain.getValue(1));
6683 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM4, Node->getOperand(6),
6684 Chain.getValue(1));
6685 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM5, Node->getOperand(7),
6686 Chain.getValue(1));
6687 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM6, Node->getOperand(8),
6688 Chain.getValue(1));
6689 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM7, Node->getOperand(9),
6690 Chain.getValue(1));
6691
6692 MachineSDNode *Res = CurDAG->getMachineNode(
6693 Opcode, dl, Node->getVTList(),
6694 {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(1)});
6695 CurDAG->setNodeMemRefs(Res, cast<MemSDNode>(Node)->getMemOperand());
6696 ReplaceNode(Node, Res);
6697 return;
6698 }
6700 SDValue Chain = Node->getOperand(0);
6701 Register Reg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
6702 SDValue Glue;
6703 if (Node->getNumValues() == 3)
6704 Glue = Node->getOperand(2);
6705 SDValue Copy =
6706 CurDAG->getCopyFromReg(Chain, dl, Reg, Node->getValueType(0), Glue);
6707 ReplaceNode(Node, Copy.getNode());
6708 return;
6709 }
6710 }
6711
6712 SelectCode(Node);
6713}
6714
6715bool X86DAGToDAGISel::SelectInlineAsmMemoryOperand(
6716 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
6717 std::vector<SDValue> &OutOps) {
6718 SDValue Op0, Op1, Op2, Op3, Op4;
6719 switch (ConstraintID) {
6720 default:
6721 llvm_unreachable("Unexpected asm memory constraint");
6722 case InlineAsm::ConstraintCode::o: // offsetable ??
6723 case InlineAsm::ConstraintCode::v: // not offsetable ??
6724 case InlineAsm::ConstraintCode::m: // memory
6725 case InlineAsm::ConstraintCode::X:
6726 case InlineAsm::ConstraintCode::p: // address
6727 if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
6728 return true;
6729 break;
6730 }
6731
6732 OutOps.push_back(Op0);
6733 OutOps.push_back(Op1);
6734 OutOps.push_back(Op2);
6735 OutOps.push_back(Op3);
6736 OutOps.push_back(Op4);
6737 return false;
6738}
6739
6742 std::make_unique<X86DAGToDAGISel>(TM, TM.getOptLevel())) {}
6743
6744/// This pass converts a legalized DAG into a X86-specific DAG,
6745/// ready for instruction scheduling.
6747 CodeGenOptLevel OptLevel) {
6748 return new X86DAGToDAGISelLegacy(TM, OptLevel);
6749}
unsigned SubReg
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis false
#define CASE(ATTRNAME, AANAME,...)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
dxil translate DXIL Translate Metadata
#define DEBUG_TYPE
Module.h This file contains the declarations for the Module class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
Register Reg
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
#define P(N)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
BaseType
A given derived pointer can have multiple base pointers through phi/selects.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode, SDValue StoredVal, SelectionDAG *CurDAG, LoadSDNode *&LoadNode, SDValue &InputChain)
static void insertDAGNode(SelectionDAG *DAG, SDNode *Pos, SDValue N)
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
#define PASS_NAME
static bool isRIPRelative(const MCInst &MI, const MCInstrInfo &MCII)
Check if the instruction uses RIP relative addressing.
#define FROM_TO(FROM, TO)
#define GET_EGPR_IF_ENABLED(OPC)
static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget)
static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM)
static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM)
static bool needBWI(MVT VT)
static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad, bool FoldedBCast, bool Masked)
static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM, const X86Subtarget &Subtarget)
static bool mayUseCarryFlag(X86::CondCode CC)
static cl::opt< bool > EnablePromoteAnyextLoad("x86-promote-anyext-load", cl::init(true), cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden)
static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, SDValue Call, SDValue OrigChain)
Replace the original chain operand of the call with load's chain operand and move load below the call...
#define GET_ND_IF_ENABLED(OPC)
#define VPTESTM_BROADCAST_CASES(SUFFIX)
static cl::opt< bool > AndImmShrink("x86-and-imm-shrink", cl::init(true), cl::desc("Enable setting constant bits to reduce size of mask immediates"), cl::Hidden)
static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, X86ISelAddressMode &AM)
#define VPTESTM_FULL_CASES(SUFFIX)
static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq)
Return true if call address is a load and it can be moved below CALLSEQ_START and the chains leading ...
static bool isDispSafeForFrameIndexOrRegBase(int64_t Val)
static bool isEndbrImm64(uint64_t Imm)
cl::opt< bool > IndirectBranchTracking("x86-indirect-branch-tracking", cl::init(false), cl::Hidden, cl::desc("Enable X86 indirect branch tracking pass."))
#define GET_ND_IF_ENABLED(OPC)
#define CASE_ND(OP)
Value * RHS
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1489
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition APInt.h:1599
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition APInt.h:1532
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1258
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:390
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1657
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:703
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:730
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI std::optional< ConstantRange > getAbsoluteSymbolRange() const
If this is an absolute symbol reference, returns the range of the symbol, otherwise returns std::null...
Definition Globals.cpp:445
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool is512BitVector() const
Return true if this is a 512-bit vector type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isNonTemporal() const
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition Module.cpp:353
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
int getNodeId() const
Return the unique node id.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
bool use_empty() const
Return true if there are no uses of this node.
const SDValue & getOperand(unsigned Num) const
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
iterator_range< user_iterator > users()
op_iterator op_end() const
op_iterator op_begin() const
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
unsigned getNumOperands() const
SelectionDAGISelPass(std::unique_ptr< SelectionDAGISel > Selector)
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
static int getUninvalidatedNodeId(SDNode *N)
virtual bool runOnMachineFunction(MachineFunction &mf)
static void InvalidateNodeId(SDNode *N)
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
static constexpr unsigned MaxRecursionDepth
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
void RepositionNode(allnodes_iterator Position, SDNode *N)
Move node N in the AllNodes list to be immediately before the given iterator Position.
ilist< SDNode >::iterator allnodes_iterator
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
unsigned getID() const
Return the register class ID number.
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
X86ISelDAGToDAGPass(X86TargetMachine &TM)
size_t getPreallocatedIdForCallSite(const Value *CS)
bool isScalarFPTypeInSSEReg(EVT VT) const
Return true if the specified scalar FP type is computed in an SSE register, not on the X87 floating p...
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:809
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:595
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:843
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:983
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:834
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:664
@ PREALLOCATED_SETUP
PREALLOCATED_SETUP - This has 2 operands: an input chain and a SRCVALUE with the preallocated call Va...
@ TargetExternalSymbol
Definition ISDOpcodes.h:185
@ PREALLOCATED_ARG
PREALLOCATED_ARG - This has 3 operands: an input chain, a SRCVALUE with the preallocated call Value,...
@ BRIND
BRIND - Indirect branch.
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ TargetGlobalAddress
TargetGlobalAddress - Like GlobalAddress, but the DAG does no folding or anything else with this node...
Definition ISDOpcodes.h:180
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:764
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:609
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:571
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:840
@ LOCAL_RECOVER
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition ISDOpcodes.h:130
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition ISDOpcodes.h:889
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:968
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:795
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:459
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:473
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:495
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:472
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:916
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:500
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:738
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:949
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition ISDOpcodes.h:911
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:453
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:846
@ CALLSEQ_START
CALLSEQ_START/CALLSEQ_END - These operators mark the beginning and end of a call sequence,...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ TargetGlobalTLSAddress
Definition ISDOpcodes.h:181
LLVM_ABI bool isBuildVectorOfConstantSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantSDNode or undef.
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LLVM_ABI bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
@ GlobalBaseReg
The result of the mflr at function entry, used for PIC code.
@ X86
Windows x64, Windows Itanium (IA-64)
Definition MCAsmInfo.h:50
@ MO_NO_FLAG
MO_NO_FLAG - No flag for the operand.
@ EVEX
EVEX - Specifies that this instruction use EVEX form which provides syntax support up to 32 512-bit r...
@ VEX
VEX - encoding using 0xC4/0xC5.
@ XOP
XOP - Opcode prefix used by XOP instructions.
@ FST
This instruction implements a truncating store from FP stack slots.
@ CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ CMP
X86 compare and logical compare instructions.
@ BLENDV
Dynamic (non-constant condition) vector blend where only the sign bits of the condition elements are ...
@ STRICT_CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ SETCC
X86 SetCC.
@ NT_BRIND
BRIND node with NoTrack prefix.
@ SELECTS
X86 Select.
@ FSETCCM
X86 FP SETCC, similar to above, but with output as an i1 mask and and a version with SAE.
@ FXOR
Bitwise logical XOR of floating point values.
@ BRCOND
X86 conditional branches.
@ CALL
These operations represent an abstract X86 call instruction, which includes a bunch of information.
@ FANDN
Bitwise logical ANDNOT of floating point values.
@ GlobalBaseReg
On Darwin, this node represents the result of the popl at function entry, used for PIC code.
@ FLD
This instruction implements an extending load to FP stack slots.
@ TC_RETURN
Tail call return.
@ FOR
Bitwise logical OR of floating point values.
@ Wrapper
A wrapper node for TargetConstantPool, TargetJumpTable, TargetExternalSymbol, TargetGlobalAddress,...
@ ANDNP
Bitwise Logical AND NOT of Packed FP values.
@ POP_FROM_X87_REG
The same as ISD::CopyFromReg except that this node makes it explicit that it may lower to an x87 FPU ...
@ FAND
Bitwise logical AND of floating point values.
@ CMOV
X86 conditional moves.
@ WrapperRIP
Special wrapper used under X86-64 PIC mode for RIP relative displacements.
int getCondSrcNoFromDesc(const MCInstrDesc &MCID)
Return the source operand # for condition code by MCID.
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse=false, bool IgnoreAlignment=false)
Check if Op is a load operation that could be folded into some other x86 instruction as a memory oper...
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement)
Returns true of the given offset can be fit into displacement field of the instruction.
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
constexpr uint16_t Magic
Definition SFrame.h:32
This is an optimization pass for GlobalISel generic memory operations.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:345
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:236
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:261
FunctionPass * createX86ISelDag(X86TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a X86-specific DAG, ready for instruction scheduling.
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
@ And
Bitwise or logical AND of integers.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:870
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:207
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition ValueTypes.h:212
bool isConstant() const
Returns true if we know the value of all bits.
Definition KnownBits.h:54
Matching combinators.
LLVM_ABI unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
bool hasNoUnsignedWrap() const