LLVM 23.0.0git
AMDGPUISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Defines an instruction selector for the AMDGPU target.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUISelDAGToDAG.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUSubtarget.h"
18#include "AMDGPUTargetMachine.h"
21#include "R600RegisterInfo.h"
22#include "SIISelLowering.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
32
33#ifdef EXPENSIVE_CHECKS
35#include "llvm/IR/Dominators.h"
36#endif
37
38#define DEBUG_TYPE "amdgpu-isel"
39
40using namespace llvm;
41
42//===----------------------------------------------------------------------===//
43// Instruction Selector Implementation
44//===----------------------------------------------------------------------===//
45
46namespace {
47static SDValue stripBitcast(SDValue Val) {
48 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
49}
50
51// Figure out if this is really an extract of the high 16-bits of a dword.
52static bool isExtractHiElt(SDValue In, SDValue &Out) {
53 In = stripBitcast(In);
54
55 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
56 if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
57 if (!Idx->isOne())
58 return false;
59 Out = In.getOperand(0);
60 return true;
61 }
62 }
63
64 if (In.getOpcode() != ISD::TRUNCATE)
65 return false;
66
67 SDValue Srl = In.getOperand(0);
68 if (Srl.getOpcode() == ISD::SRL) {
69 if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
70 if (ShiftAmt->getZExtValue() == 16) {
71 Out = stripBitcast(Srl.getOperand(0));
72 return true;
73 }
74 }
75 }
76
77 return false;
78}
79
80static SDValue createVOP3PSrc32FromLo16(SDValue Lo, SDValue Src,
81 llvm::SelectionDAG *CurDAG,
82 const GCNSubtarget *Subtarget) {
83 if (!Subtarget->useRealTrue16Insts()) {
84 return Lo;
85 }
86
87 SDValue NewSrc;
88 SDLoc SL(Lo);
89
90 if (Lo->isDivergent()) {
91 SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
92 SL, Lo.getValueType()),
93 0);
94 const SDValue Ops[] = {
95 CurDAG->getTargetConstant(AMDGPU::VGPR_32RegClassID, SL, MVT::i32), Lo,
96 CurDAG->getTargetConstant(AMDGPU::lo16, SL, MVT::i16), Undef,
97 CurDAG->getTargetConstant(AMDGPU::hi16, SL, MVT::i16)};
98
99 NewSrc = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
100 Src.getValueType(), Ops),
101 0);
102 } else {
103 // the S_MOV is needed since the Lo could still be a VGPR16.
104 // With S_MOV, isel insert a "sgpr32 = copy vgpr16" and we reply on
105 // the fixvgpr2sgprcopy pass to legalize it
106 NewSrc = SDValue(
107 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, Src.getValueType(), Lo),
108 0);
109 }
110
111 return NewSrc;
112}
113
114// Look through operations that obscure just looking at the low 16-bits of the
115// same register.
116static SDValue stripExtractLoElt(SDValue In) {
117 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
118 SDValue Idx = In.getOperand(1);
119 if (isNullConstant(Idx) && In.getValueSizeInBits() <= 32)
120 return In.getOperand(0);
121 }
122
123 if (In.getOpcode() == ISD::TRUNCATE) {
124 SDValue Src = In.getOperand(0);
125 if (Src.getValueType().getSizeInBits() == 32)
126 return stripBitcast(Src);
127 }
128
129 return In;
130}
131
132static SDValue emitRegSequence(llvm::SelectionDAG &CurDAG, unsigned DstRegClass,
133 EVT DstTy, ArrayRef<SDValue> Elts,
134 ArrayRef<unsigned> SubRegClass,
135 const SDLoc &DL) {
136 assert(Elts.size() == SubRegClass.size() && "array size mismatch");
137 unsigned NumElts = Elts.size();
138 SmallVector<SDValue, 17> Ops(2 * NumElts + 1);
139 Ops[0] = (CurDAG.getTargetConstant(DstRegClass, DL, MVT::i32));
140 for (unsigned i = 0; i < NumElts; ++i) {
141 Ops[2 * i + 1] = Elts[i];
142 Ops[2 * i + 2] = CurDAG.getTargetConstant(SubRegClass[i], DL, MVT::i32);
143 }
144 return SDValue(
145 CurDAG.getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops), 0);
146}
147
148} // end anonymous namespace
149
151 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
152 false)
153INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysisLegacy)
155#ifdef EXPENSIVE_CHECKS
158#endif
160 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
161 false)
162
163/// This pass converts a legalized DAG into a AMDGPU-specific
164// DAG, ready for instruction scheduling.
166 CodeGenOptLevel OptLevel) {
167 return new AMDGPUDAGToDAGISelLegacy(TM, OptLevel);
168}
169
173
175 Subtarget = &MF.getSubtarget<GCNSubtarget>();
176 Subtarget->checkSubtargetFeatures(MF.getFunction());
177 Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
179}
180
181bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
182 // XXX - only need to list legal operations.
183 switch (Opc) {
184 case ISD::FADD:
185 case ISD::FSUB:
186 case ISD::FMUL:
187 case ISD::FDIV:
188 case ISD::FREM:
190 case ISD::UINT_TO_FP:
191 case ISD::SINT_TO_FP:
192 case ISD::FABS:
193 // Fabs is lowered to a bit operation, but it's an and which will clear the
194 // high bits anyway.
195 case ISD::FSQRT:
196 case ISD::FSIN:
197 case ISD::FCOS:
198 case ISD::FPOWI:
199 case ISD::FPOW:
200 case ISD::FLOG:
201 case ISD::FLOG2:
202 case ISD::FLOG10:
203 case ISD::FEXP:
204 case ISD::FEXP2:
205 case ISD::FCEIL:
206 case ISD::FTRUNC:
207 case ISD::FRINT:
208 case ISD::FNEARBYINT:
209 case ISD::FROUNDEVEN:
210 case ISD::FROUND:
211 case ISD::FFLOOR:
212 case ISD::FMINNUM:
213 case ISD::FMAXNUM:
214 case ISD::FLDEXP:
215 case AMDGPUISD::FRACT:
216 case AMDGPUISD::CLAMP:
217 case AMDGPUISD::COS_HW:
218 case AMDGPUISD::SIN_HW:
219 case AMDGPUISD::FMIN3:
220 case AMDGPUISD::FMAX3:
221 case AMDGPUISD::FMED3:
222 case AMDGPUISD::FMAD_FTZ:
223 case AMDGPUISD::RCP:
224 case AMDGPUISD::RSQ:
225 case AMDGPUISD::RCP_IFLAG:
226 // On gfx10, all 16-bit instructions preserve the high bits.
227 return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
228 case ISD::FP_ROUND:
229 // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
230 // high bits on gfx9.
231 // TODO: If we had the source node we could see if the source was fma/mad
233 case ISD::FMA:
234 case ISD::FMAD:
235 case AMDGPUISD::DIV_FIXUP:
237 default:
238 // fcopysign, select and others may be lowered to 32-bit bit operations
239 // which don't zero the high bits.
240 return false;
241 }
242}
243
245#ifdef EXPENSIVE_CHECKS
247 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
248 for (auto &L : LI->getLoopsInPreorder()) {
249 assert(L->isLCSSAForm(DT));
250 }
251#endif
253}
254
263
265 assert(Subtarget->d16PreservesUnusedBits());
266 MVT VT = N->getValueType(0).getSimpleVT();
267 if (VT != MVT::v2i16 && VT != MVT::v2f16)
268 return false;
269
270 SDValue Lo = N->getOperand(0);
271 SDValue Hi = N->getOperand(1);
272
273 LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
274
275 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
276 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
277 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
278
279 // Need to check for possible indirect dependencies on the other half of the
280 // vector to avoid introducing a cycle.
281 if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
282 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
283
284 SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);
285 SDValue Ops[] = {
286 LdHi->getChain(), LdHi->getBasePtr(), TiedIn
287 };
288
289 unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
290 if (LdHi->getMemoryVT() == MVT::i8) {
291 LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
292 AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
293 } else {
294 assert(LdHi->getMemoryVT() == MVT::i16);
295 }
296
297 SDValue NewLoadHi =
298 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
299 Ops, LdHi->getMemoryVT(),
300 LdHi->getMemOperand());
301
302 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
303 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
304 return true;
305 }
306
307 // build_vector (load ptr), hi -> load_d16_lo ptr, hi
308 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
309 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
310 LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
311 if (LdLo && Lo.hasOneUse()) {
312 SDValue TiedIn = getHi16Elt(Hi);
313 if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
314 return false;
315
316 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
317 unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
318 if (LdLo->getMemoryVT() == MVT::i8) {
319 LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
320 AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
321 } else {
322 assert(LdLo->getMemoryVT() == MVT::i16);
323 }
324
325 TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
326
327 SDValue Ops[] = {
328 LdLo->getChain(), LdLo->getBasePtr(), TiedIn
329 };
330
331 SDValue NewLoadLo =
332 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
333 Ops, LdLo->getMemoryVT(),
334 LdLo->getMemOperand());
335
336 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
337 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
338 return true;
339 }
340
341 return false;
342}
343
345 if (!Subtarget->d16PreservesUnusedBits())
346 return;
347
348 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
349
350 bool MadeChange = false;
351 while (Position != CurDAG->allnodes_begin()) {
352 SDNode *N = &*--Position;
353 if (N->use_empty())
354 continue;
355
356 switch (N->getOpcode()) {
358 // TODO: Match load d16 from shl (extload:i16), 16
359 MadeChange |= matchLoadD16FromBuildVector(N);
360 break;
361 default:
362 break;
363 }
364 }
365
366 if (MadeChange) {
367 CurDAG->RemoveDeadNodes();
368 LLVM_DEBUG(dbgs() << "After PreProcess:\n";
369 CurDAG->dump(););
370 }
371}
372
373bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
374 if (N->isUndef())
375 return true;
376
377 const SIInstrInfo *TII = Subtarget->getInstrInfo();
379 return TII->isInlineConstant(C->getAPIntValue());
380
382 return TII->isInlineConstant(C->getValueAPF());
383
384 return false;
385}
386
387/// Determine the register class for \p OpNo
388/// \returns The register class of the virtual register that will be used for
389/// the given operand number \OpNo or NULL if the register class cannot be
390/// determined.
391const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
392 unsigned OpNo) const {
393 if (!N->isMachineOpcode()) {
394 if (N->getOpcode() == ISD::CopyToReg) {
395 Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
396 if (Reg.isVirtual()) {
398 return MRI.getRegClass(Reg);
399 }
400
401 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
402 return TRI->getPhysRegBaseClass(Reg);
403 }
404
405 return nullptr;
406 }
407
408 switch (N->getMachineOpcode()) {
409 default: {
410 const SIInstrInfo *TII = Subtarget->getInstrInfo();
411 const MCInstrDesc &Desc = TII->get(N->getMachineOpcode());
412 unsigned OpIdx = Desc.getNumDefs() + OpNo;
413 if (OpIdx >= Desc.getNumOperands())
414 return nullptr;
415
416 int16_t RegClass = TII->getOpRegClassID(Desc.operands()[OpIdx]);
417 if (RegClass == -1)
418 return nullptr;
419
420 return Subtarget->getRegisterInfo()->getRegClass(RegClass);
421 }
422 case AMDGPU::REG_SEQUENCE: {
423 unsigned RCID = N->getConstantOperandVal(0);
424 const TargetRegisterClass *SuperRC =
425 Subtarget->getRegisterInfo()->getRegClass(RCID);
426
427 SDValue SubRegOp = N->getOperand(OpNo + 1);
428 unsigned SubRegIdx = SubRegOp->getAsZExtVal();
429 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
430 SubRegIdx);
431 }
432 }
433}
434
435SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
436 SDValue Glue) const {
438 Ops.push_back(NewChain); // Replace the chain.
439 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
440 Ops.push_back(N->getOperand(i));
441
442 Ops.push_back(Glue);
443 return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
444}
445
446SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
447 const SITargetLowering& Lowering =
448 *static_cast<const SITargetLowering*>(getTargetLowering());
449
450 assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
451
452 SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
453 return glueCopyToOp(N, M0, M0.getValue(1));
454}
455
456SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
457 unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
458 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
459 if (Subtarget->ldsRequiresM0Init())
460 return glueCopyToM0(
461 N, CurDAG->getSignedTargetConstant(-1, SDLoc(N), MVT::i32));
462 } else if (AS == AMDGPUAS::REGION_ADDRESS) {
463 MachineFunction &MF = CurDAG->getMachineFunction();
464 unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
465 return
466 glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
467 }
468 return N;
469}
470
471MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
472 EVT VT) const {
473 SDNode *Lo = CurDAG->getMachineNode(
474 AMDGPU::S_MOV_B32, DL, MVT::i32,
475 CurDAG->getTargetConstant(Lo_32(Imm), DL, MVT::i32));
476 SDNode *Hi = CurDAG->getMachineNode(
477 AMDGPU::S_MOV_B32, DL, MVT::i32,
478 CurDAG->getTargetConstant(Hi_32(Imm), DL, MVT::i32));
479 const SDValue Ops[] = {
480 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
481 SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
482 SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
483
484 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
485}
486
487SDNode *AMDGPUDAGToDAGISel::packConstantV2I16(const SDNode *N,
488 SelectionDAG &DAG) const {
489 // TODO: Handle undef as zero
490
491 assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);
492 uint32_t LHSVal, RHSVal;
493 if (getConstantValue(N->getOperand(0), LHSVal) &&
494 getConstantValue(N->getOperand(1), RHSVal)) {
495 SDLoc SL(N);
496 uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16);
497 return DAG.getMachineNode(
498 isVGPRImm(N) ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32, SL,
499 N->getValueType(0), DAG.getTargetConstant(K, SL, MVT::i32));
500 }
501
502 return nullptr;
503}
504
505void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
506 EVT VT = N->getValueType(0);
507 unsigned NumVectorElts = VT.getVectorNumElements();
508 EVT EltVT = VT.getVectorElementType();
509 SDLoc DL(N);
510 SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
511
512 if (NumVectorElts == 1) {
513 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
514 RegClass);
515 return;
516 }
517
518 bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();
519 if (IsGCN && Subtarget->has64BitLiterals() && VT.getSizeInBits() == 64 &&
520 CurDAG->isConstantValueOfAnyType(SDValue(N, 0))) {
521 uint64_t C = 0;
522 bool AllConst = true;
523 unsigned EltSize = EltVT.getSizeInBits();
524 for (unsigned I = 0; I < NumVectorElts; ++I) {
525 SDValue Op = N->getOperand(I);
526 if (Op.isUndef()) {
527 AllConst = false;
528 break;
529 }
530 uint64_t Val;
532 Val = CF->getValueAPF().bitcastToAPInt().getZExtValue();
533 } else
534 Val = cast<ConstantSDNode>(Op)->getZExtValue();
535 C |= Val << (EltSize * I);
536 }
537 if (AllConst) {
538 SDValue CV = CurDAG->getTargetConstant(C, DL, MVT::i64);
539 MachineSDNode *Copy =
540 CurDAG->getMachineNode(AMDGPU::S_MOV_B64_IMM_PSEUDO, DL, VT, CV);
541 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, VT, SDValue(Copy, 0),
542 RegClass);
543 return;
544 }
545 }
546
547 assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
548 "supported yet");
549 // 32 = Max Num Vector Elements
550 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
551 // 1 = Vector Register Class
552 SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
553
554 RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
555 bool IsRegSeq = true;
556 unsigned NOps = N->getNumOperands();
557 for (unsigned i = 0; i < NOps; i++) {
558 // XXX: Why is this here?
559 if (isa<RegisterSDNode>(N->getOperand(i))) {
560 IsRegSeq = false;
561 break;
562 }
563 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
565 RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
566 RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
567 }
568 if (NOps != NumVectorElts) {
569 // Fill in the missing undef elements if this was a scalar_to_vector.
570 assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
571 MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
572 DL, EltVT);
573 for (unsigned i = NOps; i < NumVectorElts; ++i) {
574 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
576 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
577 RegSeqArgs[1 + (2 * i) + 1] =
578 CurDAG->getTargetConstant(Sub, DL, MVT::i32);
579 }
580 }
581
582 if (!IsRegSeq)
583 SelectCode(N);
584 CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
585}
586
588 EVT VT = N->getValueType(0);
589 EVT EltVT = VT.getVectorElementType();
590
591 // TODO: Handle 16-bit element vectors with even aligned masks.
592 if (!Subtarget->hasPkMovB32() || !EltVT.bitsEq(MVT::i32) ||
593 VT.getVectorNumElements() != 2) {
594 SelectCode(N);
595 return;
596 }
597
598 auto *SVN = cast<ShuffleVectorSDNode>(N);
599
600 SDValue Src0 = SVN->getOperand(0);
601 SDValue Src1 = SVN->getOperand(1);
602 ArrayRef<int> Mask = SVN->getMask();
603 SDLoc DL(N);
604
605 assert(Src0.getValueType().getVectorNumElements() == 2 && Mask.size() == 2 &&
606 Mask[0] < 4 && Mask[1] < 4);
607
608 SDValue VSrc0 = Mask[0] < 2 ? Src0 : Src1;
609 SDValue VSrc1 = Mask[1] < 2 ? Src0 : Src1;
610 unsigned Src0SubReg = Mask[0] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
611 unsigned Src1SubReg = Mask[1] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
612
613 if (Mask[0] < 0) {
614 Src0SubReg = Src1SubReg;
615 MachineSDNode *ImpDef =
616 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
617 VSrc0 = SDValue(ImpDef, 0);
618 }
619
620 if (Mask[1] < 0) {
621 Src1SubReg = Src0SubReg;
622 MachineSDNode *ImpDef =
623 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
624 VSrc1 = SDValue(ImpDef, 0);
625 }
626
627 // SGPR case needs to lower to copies.
628 //
629 // Also use subregister extract when we can directly blend the registers with
630 // a simple subregister copy.
631 //
632 // TODO: Maybe we should fold this out earlier
633 if (N->isDivergent() && Src0SubReg == AMDGPU::sub1 &&
634 Src1SubReg == AMDGPU::sub0) {
635 // The low element of the result always comes from src0.
636 // The high element of the result always comes from src1.
637 // op_sel selects the high half of src0.
638 // op_sel_hi selects the high half of src1.
639
640 unsigned Src0OpSel =
641 Src0SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
642 unsigned Src1OpSel =
643 Src1SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
644
645 // Enable op_sel_hi to avoid printing it. This should have no effect on the
646 // result.
647 Src0OpSel |= SISrcMods::OP_SEL_1;
648 Src1OpSel |= SISrcMods::OP_SEL_1;
649
650 SDValue Src0OpSelVal = CurDAG->getTargetConstant(Src0OpSel, DL, MVT::i32);
651 SDValue Src1OpSelVal = CurDAG->getTargetConstant(Src1OpSel, DL, MVT::i32);
652 SDValue ZeroMods = CurDAG->getTargetConstant(0, DL, MVT::i32);
653
654 CurDAG->SelectNodeTo(N, AMDGPU::V_PK_MOV_B32, N->getVTList(),
655 {Src0OpSelVal, VSrc0, Src1OpSelVal, VSrc1,
656 ZeroMods, // clamp
657 ZeroMods, // op_sel
658 ZeroMods, // op_sel_hi
659 ZeroMods, // neg_lo
660 ZeroMods}); // neg_hi
661 return;
662 }
663
664 SDValue ResultElt0 =
665 CurDAG->getTargetExtractSubreg(Src0SubReg, DL, EltVT, VSrc0);
666 SDValue ResultElt1 =
667 CurDAG->getTargetExtractSubreg(Src1SubReg, DL, EltVT, VSrc1);
668
669 const SDValue Ops[] = {
670 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
671 ResultElt0, CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
672 ResultElt1, CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
673 CurDAG->SelectNodeTo(N, TargetOpcode::REG_SEQUENCE, VT, Ops);
674}
675
677 unsigned int Opc = N->getOpcode();
678 if (N->isMachineOpcode()) {
679 N->setNodeId(-1);
680 return; // Already selected.
681 }
682
683 // isa<MemSDNode> almost works but is slightly too permissive for some DS
684 // intrinsics.
685 if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N)) {
686 N = glueCopyToM0LDSInit(N);
687 SelectCode(N);
688 return;
689 }
690
691 switch (Opc) {
692 default:
693 break;
694 // We are selecting i64 ADD here instead of custom lower it during
695 // DAG legalization, so we can fold some i64 ADDs used for address
696 // calculation into the LOAD and STORE instructions.
697 case ISD::ADDC:
698 case ISD::ADDE:
699 case ISD::SUBC:
700 case ISD::SUBE: {
701 if (N->getValueType(0) != MVT::i64)
702 break;
703
704 SelectADD_SUB_I64(N);
705 return;
706 }
707 case ISD::UADDO_CARRY:
708 case ISD::USUBO_CARRY:
709 if (N->getValueType(0) != MVT::i32)
710 break;
711
712 SelectAddcSubb(N);
713 return;
714 case ISD::UADDO:
715 case ISD::USUBO: {
716 SelectUADDO_USUBO(N);
717 return;
718 }
719 case AMDGPUISD::FMUL_W_CHAIN: {
720 SelectFMUL_W_CHAIN(N);
721 return;
722 }
723 case AMDGPUISD::FMA_W_CHAIN: {
724 SelectFMA_W_CHAIN(N);
725 return;
726 }
727
729 case ISD::BUILD_VECTOR: {
730 EVT VT = N->getValueType(0);
731 unsigned NumVectorElts = VT.getVectorNumElements();
732 if (VT.getScalarSizeInBits() == 16) {
733 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
734 if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
735 ReplaceNode(N, Packed);
736 return;
737 }
738 }
739
740 break;
741 }
742
743 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
744 assert(VT.getVectorElementType().bitsEq(MVT::i32));
745 const TargetRegisterClass *RegClass =
746 N->isDivergent()
747 ? TRI->getDefaultVectorSuperClassForBitWidth(NumVectorElts * 32)
748 : SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32);
749
750 SelectBuildVector(N, RegClass->getID());
751 return;
752 }
755 return;
756 case ISD::BUILD_PAIR: {
757 SDValue RC, SubReg0, SubReg1;
758 SDLoc DL(N);
759 if (N->getValueType(0) == MVT::i128) {
760 RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
761 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
762 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
763 } else if (N->getValueType(0) == MVT::i64) {
764 RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
765 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
766 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
767 } else {
768 llvm_unreachable("Unhandled value type for BUILD_PAIR");
769 }
770 const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
771 N->getOperand(1), SubReg1 };
772 ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
773 N->getValueType(0), Ops));
774 return;
775 }
776
777 case ISD::Constant:
778 case ISD::ConstantFP: {
779 if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N) ||
780 Subtarget->has64BitLiterals())
781 break;
782
783 uint64_t Imm;
785 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
786 if (AMDGPU::isValid32BitLiteral(Imm, true))
787 break;
788 } else {
790 Imm = C->getZExtValue();
791 if (AMDGPU::isValid32BitLiteral(Imm, false))
792 break;
793 }
794
795 SDLoc DL(N);
796 ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
797 return;
798 }
799 case AMDGPUISD::BFE_I32:
800 case AMDGPUISD::BFE_U32: {
801 // There is a scalar version available, but unlike the vector version which
802 // has a separate operand for the offset and width, the scalar version packs
803 // the width and offset into a single operand. Try to move to the scalar
804 // version if the offsets are constant, so that we can try to keep extended
805 // loads of kernel arguments in SGPRs.
806
807 // TODO: Technically we could try to pattern match scalar bitshifts of
808 // dynamic values, but it's probably not useful.
810 if (!Offset)
811 break;
812
813 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
814 if (!Width)
815 break;
816
817 bool Signed = Opc == AMDGPUISD::BFE_I32;
818
819 uint32_t OffsetVal = Offset->getZExtValue();
820 uint32_t WidthVal = Width->getZExtValue();
821
822 ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
823 WidthVal));
824 return;
825 }
826 case AMDGPUISD::DIV_SCALE: {
827 SelectDIV_SCALE(N);
828 return;
829 }
832 SelectMAD_64_32(N);
833 return;
834 }
835 case ISD::SMUL_LOHI:
836 case ISD::UMUL_LOHI:
837 return SelectMUL_LOHI(N);
838 case ISD::CopyToReg: {
840 *static_cast<const SITargetLowering*>(getTargetLowering());
841 N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
842 break;
843 }
844 case ISD::AND:
845 case ISD::SRL:
846 case ISD::SRA:
848 if (N->getValueType(0) != MVT::i32)
849 break;
850
851 SelectS_BFE(N);
852 return;
853 case ISD::BRCOND:
854 SelectBRCOND(N);
855 return;
856 case ISD::FP_EXTEND:
857 SelectFP_EXTEND(N);
858 return;
859 case AMDGPUISD::CVT_PKRTZ_F16_F32:
860 case AMDGPUISD::CVT_PKNORM_I16_F32:
861 case AMDGPUISD::CVT_PKNORM_U16_F32:
862 case AMDGPUISD::CVT_PK_U16_U32:
863 case AMDGPUISD::CVT_PK_I16_I32: {
864 // Hack around using a legal type if f16 is illegal.
865 if (N->getValueType(0) == MVT::i32) {
866 MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
867 N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
868 { N->getOperand(0), N->getOperand(1) });
869 SelectCode(N);
870 return;
871 }
872
873 break;
874 }
876 SelectINTRINSIC_W_CHAIN(N);
877 return;
878 }
880 SelectINTRINSIC_WO_CHAIN(N);
881 return;
882 }
883 case ISD::INTRINSIC_VOID: {
884 SelectINTRINSIC_VOID(N);
885 return;
886 }
888 SelectWAVE_ADDRESS(N);
889 return;
890 }
891 case ISD::STACKRESTORE: {
892 SelectSTACKRESTORE(N);
893 return;
894 }
895 }
896
897 SelectCode(N);
898}
899
900bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
901 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
902 const Instruction *Term = BB->getTerminator();
903 return Term->getMetadata("amdgpu.uniform") ||
904 Term->getMetadata("structurizecfg.uniform");
905}
906
907bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
908 unsigned ShAmtBits) const {
909 assert(N->getOpcode() == ISD::AND);
910
911 const APInt &RHS = N->getConstantOperandAPInt(1);
912 if (RHS.countr_one() >= ShAmtBits)
913 return true;
914
915 const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
916 return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;
917}
918
920 SDValue &N0, SDValue &N1) {
921 if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
923 // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
924 // (i64 (bitcast (v2i32 (build_vector
925 // (or (extract_vector_elt V, 0), OFFSET),
926 // (extract_vector_elt V, 1)))))
927 SDValue Lo = Addr.getOperand(0).getOperand(0);
928 if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
929 SDValue BaseLo = Lo.getOperand(0);
930 SDValue BaseHi = Addr.getOperand(0).getOperand(1);
931 // Check that split base (Lo and Hi) are extracted from the same one.
932 if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
934 BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
935 // Lo is statically extracted from index 0.
936 isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
937 BaseLo.getConstantOperandVal(1) == 0 &&
938 // Hi is statically extracted from index 0.
939 isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
940 BaseHi.getConstantOperandVal(1) == 1) {
941 N0 = BaseLo.getOperand(0).getOperand(0);
942 N1 = Lo.getOperand(1);
943 return true;
944 }
945 }
946 }
947 return false;
948}
949
950bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
951 SDValue &RHS) const {
952 if (CurDAG->isBaseWithConstantOffset(Addr)) {
953 LHS = Addr.getOperand(0);
954 RHS = Addr.getOperand(1);
955 return true;
956 }
957
960 return true;
961 }
962
963 return false;
964}
965
967 return "AMDGPU DAG->DAG Pattern Instruction Selection";
968}
969
973
977#ifdef EXPENSIVE_CHECKS
979 .getManager();
980 auto &F = MF.getFunction();
981 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
982 LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
983 for (auto &L : LI.getLoopsInPreorder())
984 assert(L->isLCSSAForm(DT) && "Loop is not in LCSSA form!");
985#endif
986 return SelectionDAGISelPass::run(MF, MFAM);
987}
988
989//===----------------------------------------------------------------------===//
990// Complex Patterns
991//===----------------------------------------------------------------------===//
992
993bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
994 SDValue &Offset) {
995 return false;
996}
997
998bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
999 SDValue &Offset) {
1001 SDLoc DL(Addr);
1002
1003 if ((C = dyn_cast<ConstantSDNode>(Addr))) {
1004 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
1005 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
1006 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
1007 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
1008 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
1009 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
1010 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
1011 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
1012 Base = Addr.getOperand(0);
1013 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
1014 } else {
1015 Base = Addr;
1016 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1017 }
1018
1019 return true;
1020}
1021
1022SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
1023 const SDLoc &DL) const {
1024 SDNode *Mov = CurDAG->getMachineNode(
1025 AMDGPU::S_MOV_B32, DL, MVT::i32,
1026 CurDAG->getTargetConstant(Val, DL, MVT::i32));
1027 return SDValue(Mov, 0);
1028}
1029
1030// FIXME: Should only handle uaddo_carry/usubo_carry
1031void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
1032 SDLoc DL(N);
1033 SDValue LHS = N->getOperand(0);
1034 SDValue RHS = N->getOperand(1);
1035
1036 unsigned Opcode = N->getOpcode();
1037 bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
1038 bool ProduceCarry =
1039 ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
1040 bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
1041
1042 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1043 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1044
1045 SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1046 DL, MVT::i32, LHS, Sub0);
1047 SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1048 DL, MVT::i32, LHS, Sub1);
1049
1050 SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1051 DL, MVT::i32, RHS, Sub0);
1052 SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1053 DL, MVT::i32, RHS, Sub1);
1054
1055 SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
1056
1057 static const unsigned OpcMap[2][2][2] = {
1058 {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
1059 {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
1060 {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
1061 {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
1062
1063 unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
1064 unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
1065
1066 SDNode *AddLo;
1067 if (!ConsumeCarry) {
1068 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
1069 AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
1070 } else {
1071 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
1072 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
1073 }
1074 SDValue AddHiArgs[] = {
1075 SDValue(Hi0, 0),
1076 SDValue(Hi1, 0),
1077 SDValue(AddLo, 1)
1078 };
1079 SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
1080
1081 SDValue RegSequenceArgs[] = {
1082 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
1083 SDValue(AddLo,0),
1084 Sub0,
1085 SDValue(AddHi,0),
1086 Sub1,
1087 };
1088 SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1089 MVT::i64, RegSequenceArgs);
1090
1091 if (ProduceCarry) {
1092 // Replace the carry-use
1093 ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
1094 }
1095
1096 // Replace the remaining uses.
1097 ReplaceNode(N, RegSequence);
1098}
1099
1100void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
1101 SDValue LHS = N->getOperand(0);
1102 SDValue RHS = N->getOperand(1);
1103 SDValue CI = N->getOperand(2);
1104
1105 if (N->isDivergent()) {
1106 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
1107 : AMDGPU::V_SUBB_U32_e64;
1108 CurDAG->SelectNodeTo(
1109 N, Opc, N->getVTList(),
1110 {LHS, RHS, CI,
1111 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1112 } else {
1113 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
1114 : AMDGPU::S_SUB_CO_PSEUDO;
1115 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
1116 }
1117}
1118
1119void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
1120 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
1121 // carry out despite the _i32 name. These were renamed in VI to _U32.
1122 // FIXME: We should probably rename the opcodes here.
1123 bool IsAdd = N->getOpcode() == ISD::UADDO;
1124 bool IsVALU = N->isDivergent();
1125
1126 for (SDNode::user_iterator UI = N->user_begin(), E = N->user_end(); UI != E;
1127 ++UI)
1128 if (UI.getUse().getResNo() == 1) {
1129 if (UI->isMachineOpcode()) {
1130 if (UI->getMachineOpcode() !=
1131 (IsAdd ? AMDGPU::S_ADD_CO_PSEUDO : AMDGPU::S_SUB_CO_PSEUDO)) {
1132 IsVALU = true;
1133 break;
1134 }
1135 } else {
1136 if (UI->getOpcode() != (IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY)) {
1137 IsVALU = true;
1138 break;
1139 }
1140 }
1141 }
1142
1143 if (IsVALU) {
1144 unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
1145
1146 CurDAG->SelectNodeTo(
1147 N, Opc, N->getVTList(),
1148 {N->getOperand(0), N->getOperand(1),
1149 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1150 } else {
1151 unsigned Opc = IsAdd ? AMDGPU::S_UADDO_PSEUDO : AMDGPU::S_USUBO_PSEUDO;
1152
1153 CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
1154 {N->getOperand(0), N->getOperand(1)});
1155 }
1156}
1157
1158void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
1159 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
1160 SDValue Ops[10];
1161
1162 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
1163 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1164 SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
1165 Ops[8] = N->getOperand(0);
1166 Ops[9] = N->getOperand(4);
1167
1168 // If there are no source modifiers, prefer fmac over fma because it can use
1169 // the smaller VOP2 encoding.
1170 bool UseFMAC = Subtarget->hasDLInsts() &&
1171 cast<ConstantSDNode>(Ops[0])->isZero() &&
1172 cast<ConstantSDNode>(Ops[2])->isZero() &&
1173 cast<ConstantSDNode>(Ops[4])->isZero();
1174 unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
1175 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
1176}
1177
1178void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
1179 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
1180 SDValue Ops[8];
1181
1182 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
1183 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1184 Ops[6] = N->getOperand(0);
1185 Ops[7] = N->getOperand(3);
1186
1187 CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
1188}
1189
1190// We need to handle this here because tablegen doesn't support matching
1191// instructions with multiple outputs.
1192void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
1193 EVT VT = N->getValueType(0);
1194
1195 assert(VT == MVT::f32 || VT == MVT::f64);
1196
1197 unsigned Opc
1198 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
1199
1200 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
1201 // omod
1202 SDValue Ops[8];
1203 SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
1204 SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
1205 SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
1206 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1207}
1208
1209// We need to handle this here because tablegen doesn't support matching
1210// instructions with multiple outputs.
1211void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1212 SDLoc SL(N);
1213 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1214 unsigned Opc;
1215 bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() && !N->hasAnyUseOfValue(1);
1216 if (Subtarget->hasMADIntraFwdBug())
1217 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1218 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1219 else if (UseNoCarry)
1220 Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
1221 else
1222 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1223
1224 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1225 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1226 Clamp };
1227
1228 if (UseNoCarry) {
1229 MachineSDNode *Mad = CurDAG->getMachineNode(Opc, SL, MVT::i64, Ops);
1230 ReplaceUses(SDValue(N, 0), SDValue(Mad, 0));
1231 CurDAG->RemoveDeadNode(N);
1232 return;
1233 }
1234
1235 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1236}
1237
1238// We need to handle this here because tablegen doesn't support matching
1239// instructions with multiple outputs.
1240void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1241 SDLoc SL(N);
1242 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1243 SDVTList VTList;
1244 unsigned Opc;
1245 if (Subtarget->hasMadU64U32NoCarry()) {
1246 VTList = CurDAG->getVTList(MVT::i64);
1247 Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
1248 } else {
1249 VTList = CurDAG->getVTList(MVT::i64, MVT::i1);
1250 if (Subtarget->hasMADIntraFwdBug()) {
1251 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1252 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1253 } else {
1254 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1255 }
1256 }
1257
1258 SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1259 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1260 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
1261 SDNode *Mad = CurDAG->getMachineNode(Opc, SL, VTList, Ops);
1262 if (!SDValue(N, 0).use_empty()) {
1263 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1264 SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1265 MVT::i32, SDValue(Mad, 0), Sub0);
1266 ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
1267 }
1268 if (!SDValue(N, 1).use_empty()) {
1269 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1270 SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1271 MVT::i32, SDValue(Mad, 0), Sub1);
1272 ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
1273 }
1274 CurDAG->RemoveDeadNode(N);
1275}
1276
1277bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1278 if (!isUInt<16>(Offset))
1279 return false;
1280
1281 if (!Base || Subtarget->hasUsableDSOffset() ||
1282 Subtarget->unsafeDSOffsetFoldingEnabled())
1283 return true;
1284
1285 // On Southern Islands instruction with a negative base value and an offset
1286 // don't seem to work.
1287 return CurDAG->SignBitIsZero(Base);
1288}
1289
1290bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1291 SDValue &Offset) const {
1292 SDLoc DL(Addr);
1293 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1294 SDValue N0 = Addr.getOperand(0);
1295 SDValue N1 = Addr.getOperand(1);
1296 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1297 if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1298 // (add n0, c0)
1299 Base = N0;
1300 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1301 return true;
1302 }
1303 } else if (Addr.getOpcode() == ISD::SUB) {
1304 // sub C, x -> add (sub 0, x), C
1305 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1306 int64_t ByteOffset = C->getSExtValue();
1307 if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1308 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1309
1310 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1311 // the known bits in isDSOffsetLegal. We need to emit the selected node
1312 // here, so this is thrown away.
1313 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1314 Zero, Addr.getOperand(1));
1315
1316 if (isDSOffsetLegal(Sub, ByteOffset)) {
1318 Opnds.push_back(Zero);
1319 Opnds.push_back(Addr.getOperand(1));
1320
1321 // FIXME: Select to VOP3 version for with-carry.
1322 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1323 if (Subtarget->hasAddNoCarryInsts()) {
1324 SubOp = AMDGPU::V_SUB_U32_e64;
1325 Opnds.push_back(
1326 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1327 }
1328
1329 MachineSDNode *MachineSub =
1330 CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1331
1332 Base = SDValue(MachineSub, 0);
1333 Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1334 return true;
1335 }
1336 }
1337 }
1338 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1339 // If we have a constant address, prefer to put the constant into the
1340 // offset. This can save moves to load the constant address since multiple
1341 // operations can share the zero base address register, and enables merging
1342 // into read2 / write2 instructions.
1343
1344 SDLoc DL(Addr);
1345
1346 if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1347 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1348 MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1349 DL, MVT::i32, Zero);
1350 Base = SDValue(MovZero, 0);
1351 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1352 return true;
1353 }
1354 }
1355
1356 // default case
1357 Base = Addr;
1358 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1359 return true;
1360}
1361
1362bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1363 unsigned Offset1,
1364 unsigned Size) const {
1365 if (Offset0 % Size != 0 || Offset1 % Size != 0)
1366 return false;
1367 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1368 return false;
1369
1370 if (!Base || Subtarget->hasUsableDSOffset() ||
1371 Subtarget->unsafeDSOffsetFoldingEnabled())
1372 return true;
1373
1374 // On Southern Islands instruction with a negative base value and an offset
1375 // don't seem to work.
1376 return CurDAG->SignBitIsZero(Base);
1377}
1378
1379// Return whether the operation has NoUnsignedWrap property.
1380static bool isNoUnsignedWrap(SDValue Addr) {
1381 return (Addr.getOpcode() == ISD::ADD &&
1382 Addr->getFlags().hasNoUnsignedWrap()) ||
1383 Addr->getOpcode() == ISD::OR;
1384}
1385
1386// Check that the base address of flat scratch load/store in the form of `base +
1387// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
1388// requirement). We always treat the first operand as the base address here.
1389bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
1390 if (isNoUnsignedWrap(Addr))
1391 return true;
1392
1393 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1394 // values.
1395 if (Subtarget->hasSignedScratchOffsets())
1396 return true;
1397
1398 auto LHS = Addr.getOperand(0);
1399 auto RHS = Addr.getOperand(1);
1400
1401 // If the immediate offset is negative and within certain range, the base
1402 // address cannot also be negative. If the base is also negative, the sum
1403 // would be either negative or much larger than the valid range of scratch
1404 // memory a thread can access.
1405 ConstantSDNode *ImmOp = nullptr;
1406 if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
1407 if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000)
1408 return true;
1409 }
1410
1411 return CurDAG->SignBitIsZero(LHS);
1412}
1413
1414// Check address value in SGPR/VGPR are legal for flat scratch in the form
1415// of: SGPR + VGPR.
1416bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
1417 if (isNoUnsignedWrap(Addr))
1418 return true;
1419
1420 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1421 // values.
1422 if (Subtarget->hasSignedScratchOffsets())
1423 return true;
1424
1425 auto LHS = Addr.getOperand(0);
1426 auto RHS = Addr.getOperand(1);
1427 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1428}
1429
1430// Check address value in SGPR/VGPR are legal for flat scratch in the form
1431// of: SGPR + VGPR + Imm.
1432bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
1433 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1434 // values.
1435 if (AMDGPU::isGFX12Plus(*Subtarget))
1436 return true;
1437
1438 auto Base = Addr.getOperand(0);
1439 auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));
1440 // If the immediate offset is negative and within certain range, the base
1441 // address cannot also be negative. If the base is also negative, the sum
1442 // would be either negative or much larger than the valid range of scratch
1443 // memory a thread can access.
1444 if (isNoUnsignedWrap(Base) &&
1445 (isNoUnsignedWrap(Addr) ||
1446 (RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000)))
1447 return true;
1448
1449 auto LHS = Base.getOperand(0);
1450 auto RHS = Base.getOperand(1);
1451 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1452}
1453
1454// TODO: If offset is too big, put low 16-bit into offset.
1455bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1456 SDValue &Offset0,
1457 SDValue &Offset1) const {
1458 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1459}
1460
1461bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1462 SDValue &Offset0,
1463 SDValue &Offset1) const {
1464 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1465}
1466
1467bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1468 SDValue &Offset0, SDValue &Offset1,
1469 unsigned Size) const {
1470 SDLoc DL(Addr);
1471
1472 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1473 SDValue N0 = Addr.getOperand(0);
1474 SDValue N1 = Addr.getOperand(1);
1475 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1476 unsigned OffsetValue0 = C1->getZExtValue();
1477 unsigned OffsetValue1 = OffsetValue0 + Size;
1478
1479 // (add n0, c0)
1480 if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1481 Base = N0;
1482 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1483 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1484 return true;
1485 }
1486 } else if (Addr.getOpcode() == ISD::SUB) {
1487 // sub C, x -> add (sub 0, x), C
1488 if (const ConstantSDNode *C =
1490 unsigned OffsetValue0 = C->getZExtValue();
1491 unsigned OffsetValue1 = OffsetValue0 + Size;
1492
1493 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1494 SDLoc DL(Addr);
1495 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1496
1497 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1498 // the known bits in isDSOffsetLegal. We need to emit the selected node
1499 // here, so this is thrown away.
1500 SDValue Sub =
1501 CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1502
1503 if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1505 Opnds.push_back(Zero);
1506 Opnds.push_back(Addr.getOperand(1));
1507 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1508 if (Subtarget->hasAddNoCarryInsts()) {
1509 SubOp = AMDGPU::V_SUB_U32_e64;
1510 Opnds.push_back(
1511 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1512 }
1513
1514 MachineSDNode *MachineSub = CurDAG->getMachineNode(
1515 SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1516
1517 Base = SDValue(MachineSub, 0);
1518 Offset0 =
1519 CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1520 Offset1 =
1521 CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1522 return true;
1523 }
1524 }
1525 }
1526 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1527 unsigned OffsetValue0 = CAddr->getZExtValue();
1528 unsigned OffsetValue1 = OffsetValue0 + Size;
1529
1530 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1531 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1532 MachineSDNode *MovZero =
1533 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1534 Base = SDValue(MovZero, 0);
1535 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1536 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1537 return true;
1538 }
1539 }
1540
1541 // default case
1542
1543 Base = Addr;
1544 Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i32);
1545 Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i32);
1546 return true;
1547}
1548
1549bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1550 SDValue &SOffset, SDValue &Offset,
1551 SDValue &Offen, SDValue &Idxen,
1552 SDValue &Addr64) const {
1553 // Subtarget prefers to use flat instruction
1554 // FIXME: This should be a pattern predicate and not reach here
1555 if (Subtarget->useFlatForGlobal())
1556 return false;
1557
1558 SDLoc DL(Addr);
1559
1560 Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1561 Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1562 Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1563 SOffset = Subtarget->hasRestrictedSOffset()
1564 ? CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32)
1565 : CurDAG->getTargetConstant(0, DL, MVT::i32);
1566
1567 ConstantSDNode *C1 = nullptr;
1568 SDValue N0 = Addr;
1569 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1570 C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1571 if (isUInt<32>(C1->getZExtValue()))
1572 N0 = Addr.getOperand(0);
1573 else
1574 C1 = nullptr;
1575 }
1576
1577 if (N0->isAnyAdd()) {
1578 // (add N2, N3) -> addr64, or
1579 // (add (add N2, N3), C1) -> addr64
1580 SDValue N2 = N0.getOperand(0);
1581 SDValue N3 = N0.getOperand(1);
1582 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1583
1584 if (N2->isDivergent()) {
1585 if (N3->isDivergent()) {
1586 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1587 // addr64, and construct the resource from a 0 address.
1588 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1589 VAddr = N0;
1590 } else {
1591 // N2 is divergent, N3 is not.
1592 Ptr = N3;
1593 VAddr = N2;
1594 }
1595 } else {
1596 // N2 is not divergent.
1597 Ptr = N2;
1598 VAddr = N3;
1599 }
1600 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1601 } else if (N0->isDivergent()) {
1602 // N0 is divergent. Use it as the addr64, and construct the resource from a
1603 // 0 address.
1604 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1605 VAddr = N0;
1606 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1607 } else {
1608 // N0 -> offset, or
1609 // (N0 + C1) -> offset
1610 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1611 Ptr = N0;
1612 }
1613
1614 if (!C1) {
1615 // No offset.
1616 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1617 return true;
1618 }
1619
1620 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1621 if (TII->isLegalMUBUFImmOffset(C1->getZExtValue())) {
1622 // Legal offset for instruction.
1623 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1624 return true;
1625 }
1626
1627 // Illegal offset, store it in soffset.
1628 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1629 SOffset =
1630 SDValue(CurDAG->getMachineNode(
1631 AMDGPU::S_MOV_B32, DL, MVT::i32,
1632 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1633 0);
1634 return true;
1635}
1636
1637bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1638 SDValue &VAddr, SDValue &SOffset,
1639 SDValue &Offset) const {
1640 SDValue Ptr, Offen, Idxen, Addr64;
1641
1642 // addr64 bit was removed for volcanic islands.
1643 // FIXME: This should be a pattern predicate and not reach here
1644 if (!Subtarget->hasAddr64())
1645 return false;
1646
1647 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1648 return false;
1649
1650 ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1651 if (C->getSExtValue()) {
1652 SDLoc DL(Addr);
1653
1654 const SITargetLowering& Lowering =
1655 *static_cast<const SITargetLowering*>(getTargetLowering());
1656
1657 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1658 return true;
1659 }
1660
1661 return false;
1662}
1663
1664std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1665 SDLoc DL(N);
1666
1667 auto *FI = dyn_cast<FrameIndexSDNode>(N);
1668 SDValue TFI =
1669 FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1670
1671 // We rebase the base address into an absolute stack address and hence
1672 // use constant 0 for soffset. This value must be retained until
1673 // frame elimination and eliminateFrameIndex will choose the appropriate
1674 // frame register if need be.
1675 return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1676}
1677
1678bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1679 SDValue Addr, SDValue &Rsrc,
1680 SDValue &VAddr, SDValue &SOffset,
1681 SDValue &ImmOffset) const {
1682
1683 SDLoc DL(Addr);
1684 MachineFunction &MF = CurDAG->getMachineFunction();
1685 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1686
1687 Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1688
1689 if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1690 int64_t Imm = CAddr->getSExtValue();
1691 const int64_t NullPtr =
1693 // Don't fold null pointer.
1694 if (Imm != NullPtr) {
1695 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
1696 SDValue HighBits =
1697 CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
1698 MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1699 AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1700 VAddr = SDValue(MovHighBits, 0);
1701
1702 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1703 ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32);
1704 return true;
1705 }
1706 }
1707
1708 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1709 // (add n0, c1)
1710
1711 SDValue N0 = Addr.getOperand(0);
1712 uint64_t C1 = Addr.getConstantOperandVal(1);
1713
1714 // Offsets in vaddr must be positive if range checking is enabled.
1715 //
1716 // The total computation of vaddr + soffset + offset must not overflow. If
1717 // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1718 // overflowing.
1719 //
1720 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1721 // always perform a range check. If a negative vaddr base index was used,
1722 // this would fail the range check. The overall address computation would
1723 // compute a valid address, but this doesn't happen due to the range
1724 // check. For out-of-bounds MUBUF loads, a 0 is returned.
1725 //
1726 // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1727 // MUBUF vaddr, but not on older subtargets which can only do this if the
1728 // sign bit is known 0.
1729 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1730 if (TII->isLegalMUBUFImmOffset(C1) &&
1731 (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1732 CurDAG->SignBitIsZero(N0))) {
1733 std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1734 ImmOffset = CurDAG->getTargetConstant(C1, DL, MVT::i32);
1735 return true;
1736 }
1737 }
1738
1739 // (node)
1740 std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1741 ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1742 return true;
1743}
1744
1745static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1746 if (Val.getOpcode() != ISD::CopyFromReg)
1747 return false;
1748 auto Reg = cast<RegisterSDNode>(Val.getOperand(1))->getReg();
1749 if (!Reg.isPhysical())
1750 return false;
1751 const auto *RC = TRI.getPhysRegBaseClass(Reg);
1752 return RC && TRI.isSGPRClass(RC);
1753}
1754
1755bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1756 SDValue Addr,
1757 SDValue &SRsrc,
1758 SDValue &SOffset,
1759 SDValue &Offset) const {
1760 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
1761 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1762 MachineFunction &MF = CurDAG->getMachineFunction();
1763 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1764 SDLoc DL(Addr);
1765
1766 // CopyFromReg <sgpr>
1767 if (IsCopyFromSGPR(*TRI, Addr)) {
1768 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1769 SOffset = Addr;
1770 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1771 return true;
1772 }
1773
1774 ConstantSDNode *CAddr;
1775 if (Addr.getOpcode() == ISD::ADD) {
1776 // Add (CopyFromReg <sgpr>) <constant>
1777 CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1778 if (!CAddr || !TII->isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1779 return false;
1780 if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
1781 return false;
1782
1783 SOffset = Addr.getOperand(0);
1784 } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
1785 TII->isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
1786 // <constant>
1787 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1788 } else {
1789 return false;
1790 }
1791
1792 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1793
1794 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32);
1795 return true;
1796}
1797
1798bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1799 SDValue &SOffset, SDValue &Offset
1800 ) const {
1801 SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1802 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1803
1804 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1805 return false;
1806
1807 if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1808 !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1809 !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1810 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1811 maskTrailingOnes<uint64_t>(32); // Size
1812 SDLoc DL(Addr);
1813
1814 const SITargetLowering& Lowering =
1815 *static_cast<const SITargetLowering*>(getTargetLowering());
1816
1817 SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1818 return true;
1819 }
1820 return false;
1821}
1822
1823bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
1824 SDValue &SOffset) const {
1825 if (Subtarget->hasRestrictedSOffset() && isNullConstant(ByteOffsetNode)) {
1826 SOffset = CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32);
1827 return true;
1828 }
1829
1830 SOffset = ByteOffsetNode;
1831 return true;
1832}
1833
1834// Find a load or store from corresponding pattern root.
1835// Roots may be build_vector, bitconvert or their combinations.
1838 if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1839 return MN;
1841 for (SDValue V : N->op_values())
1842 if (MemSDNode *MN =
1844 return MN;
1845 llvm_unreachable("cannot find MemSDNode in the pattern!");
1846}
1847
1848bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1849 SDValue &VAddr, SDValue &Offset,
1850 uint64_t FlatVariant) const {
1851 int64_t OffsetVal = 0;
1852
1853 unsigned AS = findMemSDNode(N)->getAddressSpace();
1854
1855 bool CanHaveFlatSegmentOffsetBug =
1856 Subtarget->hasFlatSegmentOffsetBug() &&
1857 FlatVariant == SIInstrFlags::FLAT &&
1859
1860 if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1861 SDValue N0, N1;
1862 if (isBaseWithConstantOffset64(Addr, N0, N1) &&
1863 (FlatVariant != SIInstrFlags::FlatScratch ||
1864 isFlatScratchBaseLegal(Addr))) {
1865 int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1866
1867 // Adding the offset to the base address in a FLAT instruction must not
1868 // change the memory aperture in which the address falls. Therefore we can
1869 // only fold offsets from inbounds GEPs into FLAT instructions.
1870 bool IsInBounds =
1871 Addr.getOpcode() == ISD::PTRADD && Addr->getFlags().hasInBounds();
1872 if (COffsetVal == 0 || FlatVariant != SIInstrFlags::FLAT || IsInBounds) {
1873 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1874 if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1875 Addr = N0;
1876 OffsetVal = COffsetVal;
1877 } else {
1878 // If the offset doesn't fit, put the low bits into the offset field
1879 // and add the rest.
1880 //
1881 // For a FLAT instruction the hardware decides whether to access
1882 // global/scratch/shared memory based on the high bits of vaddr,
1883 // ignoring the offset field, so we have to ensure that when we add
1884 // remainder to vaddr it still points into the same underlying object.
1885 // The easiest way to do that is to make sure that we split the offset
1886 // into two pieces that are both >= 0 or both <= 0.
1887
1888 SDLoc DL(N);
1889 uint64_t RemainderOffset;
1890
1891 std::tie(OffsetVal, RemainderOffset) =
1892 TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1893
1894 SDValue AddOffsetLo =
1895 getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1896 SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1897
1898 if (Addr.getValueType().getSizeInBits() == 32) {
1900 Opnds.push_back(N0);
1901 Opnds.push_back(AddOffsetLo);
1902 unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1903 if (Subtarget->hasAddNoCarryInsts()) {
1904 AddOp = AMDGPU::V_ADD_U32_e64;
1905 Opnds.push_back(Clamp);
1906 }
1907 Addr =
1908 SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1909 } else {
1910 // TODO: Should this try to use a scalar add pseudo if the base
1911 // address is uniform and saddr is usable?
1912 SDValue Sub0 =
1913 CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1914 SDValue Sub1 =
1915 CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1916
1917 SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1918 DL, MVT::i32, N0, Sub0);
1919 SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1920 DL, MVT::i32, N0, Sub1);
1921
1922 SDValue AddOffsetHi =
1923 getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1924
1925 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1926
1927 SDNode *Add =
1928 CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1929 {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1930
1931 SDNode *Addc = CurDAG->getMachineNode(
1932 AMDGPU::V_ADDC_U32_e64, DL, VTs,
1933 {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1934
1935 SDValue RegSequenceArgs[] = {
1936 CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL,
1937 MVT::i32),
1938 SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1939
1940 Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1941 MVT::i64, RegSequenceArgs),
1942 0);
1943 }
1944 }
1945 }
1946 }
1947 }
1948
1949 VAddr = Addr;
1950 Offset = CurDAG->getSignedTargetConstant(OffsetVal, SDLoc(), MVT::i32);
1951 return true;
1952}
1953
1954bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1955 SDValue &VAddr,
1956 SDValue &Offset) const {
1957 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);
1958}
1959
1960bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1961 SDValue &VAddr,
1962 SDValue &Offset) const {
1963 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);
1964}
1965
1966bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1967 SDValue &VAddr,
1968 SDValue &Offset) const {
1969 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1971}
1972
1973// If this matches *_extend i32:x, return x
1974// Otherwise if the value is I32 returns x.
1976 const SelectionDAG *DAG) {
1977 if (Op.getValueType() == MVT::i32)
1978 return Op;
1979
1980 if (Op.getOpcode() != (IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND) &&
1981 Op.getOpcode() != ISD::ANY_EXTEND &&
1982 !(DAG->SignBitIsZero(Op) &&
1983 Op.getOpcode() == (IsSigned ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND)))
1984 return SDValue();
1985
1986 SDValue ExtSrc = Op.getOperand(0);
1987 return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1988}
1989
1990// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1991// or (64-bit SGPR base) + (sext vgpr offset) + sext(imm offset)
1992bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
1993 SDValue &SAddr, SDValue &VOffset,
1994 SDValue &Offset, bool &ScaleOffset,
1995 bool NeedIOffset) const {
1996 int64_t ImmOffset = 0;
1997 ScaleOffset = false;
1998
1999 // Match the immediate offset first, which canonically is moved as low as
2000 // possible.
2001
2002 SDValue LHS, RHS;
2003 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
2004 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
2005 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2006
2007 if (NeedIOffset &&
2008 TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
2010 Addr = LHS;
2011 ImmOffset = COffsetVal;
2012 } else if (!LHS->isDivergent()) {
2013 if (COffsetVal > 0) {
2014 SDLoc SL(N);
2015 // saddr + large_offset -> saddr +
2016 // (voffset = large_offset & ~MaxOffset) +
2017 // (large_offset & MaxOffset);
2018 int64_t SplitImmOffset = 0, RemainderOffset = COffsetVal;
2019 if (NeedIOffset) {
2020 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
2022 }
2023
2024 if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
2025 : isUInt<32>(RemainderOffset)) {
2026 SDNode *VMov = CurDAG->getMachineNode(
2027 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
2028 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
2029 VOffset = SDValue(VMov, 0);
2030 SAddr = LHS;
2031 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
2032 return true;
2033 }
2034 }
2035
2036 // We are adding a 64 bit SGPR and a constant. If constant bus limit
2037 // is 1 we would need to perform 1 or 2 extra moves for each half of
2038 // the constant and it is better to do a scalar add and then issue a
2039 // single VALU instruction to materialize zero. Otherwise it is less
2040 // instructions to perform VALU adds with immediates or inline literals.
2041 unsigned NumLiterals =
2042 !TII->isInlineConstant(APInt(32, Lo_32(COffsetVal))) +
2043 !TII->isInlineConstant(APInt(32, Hi_32(COffsetVal)));
2044 if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
2045 return false;
2046 }
2047 }
2048
2049 // Match the variable offset.
2050 if (Addr->isAnyAdd()) {
2051 LHS = Addr.getOperand(0);
2052
2053 if (!LHS->isDivergent()) {
2054 // add (i64 sgpr), (*_extend (i32 vgpr))
2055 RHS = Addr.getOperand(1);
2056 ScaleOffset = SelectScaleOffset(N, RHS, Subtarget->hasSignedGVSOffset());
2057 if (SDValue ExtRHS = matchExtFromI32orI32(
2058 RHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
2059 SAddr = LHS;
2060 VOffset = ExtRHS;
2061 }
2062 }
2063
2064 RHS = Addr.getOperand(1);
2065 if (!SAddr && !RHS->isDivergent()) {
2066 // add (*_extend (i32 vgpr)), (i64 sgpr)
2067 ScaleOffset = SelectScaleOffset(N, LHS, Subtarget->hasSignedGVSOffset());
2068 if (SDValue ExtLHS = matchExtFromI32orI32(
2069 LHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
2070 SAddr = RHS;
2071 VOffset = ExtLHS;
2072 }
2073 }
2074
2075 if (SAddr) {
2076 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2077 return true;
2078 }
2079 }
2080
2081 if (Subtarget->hasScaleOffset() &&
2082 (Addr.getOpcode() == (Subtarget->hasSignedGVSOffset()
2085 (Addr.getOpcode() == AMDGPUISD::MAD_U64_U32 &&
2086 CurDAG->SignBitIsZero(Addr.getOperand(0)))) &&
2087 Addr.getOperand(0)->isDivergent() &&
2089 !Addr.getOperand(2)->isDivergent()) {
2090 // mad_u64_u32 (i32 vgpr), (i32 c), (i64 sgpr)
2091 unsigned Size =
2092 (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;
2093 ScaleOffset = Addr.getConstantOperandVal(1) == Size;
2094 if (ScaleOffset) {
2095 SAddr = Addr.getOperand(2);
2096 VOffset = Addr.getOperand(0);
2097 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2098 return true;
2099 }
2100 }
2101
2102 if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
2103 isa<ConstantSDNode>(Addr))
2104 return false;
2105
2106 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
2107 // moves required to copy a 64-bit SGPR to VGPR.
2108 SAddr = Addr;
2109 SDNode *VMov =
2110 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
2111 CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
2112 VOffset = SDValue(VMov, 0);
2113 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2114 return true;
2115}
2116
2117bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
2118 SDValue &SAddr, SDValue &VOffset,
2119 SDValue &Offset,
2120 SDValue &CPol) const {
2121 bool ScaleOffset;
2122 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2123 return false;
2124
2125 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2126 SDLoc(), MVT::i32);
2127 return true;
2128}
2129
2130bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPol(SDNode *N, SDValue Addr,
2131 SDValue &SAddr, SDValue &VOffset,
2132 SDValue &Offset,
2133 SDValue &CPol) const {
2134 bool ScaleOffset;
2135 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2136 return false;
2137
2138 // We are assuming CPol is always the last operand of the intrinsic.
2139 auto PassedCPol =
2140 N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
2141 CPol = CurDAG->getTargetConstant(
2142 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2143 return true;
2144}
2145
2146bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPolM0(SDNode *N, SDValue Addr,
2147 SDValue &SAddr,
2148 SDValue &VOffset,
2149 SDValue &Offset,
2150 SDValue &CPol) const {
2151 bool ScaleOffset;
2152 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2153 return false;
2154
2155 // We are assuming CPol is second from last operand of the intrinsic.
2156 auto PassedCPol =
2157 N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;
2158 CPol = CurDAG->getTargetConstant(
2159 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2160 return true;
2161}
2162
2163bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
2164 SDValue &SAddr, SDValue &VOffset,
2165 SDValue &Offset,
2166 SDValue &CPol) const {
2167 bool ScaleOffset;
2168 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2169 return false;
2170
2171 unsigned CPolVal = (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | AMDGPU::CPol::GLC;
2172 CPol = CurDAG->getTargetConstant(CPolVal, SDLoc(), MVT::i32);
2173 return true;
2174}
2175
2176bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr,
2177 SDValue &SAddr,
2178 SDValue &VOffset,
2179 SDValue &CPol) const {
2180 bool ScaleOffset;
2181 SDValue DummyOffset;
2182 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset,
2183 false))
2184 return false;
2185
2186 // We are assuming CPol is always the last operand of the intrinsic.
2187 auto PassedCPol =
2188 N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
2189 CPol = CurDAG->getTargetConstant(
2190 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2191 return true;
2192}
2193
2194bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffsetM0(SDNode *N, SDValue Addr,
2195 SDValue &SAddr,
2196 SDValue &VOffset,
2197 SDValue &CPol) const {
2198 bool ScaleOffset;
2199 SDValue DummyOffset;
2200 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset,
2201 false))
2202 return false;
2203
2204 // We are assuming CPol is second from last operand of the intrinsic.
2205 auto PassedCPol =
2206 N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;
2207 CPol = CurDAG->getTargetConstant(
2208 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2209 return true;
2210}
2211
2213 if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
2214 SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
2215 } else if (SAddr.getOpcode() == ISD::ADD &&
2217 // Materialize this into a scalar move for scalar address to avoid
2218 // readfirstlane.
2219 auto *FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
2220 SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
2221 FI->getValueType(0));
2222 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
2223 MVT::i32, TFI, SAddr.getOperand(1)),
2224 0);
2225 }
2226
2227 return SAddr;
2228}
2229
2230// Match (32-bit SGPR base) + sext(imm offset)
2231bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
2232 SDValue &SAddr,
2233 SDValue &Offset) const {
2234 if (Addr->isDivergent())
2235 return false;
2236
2237 SDLoc DL(Addr);
2238
2239 int64_t COffsetVal = 0;
2240
2241 if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {
2242 COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
2243 SAddr = Addr.getOperand(0);
2244 } else {
2245 SAddr = Addr;
2246 }
2247
2248 SAddr = SelectSAddrFI(CurDAG, SAddr);
2249
2250 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2251
2252 if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
2254 int64_t SplitImmOffset, RemainderOffset;
2255 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
2257
2258 COffsetVal = SplitImmOffset;
2259
2260 SDValue AddOffset =
2262 ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
2263 : CurDAG->getSignedTargetConstant(RemainderOffset, DL, MVT::i32);
2264 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
2265 SAddr, AddOffset),
2266 0);
2267 }
2268
2269 Offset = CurDAG->getSignedTargetConstant(COffsetVal, DL, MVT::i32);
2270
2271 return true;
2272}
2273
2274// Check whether the flat scratch SVS swizzle bug affects this access.
2275bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
2276 SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
2277 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
2278 return false;
2279
2280 // The bug affects the swizzling of SVS accesses if there is any carry out
2281 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
2282 // voffset to (soffset + inst_offset).
2283 KnownBits VKnown = CurDAG->computeKnownBits(VAddr);
2284 KnownBits SKnown =
2285 KnownBits::add(CurDAG->computeKnownBits(SAddr),
2286 KnownBits::makeConstant(APInt(32, ImmOffset,
2287 /*isSigned=*/true)));
2288 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
2289 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
2290 return (VMax & 3) + (SMax & 3) >= 4;
2291}
2292
2293bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
2294 SDValue &VAddr, SDValue &SAddr,
2295 SDValue &Offset,
2296 SDValue &CPol) const {
2297 int64_t ImmOffset = 0;
2298
2299 SDValue LHS, RHS;
2300 SDValue OrigAddr = Addr;
2301 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
2302 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
2303 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2304
2305 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
2307 Addr = LHS;
2308 ImmOffset = COffsetVal;
2309 } else if (!LHS->isDivergent() && COffsetVal > 0) {
2310 SDLoc SL(N);
2311 // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
2312 // (large_offset & MaxOffset);
2313 int64_t SplitImmOffset, RemainderOffset;
2314 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
2316
2317 if (isUInt<32>(RemainderOffset)) {
2318 SDNode *VMov = CurDAG->getMachineNode(
2319 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
2320 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
2321 VAddr = SDValue(VMov, 0);
2322 SAddr = LHS;
2323 if (!isFlatScratchBaseLegal(Addr))
2324 return false;
2325 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
2326 return false;
2327 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
2328 CPol = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2329 return true;
2330 }
2331 }
2332 }
2333
2334 if (Addr.getOpcode() != ISD::ADD)
2335 return false;
2336
2337 LHS = Addr.getOperand(0);
2338 RHS = Addr.getOperand(1);
2339
2340 if (!LHS->isDivergent() && RHS->isDivergent()) {
2341 SAddr = LHS;
2342 VAddr = RHS;
2343 } else if (!RHS->isDivergent() && LHS->isDivergent()) {
2344 SAddr = RHS;
2345 VAddr = LHS;
2346 } else {
2347 return false;
2348 }
2349
2350 if (OrigAddr != Addr) {
2351 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
2352 return false;
2353 } else {
2354 if (!isFlatScratchBaseLegalSV(OrigAddr))
2355 return false;
2356 }
2357
2358 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
2359 return false;
2360 SAddr = SelectSAddrFI(CurDAG, SAddr);
2361 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2362
2363 bool ScaleOffset = SelectScaleOffset(N, VAddr, true /* IsSigned */);
2364 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2365 SDLoc(), MVT::i32);
2366 return true;
2367}
2368
2369// For unbuffered smem loads, it is illegal for the Immediate Offset to be
2370// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
2371// Handle the case where the Immediate Offset + SOffset is negative.
2372bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
2373 bool Imm32Only,
2374 bool IsBuffer,
2375 int64_t ImmOffset) const {
2376 if (!IsBuffer && !Imm32Only && ImmOffset < 0 &&
2377 AMDGPU::hasSMRDSignedImmOffset(*Subtarget)) {
2378 KnownBits SKnown = CurDAG->computeKnownBits(*SOffset);
2379 if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0)
2380 return false;
2381 }
2382
2383 return true;
2384}
2385
2386// Given \p Offset and load node \p N check if an \p Offset is a multiple of
2387// the load byte size. If it is update \p Offset to a pre-scaled value and
2388// return true.
2389bool AMDGPUDAGToDAGISel::SelectScaleOffset(SDNode *N, SDValue &Offset,
2390 bool IsSigned) const {
2391 bool ScaleOffset = false;
2392 if (!Subtarget->hasScaleOffset() || !Offset)
2393 return false;
2394
2395 unsigned Size =
2396 (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;
2397
2398 SDValue Off = Offset;
2399 if (SDValue Ext = matchExtFromI32orI32(Offset, IsSigned, CurDAG))
2400 Off = Ext;
2401
2402 if (isPowerOf2_32(Size) && Off.getOpcode() == ISD::SHL) {
2403 if (auto *C = dyn_cast<ConstantSDNode>(Off.getOperand(1)))
2404 ScaleOffset = C->getZExtValue() == Log2_32(Size);
2405 } else if (Offset.getOpcode() == ISD::MUL ||
2406 (IsSigned && Offset.getOpcode() == AMDGPUISD::MUL_I24) ||
2407 Offset.getOpcode() == AMDGPUISD::MUL_U24 ||
2408 (Offset.isMachineOpcode() &&
2409 Offset.getMachineOpcode() ==
2410 (IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO
2411 : AMDGPU::S_MUL_U64_U32_PSEUDO))) {
2412 if (auto *C = dyn_cast<ConstantSDNode>(Offset.getOperand(1)))
2413 ScaleOffset = C->getZExtValue() == Size;
2414 }
2415
2416 if (ScaleOffset)
2417 Offset = Off.getOperand(0);
2418
2419 return ScaleOffset;
2420}
2421
2422// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
2423// not null) offset. If Imm32Only is true, match only 32-bit immediate
2424// offsets available on CI.
2425bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode,
2426 SDValue *SOffset, SDValue *Offset,
2427 bool Imm32Only, bool IsBuffer,
2428 bool HasSOffset, int64_t ImmOffset,
2429 bool *ScaleOffset) const {
2430 assert((!SOffset || !Offset) &&
2431 "Cannot match both soffset and offset at the same time!");
2432
2433 if (ScaleOffset) {
2434 assert(N && SOffset);
2435
2436 *ScaleOffset = SelectScaleOffset(N, ByteOffsetNode, false /* IsSigned */);
2437 }
2438
2439 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
2440 if (!C) {
2441 if (!SOffset)
2442 return false;
2443
2444 if (ByteOffsetNode.getValueType().isScalarInteger() &&
2445 ByteOffsetNode.getValueType().getSizeInBits() == 32) {
2446 *SOffset = ByteOffsetNode;
2447 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2448 ImmOffset);
2449 }
2450 if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
2451 if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
2452 *SOffset = ByteOffsetNode.getOperand(0);
2453 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2454 ImmOffset);
2455 }
2456 }
2457 return false;
2458 }
2459
2460 SDLoc SL(ByteOffsetNode);
2461
2462 // GFX9 and GFX10 have signed byte immediate offsets. The immediate
2463 // offset for S_BUFFER instructions is unsigned.
2464 int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2465 std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
2466 *Subtarget, ByteOffset, IsBuffer, HasSOffset);
2467 if (EncodedOffset && Offset && !Imm32Only) {
2468 *Offset = CurDAG->getSignedTargetConstant(*EncodedOffset, SL, MVT::i32);
2469 return true;
2470 }
2471
2472 // SGPR and literal offsets are unsigned.
2473 if (ByteOffset < 0)
2474 return false;
2475
2476 EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
2477 if (EncodedOffset && Offset && Imm32Only) {
2478 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2479 return true;
2480 }
2481
2482 if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
2483 return false;
2484
2485 if (SOffset) {
2486 SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
2487 *SOffset = SDValue(
2488 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
2489 return true;
2490 }
2491
2492 return false;
2493}
2494
2495SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
2496 if (Addr.getValueType() != MVT::i32)
2497 return Addr;
2498
2499 // Zero-extend a 32-bit address.
2500 SDLoc SL(Addr);
2501
2502 const MachineFunction &MF = CurDAG->getMachineFunction();
2503 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2504 unsigned AddrHiVal = Info->get32BitAddressHighBits();
2505 SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
2506
2507 const SDValue Ops[] = {
2508 CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
2509 Addr,
2510 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2511 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
2512 0),
2513 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
2514 };
2515
2516 return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
2517 Ops), 0);
2518}
2519
2520// Match a base and an immediate (if Offset is not null) or an SGPR (if
2521// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
2522// true, match only 32-bit immediate offsets available on CI.
2523bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDNode *N, SDValue Addr,
2524 SDValue &SBase, SDValue *SOffset,
2525 SDValue *Offset, bool Imm32Only,
2526 bool IsBuffer, bool HasSOffset,
2527 int64_t ImmOffset,
2528 bool *ScaleOffset) const {
2529 if (SOffset && Offset) {
2530 assert(!Imm32Only && !IsBuffer);
2531 SDValue B;
2532
2533 if (!SelectSMRDBaseOffset(N, Addr, B, nullptr, Offset, false, false, true))
2534 return false;
2535
2536 int64_t ImmOff = 0;
2537 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
2538 ImmOff = C->getSExtValue();
2539
2540 return SelectSMRDBaseOffset(N, B, SBase, SOffset, nullptr, false, false,
2541 true, ImmOff, ScaleOffset);
2542 }
2543
2544 // A 32-bit (address + offset) should not cause unsigned 32-bit integer
2545 // wraparound, because s_load instructions perform the addition in 64 bits.
2546 if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2547 !Addr->getFlags().hasNoUnsignedWrap())
2548 return false;
2549
2550 SDValue N0, N1;
2551 // Extract the base and offset if possible.
2552 if (Addr->isAnyAdd() || CurDAG->isADDLike(Addr)) {
2553 N0 = Addr.getOperand(0);
2554 N1 = Addr.getOperand(1);
2555 } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
2556 assert(N0 && N1 && isa<ConstantSDNode>(N1));
2557 }
2558 if (!N0 || !N1)
2559 return false;
2560
2561 if (SelectSMRDOffset(N, N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2562 ImmOffset, ScaleOffset)) {
2563 SBase = N0;
2564 return true;
2565 }
2566 if (SelectSMRDOffset(N, N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2567 ImmOffset, ScaleOffset)) {
2568 SBase = N1;
2569 return true;
2570 }
2571 return false;
2572}
2573
2574bool AMDGPUDAGToDAGISel::SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase,
2575 SDValue *SOffset, SDValue *Offset,
2576 bool Imm32Only, bool *ScaleOffset) const {
2577 if (SelectSMRDBaseOffset(N, Addr, SBase, SOffset, Offset, Imm32Only,
2578 /* IsBuffer */ false, /* HasSOffset */ false,
2579 /* ImmOffset */ 0, ScaleOffset)) {
2580 SBase = Expand32BitAddress(SBase);
2581 return true;
2582 }
2583
2584 if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2585 SBase = Expand32BitAddress(Addr);
2586 *Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
2587 return true;
2588 }
2589
2590 return false;
2591}
2592
2593bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2594 SDValue &Offset) const {
2595 return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
2596 &Offset);
2597}
2598
2599bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2600 SDValue &Offset) const {
2601 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2602 return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
2603 &Offset, /* Imm32Only */ true);
2604}
2605
2606bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase,
2607 SDValue &SOffset, SDValue &CPol) const {
2608 bool ScaleOffset;
2609 if (!SelectSMRD(N, Addr, SBase, &SOffset, /* Offset */ nullptr,
2610 /* Imm32Only */ false, &ScaleOffset))
2611 return false;
2612
2613 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2614 SDLoc(N), MVT::i32);
2615 return true;
2616}
2617
2618bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDNode *N, SDValue Addr,
2619 SDValue &SBase, SDValue &SOffset,
2620 SDValue &Offset,
2621 SDValue &CPol) const {
2622 bool ScaleOffset;
2623 if (!SelectSMRD(N, Addr, SBase, &SOffset, &Offset, false, &ScaleOffset))
2624 return false;
2625
2626 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2627 SDLoc(N), MVT::i32);
2628 return true;
2629}
2630
2631bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2632 return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,
2633 /* Imm32Only */ false, /* IsBuffer */ true);
2634}
2635
2636bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2637 SDValue &Offset) const {
2638 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2639 return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,
2640 /* Imm32Only */ true, /* IsBuffer */ true);
2641}
2642
2643bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2644 SDValue &Offset) const {
2645 // Match the (soffset + offset) pair as a 32-bit register base and
2646 // an immediate offset.
2647 return N.getValueType() == MVT::i32 &&
2648 SelectSMRDBaseOffset(/* N */ nullptr, N, /* SBase */ SOffset,
2649 /* SOffset*/ nullptr, &Offset,
2650 /* Imm32Only */ false, /* IsBuffer */ true);
2651}
2652
2653bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2654 SDValue &Base,
2655 SDValue &Offset) const {
2656 SDLoc DL(Index);
2657
2658 if (CurDAG->isBaseWithConstantOffset(Index)) {
2659 SDValue N0 = Index.getOperand(0);
2660 SDValue N1 = Index.getOperand(1);
2661 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2662
2663 // (add n0, c0)
2664 // Don't peel off the offset (c0) if doing so could possibly lead
2665 // the base (n0) to be negative.
2666 // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2667 if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2668 (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2669 Base = N0;
2670 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2671 return true;
2672 }
2673 }
2674
2675 if (isa<ConstantSDNode>(Index))
2676 return false;
2677
2678 Base = Index;
2679 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2680 return true;
2681}
2682
2683SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2684 SDValue Val, uint32_t Offset,
2685 uint32_t Width) {
2686 if (Val->isDivergent()) {
2687 unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2688 SDValue Off = CurDAG->getTargetConstant(Offset, DL, MVT::i32);
2689 SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
2690
2691 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
2692 }
2693 unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2694 // Transformation function, pack the offset and width of a BFE into
2695 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2696 // source, bits [5:0] contain the offset and bits [22:16] the width.
2697 uint32_t PackedVal = Offset | (Width << 16);
2698 SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2699
2700 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2701}
2702
2703void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2704 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2705 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2706 // Predicate: 0 < b <= c < 32
2707
2708 const SDValue &Shl = N->getOperand(0);
2709 ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2710 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2711
2712 if (B && C) {
2713 uint32_t BVal = B->getZExtValue();
2714 uint32_t CVal = C->getZExtValue();
2715
2716 if (0 < BVal && BVal <= CVal && CVal < 32) {
2717 bool Signed = N->getOpcode() == ISD::SRA;
2718 ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2719 32 - CVal));
2720 return;
2721 }
2722 }
2723 SelectCode(N);
2724}
2725
2726void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2727 switch (N->getOpcode()) {
2728 case ISD::AND:
2729 if (N->getOperand(0).getOpcode() == ISD::SRL) {
2730 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2731 // Predicate: isMask(mask)
2732 const SDValue &Srl = N->getOperand(0);
2733 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2734 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2735
2736 if (Shift && Mask) {
2737 uint32_t ShiftVal = Shift->getZExtValue();
2738 uint32_t MaskVal = Mask->getZExtValue();
2739
2740 if (isMask_32(MaskVal)) {
2741 uint32_t WidthVal = llvm::popcount(MaskVal);
2742 ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
2743 WidthVal));
2744 return;
2745 }
2746 }
2747 }
2748 break;
2749 case ISD::SRL:
2750 if (N->getOperand(0).getOpcode() == ISD::AND) {
2751 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2752 // Predicate: isMask(mask >> b)
2753 const SDValue &And = N->getOperand(0);
2754 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2755 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2756
2757 if (Shift && Mask) {
2758 uint32_t ShiftVal = Shift->getZExtValue();
2759 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2760
2761 if (isMask_32(MaskVal)) {
2762 uint32_t WidthVal = llvm::popcount(MaskVal);
2763 ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
2764 WidthVal));
2765 return;
2766 }
2767 }
2768 } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2769 SelectS_BFEFromShifts(N);
2770 return;
2771 }
2772 break;
2773 case ISD::SRA:
2774 if (N->getOperand(0).getOpcode() == ISD::SHL) {
2775 SelectS_BFEFromShifts(N);
2776 return;
2777 }
2778 break;
2779
2781 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2782 SDValue Src = N->getOperand(0);
2783 if (Src.getOpcode() != ISD::SRL)
2784 break;
2785
2786 const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2787 if (!Amt)
2788 break;
2789
2790 unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2791 ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
2792 Amt->getZExtValue(), Width));
2793 return;
2794 }
2795 }
2796
2797 SelectCode(N);
2798}
2799
2800bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2801 assert(N->getOpcode() == ISD::BRCOND);
2802 if (!N->hasOneUse())
2803 return false;
2804
2805 SDValue Cond = N->getOperand(1);
2806 if (Cond.getOpcode() == ISD::CopyToReg)
2807 Cond = Cond.getOperand(2);
2808
2809 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2810 return false;
2811
2812 MVT VT = Cond.getOperand(0).getSimpleValueType();
2813 if (VT == MVT::i32)
2814 return true;
2815
2816 if (VT == MVT::i64) {
2817 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2818 return (CC == ISD::SETEQ || CC == ISD::SETNE) &&
2819 Subtarget->hasScalarCompareEq64();
2820 }
2821
2822 if ((VT == MVT::f16 || VT == MVT::f32) && Subtarget->hasSALUFloatInsts())
2823 return true;
2824
2825 return false;
2826}
2827
2828static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
2829 assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
2830 // Special case for amdgcn.ballot:
2831 // %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2832 // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
2833 // =>
2834 // Use i1 %Cond value instead of i(WaveSize) %VCMP.
2835 // This is possible because divergent ISD::SETCC is selected as V_CMP and
2836 // Cond becomes a i(WaveSize) full mask value.
2837 // Note that ballot doesn't use SETEQ condition but its easy to support it
2838 // here for completeness, so in this case Negate is set true on return.
2839 auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
2840 if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) &&
2841 isNullConstant(VCMP.getOperand(1))) {
2842
2843 auto Cond = VCMP.getOperand(0);
2844 if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
2845 Cond = Cond.getOperand(0);
2846
2847 if (isBoolSGPR(Cond)) {
2848 Negate = VCMP_CC == ISD::SETEQ;
2849 return Cond;
2850 }
2851 }
2852 return SDValue();
2853}
2854
2855void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2856 SDValue Cond = N->getOperand(1);
2857
2858 if (Cond.isUndef()) {
2859 CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2860 N->getOperand(2), N->getOperand(0));
2861 return;
2862 }
2863
2864 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2865
2866 bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2867 bool AndExec = !UseSCCBr;
2868 bool Negate = false;
2869
2870 if (Cond.getOpcode() == ISD::SETCC &&
2871 Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
2872 SDValue VCMP = Cond->getOperand(0);
2873 auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
2874 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
2875 isNullConstant(Cond->getOperand(1)) &&
2876 // We may encounter ballot.i64 in wave32 mode on -O0.
2877 VCMP.getValueType().getSizeInBits() == Subtarget->getWavefrontSize()) {
2878 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2879 // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2880 // BRCOND i1 %C, %BB
2881 // =>
2882 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2883 // VCC = COPY i(WaveSize) %VCMP
2884 // S_CBRANCH_VCCNZ/VCCZ %BB
2885 Negate = CC == ISD::SETEQ;
2886 bool NegatedBallot = false;
2887 if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {
2888 Cond = BallotCond;
2889 UseSCCBr = !BallotCond->isDivergent();
2890 Negate = Negate ^ NegatedBallot;
2891 } else {
2892 // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
2893 // selected as V_CMP, but this may change for uniform condition.
2894 Cond = VCMP;
2895 UseSCCBr = false;
2896 }
2897 }
2898 // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
2899 // V_CMPs resulted from ballot or ballot has uniform condition and SCC is
2900 // used.
2901 AndExec = false;
2902 }
2903
2904 unsigned BrOp =
2905 UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
2906 : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
2907 Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2908 SDLoc SL(N);
2909
2910 if (AndExec) {
2911 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
2912 // analyzed what generates the vcc value, so we do not know whether vcc
2913 // bits for disabled lanes are 0. Thus we need to mask out bits for
2914 // disabled lanes.
2915 //
2916 // For the case that we select S_CBRANCH_SCC1 and it gets
2917 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2918 // SIInstrInfo::moveToVALU which inserts the S_AND).
2919 //
2920 // We could add an analysis of what generates the vcc value here and omit
2921 // the S_AND when is unnecessary. But it would be better to add a separate
2922 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2923 // catches both cases.
2924 Cond = SDValue(
2925 CurDAG->getMachineNode(
2926 Subtarget->isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64, SL,
2927 MVT::i1,
2928 CurDAG->getRegister(Subtarget->isWave32() ? AMDGPU::EXEC_LO
2929 : AMDGPU::EXEC,
2930 MVT::i1),
2931 Cond),
2932 0);
2933 }
2934
2935 SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2936 CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2937 N->getOperand(2), // Basic Block
2938 VCC.getValue(0));
2939}
2940
2941void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
2942 if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
2943 !N->isDivergent()) {
2944 SDValue Src = N->getOperand(0);
2945 if (Src.getValueType() == MVT::f16) {
2946 if (isExtractHiElt(Src, Src)) {
2947 CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(),
2948 {Src});
2949 return;
2950 }
2951 }
2952 }
2953
2954 SelectCode(N);
2955}
2956
2957void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2958 // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2959 // be copied to an SGPR with readfirstlane.
2960 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2961 AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2962
2963 SDValue Chain = N->getOperand(0);
2964 SDValue Ptr = N->getOperand(2);
2965 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2966 MachineMemOperand *MMO = M->getMemOperand();
2967 bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2968
2970 if (CurDAG->isBaseWithConstantOffset(Ptr)) {
2971 SDValue PtrBase = Ptr.getOperand(0);
2972 SDValue PtrOffset = Ptr.getOperand(1);
2973
2974 const APInt &OffsetVal = PtrOffset->getAsAPIntVal();
2975 if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
2976 N = glueCopyToM0(N, PtrBase);
2977 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2978 }
2979 }
2980
2981 if (!Offset) {
2982 N = glueCopyToM0(N, Ptr);
2983 Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2984 }
2985
2986 SDValue Ops[] = {
2987 Offset,
2988 CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
2989 Chain,
2990 N->getOperand(N->getNumOperands() - 1) // New glue
2991 };
2992
2993 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2994 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2995}
2996
2997// We need to handle this here because tablegen doesn't support matching
2998// instructions with multiple outputs.
2999void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID) {
3000 unsigned Opc;
3001 switch (IntrID) {
3002 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
3003 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
3004 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
3005 break;
3006 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
3007 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
3008 break;
3009 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
3010 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
3011 break;
3012 }
3013 SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
3014 N->getOperand(5), N->getOperand(0)};
3015
3016 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
3017 MachineMemOperand *MMO = M->getMemOperand();
3018 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
3019 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
3020}
3021
3022void AMDGPUDAGToDAGISel::SelectTensorLoadStore(SDNode *N, unsigned IntrID) {
3023 bool IsLoad = IntrID == Intrinsic::amdgcn_tensor_load_to_lds;
3024 unsigned Opc =
3025 IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d4 : AMDGPU::TENSOR_STORE_FROM_LDS_d4;
3026
3027 SmallVector<SDValue, 7> TensorOps;
3028 // First two groups
3029 TensorOps.push_back(N->getOperand(2)); // D# group 0
3030 TensorOps.push_back(N->getOperand(3)); // D# group 1
3031
3032 // Use _D2 version if both group 2 and 3 are zero-initialized.
3033 SDValue Group2 = N->getOperand(4);
3034 SDValue Group3 = N->getOperand(5);
3035 if (ISD::isBuildVectorAllZeros(Group2.getNode()) &&
3037 Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d2
3038 : AMDGPU::TENSOR_STORE_FROM_LDS_d2;
3039 } else { // Has at least 4 groups
3040 TensorOps.push_back(Group2); // D# group 2
3041 TensorOps.push_back(Group3); // D# group 3
3042 }
3043
3044 // TODO: Handle the fifth group: N->getOperand(6), which is silently ignored
3045 // for now because all existing targets only support up to 4 groups.
3046 TensorOps.push_back(CurDAG->getTargetConstant(0, SDLoc(N), MVT::i1)); // r128
3047 TensorOps.push_back(N->getOperand(7)); // cache policy
3048 TensorOps.push_back(N->getOperand(0)); // chain
3049
3050 (void)CurDAG->SelectNodeTo(N, Opc, MVT::Other, TensorOps);
3051}
3052
3053static unsigned gwsIntrinToOpcode(unsigned IntrID) {
3054 switch (IntrID) {
3055 case Intrinsic::amdgcn_ds_gws_init:
3056 return AMDGPU::DS_GWS_INIT;
3057 case Intrinsic::amdgcn_ds_gws_barrier:
3058 return AMDGPU::DS_GWS_BARRIER;
3059 case Intrinsic::amdgcn_ds_gws_sema_v:
3060 return AMDGPU::DS_GWS_SEMA_V;
3061 case Intrinsic::amdgcn_ds_gws_sema_br:
3062 return AMDGPU::DS_GWS_SEMA_BR;
3063 case Intrinsic::amdgcn_ds_gws_sema_p:
3064 return AMDGPU::DS_GWS_SEMA_P;
3065 case Intrinsic::amdgcn_ds_gws_sema_release_all:
3066 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
3067 default:
3068 llvm_unreachable("not a gws intrinsic");
3069 }
3070}
3071
3072void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
3073 if (!Subtarget->hasGWS() ||
3074 (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
3075 !Subtarget->hasGWSSemaReleaseAll())) {
3076 // Let this error.
3077 SelectCode(N);
3078 return;
3079 }
3080
3081 // Chain, intrinsic ID, vsrc, offset
3082 const bool HasVSrc = N->getNumOperands() == 4;
3083 assert(HasVSrc || N->getNumOperands() == 3);
3084
3085 SDLoc SL(N);
3086 SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
3087 int ImmOffset = 0;
3088 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
3089 MachineMemOperand *MMO = M->getMemOperand();
3090
3091 // Don't worry if the offset ends up in a VGPR. Only one lane will have
3092 // effect, so SIFixSGPRCopies will validly insert readfirstlane.
3093
3094 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
3095 // offset field) % 64. Some versions of the programming guide omit the m0
3096 // part, or claim it's from offset 0.
3097 if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
3098 // If we have a constant offset, try to use the 0 in m0 as the base.
3099 // TODO: Look into changing the default m0 initialization value. If the
3100 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
3101 // the immediate offset.
3102 glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
3103 ImmOffset = ConstOffset->getZExtValue();
3104 } else {
3105 if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
3106 ImmOffset = BaseOffset.getConstantOperandVal(1);
3107 BaseOffset = BaseOffset.getOperand(0);
3108 }
3109
3110 // Prefer to do the shift in an SGPR since it should be possible to use m0
3111 // as the result directly. If it's already an SGPR, it will be eliminated
3112 // later.
3113 SDNode *SGPROffset
3114 = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
3115 BaseOffset);
3116 // Shift to offset in m0
3117 SDNode *M0Base
3118 = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
3119 SDValue(SGPROffset, 0),
3120 CurDAG->getTargetConstant(16, SL, MVT::i32));
3121 glueCopyToM0(N, SDValue(M0Base, 0));
3122 }
3123
3124 SDValue Chain = N->getOperand(0);
3125 SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
3126
3127 const unsigned Opc = gwsIntrinToOpcode(IntrID);
3128
3129 const MCInstrDesc &InstrDesc = TII->get(Opc);
3130 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
3131
3132 const TargetRegisterClass *DataRC = TII->getRegClass(InstrDesc, Data0Idx);
3133
3135 if (HasVSrc) {
3136 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3137
3138 SDValue Data = N->getOperand(2);
3139 MVT DataVT = Data.getValueType().getSimpleVT();
3140 if (TRI->isTypeLegalForClass(*DataRC, DataVT)) {
3141 // Normal 32-bit case.
3142 Ops.push_back(N->getOperand(2));
3143 } else {
3144 // Operand is really 32-bits, but requires 64-bit alignment, so use the
3145 // even aligned 64-bit register class.
3146 const SDValue RegSeqOps[] = {
3147 CurDAG->getTargetConstant(DataRC->getID(), SL, MVT::i32), Data,
3148 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
3149 SDValue(
3150 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL, MVT::i32),
3151 0),
3152 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32)};
3153
3154 Ops.push_back(SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
3155 SL, MVT::v2i32, RegSeqOps),
3156 0));
3157 }
3158 }
3159
3160 Ops.push_back(OffsetField);
3161 Ops.push_back(Chain);
3162
3163 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
3164 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
3165}
3166
3167void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
3168 if (Subtarget->getLDSBankCount() != 16) {
3169 // This is a single instruction with a pattern.
3170 SelectCode(N);
3171 return;
3172 }
3173
3174 SDLoc DL(N);
3175
3176 // This requires 2 instructions. It is possible to write a pattern to support
3177 // this, but the generated isel emitter doesn't correctly deal with multiple
3178 // output instructions using the same physical register input. The copy to m0
3179 // is incorrectly placed before the second instruction.
3180 //
3181 // TODO: Match source modifiers.
3182 //
3183 // def : Pat <
3184 // (int_amdgcn_interp_p1_f16
3185 // (VOP3Mods f32:$src0, i32:$src0_modifiers),
3186 // (i32 timm:$attrchan), (i32 timm:$attr),
3187 // (i1 timm:$high), M0),
3188 // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
3189 // timm:$attrchan, 0,
3190 // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
3191 // let Predicates = [has16BankLDS];
3192 // }
3193
3194 // 16 bank LDS
3195 SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
3196 N->getOperand(5), SDValue());
3197
3198 SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
3199
3200 SDNode *InterpMov =
3201 CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
3202 CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
3203 N->getOperand(3), // Attr
3204 N->getOperand(2), // Attrchan
3205 ToM0.getValue(1) // In glue
3206 });
3207
3208 SDNode *InterpP1LV =
3209 CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
3210 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
3211 N->getOperand(1), // Src0
3212 N->getOperand(3), // Attr
3213 N->getOperand(2), // Attrchan
3214 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
3215 SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
3216 N->getOperand(4), // high
3217 CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
3218 CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
3219 SDValue(InterpMov, 1)
3220 });
3221
3222 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
3223}
3224
3225void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
3226 unsigned IntrID = N->getConstantOperandVal(1);
3227 switch (IntrID) {
3228 case Intrinsic::amdgcn_ds_append:
3229 case Intrinsic::amdgcn_ds_consume: {
3230 if (N->getValueType(0) != MVT::i32)
3231 break;
3232 SelectDSAppendConsume(N, IntrID);
3233 return;
3234 }
3235 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
3236 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
3237 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
3238 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
3239 SelectDSBvhStackIntrinsic(N, IntrID);
3240 return;
3241 case Intrinsic::amdgcn_init_whole_wave:
3242 CurDAG->getMachineFunction()
3243 .getInfo<SIMachineFunctionInfo>()
3244 ->setInitWholeWave();
3245 break;
3246 }
3247
3248 SelectCode(N);
3249}
3250
3251void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
3252 unsigned IntrID = N->getConstantOperandVal(0);
3253 unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END;
3254 SDNode *ConvGlueNode = N->getGluedNode();
3255 if (ConvGlueNode) {
3256 // FIXME: Possibly iterate over multiple glue nodes?
3257 assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
3258 ConvGlueNode = ConvGlueNode->getOperand(0).getNode();
3259 ConvGlueNode =
3260 CurDAG->getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, {},
3261 MVT::Glue, SDValue(ConvGlueNode, 0));
3262 } else {
3263 ConvGlueNode = nullptr;
3264 }
3265 switch (IntrID) {
3266 case Intrinsic::amdgcn_wqm:
3267 Opcode = AMDGPU::WQM;
3268 break;
3269 case Intrinsic::amdgcn_softwqm:
3270 Opcode = AMDGPU::SOFT_WQM;
3271 break;
3272 case Intrinsic::amdgcn_wwm:
3273 case Intrinsic::amdgcn_strict_wwm:
3274 Opcode = AMDGPU::STRICT_WWM;
3275 break;
3276 case Intrinsic::amdgcn_strict_wqm:
3277 Opcode = AMDGPU::STRICT_WQM;
3278 break;
3279 case Intrinsic::amdgcn_interp_p1_f16:
3280 SelectInterpP1F16(N);
3281 return;
3282 case Intrinsic::amdgcn_permlane16_swap:
3283 case Intrinsic::amdgcn_permlane32_swap: {
3284 if ((IntrID == Intrinsic::amdgcn_permlane16_swap &&
3285 !Subtarget->hasPermlane16Swap()) ||
3286 (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3287 !Subtarget->hasPermlane32Swap())) {
3288 SelectCode(N); // Hit the default error
3289 return;
3290 }
3291
3292 Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3293 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3294 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3295
3296 SmallVector<SDValue, 4> NewOps(N->op_begin() + 1, N->op_end());
3297 if (ConvGlueNode)
3298 NewOps.push_back(SDValue(ConvGlueNode, 0));
3299
3300 bool FI = N->getConstantOperandVal(3);
3301 NewOps[2] = CurDAG->getTargetConstant(
3302 FI ? AMDGPU::DPP::DPP_FI_1 : AMDGPU::DPP::DPP_FI_0, SDLoc(), MVT::i32);
3303
3304 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), NewOps);
3305 return;
3306 }
3307 default:
3308 SelectCode(N);
3309 break;
3310 }
3311
3312 if (Opcode != AMDGPU::INSTRUCTION_LIST_END) {
3313 SDValue Src = N->getOperand(1);
3314 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
3315 }
3316
3317 if (ConvGlueNode) {
3318 SmallVector<SDValue, 4> NewOps(N->ops());
3319 NewOps.push_back(SDValue(ConvGlueNode, 0));
3320 CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), NewOps);
3321 }
3322}
3323
3324void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
3325 unsigned IntrID = N->getConstantOperandVal(1);
3326 switch (IntrID) {
3327 case Intrinsic::amdgcn_ds_gws_init:
3328 case Intrinsic::amdgcn_ds_gws_barrier:
3329 case Intrinsic::amdgcn_ds_gws_sema_v:
3330 case Intrinsic::amdgcn_ds_gws_sema_br:
3331 case Intrinsic::amdgcn_ds_gws_sema_p:
3332 case Intrinsic::amdgcn_ds_gws_sema_release_all:
3333 SelectDS_GWS(N, IntrID);
3334 return;
3335 case Intrinsic::amdgcn_tensor_load_to_lds:
3336 case Intrinsic::amdgcn_tensor_store_from_lds:
3337 SelectTensorLoadStore(N, IntrID);
3338 return;
3339 default:
3340 break;
3341 }
3342
3343 SelectCode(N);
3344}
3345
3346void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
3347 SDValue Log2WaveSize =
3348 CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
3349 CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
3350 {N->getOperand(0), Log2WaveSize});
3351}
3352
3353void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
3354 SDValue SrcVal = N->getOperand(1);
3355 if (SrcVal.getValueType() != MVT::i32) {
3356 SelectCode(N); // Emit default error
3357 return;
3358 }
3359
3360 SDValue CopyVal;
3361 Register SP = TLI->getStackPointerRegisterToSaveRestore();
3362 SDLoc SL(N);
3363
3364 if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
3365 CopyVal = SrcVal.getOperand(0);
3366 } else {
3367 SDValue Log2WaveSize = CurDAG->getTargetConstant(
3368 Subtarget->getWavefrontSizeLog2(), SL, MVT::i32);
3369
3370 if (N->isDivergent()) {
3371 SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
3372 MVT::i32, SrcVal),
3373 0);
3374 }
3375
3376 CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
3377 {SrcVal, Log2WaveSize}),
3378 0);
3379 }
3380
3381 SDValue CopyToSP = CurDAG->getCopyToReg(N->getOperand(0), SL, SP, CopyVal);
3382 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyToSP);
3383}
3384
3385bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
3386 unsigned &Mods,
3387 bool IsCanonicalizing,
3388 bool AllowAbs) const {
3389 Mods = SISrcMods::NONE;
3390 Src = In;
3391
3392 if (Src.getOpcode() == ISD::FNEG) {
3393 Mods |= SISrcMods::NEG;
3394 Src = Src.getOperand(0);
3395 } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
3396 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
3397 // denormal mode, but we're implicitly canonicalizing in a source operand.
3398 auto *LHS = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
3399 if (LHS && LHS->isZero()) {
3400 Mods |= SISrcMods::NEG;
3401 Src = Src.getOperand(1);
3402 }
3403 }
3404
3405 if (AllowAbs && Src.getOpcode() == ISD::FABS) {
3406 Mods |= SISrcMods::ABS;
3407 Src = Src.getOperand(0);
3408 }
3409
3410 if (Mods != SISrcMods::NONE)
3411 return true;
3412
3413 // Convert various sign-bit masks on integers to src mods. Currently disabled
3414 // for 16-bit types as the codegen replaces the operand without adding a
3415 // srcmod. This is intentionally finding the cases where we are performing
3416 // float neg and abs on int types, the goal is not to obtain two's complement
3417 // neg or abs. Limit converison to select operands via the nonCanonalizing
3418 // pattern.
3419 // TODO: Add 16-bit support.
3420 if (IsCanonicalizing)
3421 return true;
3422
3423 // v2i32 xor/or/and are legal. A vselect using these instructions as operands
3424 // is scalarised into two selects with EXTRACT_VECTOR_ELT operands. Peek
3425 // through the extract to the bitwise op.
3426 SDValue PeekSrc =
3427 Src->getOpcode() == ISD::EXTRACT_VECTOR_ELT ? Src->getOperand(0) : Src;
3428 // Convert various sign-bit masks to src mods. Currently disabled for 16-bit
3429 // types as the codegen replaces the operand without adding a srcmod.
3430 // This is intentionally finding the cases where we are performing float neg
3431 // and abs on int types, the goal is not to obtain two's complement neg or
3432 // abs.
3433 // TODO: Add 16-bit support.
3434 unsigned Opc = PeekSrc.getOpcode();
3435 EVT VT = Src.getValueType();
3436 if ((Opc != ISD::AND && Opc != ISD::OR && Opc != ISD::XOR) ||
3437 (VT != MVT::i32 && VT != MVT::v2i32 && VT != MVT::i64))
3438 return true;
3439
3440 ConstantSDNode *CRHS = isConstOrConstSplat(PeekSrc->getOperand(1));
3441 if (!CRHS)
3442 return true;
3443
3444 auto ReplaceSrc = [&]() -> SDValue {
3445 if (Src->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
3446 return Src.getOperand(0);
3447
3448 SDValue LHS = PeekSrc->getOperand(0);
3449 SDValue Index = Src->getOperand(1);
3450 return CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Src),
3451 Src.getValueType(), LHS, Index);
3452 };
3453
3454 // Recognise Srcmods:
3455 // (xor a, 0x80000000) or v2i32 (xor a, {0x80000000,0x80000000}) as NEG.
3456 // (and a, 0x7fffffff) or v2i32 (and a, {0x7fffffff,0x7fffffff}) as ABS.
3457 // (or a, 0x80000000) or v2i32 (or a, {0x80000000,0x80000000}) as NEG+ABS
3458 // SrcModifiers.
3459 if (Opc == ISD::XOR && CRHS->getAPIntValue().isSignMask()) {
3460 Mods |= SISrcMods::NEG;
3461 Src = ReplaceSrc();
3462 } else if (Opc == ISD::AND && AllowAbs &&
3463 CRHS->getAPIntValue().isMaxSignedValue()) {
3464 Mods |= SISrcMods::ABS;
3465 Src = ReplaceSrc();
3466 } else if (Opc == ISD::OR && AllowAbs && CRHS->getAPIntValue().isSignMask()) {
3468 Src = ReplaceSrc();
3469 }
3470
3471 return true;
3472}
3473
3474bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
3475 SDValue &SrcMods) const {
3476 unsigned Mods;
3477 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,
3478 /*AllowAbs=*/true)) {
3479 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3480 return true;
3481 }
3482
3483 return false;
3484}
3485
3486bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
3487 SDValue In, SDValue &Src, SDValue &SrcMods) const {
3488 unsigned Mods;
3489 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,
3490 /*AllowAbs=*/true)) {
3491 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3492 return true;
3493 }
3494
3495 return false;
3496}
3497
3498bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
3499 SDValue &SrcMods) const {
3500 unsigned Mods;
3501 if (SelectVOP3ModsImpl(In, Src, Mods,
3502 /*IsCanonicalizing=*/true,
3503 /*AllowAbs=*/false)) {
3504 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3505 return true;
3506 }
3507
3508 return false;
3509}
3510
3511bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
3512 if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
3513 return false;
3514
3515 Src = In;
3516 return true;
3517}
3518
3519bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
3520 SDValue &SrcMods,
3521 bool OpSel) const {
3522 unsigned Mods;
3523 if (SelectVOP3ModsImpl(In, Src, Mods,
3524 /*IsCanonicalizing=*/true,
3525 /*AllowAbs=*/false)) {
3526 if (OpSel)
3527 Mods |= SISrcMods::OP_SEL_0;
3528 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3529 return true;
3530 }
3531
3532 return false;
3533}
3534
3535bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
3536 SDValue &SrcMods) const {
3537 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
3538}
3539
3540bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
3541 SDValue &SrcMods) const {
3542 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
3543}
3544
3545bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
3546 SDValue &SrcMods, SDValue &Clamp,
3547 SDValue &Omod) const {
3548 SDLoc DL(In);
3549 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3550 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3551
3552 return SelectVOP3Mods(In, Src, SrcMods);
3553}
3554
3555bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
3556 SDValue &SrcMods, SDValue &Clamp,
3557 SDValue &Omod) const {
3558 SDLoc DL(In);
3559 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3560 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3561
3562 return SelectVOP3BMods(In, Src, SrcMods);
3563}
3564
3565bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
3566 SDValue &Clamp, SDValue &Omod) const {
3567 Src = In;
3568
3569 SDLoc DL(In);
3570 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3571 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3572
3573 return true;
3574}
3575
3576bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
3577 SDValue &SrcMods, bool IsDOT) const {
3578 unsigned Mods = SISrcMods::NONE;
3579 Src = In;
3580
3581 // TODO: Handle G_FSUB 0 as fneg
3582 if (Src.getOpcode() == ISD::FNEG) {
3584 Src = Src.getOperand(0);
3585 }
3586
3587 if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 &&
3588 (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
3589 unsigned VecMods = Mods;
3590
3591 SDValue Lo = stripBitcast(Src.getOperand(0));
3592 SDValue Hi = stripBitcast(Src.getOperand(1));
3593
3594 if (Lo.getOpcode() == ISD::FNEG) {
3595 Lo = stripBitcast(Lo.getOperand(0));
3596 Mods ^= SISrcMods::NEG;
3597 }
3598
3599 if (Hi.getOpcode() == ISD::FNEG) {
3600 Hi = stripBitcast(Hi.getOperand(0));
3601 Mods ^= SISrcMods::NEG_HI;
3602 }
3603
3604 if (isExtractHiElt(Lo, Lo))
3605 Mods |= SISrcMods::OP_SEL_0;
3606
3607 if (isExtractHiElt(Hi, Hi))
3608 Mods |= SISrcMods::OP_SEL_1;
3609
3610 unsigned VecSize = Src.getValueSizeInBits();
3611 Lo = stripExtractLoElt(Lo);
3612 Hi = stripExtractLoElt(Hi);
3613
3614 if (Lo.getValueSizeInBits() > VecSize) {
3615 Lo = CurDAG->getTargetExtractSubreg(
3616 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3617 MVT::getIntegerVT(VecSize), Lo);
3618 }
3619
3620 if (Hi.getValueSizeInBits() > VecSize) {
3621 Hi = CurDAG->getTargetExtractSubreg(
3622 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3623 MVT::getIntegerVT(VecSize), Hi);
3624 }
3625
3626 assert(Lo.getValueSizeInBits() <= VecSize &&
3627 Hi.getValueSizeInBits() <= VecSize);
3628
3629 if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
3630 // Really a scalar input. Just select from the low half of the register to
3631 // avoid packing.
3632
3633 if (VecSize == Lo.getValueSizeInBits()) {
3634 Src = Lo;
3635 } else if (VecSize == 32) {
3636 Src = createVOP3PSrc32FromLo16(Lo, Src, CurDAG, Subtarget);
3637 } else {
3638 assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
3639
3640 SDLoc SL(In);
3642 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
3643 Lo.getValueType()), 0);
3644 auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
3645 : AMDGPU::SReg_64RegClassID;
3646 const SDValue Ops[] = {
3647 CurDAG->getTargetConstant(RC, SL, MVT::i32),
3648 Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
3649 Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
3650
3651 Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
3652 Src.getValueType(), Ops), 0);
3653 }
3654 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3655 return true;
3656 }
3657
3658 if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
3659 uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
3660 .bitcastToAPInt().getZExtValue();
3661 if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
3662 Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
3663 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3664 return true;
3665 }
3666 }
3667
3668 Mods = VecMods;
3669 } else if (Src.getOpcode() == ISD::VECTOR_SHUFFLE &&
3670 Src.getNumOperands() == 2) {
3671
3672 // TODO: We should repeat the build_vector source check above for the
3673 // vector_shuffle for negates and casts of individual elements.
3674
3675 auto *SVN = cast<ShuffleVectorSDNode>(Src);
3676 ArrayRef<int> Mask = SVN->getMask();
3677
3678 if (Mask[0] < 2 && Mask[1] < 2) {
3679 // src1 should be undef.
3680 SDValue ShuffleSrc = SVN->getOperand(0);
3681
3682 if (ShuffleSrc.getOpcode() == ISD::FNEG) {
3683 ShuffleSrc = ShuffleSrc.getOperand(0);
3685 }
3686
3687 if (Mask[0] == 1)
3688 Mods |= SISrcMods::OP_SEL_0;
3689 if (Mask[1] == 1)
3690 Mods |= SISrcMods::OP_SEL_1;
3691
3692 Src = ShuffleSrc;
3693 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3694 return true;
3695 }
3696 }
3697
3698 // Packed instructions do not have abs modifiers.
3699 Mods |= SISrcMods::OP_SEL_1;
3700
3701 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3702 return true;
3703}
3704
3705bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
3706 SDValue &SrcMods) const {
3707 return SelectVOP3PMods(In, Src, SrcMods, true);
3708}
3709
3710bool AMDGPUDAGToDAGISel::SelectVOP3PNoModsDOT(SDValue In, SDValue &Src) const {
3711 SDValue SrcTmp, SrcModsTmp;
3712 SelectVOP3PMods(In, SrcTmp, SrcModsTmp, true);
3713 if (cast<ConstantSDNode>(SrcModsTmp)->getZExtValue() == SISrcMods::OP_SEL_1) {
3714 Src = SrcTmp;
3715 return true;
3716 }
3717
3718 return false;
3719}
3720
3721bool AMDGPUDAGToDAGISel::SelectVOP3PModsF32(SDValue In, SDValue &Src,
3722 SDValue &SrcMods) const {
3723 SelectVOP3Mods(In, Src, SrcMods);
3724 unsigned Mods = SISrcMods::OP_SEL_1;
3725 Mods |= cast<ConstantSDNode>(SrcMods)->getZExtValue();
3726 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3727 return true;
3728}
3729
3730bool AMDGPUDAGToDAGISel::SelectVOP3PNoModsF32(SDValue In, SDValue &Src) const {
3731 SDValue SrcTmp, SrcModsTmp;
3732 SelectVOP3PModsF32(In, SrcTmp, SrcModsTmp);
3733 if (cast<ConstantSDNode>(SrcModsTmp)->getZExtValue() == SISrcMods::OP_SEL_1) {
3734 Src = SrcTmp;
3735 return true;
3736 }
3737
3738 return false;
3739}
3740
3741bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
3742 SDValue &Src) const {
3743 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3744 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3745
3746 unsigned Mods = SISrcMods::OP_SEL_1;
3747 unsigned SrcVal = C->getZExtValue();
3748 if (SrcVal == 1)
3749 Mods |= SISrcMods::OP_SEL_0;
3750
3751 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3752 return true;
3753}
3754
3756AMDGPUDAGToDAGISel::buildRegSequence32(SmallVectorImpl<SDValue> &Elts,
3757 const SDLoc &DL) const {
3758 unsigned DstRegClass;
3759 EVT DstTy;
3760 switch (Elts.size()) {
3761 case 8:
3762 DstRegClass = AMDGPU::VReg_256RegClassID;
3763 DstTy = MVT::v8i32;
3764 break;
3765 case 4:
3766 DstRegClass = AMDGPU::VReg_128RegClassID;
3767 DstTy = MVT::v4i32;
3768 break;
3769 case 2:
3770 DstRegClass = AMDGPU::VReg_64RegClassID;
3771 DstTy = MVT::v2i32;
3772 break;
3773 default:
3774 llvm_unreachable("unhandled Reg sequence size");
3775 }
3776
3778 Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));
3779 for (unsigned i = 0; i < Elts.size(); ++i) {
3780 Ops.push_back(Elts[i]);
3781 Ops.push_back(CurDAG->getTargetConstant(
3783 }
3784 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);
3785}
3786
3788AMDGPUDAGToDAGISel::buildRegSequence16(SmallVectorImpl<SDValue> &Elts,
3789 const SDLoc &DL) const {
3790 SmallVector<SDValue, 8> PackedElts;
3791 assert("unhandled Reg sequence size" &&
3792 (Elts.size() == 8 || Elts.size() == 16));
3793
3794 // Pack 16-bit elements in pairs into 32-bit register. If both elements are
3795 // unpacked from 32-bit source use it, otherwise pack them using v_perm.
3796 for (unsigned i = 0; i < Elts.size(); i += 2) {
3797 SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));
3798 SDValue HiSrc;
3799 if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {
3800 PackedElts.push_back(HiSrc);
3801 } else {
3802 if (Subtarget->useRealTrue16Insts()) {
3803 // FIXME-TRUE16. For now pack VGPR_32 for 16-bit source before
3804 // passing to v_perm_b32. Eventually we should use replace v_perm_b32
3805 // by reg_sequence.
3807 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i16),
3808 0);
3809 Elts[i] =
3810 emitRegSequence(*CurDAG, AMDGPU::VGPR_32RegClassID, MVT::i32,
3811 {Elts[i], Undef}, {AMDGPU::lo16, AMDGPU::hi16}, DL);
3812 Elts[i + 1] = emitRegSequence(*CurDAG, AMDGPU::VGPR_32RegClassID,
3813 MVT::i32, {Elts[i + 1], Undef},
3814 {AMDGPU::lo16, AMDGPU::hi16}, DL);
3815 }
3816 SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);
3817 MachineSDNode *Packed =
3818 CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,
3819 {Elts[i + 1], Elts[i], PackLoLo});
3820 PackedElts.push_back(SDValue(Packed, 0));
3821 }
3822 }
3823 return buildRegSequence32(PackedElts, DL);
3824}
3825
3827AMDGPUDAGToDAGISel::buildRegSequence(SmallVectorImpl<SDValue> &Elts,
3828 const SDLoc &DL,
3829 unsigned ElementSize) const {
3830 if (ElementSize == 16)
3831 return buildRegSequence16(Elts, DL);
3832 if (ElementSize == 32)
3833 return buildRegSequence32(Elts, DL);
3834 llvm_unreachable("Unhandled element size");
3835}
3836
3837void AMDGPUDAGToDAGISel::selectWMMAModsNegAbs(unsigned ModOpcode,
3838 unsigned &Mods,
3840 SDValue &Src, const SDLoc &DL,
3841 unsigned ElementSize) const {
3842 if (ModOpcode == ISD::FNEG) {
3843 Mods |= SISrcMods::NEG;
3844 // Check if all elements also have abs modifier
3845 SmallVector<SDValue, 8> NegAbsElts;
3846 for (auto El : Elts) {
3847 if (El.getOpcode() != ISD::FABS)
3848 break;
3849 NegAbsElts.push_back(El->getOperand(0));
3850 }
3851 if (Elts.size() != NegAbsElts.size()) {
3852 // Neg
3853 Src = SDValue(buildRegSequence(Elts, DL, ElementSize), 0);
3854 } else {
3855 // Neg and Abs
3856 Mods |= SISrcMods::NEG_HI;
3857 Src = SDValue(buildRegSequence(NegAbsElts, DL, ElementSize), 0);
3858 }
3859 } else {
3860 assert(ModOpcode == ISD::FABS);
3861 // Abs
3862 Mods |= SISrcMods::NEG_HI;
3863 Src = SDValue(buildRegSequence(Elts, DL, ElementSize), 0);
3864 }
3865}
3866
3867// Check all f16 elements for modifiers while looking through b32 and v2b16
3868// build vector, stop if element does not satisfy ModifierCheck.
3869static void
3871 std::function<bool(SDValue)> ModifierCheck) {
3872 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3873 if (auto *F16Pair =
3874 dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {
3875 for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
3876 SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));
3877 if (!ModifierCheck(ElF16))
3878 break;
3879 }
3880 }
3881 }
3882}
3883
3884bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
3885 SDValue &SrcMods) const {
3886 Src = In;
3887 unsigned Mods = SISrcMods::OP_SEL_1;
3888
3889 // mods are on f16 elements
3890 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3892
3893 checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {
3894 if (Element.getOpcode() != ISD::FNEG)
3895 return false;
3896 EltsF16.push_back(Element.getOperand(0));
3897 return true;
3898 });
3899
3900 // All elements have neg modifier
3901 if (BV->getNumOperands() * 2 == EltsF16.size()) {
3902 Src = SDValue(buildRegSequence16(EltsF16, SDLoc(In)), 0);
3903 Mods |= SISrcMods::NEG;
3904 Mods |= SISrcMods::NEG_HI;
3905 }
3906 }
3907
3908 // mods are on v2f16 elements
3909 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3910 SmallVector<SDValue, 8> EltsV2F16;
3911 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3912 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3913 // Based on first element decide which mod we match, neg or abs
3914 if (ElV2f16.getOpcode() != ISD::FNEG)
3915 break;
3916 EltsV2F16.push_back(ElV2f16.getOperand(0));
3917 }
3918
3919 // All pairs of elements have neg modifier
3920 if (BV->getNumOperands() == EltsV2F16.size()) {
3921 Src = SDValue(buildRegSequence32(EltsV2F16, SDLoc(In)), 0);
3922 Mods |= SISrcMods::NEG;
3923 Mods |= SISrcMods::NEG_HI;
3924 }
3925 }
3926
3927 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3928 return true;
3929}
3930
3931bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
3932 SDValue &SrcMods) const {
3933 Src = In;
3934 unsigned Mods = SISrcMods::OP_SEL_1;
3935 unsigned ModOpcode;
3936
3937 // mods are on f16 elements
3938 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3940 checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {
3941 // Based on first element decide which mod we match, neg or abs
3942 if (EltsF16.empty())
3943 ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3944 if (ElF16.getOpcode() != ModOpcode)
3945 return false;
3946 EltsF16.push_back(ElF16.getOperand(0));
3947 return true;
3948 });
3949
3950 // All elements have ModOpcode modifier
3951 if (BV->getNumOperands() * 2 == EltsF16.size())
3952 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, SDLoc(In), 16);
3953 }
3954
3955 // mods are on v2f16 elements
3956 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3957 SmallVector<SDValue, 8> EltsV2F16;
3958
3959 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3960 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3961 // Based on first element decide which mod we match, neg or abs
3962 if (EltsV2F16.empty())
3963 ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3964 if (ElV2f16->getOpcode() != ModOpcode)
3965 break;
3966 EltsV2F16.push_back(ElV2f16->getOperand(0));
3967 }
3968
3969 // All elements have ModOpcode modifier
3970 if (BV->getNumOperands() == EltsV2F16.size())
3971 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, SDLoc(In), 32);
3972 }
3973
3974 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3975 return true;
3976}
3977
3978bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
3979 SDValue &SrcMods) const {
3980 Src = In;
3981 unsigned Mods = SISrcMods::OP_SEL_1;
3983
3984 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3985 assert(BV->getNumOperands() > 0);
3986 // Based on first element decide which mod we match, neg or abs
3987 SDValue ElF32 = stripBitcast(BV->getOperand(0));
3988 unsigned ModOpcode =
3989 (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3990 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3991 SDValue ElF32 = stripBitcast(BV->getOperand(i));
3992 if (ElF32.getOpcode() != ModOpcode)
3993 break;
3994 EltsF32.push_back(ElF32.getOperand(0));
3995 }
3996
3997 // All elements had ModOpcode modifier
3998 if (BV->getNumOperands() == EltsF32.size())
3999 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, SDLoc(In), 32);
4000 }
4001
4002 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4003 return true;
4004}
4005
4006bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
4007 if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {
4008 BitVector UndefElements;
4009 if (SDValue Splat = BV->getSplatValue(&UndefElements))
4010 if (isInlineImmediate(Splat.getNode())) {
4011 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
4012 unsigned Imm = C->getAPIntValue().getSExtValue();
4013 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
4014 return true;
4015 }
4016 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {
4017 unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
4018 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
4019 return true;
4020 }
4021 llvm_unreachable("unhandled Constant node");
4022 }
4023 }
4024
4025 // 16 bit splat
4026 SDValue SplatSrc32 = stripBitcast(In);
4027 if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32))
4028 if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
4029 SDValue SplatSrc16 = stripBitcast(Splat32);
4030 if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16))
4031 if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
4032 const SIInstrInfo *TII = Subtarget->getInstrInfo();
4033 std::optional<APInt> RawValue;
4034 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat))
4035 RawValue = C->getValueAPF().bitcastToAPInt();
4036 else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat))
4037 RawValue = C->getAPIntValue();
4038
4039 if (RawValue.has_value()) {
4040 EVT VT = In.getValueType().getScalarType();
4041 if (VT.getSimpleVT() == MVT::f16 || VT.getSimpleVT() == MVT::bf16) {
4042 APFloat FloatVal(VT.getSimpleVT() == MVT::f16
4045 RawValue.value());
4046 if (TII->isInlineConstant(FloatVal)) {
4047 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
4048 MVT::i16);
4049 return true;
4050 }
4051 } else if (VT.getSimpleVT() == MVT::i16) {
4052 if (TII->isInlineConstant(RawValue.value())) {
4053 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
4054 MVT::i16);
4055 return true;
4056 }
4057 } else
4058 llvm_unreachable("unknown 16-bit type");
4059 }
4060 }
4061 }
4062
4063 return false;
4064}
4065
4066bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
4067 SDValue &IndexKey) const {
4068 unsigned Key = 0;
4069 Src = In;
4070
4071 if (In.getOpcode() == ISD::SRL) {
4072 const llvm::SDValue &ShiftSrc = In.getOperand(0);
4073 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
4074 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
4075 ShiftAmt->getZExtValue() % 8 == 0) {
4076 Key = ShiftAmt->getZExtValue() / 8;
4077 Src = ShiftSrc;
4078 }
4079 }
4080
4081 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
4082 return true;
4083}
4084
4085bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
4086 SDValue &IndexKey) const {
4087 unsigned Key = 0;
4088 Src = In;
4089
4090 if (In.getOpcode() == ISD::SRL) {
4091 const llvm::SDValue &ShiftSrc = In.getOperand(0);
4092 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
4093 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
4094 ShiftAmt->getZExtValue() == 16) {
4095 Key = 1;
4096 Src = ShiftSrc;
4097 }
4098 }
4099
4100 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
4101 return true;
4102}
4103
4104bool AMDGPUDAGToDAGISel::SelectSWMMACIndex32(SDValue In, SDValue &Src,
4105 SDValue &IndexKey) const {
4106 unsigned Key = 0;
4107 Src = In;
4108
4109 SDValue InI32;
4110
4111 if (In.getOpcode() == ISD::ANY_EXTEND || In.getOpcode() == ISD::ZERO_EXTEND) {
4112 const SDValue &ExtendSrc = In.getOperand(0);
4113 if (ExtendSrc.getValueSizeInBits() == 32)
4114 InI32 = ExtendSrc;
4115 } else if (In->getOpcode() == ISD::BITCAST) {
4116 const SDValue &CastSrc = In.getOperand(0);
4117 if (CastSrc.getOpcode() == ISD::BUILD_VECTOR &&
4118 CastSrc.getOperand(0).getValueSizeInBits() == 32) {
4119 ConstantSDNode *Zero = dyn_cast<ConstantSDNode>(CastSrc.getOperand(1));
4120 if (Zero && Zero->getZExtValue() == 0)
4121 InI32 = CastSrc.getOperand(0);
4122 }
4123 }
4124
4125 if (InI32 && InI32.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
4126 const SDValue &ExtractVecEltSrc = InI32.getOperand(0);
4127 ConstantSDNode *EltIdx = dyn_cast<ConstantSDNode>(InI32.getOperand(1));
4128 if (ExtractVecEltSrc.getValueSizeInBits() == 64 && EltIdx &&
4129 EltIdx->getZExtValue() == 1) {
4130 Key = 1;
4131 Src = ExtractVecEltSrc;
4132 }
4133 }
4134
4135 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
4136 return true;
4137}
4138
4139bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
4140 SDValue &SrcMods) const {
4141 Src = In;
4142 // FIXME: Handle op_sel
4143 SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
4144 return true;
4145}
4146
4147bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
4148 SDValue &SrcMods) const {
4149 // FIXME: Handle op_sel
4150 return SelectVOP3Mods(In, Src, SrcMods);
4151}
4152
4153// Match lowered fpext from bf16 to f32. This is a bit operation extending
4154// a 16-bit value with 16-bit of zeroes at LSB:
4155//
4156// 1. (f32 (bitcast (build_vector (i16 0), (i16 (bitcast bf16:val)))))
4157// 2. (f32 (bitcast (and i32:val, 0xffff0000))) -> IsExtractHigh = true
4158// 3. (f32 (bitcast (shl i32:va, 16) -> IsExtractHigh = false
4159static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh) {
4160 if (Op.getValueType() != MVT::f32 || Op.getOpcode() != ISD::BITCAST)
4161 return SDValue();
4162 Op = Op.getOperand(0);
4163
4164 IsExtractHigh = false;
4165 if (Op.getValueType() == MVT::v2i16 && Op.getOpcode() == ISD::BUILD_VECTOR) {
4166 auto Low16 = dyn_cast<ConstantSDNode>(Op.getOperand(0));
4167 if (!Low16 || !Low16->isZero())
4168 return SDValue();
4169 Op = stripBitcast(Op.getOperand(1));
4170 if (Op.getValueType() != MVT::bf16)
4171 return SDValue();
4172 return Op;
4173 }
4174
4175 if (Op.getValueType() != MVT::i32)
4176 return SDValue();
4177
4178 if (Op.getOpcode() == ISD::AND) {
4179 if (auto Mask = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4180 if (Mask->getZExtValue() == 0xffff0000) {
4181 IsExtractHigh = true;
4182 return Op.getOperand(0);
4183 }
4184 }
4185 return SDValue();
4186 }
4187
4188 if (Op.getOpcode() == ISD::SHL) {
4189 if (auto Amt = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4190 if (Amt->getZExtValue() == 16)
4191 return Op.getOperand(0);
4192 }
4193 }
4194
4195 return SDValue();
4196}
4197
4198// The return value is not whether the match is possible (which it always is),
4199// but whether or not it a conversion is really used.
4200bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
4201 unsigned &Mods,
4202 MVT VT) const {
4203 Mods = 0;
4204 SelectVOP3ModsImpl(In, Src, Mods);
4205
4206 bool IsExtractHigh = false;
4207 if (Src.getOpcode() == ISD::FP_EXTEND) {
4208 Src = Src.getOperand(0);
4209 } else if (VT == MVT::bf16) {
4210 SDValue B16 = matchBF16FPExtendLike(Src, IsExtractHigh);
4211 if (!B16)
4212 return false;
4213 Src = B16;
4214 } else
4215 return false;
4216
4217 if (Src.getValueType() != VT &&
4218 (VT != MVT::bf16 || Src.getValueType() != MVT::i32))
4219 return false;
4220
4221 Src = stripBitcast(Src);
4222
4223 // Be careful about folding modifiers if we already have an abs. fneg is
4224 // applied last, so we don't want to apply an earlier fneg.
4225 if ((Mods & SISrcMods::ABS) == 0) {
4226 unsigned ModsTmp;
4227 SelectVOP3ModsImpl(Src, Src, ModsTmp);
4228
4229 if ((ModsTmp & SISrcMods::NEG) != 0)
4230 Mods ^= SISrcMods::NEG;
4231
4232 if ((ModsTmp & SISrcMods::ABS) != 0)
4233 Mods |= SISrcMods::ABS;
4234 }
4235
4236 // op_sel/op_sel_hi decide the source type and source.
4237 // If the source's op_sel_hi is set, it indicates to do a conversion from
4238 // fp16. If the sources's op_sel is set, it picks the high half of the source
4239 // register.
4240
4241 Mods |= SISrcMods::OP_SEL_1;
4242 if (Src.getValueSizeInBits() == 16) {
4243 if (isExtractHiElt(Src, Src)) {
4244 Mods |= SISrcMods::OP_SEL_0;
4245
4246 // TODO: Should we try to look for neg/abs here?
4247 return true;
4248 }
4249
4250 if (Src.getOpcode() == ISD::TRUNCATE &&
4251 Src.getOperand(0).getValueType() == MVT::i32) {
4252 Src = Src.getOperand(0);
4253 return true;
4254 }
4255
4256 if (Subtarget->useRealTrue16Insts())
4257 // In true16 mode, pack src to a 32bit
4258 Src = createVOP3PSrc32FromLo16(Src, In, CurDAG, Subtarget);
4259 } else if (IsExtractHigh)
4260 Mods |= SISrcMods::OP_SEL_0;
4261
4262 return true;
4263}
4264
4265bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
4266 SDValue &SrcMods) const {
4267 unsigned Mods = 0;
4268 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16))
4269 return false;
4270 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4271 return true;
4272}
4273
4274bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
4275 SDValue &SrcMods) const {
4276 unsigned Mods = 0;
4277 SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16);
4278 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4279 return true;
4280}
4281
4282bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src,
4283 SDValue &SrcMods) const {
4284 unsigned Mods = 0;
4285 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16))
4286 return false;
4287 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4288 return true;
4289}
4290
4291bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,
4292 SDValue &SrcMods) const {
4293 unsigned Mods = 0;
4294 SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16);
4295 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4296 return true;
4297}
4298
4299// Match BITOP3 operation and return a number of matched instructions plus
4300// truth table.
4301static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
4303 unsigned NumOpcodes = 0;
4304 uint8_t LHSBits, RHSBits;
4305
4306 auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool {
4307 // Define truth table given Src0, Src1, Src2 bits permutations:
4308 // 0 0 0
4309 // 0 0 1
4310 // 0 1 0
4311 // 0 1 1
4312 // 1 0 0
4313 // 1 0 1
4314 // 1 1 0
4315 // 1 1 1
4316 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
4317
4318 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
4319 if (C->isAllOnes()) {
4320 Bits = 0xff;
4321 return true;
4322 }
4323 if (C->isZero()) {
4324 Bits = 0;
4325 return true;
4326 }
4327 }
4328
4329 for (unsigned I = 0; I < Src.size(); ++I) {
4330 // Try to find existing reused operand
4331 if (Src[I] == Op) {
4332 Bits = SrcBits[I];
4333 return true;
4334 }
4335 // Try to replace parent operator
4336 if (Src[I] == In) {
4337 Bits = SrcBits[I];
4338 Src[I] = Op;
4339 return true;
4340 }
4341 }
4342
4343 if (Src.size() == 3) {
4344 // No room left for operands. Try one last time, there can be a 'not' of
4345 // one of our source operands. In this case we can compute the bits
4346 // without growing Src vector.
4347 if (Op.getOpcode() == ISD::XOR) {
4348 if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4349 if (C->isAllOnes()) {
4350 SDValue LHS = Op.getOperand(0);
4351 for (unsigned I = 0; I < Src.size(); ++I) {
4352 if (Src[I] == LHS) {
4353 Bits = ~SrcBits[I];
4354 return true;
4355 }
4356 }
4357 }
4358 }
4359 }
4360
4361 return false;
4362 }
4363
4364 Bits = SrcBits[Src.size()];
4365 Src.push_back(Op);
4366 return true;
4367 };
4368
4369 switch (In.getOpcode()) {
4370 case ISD::AND:
4371 case ISD::OR:
4372 case ISD::XOR: {
4373 SDValue LHS = In.getOperand(0);
4374 SDValue RHS = In.getOperand(1);
4375
4376 SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());
4377 if (!getOperandBits(LHS, LHSBits) ||
4378 !getOperandBits(RHS, RHSBits)) {
4379 Src = std::move(Backup);
4380 return std::make_pair(0, 0);
4381 }
4382
4383 // Recursion is naturally limited by the size of the operand vector.
4384 auto Op = BitOp3_Op(LHS, Src);
4385 if (Op.first) {
4386 NumOpcodes += Op.first;
4387 LHSBits = Op.second;
4388 }
4389
4390 Op = BitOp3_Op(RHS, Src);
4391 if (Op.first) {
4392 NumOpcodes += Op.first;
4393 RHSBits = Op.second;
4394 }
4395 break;
4396 }
4397 default:
4398 return std::make_pair(0, 0);
4399 }
4400
4401 uint8_t TTbl;
4402 switch (In.getOpcode()) {
4403 case ISD::AND:
4404 TTbl = LHSBits & RHSBits;
4405 break;
4406 case ISD::OR:
4407 TTbl = LHSBits | RHSBits;
4408 break;
4409 case ISD::XOR:
4410 TTbl = LHSBits ^ RHSBits;
4411 break;
4412 default:
4413 break;
4414 }
4415
4416 return std::make_pair(NumOpcodes + 1, TTbl);
4417}
4418
4419bool AMDGPUDAGToDAGISel::SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1,
4420 SDValue &Src2, SDValue &Tbl) const {
4422 uint8_t TTbl;
4423 unsigned NumOpcodes;
4424
4425 std::tie(NumOpcodes, TTbl) = BitOp3_Op(In, Src);
4426
4427 // Src.empty() case can happen if all operands are all zero or all ones.
4428 // Normally it shall be optimized out before reaching this.
4429 if (NumOpcodes < 2 || Src.empty())
4430 return false;
4431
4432 // For a uniform case threshold should be higher to account for moves between
4433 // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
4434 // and a readtfirstlane after.
4435 if (NumOpcodes < 4 && !In->isDivergent())
4436 return false;
4437
4438 if (NumOpcodes == 2 && In.getValueType() == MVT::i32) {
4439 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
4440 // asm more readable. This cannot be modeled with AddedComplexity because
4441 // selector does not know how many operations did we match.
4442 if ((In.getOpcode() == ISD::XOR || In.getOpcode() == ISD::OR) &&
4443 (In.getOperand(0).getOpcode() == In.getOpcode() ||
4444 In.getOperand(1).getOpcode() == In.getOpcode()))
4445 return false;
4446
4447 if (In.getOpcode() == ISD::OR &&
4448 (In.getOperand(0).getOpcode() == ISD::AND ||
4449 In.getOperand(1).getOpcode() == ISD::AND))
4450 return false;
4451 }
4452
4453 // Last operand can be ignored, turning a ternary operation into a binary.
4454 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
4455 // 'c' with 'a' here without changing the answer. In some pathological
4456 // cases it should be possible to get an operation with a single operand
4457 // too if optimizer would not catch it.
4458 while (Src.size() < 3)
4459 Src.push_back(Src[0]);
4460
4461 Src0 = Src[0];
4462 Src1 = Src[1];
4463 Src2 = Src[2];
4464
4465 Tbl = CurDAG->getTargetConstant(TTbl, SDLoc(In), MVT::i32);
4466 return true;
4467}
4468
4469SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
4470 if (In.isUndef())
4471 return CurDAG->getUNDEF(MVT::i32);
4472
4473 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
4474 SDLoc SL(In);
4475 return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
4476 }
4477
4478 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
4479 SDLoc SL(In);
4480 return CurDAG->getConstant(
4481 C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
4482 }
4483
4484 SDValue Src;
4485 if (isExtractHiElt(In, Src))
4486 return Src;
4487
4488 return SDValue();
4489}
4490
4491bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
4492 assert(CurDAG->getTarget().getTargetTriple().isAMDGCN());
4493
4494 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
4495 const SIInstrInfo *SII = Subtarget->getInstrInfo();
4496
4497 unsigned Limit = 0;
4498 bool AllUsesAcceptSReg = true;
4499 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
4500 Limit < 10 && U != E; ++U, ++Limit) {
4501 const TargetRegisterClass *RC =
4502 getOperandRegClass(U->getUser(), U->getOperandNo());
4503
4504 // If the register class is unknown, it could be an unknown
4505 // register class that needs to be an SGPR, e.g. an inline asm
4506 // constraint
4507 if (!RC || SIRI->isSGPRClass(RC))
4508 return false;
4509
4510 if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass &&
4511 RC != &AMDGPU::VS_64_Align2RegClass) {
4512 AllUsesAcceptSReg = false;
4513 SDNode *User = U->getUser();
4514 if (User->isMachineOpcode()) {
4515 unsigned Opc = User->getMachineOpcode();
4516 const MCInstrDesc &Desc = SII->get(Opc);
4517 if (Desc.isCommutable()) {
4518 unsigned OpIdx = Desc.getNumDefs() + U->getOperandNo();
4519 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
4520 if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
4521 unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
4522 const TargetRegisterClass *CommutedRC =
4523 getOperandRegClass(U->getUser(), CommutedOpNo);
4524 if (CommutedRC == &AMDGPU::VS_32RegClass ||
4525 CommutedRC == &AMDGPU::VS_64RegClass ||
4526 CommutedRC == &AMDGPU::VS_64_Align2RegClass)
4527 AllUsesAcceptSReg = true;
4528 }
4529 }
4530 }
4531 // If "AllUsesAcceptSReg == false" so far we haven't succeeded
4532 // commuting current user. This means have at least one use
4533 // that strictly require VGPR. Thus, we will not attempt to commute
4534 // other user instructions.
4535 if (!AllUsesAcceptSReg)
4536 break;
4537 }
4538 }
4539 return !AllUsesAcceptSReg && (Limit < 10);
4540}
4541
4542bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
4543 const auto *Ld = cast<LoadSDNode>(N);
4544 const MachineMemOperand *MMO = Ld->getMemOperand();
4545
4546 // FIXME: We ought to able able to take the direct isDivergent result. We
4547 // cannot rely on the MMO for a uniformity check, and should stop using
4548 // it. This is a hack for 2 ways that the IR divergence analysis is superior
4549 // to the DAG divergence: Recognizing shift-of-workitem-id as always
4550 // uniform, and isSingleLaneExecution. These should be handled in the DAG
4551 // version, and then this can be dropped.
4552 if (Ld->isDivergent() && !AMDGPU::isUniformMMO(MMO))
4553 return false;
4554
4555 return MMO->getSize().hasValue() &&
4556 Ld->getAlign() >=
4557 Align(std::min(MMO->getSize().getValue().getKnownMinValue(),
4558 uint64_t(4))) &&
4559 (MMO->isInvariant() ||
4560 (Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
4561 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
4562 (Subtarget->getScalarizeGlobalBehavior() &&
4563 Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
4564 Ld->isSimple() &&
4565 static_cast<const SITargetLowering *>(getTargetLowering())
4566 ->isMemOpHasNoClobberedMemOperand(N)));
4567}
4568
4571 *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
4572 bool IsModified = false;
4573 do {
4574 IsModified = false;
4575
4576 // Go over all selected nodes and try to fold them a bit more
4577 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
4578 while (Position != CurDAG->allnodes_end()) {
4579 SDNode *Node = &*Position++;
4581 if (!MachineNode)
4582 continue;
4583
4584 SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
4585 if (ResNode != Node) {
4586 if (ResNode)
4587 ReplaceUses(Node, ResNode);
4588 IsModified = true;
4589 }
4590 }
4591 CurDAG->RemoveDeadNodes();
4592 } while (IsModified);
4593}
4594
4599
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr, SDValue &N0, SDValue &N1)
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr)
static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned, const SelectionDAG *DAG)
static MemSDNode * findMemSDNode(SDNode *N)
static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val)
static SDValue combineBallotPattern(SDValue VCMP, bool &Negate)
static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh)
static void checkWMMAElementsModifiersF16(BuildVectorSDNode *BV, std::function< bool(SDValue)> ModifierCheck)
Defines an instruction selector for the AMDGPU target.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
Provides AMDGPU specific target descriptions.
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
const HexagonInstrInfo * TII
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
MachineInstr unsigned OpIdx
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
Provides R600 specific target descriptions.
Interface definition for R600RegisterInfo.
const SmallVectorImpl< MachineOperand > & Cond
SI DAG Lowering interface definition.
#define LLVM_DEBUG(...)
Definition Debug.h:114
LLVM IR instance of the generic uniformity analysis.
Value * RHS
Value * LHS
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
AMDGPUDAGToDAGISelLegacy(TargetMachine &TM, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
AMDGPU specific code to select AMDGPU machine instructions for SelectionDAG operations.
void SelectBuildVector(SDNode *N, unsigned RegClassID)
void Select(SDNode *N) override
Main hook for targets to transform nodes into machine nodes.
bool runOnMachineFunction(MachineFunction &MF) override
void PreprocessISelDAG() override
PreprocessISelDAG - This hook allows targets to hack on the graph before instruction selection starts...
void PostprocessISelDAG() override
PostprocessISelDAG() - This hook allows the target to hack on the graph right after selection.
bool matchLoadD16FromBuildVector(SDNode *N) const
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
AMDGPUISelDAGToDAGPass(TargetMachine &TM)
static SDValue stripBitcast(SDValue Val)
static const fltSemantics & BFloat()
Definition APFloat.h:295
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1555
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:467
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition APInt.h:406
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1577
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1671
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
Analysis pass which computes a DominatorTree.
Definition Dominators.h:278
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:316
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
const SIInstrInfo * getInstrInfo() const override
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
Generation getGeneration() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
bool hasValue() const
TypeSize getValue() const
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
SmallVector< LoopT *, 4 > getLoopsInPreorder() const
Return all of the loops in the function in preorder across the loop nests, with siblings in forward p...
The legacy pass manager's analysis pass to compute loop information.
Definition LoopInfo.h:596
Machine Value Type.
static MVT getIntegerVT(unsigned BitWidth)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
Wrapper class representing virtual and physical registers.
Definition Register.h:20
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isSGPRClass(const TargetRegisterClass *RC)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
SelectionDAGISelLegacy(char &ID, std::unique_ptr< SelectionDAGISel > S)
SelectionDAGISelPass(std::unique_ptr< SelectionDAGISel > Selector)
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
std::unique_ptr< FunctionLoweringInfo > FuncInfo
const TargetLowering * TLI
const TargetInstrInfo * TII
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
SelectionDAGISel(TargetMachine &tm, CodeGenOptLevel OL=CodeGenOptLevel::Default)
virtual bool runOnMachineFunction(MachineFunction &mf)
const TargetLowering * getTargetLowering() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
SDValue getTargetFrameIndex(int FI, EVT VT)
LLVM_ABI bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
MachineFunction & getMachineFunction() const
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
ilist< SDNode >::iterator allnodes_iterator
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
static const unsigned CommuteAnyOperandIndex
Primary interface to the complete machine description for the target machine.
unsigned getID() const
Return the register class ID number.
Legacy analysis pass which computes a CycleInfo.
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
bool isUniformMMO(const MachineMemOperand *MMO)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:819
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
@ PTRADD
PTRADD represents pointer arithmetic semantics, for targets that opt in using shouldPreservePtrArith(...
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:294
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:522
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:853
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:220
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:993
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ CONVERGENCECTRL_GLUE
This does not correspond to any convergence control intrinsic.
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:665
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:541
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:233
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:230
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ TargetFrameIndex
Definition ISDOpcodes.h:187
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:888
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:304
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
@ BRCOND
BRCOND - Conditional branch.
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556
bool isExtOpcode(unsigned Opcode)
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
@ User
could "use" a pointer
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
@ Undef
Value of the register doesn't matter.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:255
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
static bool getConstantValue(SDValue N, uint32_t &Out)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
FunctionPass * createAMDGPUISelDag(TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a AMDGPU-specific.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:870
#define N
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:381
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:393
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:324
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:264
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:336
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:165
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:344
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:317
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false, bool SelfAdd=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:363
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:148
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition KnownBits.h:132
static unsigned getSubRegFromChannel(unsigned Channel)
bool hasNoUnsignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.