doxygen/AMDGPUISelDAGToDAG_8cpp_source.html

//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//==-----------------------------------------------------------------------===//

//

/// \file

/// Defines an instruction selector for the AMDGPU target.

//

//===----------------------------------------------------------------------===//


#include "AMDGPUISelDAGToDAG.h"

#include "AMDGPU.h"

#include "AMDGPUInstrInfo.h"

#include "AMDGPUSubtarget.h"

#include "AMDGPUTargetMachine.h"

#include "MCTargetDesc/AMDGPUMCTargetDesc.h"

#include "MCTargetDesc/R600MCTargetDesc.h"

#include "R600RegisterInfo.h"

#include "SIISelLowering.h"

#include "SIMachineFunctionInfo.h"

#include "llvm/Analysis/UniformityAnalysis.h"

#include "llvm/CodeGen/FunctionLoweringInfo.h"

#include "llvm/CodeGen/SelectionDAG.h"

#include "llvm/CodeGen/SelectionDAGISel.h"

#include "llvm/CodeGen/SelectionDAGNodes.h"

#include "llvm/IR/IntrinsicsAMDGPU.h"

#include "llvm/InitializePasses.h"

#include "llvm/Support/ErrorHandling.h"


#ifdef EXPENSIVE_CHECKS

#include "llvm/Analysis/LoopInfo.h"

#include "llvm/IR/Dominators.h"

#endif


#define DEBUG_TYPE "amdgpu-isel"


using namespace llvm;


//===----------------------------------------------------------------------===//

// Instruction Selector Implementation

//===----------------------------------------------------------------------===//


namespace {

static SDValue stripBitcast(SDValue Val) {

  return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;

}


// Figure out if this is really an extract of the high 16-bits of a dword.

static bool isExtractHiElt(SDValue In, SDValue &Out) {

  In = stripBitcast(In);


  if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {

    if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {

      if (!Idx->isOne())

        return false;

      Out = In.getOperand(0);

      return true;

    }

  }


  if (In.getOpcode() != ISD::TRUNCATE)

    return false;


  SDValue Srl = In.getOperand(0);

  if (Srl.getOpcode() == ISD::SRL) {

    if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {

      if (ShiftAmt->getZExtValue() == 16) {

        Out = stripBitcast(Srl.getOperand(0));

        return true;

      }

    }

  }


  return false;

}


static SDValue createVOP3PSrc32FromLo16(SDValue Lo, SDValue Src,

                                        llvm::SelectionDAG *CurDAG,

                                        const GCNSubtarget *Subtarget) {

  if (!Subtarget->useRealTrue16Insts()) {

    return Lo;

  }


  SDValue NewSrc;

  SDLoc SL(Lo);


  if (Lo->isDivergent()) {

    SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,

                                                   SL, Lo.getValueType()),

                            0);

    const SDValue Ops[] = {

        CurDAG->getTargetConstant(AMDGPU::VGPR_32RegClassID, SL, MVT::i32), Lo,

        CurDAG->getTargetConstant(AMDGPU::lo16, SL, MVT::i16), Undef,

        CurDAG->getTargetConstant(AMDGPU::hi16, SL, MVT::i16)};


    NewSrc = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,

                                            Src.getValueType(), Ops),

                     0);

  } else {

    // the S_MOV is needed since the Lo could still be a VGPR16.

    // With S_MOV, isel insert a "sgpr32 = copy vgpr16" and we reply on

    // the fixvgpr2sgprcopy pass to legalize it

    NewSrc = SDValue(

        CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, Src.getValueType(), Lo),

        0);

  }


  return NewSrc;

}


// Look through operations that obscure just looking at the low 16-bits of the

// same register.

static SDValue stripExtractLoElt(SDValue In) {

  if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {

    SDValue Idx = In.getOperand(1);

    if (isNullConstant(Idx) && In.getValueSizeInBits() <= 32)

      return In.getOperand(0);

  }


  if (In.getOpcode() == ISD::TRUNCATE) {

    SDValue Src = In.getOperand(0);

    if (Src.getValueType().getSizeInBits() == 32)

      return stripBitcast(Src);

  }


  return In;

}


static SDValue emitRegSequence(llvm::SelectionDAG &CurDAG, unsigned DstRegClass,

                               EVT DstTy, ArrayRef<SDValue> Elts,

                               ArrayRef<unsigned> SubRegClass,

                               const SDLoc &DL) {

  assert(Elts.size() == SubRegClass.size() && "array size mismatch");

  unsigned NumElts = Elts.size();

  SmallVector<SDValue, 17> Ops(2 * NumElts + 1);

  Ops[0] = (CurDAG.getTargetConstant(DstRegClass, DL, MVT::i32));

  for (unsigned i = 0; i < NumElts; ++i) {

    Ops[2 * i + 1] = Elts[i];

    Ops[2 * i + 2] = CurDAG.getTargetConstant(SubRegClass[i], DL, MVT::i32);

  }

  return SDValue(

      CurDAG.getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops), 0);

}


} // end anonymous namespace


INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISelLegacy, "amdgpu-isel",

                      "AMDGPU DAG->DAG Pattern Instruction Selection", false,

                      false)

INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysisLegacy)

INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)

#ifdef EXPENSIVE_CHECKS

INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)

INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)

#endif

INITIALIZE_PASS_END(AMDGPUDAGToDAGISelLegacy, "amdgpu-isel",

                    "AMDGPU DAG->DAG Pattern Instruction Selection", false,

                    false)


/// This pass converts a legalized DAG into a AMDGPU-specific

// DAG, ready for instruction scheduling.


FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM,

                                        CodeGenOptLevel OptLevel) {

  return new AMDGPUDAGToDAGISelLegacy(TM, OptLevel);

}


AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM,

                                       CodeGenOptLevel OptLevel)

    : SelectionDAGISel(TM, OptLevel) {}


bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {

  Subtarget = &MF.getSubtarget<GCNSubtarget>();

  Subtarget->checkSubtargetFeatures(MF.getFunction());

  Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);

  return SelectionDAGISel::runOnMachineFunction(MF);

}


bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {

  // XXX - only need to list legal operations.

  switch (Opc) {

  case ISD::FADD:

  case ISD::FSUB:

  case ISD::FMUL:

  case ISD::FDIV:

  case ISD::FREM:

  case ISD::FCANONICALIZE:

  case ISD::UINT_TO_FP:

  case ISD::SINT_TO_FP:

  case ISD::FABS:

    // Fabs is lowered to a bit operation, but it's an and which will clear the

    // high bits anyway.

  case ISD::FSQRT:

  case ISD::FSIN:

  case ISD::FCOS:

  case ISD::FPOWI:

  case ISD::FPOW:

  case ISD::FLOG:

  case ISD::FLOG2:

  case ISD::FLOG10:

  case ISD::FEXP:

  case ISD::FEXP2:

  case ISD::FCEIL:

  case ISD::FTRUNC:

  case ISD::FRINT:

  case ISD::FNEARBYINT:

  case ISD::FROUNDEVEN:

  case ISD::FROUND:

  case ISD::FFLOOR:

  case ISD::FMINNUM:

  case ISD::FMAXNUM:

  case ISD::FLDEXP:

  case AMDGPUISD::FRACT:

  case AMDGPUISD::CLAMP:

  case AMDGPUISD::COS_HW:

  case AMDGPUISD::SIN_HW:

  case AMDGPUISD::FMIN3:

  case AMDGPUISD::FMAX3:

  case AMDGPUISD::FMED3:

  case AMDGPUISD::FMAD_FTZ:

  case AMDGPUISD::RCP:

  case AMDGPUISD::RSQ:

  case AMDGPUISD::RCP_IFLAG:

    // On gfx10, all 16-bit instructions preserve the high bits.

    return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;

  case ISD::FP_ROUND:

    // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the

    // high bits on gfx9.

    // TODO: If we had the source node we could see if the source was fma/mad

    return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;

  case ISD::FMA:

  case ISD::FMAD:

  case AMDGPUISD::DIV_FIXUP:

    return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;

  default:

    // fcopysign, select and others may be lowered to 32-bit bit operations

    // which don't zero the high bits.

    return false;

  }

}


bool AMDGPUDAGToDAGISelLegacy::runOnMachineFunction(MachineFunction &MF) {

#ifdef EXPENSIVE_CHECKS

  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();

  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();

  for (auto &L : LI->getLoopsInPreorder()) {

    assert(L->isLCSSAForm(DT));

  }

#endif

  return SelectionDAGISelLegacy::runOnMachineFunction(MF);

}


void AMDGPUDAGToDAGISelLegacy::getAnalysisUsage(AnalysisUsage &AU) const {

  AU.addRequired<UniformityInfoWrapperPass>();

#ifdef EXPENSIVE_CHECKS

  AU.addRequired<DominatorTreeWrapperPass>();

  AU.addRequired<LoopInfoWrapperPass>();

#endif

  SelectionDAGISelLegacy::getAnalysisUsage(AU);

}


bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {

  assert(Subtarget->d16PreservesUnusedBits());

  MVT VT = N->getValueType(0).getSimpleVT();

  if (VT != MVT::v2i16 && VT != MVT::v2f16)

    return false;


  SDValue Lo = N->getOperand(0);

  SDValue Hi = N->getOperand(1);


  LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));


  // build_vector lo, (load ptr) -> load_d16_hi ptr, lo

  // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo

  // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo


  // Need to check for possible indirect dependencies on the other half of the

  // vector to avoid introducing a cycle.

  if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {

    SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);


    SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);

    SDValue Ops[] = {

      LdHi->getChain(), LdHi->getBasePtr(), TiedIn

    };


    unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;

    if (LdHi->getMemoryVT() == MVT::i8) {

      LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?

        AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;

    } else {

      assert(LdHi->getMemoryVT() == MVT::i16);

    }


    SDValue NewLoadHi =

      CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,

                                  Ops, LdHi->getMemoryVT(),

                                  LdHi->getMemOperand());


    CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);

    CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));

    return true;

  }


  // build_vector (load ptr), hi -> load_d16_lo ptr, hi

  // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi

  // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi

  LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));

  if (LdLo && Lo.hasOneUse()) {

    SDValue TiedIn = getHi16Elt(Hi);

    if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))

      return false;


    SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);

    unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;

    if (LdLo->getMemoryVT() == MVT::i8) {

      LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?

        AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;

    } else {

      assert(LdLo->getMemoryVT() == MVT::i16);

    }


    TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);


    SDValue Ops[] = {

      LdLo->getChain(), LdLo->getBasePtr(), TiedIn

    };


    SDValue NewLoadLo =

      CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,

                                  Ops, LdLo->getMemoryVT(),

                                  LdLo->getMemOperand());


    CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);

    CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));

    return true;

  }


  return false;

}


void AMDGPUDAGToDAGISel::PreprocessISelDAG() {

  if (!Subtarget->d16PreservesUnusedBits())

    return;


  SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();


  bool MadeChange = false;

  while (Position != CurDAG->allnodes_begin()) {

    SDNode *N = &*--Position;

    if (N->use_empty())

      continue;


    switch (N->getOpcode()) {

    case ISD::BUILD_VECTOR:

      // TODO: Match load d16 from shl (extload:i16), 16

      MadeChange |= matchLoadD16FromBuildVector(N);

      break;

    default:

      break;

    }

  }


  if (MadeChange) {

    CurDAG->RemoveDeadNodes();

    LLVM_DEBUG(dbgs() << "After PreProcess:\n";

               CurDAG->dump(););

  }

}


bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {

  if (N->isUndef())

    return true;


  const SIInstrInfo *TII = Subtarget->getInstrInfo();

  if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))

    return TII->isInlineConstant(C->getAPIntValue());


  if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))

    return TII->isInlineConstant(C->getValueAPF());


  return false;

}


/// Determine the register class for \p OpNo

/// \returns The register class of the virtual register that will be used for

/// the given operand number \OpNo or NULL if the register class cannot be

/// determined.

const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,

                                                          unsigned OpNo) const {

  if (!N->isMachineOpcode()) {

    if (N->getOpcode() == ISD::CopyToReg) {

      Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();

      if (Reg.isVirtual()) {

        MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();

        return MRI.getRegClass(Reg);

      }


      const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();

      return TRI->getPhysRegBaseClass(Reg);

    }


    return nullptr;

  }


  switch (N->getMachineOpcode()) {

  default: {

    const SIInstrInfo *TII = Subtarget->getInstrInfo();

    const MCInstrDesc &Desc = TII->get(N->getMachineOpcode());

    unsigned OpIdx = Desc.getNumDefs() + OpNo;

    if (OpIdx >= Desc.getNumOperands())

      return nullptr;


    int16_t RegClass = TII->getOpRegClassID(Desc.operands()[OpIdx]);

    if (RegClass == -1)

      return nullptr;


    return Subtarget->getRegisterInfo()->getRegClass(RegClass);

  }

  case AMDGPU::REG_SEQUENCE: {

    unsigned RCID = N->getConstantOperandVal(0);

    const TargetRegisterClass *SuperRC =

        Subtarget->getRegisterInfo()->getRegClass(RCID);


    SDValue SubRegOp = N->getOperand(OpNo + 1);

    unsigned SubRegIdx = SubRegOp->getAsZExtVal();

    return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,

                                                              SubRegIdx);

  }

  }

}


SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,

                                         SDValue Glue) const {

  SmallVector <SDValue, 8> Ops;

  Ops.push_back(NewChain); // Replace the chain.

  for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)

    Ops.push_back(N->getOperand(i));


  Ops.push_back(Glue);

  return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);

}


SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {

  const SITargetLowering& Lowering =

    *static_cast<const SITargetLowering*>(getTargetLowering());


  assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");


  SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);

  return glueCopyToOp(N, M0, M0.getValue(1));

}


SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {

  unsigned AS = cast<MemSDNode>(N)->getAddressSpace();

  if (AS == AMDGPUAS::LOCAL_ADDRESS) {

    if (Subtarget->ldsRequiresM0Init())

      return glueCopyToM0(

          N, CurDAG->getSignedTargetConstant(-1, SDLoc(N), MVT::i32));

  } else if (AS == AMDGPUAS::REGION_ADDRESS) {

    MachineFunction &MF = CurDAG->getMachineFunction();

    unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();

    return

        glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));

  }

  return N;

}


MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,

                                                  EVT VT) const {

  SDNode *Lo = CurDAG->getMachineNode(

      AMDGPU::S_MOV_B32, DL, MVT::i32,

      CurDAG->getTargetConstant(Lo_32(Imm), DL, MVT::i32));

  SDNode *Hi = CurDAG->getMachineNode(

      AMDGPU::S_MOV_B32, DL, MVT::i32,

      CurDAG->getTargetConstant(Hi_32(Imm), DL, MVT::i32));

  const SDValue Ops[] = {

      CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),

      SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),

      SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};


  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);

}


SDNode *AMDGPUDAGToDAGISel::packConstantV2I16(const SDNode *N,

                                              SelectionDAG &DAG) const {

  // TODO: Handle undef as zero


  assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);

  uint32_t LHSVal, RHSVal;

  if (getConstantValue(N->getOperand(0), LHSVal) &&

      getConstantValue(N->getOperand(1), RHSVal)) {

    SDLoc SL(N);

    uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16);

    return DAG.getMachineNode(

        isVGPRImm(N) ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32, SL,

        N->getValueType(0), DAG.getTargetConstant(K, SL, MVT::i32));

  }


  return nullptr;

}


void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {

  EVT VT = N->getValueType(0);

  unsigned NumVectorElts = VT.getVectorNumElements();

  EVT EltVT = VT.getVectorElementType();

  SDLoc DL(N);

  SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);


  if (NumVectorElts == 1) {

    CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),

                         RegClass);

    return;

  }


  bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();

  if (IsGCN && Subtarget->has64BitLiterals() && VT.getSizeInBits() == 64 &&

      CurDAG->isConstantValueOfAnyType(SDValue(N, 0))) {

    uint64_t C = 0;

    bool AllConst = true;

    unsigned EltSize = EltVT.getSizeInBits();

    for (unsigned I = 0; I < NumVectorElts; ++I) {

      SDValue Op = N->getOperand(I);

      if (Op.isUndef()) {

        AllConst = false;

        break;

      }

      uint64_t Val;

      if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Op)) {

        Val = CF->getValueAPF().bitcastToAPInt().getZExtValue();

      } else

        Val = cast<ConstantSDNode>(Op)->getZExtValue();

      C |= Val << (EltSize * I);

    }

    if (AllConst) {

      SDValue CV = CurDAG->getTargetConstant(C, DL, MVT::i64);

      MachineSDNode *Copy =

          CurDAG->getMachineNode(AMDGPU::S_MOV_B64_IMM_PSEUDO, DL, VT, CV);

      CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, VT, SDValue(Copy, 0),

                           RegClass);

      return;

    }

  }


  assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "

                                  "supported yet");

  // 32 = Max Num Vector Elements

  // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)

  // 1 = Vector Register Class

  SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);


  RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);

  bool IsRegSeq = true;

  unsigned NOps = N->getNumOperands();

  unsigned EltSizeInRegs = EltVT.getSizeInBits() / 32;

  assert(IsGCN || EltSizeInRegs == 1);

  for (unsigned i = 0; i < NOps; i++) {

    // XXX: Why is this here?

    if (isa<RegisterSDNode>(N->getOperand(i))) {

      IsRegSeq = false;

      break;

    }

    unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(

                               i * EltSizeInRegs, EltSizeInRegs)

                         : R600RegisterInfo::getSubRegFromChannel(i);

    RegSeqArgs[1 + (2 * i)] = N->getOperand(i);

    RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);

  }

  if (NOps != NumVectorElts) {

    // Fill in the missing undef elements if this was a scalar_to_vector.

    assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);

    MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,

                                                   DL, EltVT);

    for (unsigned i = NOps; i < NumVectorElts; ++i) {

      unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(

                                 i * EltSizeInRegs, EltSizeInRegs)

                           : R600RegisterInfo::getSubRegFromChannel(i);

      RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);

      RegSeqArgs[1 + (2 * i) + 1] =

          CurDAG->getTargetConstant(Sub, DL, MVT::i32);

    }

  }


  if (!IsRegSeq)

    SelectCode(N);

  CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);

}


void AMDGPUDAGToDAGISel::SelectVectorShuffle(SDNode *N) {

  EVT VT = N->getValueType(0);

  EVT EltVT = VT.getVectorElementType();


  // TODO: Handle 16-bit element vectors with even aligned masks.

  if (!Subtarget->hasPkMovB32() || !EltVT.bitsEq(MVT::i32) ||

      VT.getVectorNumElements() != 2) {

    SelectCode(N);

    return;

  }


  auto *SVN = cast<ShuffleVectorSDNode>(N);


  SDValue Src0 = SVN->getOperand(0);

  SDValue Src1 = SVN->getOperand(1);

  ArrayRef<int> Mask = SVN->getMask();

  SDLoc DL(N);


  assert(Src0.getValueType().getVectorNumElements() == 2 && Mask.size() == 2 &&

         Mask[0] < 4 && Mask[1] < 4);


  SDValue VSrc0 = Mask[0] < 2 ? Src0 : Src1;

  SDValue VSrc1 = Mask[1] < 2 ? Src0 : Src1;

  unsigned Src0SubReg = Mask[0] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;

  unsigned Src1SubReg = Mask[1] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;


  if (Mask[0] < 0) {

    Src0SubReg = Src1SubReg;

    MachineSDNode *ImpDef =

        CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);

    VSrc0 = SDValue(ImpDef, 0);

  }


  if (Mask[1] < 0) {

    Src1SubReg = Src0SubReg;

    MachineSDNode *ImpDef =

        CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);

    VSrc1 = SDValue(ImpDef, 0);

  }


  // SGPR case needs to lower to copies.

  //

  // Also use subregister extract when we can directly blend the registers with

  // a simple subregister copy.

  //

  // TODO: Maybe we should fold this out earlier

  if (N->isDivergent() && Src0SubReg == AMDGPU::sub1 &&

      Src1SubReg == AMDGPU::sub0) {

    // The low element of the result always comes from src0.

    // The high element of the result always comes from src1.

    // op_sel selects the high half of src0.

    // op_sel_hi selects the high half of src1.


    unsigned Src0OpSel =

        Src0SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;

    unsigned Src1OpSel =

        Src1SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;


    // Enable op_sel_hi to avoid printing it. This should have no effect on the

    // result.

    Src0OpSel |= SISrcMods::OP_SEL_1;

    Src1OpSel |= SISrcMods::OP_SEL_1;


    SDValue Src0OpSelVal = CurDAG->getTargetConstant(Src0OpSel, DL, MVT::i32);

    SDValue Src1OpSelVal = CurDAG->getTargetConstant(Src1OpSel, DL, MVT::i32);

    SDValue ZeroMods = CurDAG->getTargetConstant(0, DL, MVT::i32);


    CurDAG->SelectNodeTo(N, AMDGPU::V_PK_MOV_B32, N->getVTList(),

                         {Src0OpSelVal, VSrc0, Src1OpSelVal, VSrc1,

                          ZeroMods,   // clamp

                          ZeroMods,   // op_sel

                          ZeroMods,   // op_sel_hi

                          ZeroMods,   // neg_lo

                          ZeroMods}); // neg_hi

    return;

  }


  SDValue ResultElt0 =

      CurDAG->getTargetExtractSubreg(Src0SubReg, DL, EltVT, VSrc0);

  SDValue ResultElt1 =

      CurDAG->getTargetExtractSubreg(Src1SubReg, DL, EltVT, VSrc1);


  const SDValue Ops[] = {

      CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),

      ResultElt0, CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),

      ResultElt1, CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};

  CurDAG->SelectNodeTo(N, TargetOpcode::REG_SEQUENCE, VT, Ops);

}


void AMDGPUDAGToDAGISel::Select(SDNode *N) {

  unsigned int Opc = N->getOpcode();

  if (N->isMachineOpcode()) {

    N->setNodeId(-1);

    return;   // Already selected.

  }


  // isa<MemSDNode> almost works but is slightly too permissive for some DS

  // intrinsics.

  if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N)) {

    N = glueCopyToM0LDSInit(N);

    SelectCode(N);

    return;

  }


  switch (Opc) {

  default:

    break;

  case ISD::UADDO_CARRY:

  case ISD::USUBO_CARRY:

    if (N->getValueType(0) == MVT::i64) {

      SelectAddcSubbI64(N);

      return;

    }


    if (N->getValueType(0) != MVT::i32)

      break;


    SelectAddcSubb(N);

    return;

  case ISD::UADDO:

  case ISD::USUBO: {

    if (N->getValueType(0) == MVT::i64) {

      SelectAddcSubbI64(N);

      return;

    }


    SelectUADDO_USUBO(N);

    return;

  }

  case AMDGPUISD::FMUL_W_CHAIN: {

    SelectFMUL_W_CHAIN(N);

    return;

  }

  case AMDGPUISD::FMA_W_CHAIN: {

    SelectFMA_W_CHAIN(N);

    return;

  }


  case ISD::SCALAR_TO_VECTOR:

  case ISD::BUILD_VECTOR: {

    EVT VT = N->getValueType(0);

    unsigned NumVectorElts = VT.getVectorNumElements();

    if (VT.getScalarSizeInBits() == 16) {

      if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {

        if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {

          ReplaceNode(N, Packed);

          return;

        }

      }


      break;

    }


    const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();

    EVT EltTy = VT.getVectorElementType();

    assert(EltTy.bitsEq(MVT::i32) || EltTy.bitsEq(MVT::i64));

    unsigned VecInBits = NumVectorElts * EltTy.getScalarSizeInBits();

    const TargetRegisterClass *RegClass =

        N->isDivergent() ? TRI->getDefaultVectorSuperClassForBitWidth(VecInBits)

                         : SIRegisterInfo::getSGPRClassForBitWidth(VecInBits);


    SelectBuildVector(N, RegClass->getID());

    return;

  }

  case ISD::VECTOR_SHUFFLE:

    SelectVectorShuffle(N);

    return;

  case ISD::BUILD_PAIR: {

    SDValue RC, SubReg0, SubReg1;

    SDLoc DL(N);

    if (N->getValueType(0) == MVT::i128) {

      RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);

      SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);

      SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);

    } else if (N->getValueType(0) == MVT::i64) {

      RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);

      SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);

      SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);

    } else {

      llvm_unreachable("Unhandled value type for BUILD_PAIR");

    }

    const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,

                            N->getOperand(1), SubReg1 };

    ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,

                                          N->getValueType(0), Ops));

    return;

  }


  case ISD::Constant:

  case ISD::ConstantFP: {

    if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N) ||

        Subtarget->has64BitLiterals())

      break;


    uint64_t Imm;

    if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N)) {

      Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();

      if (AMDGPU::isValid32BitLiteral(Imm, true))

        break;

    } else {

      ConstantSDNode *C = cast<ConstantSDNode>(N);

      Imm = C->getZExtValue();

      if (AMDGPU::isValid32BitLiteral(Imm, false))

        break;

    }


    SDLoc DL(N);

    ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));

    return;

  }

  case AMDGPUISD::BFE_I32:

  case AMDGPUISD::BFE_U32: {

    // There is a scalar version available, but unlike the vector version which

    // has a separate operand for the offset and width, the scalar version packs

    // the width and offset into a single operand. Try to move to the scalar

    // version if the offsets are constant, so that we can try to keep extended

    // loads of kernel arguments in SGPRs.


    // TODO: Technically we could try to pattern match scalar bitshifts of

    // dynamic values, but it's probably not useful.

    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));

    if (!Offset)

      break;


    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));

    if (!Width)

      break;


    bool Signed = Opc == AMDGPUISD::BFE_I32;


    uint32_t OffsetVal = Offset->getZExtValue();

    uint32_t WidthVal = Width->getZExtValue();


    ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,

                            WidthVal));

    return;

  }

  case AMDGPUISD::DIV_SCALE: {

    SelectDIV_SCALE(N);

    return;

  }

  case AMDGPUISD::MAD_I64_I32:

  case AMDGPUISD::MAD_U64_U32: {

    SelectMAD_64_32(N);

    return;

  }

  case ISD::SMUL_LOHI:

  case ISD::UMUL_LOHI:

    return SelectMUL_LOHI(N);

  case ISD::CopyToReg: {

    const SITargetLowering& Lowering =

      *static_cast<const SITargetLowering*>(getTargetLowering());

    N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);

    break;

  }

  case ISD::AND:

  case ISD::SRL:

  case ISD::SRA:

  case ISD::SIGN_EXTEND_INREG:

    if (N->getValueType(0) != MVT::i32)

      break;


    SelectS_BFE(N);

    return;

  case ISD::BRCOND:

    SelectBRCOND(N);

    return;

  case ISD::FP_EXTEND:

    SelectFP_EXTEND(N);

    return;

  case AMDGPUISD::CVT_PKRTZ_F16_F32:

  case AMDGPUISD::CVT_PKNORM_I16_F32:

  case AMDGPUISD::CVT_PKNORM_U16_F32:

  case AMDGPUISD::CVT_PK_U16_U32:

  case AMDGPUISD::CVT_PK_I16_I32: {

    // Hack around using a legal type if f16 is illegal.

    if (N->getValueType(0) == MVT::i32) {

      MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;

      N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),

                              { N->getOperand(0), N->getOperand(1) });

      SelectCode(N);

      return;

    }


    break;

  }

  case ISD::INTRINSIC_W_CHAIN: {

    SelectINTRINSIC_W_CHAIN(N);

    return;

  }

  case ISD::INTRINSIC_WO_CHAIN: {

    SelectINTRINSIC_WO_CHAIN(N);

    return;

  }

  case ISD::INTRINSIC_VOID: {

    SelectINTRINSIC_VOID(N);

    return;

  }

  case AMDGPUISD::WAVE_ADDRESS: {

    SelectWAVE_ADDRESS(N);

    return;

  }

  case ISD::STACKRESTORE: {

    SelectSTACKRESTORE(N);

    return;

  }

  }


  SelectCode(N);

}


bool AMDGPUDAGToDAGISel::isSDWAOperand(const SDNode *N) const {

  if (!Subtarget->hasSDWA())

    return false;


  if (N->getOpcode() == ISD::SIGN_EXTEND_INREG) {

    EVT VT = cast<VTSDNode>(N->getOperand(1))->getVT();

    return VT.getScalarSizeInBits() == 8 || VT.getScalarSizeInBits() == 16;

  }


  if (N->getOpcode() == ISD::AND)

    if (auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)))

      return RHS->getZExtValue() == 0xFF || RHS->getZExtValue() == 0xFFFF;


  if (N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL)

    if (auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)))

      return (RHS->getZExtValue() % 8) == 0;


  return false;

}


bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {

  const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();

  const Instruction *Term = BB->getTerminator();

  return Term->getMetadata("amdgpu.uniform") ||

         Term->getMetadata("structurizecfg.uniform");

}


bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,

                                             unsigned ShAmtBits) const {

  assert(N->getOpcode() == ISD::AND);


  const APInt &RHS = N->getConstantOperandAPInt(1);

  if (RHS.countr_one() >= ShAmtBits)

    return true;


  const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;

  return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;

}


static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr,

                                          SDValue &N0, SDValue &N1) {

  if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&

      Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {

    // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.

    // (i64 (bitcast (v2i32 (build_vector

    //                        (or (extract_vector_elt V, 0), OFFSET),

    //                        (extract_vector_elt V, 1)))))

    SDValue Lo = Addr.getOperand(0).getOperand(0);

    if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {

      SDValue BaseLo = Lo.getOperand(0);

      SDValue BaseHi = Addr.getOperand(0).getOperand(1);

      // Check that split base (Lo and Hi) are extracted from the same one.

      if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

          BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

          BaseLo.getOperand(0) == BaseHi.getOperand(0) &&

          // Lo is statically extracted from index 0.

          isa<ConstantSDNode>(BaseLo.getOperand(1)) &&

          BaseLo.getConstantOperandVal(1) == 0 &&

          // Hi is statically extracted from index 0.

          isa<ConstantSDNode>(BaseHi.getOperand(1)) &&

          BaseHi.getConstantOperandVal(1) == 1) {

        N0 = BaseLo.getOperand(0).getOperand(0);

        N1 = Lo.getOperand(1);

        return true;

      }

    }

  }

  return false;

}


bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,

                                                    SDValue &RHS) const {

  if (CurDAG->isBaseWithConstantOffset(Addr)) {

    LHS = Addr.getOperand(0);

    RHS = Addr.getOperand(1);

    return true;

  }


  if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, LHS, RHS)) {

    assert(LHS && RHS && isa<ConstantSDNode>(RHS));

    return true;

  }


  return false;

}


StringRef AMDGPUDAGToDAGISelLegacy::getPassName() const {

  return "AMDGPU DAG->DAG Pattern Instruction Selection";

}


AMDGPUISelDAGToDAGPass::AMDGPUISelDAGToDAGPass(TargetMachine &TM)

    : SelectionDAGISelPass(

          std::make_unique<AMDGPUDAGToDAGISel>(TM, TM.getOptLevel())) {}


PreservedAnalyses


AMDGPUISelDAGToDAGPass::run(MachineFunction &MF,

                            MachineFunctionAnalysisManager &MFAM) {

#ifdef EXPENSIVE_CHECKS

  auto &FAM = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF)

                  .getManager();

  auto &F = MF.getFunction();

  DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);

  LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);

  for (auto &L : LI.getLoopsInPreorder())

    assert(L->isLCSSAForm(DT) && "Loop is not in LCSSA form!");

#endif

  return SelectionDAGISelPass::run(MF, MFAM);

}


//===----------------------------------------------------------------------===//

// Complex Patterns

//===----------------------------------------------------------------------===//


bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,

                                            SDValue &Offset) {

  return false;

}


bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,

                                            SDValue &Offset) {

  ConstantSDNode *C;

  SDLoc DL(Addr);


  if ((C = dyn_cast<ConstantSDNode>(Addr))) {

    Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);

    Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);

  } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&

             (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {

    Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);

    Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);

  } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&

            (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {

    Base = Addr.getOperand(0);

    Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);

  } else {

    Base = Addr;

    Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);

  }


  return true;

}


SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,

                                                       const SDLoc &DL) const {

  SDNode *Mov = CurDAG->getMachineNode(

    AMDGPU::S_MOV_B32, DL, MVT::i32,

    CurDAG->getTargetConstant(Val, DL, MVT::i32));

  return SDValue(Mov, 0);

}


void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {

  SDValue LHS = N->getOperand(0);

  SDValue RHS = N->getOperand(1);

  SDValue CI = N->getOperand(2);


  if (N->isDivergent()) {

    unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64

                                                      : AMDGPU::V_SUBB_U32_e64;

    CurDAG->SelectNodeTo(

        N, Opc, N->getVTList(),

        {LHS, RHS, CI,

         CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});

  } else {

    unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO

                                                      : AMDGPU::S_SUB_CO_PSEUDO;

    CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});

  }

}


void AMDGPUDAGToDAGISel::SelectAddcSubbI64(SDNode *N) {

  SDLoc DL(N);

  SDValue LHS = N->getOperand(0);

  SDValue RHS = N->getOperand(1);


  unsigned Opcode = N->getOpcode();

  bool ConsumeCarry = Opcode == ISD::UADDO_CARRY || Opcode == ISD::USUBO_CARRY;

  bool IsAdd = Opcode == ISD::UADDO || Opcode == ISD::UADDO_CARRY;


  SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);

  SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);


  SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,

                                       MVT::i32, LHS, Sub0);

  SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,

                                       MVT::i32, LHS, Sub1);


  SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,

                                       MVT::i32, RHS, Sub0);

  SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,

                                       MVT::i32, RHS, Sub1);


  SDVTList VTList = CurDAG->getVTList(MVT::i32, N->getValueType(1));


  static const unsigned NoCarryOpcMap[2][2] = {

      {AMDGPU::S_USUBO_PSEUDO, AMDGPU::S_UADDO_PSEUDO},

      {AMDGPU::V_SUB_CO_U32_e64, AMDGPU::V_ADD_CO_U32_e64}};

  static const unsigned CarryOpcMap[2][2] = {

      {AMDGPU::S_SUB_CO_PSEUDO, AMDGPU::S_ADD_CO_PSEUDO},

      {AMDGPU::V_SUBB_U32_e64, AMDGPU::V_ADDC_U32_e64}};


  bool IsVALU = N->isDivergent();


  unsigned NoCarryOpc = NoCarryOpcMap[IsVALU][IsAdd];

  unsigned CarryOpc = CarryOpcMap[IsVALU][IsAdd];

  SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);


  SDNode *AddLo;

  if (!ConsumeCarry) {

    if (IsVALU) {

      SDValue Args[] = {SDValue(Lo0, 0), SDValue(Lo1, 0), Clamp};

      AddLo = CurDAG->getMachineNode(NoCarryOpc, DL, VTList, Args);

    } else {

      SDValue Args[] = {SDValue(Lo0, 0), SDValue(Lo1, 0)};

      AddLo = CurDAG->getMachineNode(NoCarryOpc, DL, VTList, Args);

    }

  } else {

    if (IsVALU) {

      SDValue Args[] = {SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2),

                        Clamp};

      AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);

    } else {

      SDValue Args[] = {SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2)};

      AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);

    }

  }


  SDNode *AddHi;

  if (IsVALU) {

    SDValue Args[] = {SDValue(Hi0, 0), SDValue(Hi1, 0), SDValue(AddLo, 1),

                      Clamp};

    AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);

  } else {

    SDValue Args[] = {SDValue(Hi0, 0), SDValue(Hi1, 0), SDValue(AddLo, 1)};

    AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);

  }


  unsigned RC = IsVALU ? AMDGPU::VReg_64RegClassID : AMDGPU::SReg_64RegClassID;

  SDValue RegSequenceArgs[] = {CurDAG->getTargetConstant(RC, DL, MVT::i32),

                               SDValue(AddLo, 0), Sub0, SDValue(AddHi, 0),

                               Sub1};

  SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,

                                               MVT::i64, RegSequenceArgs);


  ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));

  ReplaceNode(N, RegSequence);

}


void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {

  // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned

  // carry out despite the _i32 name. These were renamed in VI to _U32.

  // FIXME: We should probably rename the opcodes here.

  bool IsAdd = N->getOpcode() == ISD::UADDO;

  bool IsVALU = N->isDivergent();


  for (SDNode::user_iterator UI = N->user_begin(), E = N->user_end(); UI != E;

       ++UI)

    if (UI.getUse().getResNo() == 1) {

      if (UI->isMachineOpcode()) {

        if (UI->getMachineOpcode() !=

            (IsAdd ? AMDGPU::S_ADD_CO_PSEUDO : AMDGPU::S_SUB_CO_PSEUDO)) {

          IsVALU = true;

          break;

        }

      } else {

        if (UI->getOpcode() != (IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY)) {

          IsVALU = true;

          break;

        }

      }

    }


  if (IsVALU) {

    unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;


    CurDAG->SelectNodeTo(

        N, Opc, N->getVTList(),

        {N->getOperand(0), N->getOperand(1),

         CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});

  } else {

    unsigned Opc = IsAdd ? AMDGPU::S_UADDO_PSEUDO : AMDGPU::S_USUBO_PSEUDO;


    CurDAG->SelectNodeTo(N, Opc, N->getVTList(),

                         {N->getOperand(0), N->getOperand(1)});

  }

}


void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {

  //  src0_modifiers, src0,  src1_modifiers, src1, src2_modifiers, src2, clamp, omod

  SDValue Ops[10];


  SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);

  SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);

  SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);

  Ops[8] = N->getOperand(0);

  Ops[9] = N->getOperand(4);


  // If there are no source modifiers, prefer fmac over fma because it can use

  // the smaller VOP2 encoding.

  bool UseFMAC = Subtarget->hasDLInsts() &&

                 cast<ConstantSDNode>(Ops[0])->isZero() &&

                 cast<ConstantSDNode>(Ops[2])->isZero() &&

                 cast<ConstantSDNode>(Ops[4])->isZero();

  unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;

  CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);

}


void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {

  //    src0_modifiers, src0,  src1_modifiers, src1, clamp, omod

  SDValue Ops[8];


  SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);

  SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);

  Ops[6] = N->getOperand(0);

  Ops[7] = N->getOperand(3);


  CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);

}


// We need to handle this here because tablegen doesn't support matching

// instructions with multiple outputs.

void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {

  EVT VT = N->getValueType(0);


  assert(VT == MVT::f32 || VT == MVT::f64);


  unsigned Opc

    = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;


  // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,

  // omod

  SDValue Ops[8];

  SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);

  SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);

  SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);

  CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);

}


// We need to handle this here because tablegen doesn't support matching

// instructions with multiple outputs.

void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {

  SDLoc SL(N);

  bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;

  unsigned Opc;

  bool UseNoCarry = Subtarget->hasMadNC64_32Insts() && !N->hasAnyUseOfValue(1);

  if (Subtarget->hasMADIntraFwdBug())

    Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64

                 : AMDGPU::V_MAD_U64_U32_gfx11_e64;

  else if (UseNoCarry)

    Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;

  else

    Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;


  SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);

  SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),

                    Clamp };


  if (UseNoCarry) {

    MachineSDNode *Mad = CurDAG->getMachineNode(Opc, SL, MVT::i64, Ops);

    ReplaceUses(SDValue(N, 0), SDValue(Mad, 0));

    CurDAG->RemoveDeadNode(N);

    return;

  }


  CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);

}


// We need to handle this here because tablegen doesn't support matching

// instructions with multiple outputs.

void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {

  SDLoc SL(N);

  bool Signed = N->getOpcode() == ISD::SMUL_LOHI;

  SDVTList VTList;

  unsigned Opc;

  if (Subtarget->hasMadNC64_32Insts()) {

    VTList = CurDAG->getVTList(MVT::i64);

    Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;

  } else {

    VTList = CurDAG->getVTList(MVT::i64, MVT::i1);

    if (Subtarget->hasMADIntraFwdBug()) {

      Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64

                   : AMDGPU::V_MAD_U64_U32_gfx11_e64;

    } else {

      Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;

    }

  }


  SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);

  SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);

  SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};

  SDNode *Mad = CurDAG->getMachineNode(Opc, SL, VTList, Ops);

  if (!SDValue(N, 0).use_empty()) {

    SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);

    SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,

                                        MVT::i32, SDValue(Mad, 0), Sub0);

    ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));

  }

  if (!SDValue(N, 1).use_empty()) {

    SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);

    SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,

                                        MVT::i32, SDValue(Mad, 0), Sub1);

    ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));

  }

  CurDAG->RemoveDeadNode(N);

}


bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {

  if (!isUInt<16>(Offset))

    return false;


  if (!Base || Subtarget->hasUsableDSOffset() ||

      Subtarget->unsafeDSOffsetFoldingEnabled())

    return true;


  // On Southern Islands instruction with a negative base value and an offset

  // don't seem to work.

  return CurDAG->SignBitIsZero(Base);

}


bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,

                                              SDValue &Offset) const {

  SDLoc DL(Addr);

  if (CurDAG->isBaseWithConstantOffset(Addr)) {

    SDValue N0 = Addr.getOperand(0);

    SDValue N1 = Addr.getOperand(1);

    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);

    if (isDSOffsetLegal(N0, C1->getSExtValue())) {

      // (add n0, c0)

      Base = N0;

      Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);

      return true;

    }

  } else if (Addr.getOpcode() == ISD::SUB) {

    // sub C, x -> add (sub 0, x), C

    if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {

      int64_t ByteOffset = C->getSExtValue();

      if (isDSOffsetLegal(SDValue(), ByteOffset)) {

        SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);


        // XXX - This is kind of hacky. Create a dummy sub node so we can check

        // the known bits in isDSOffsetLegal. We need to emit the selected node

        // here, so this is thrown away.

        SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,

                                      Zero, Addr.getOperand(1));


        if (isDSOffsetLegal(Sub, ByteOffset)) {

          SmallVector<SDValue, 3> Opnds;

          Opnds.push_back(Zero);

          Opnds.push_back(Addr.getOperand(1));


          // FIXME: Select to VOP3 version for with-carry.

          unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;

          if (Subtarget->hasAddNoCarryInsts()) {

            SubOp = AMDGPU::V_SUB_U32_e64;

            Opnds.push_back(

                CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit

          }


          MachineSDNode *MachineSub =

              CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);


          Base = SDValue(MachineSub, 0);

          Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);

          return true;

        }

      }

    }

  } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {

    // If we have a constant address, prefer to put the constant into the

    // offset. This can save moves to load the constant address since multiple

    // operations can share the zero base address register, and enables merging

    // into read2 / write2 instructions.


    SDLoc DL(Addr);


    if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {

      SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);

      MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,

                                 DL, MVT::i32, Zero);

      Base = SDValue(MovZero, 0);

      Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);

      return true;

    }

  }


  // default case

  Base = Addr;

  Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);

  return true;

}


bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,

                                          unsigned Offset1,

                                          unsigned Size) const {

  if (Offset0 % Size != 0 || Offset1 % Size != 0)

    return false;

  if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))

    return false;


  if (!Base || Subtarget->hasUsableDSOffset() ||

      Subtarget->unsafeDSOffsetFoldingEnabled())

    return true;


  // On Southern Islands instruction with a negative base value and an offset

  // don't seem to work.

  return CurDAG->SignBitIsZero(Base);

}


// Return whether the operation has NoUnsignedWrap property.


static bool isNoUnsignedWrap(SDValue Addr) {

  return (Addr.getOpcode() == ISD::ADD &&

          Addr->getFlags().hasNoUnsignedWrap()) ||

         Addr->getOpcode() == ISD::OR;

}


// Check that the base address of flat scratch load/store in the form of `base +

// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware

// requirement). We always treat the first operand as the base address here.

bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {

  if (isNoUnsignedWrap(Addr))

    return true;


  // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative

  // values.

  if (Subtarget->hasSignedScratchOffsets())

    return true;


  auto LHS = Addr.getOperand(0);

  auto RHS = Addr.getOperand(1);


  // If the immediate offset is negative and within certain range, the base

  // address cannot also be negative. If the base is also negative, the sum

  // would be either negative or much larger than the valid range of scratch

  // memory a thread can access.

  ConstantSDNode *ImmOp = nullptr;

  if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {

    if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000)

      return true;

  }


  return CurDAG->SignBitIsZero(LHS);

}


// Check address value in SGPR/VGPR are legal for flat scratch in the form

// of: SGPR + VGPR.

bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {

  if (isNoUnsignedWrap(Addr))

    return true;


  // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative

  // values.

  if (Subtarget->hasSignedScratchOffsets())

    return true;


  auto LHS = Addr.getOperand(0);

  auto RHS = Addr.getOperand(1);

  return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);

}


// Check address value in SGPR/VGPR are legal for flat scratch in the form

// of: SGPR + VGPR + Imm.

bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {

  // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative

  // values.

  if (AMDGPU::isGFX12Plus(*Subtarget))

    return true;


  auto Base = Addr.getOperand(0);

  auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));

  // If the immediate offset is negative and within certain range, the base

  // address cannot also be negative. If the base is also negative, the sum

  // would be either negative or much larger than the valid range of scratch

  // memory a thread can access.

  if (isNoUnsignedWrap(Base) &&

      (isNoUnsignedWrap(Addr) ||

       (RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000)))

    return true;


  auto LHS = Base.getOperand(0);

  auto RHS = Base.getOperand(1);

  return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);

}


// TODO: If offset is too big, put low 16-bit into offset.

bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,

                                                   SDValue &Offset0,

                                                   SDValue &Offset1) const {

  return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);

}


bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,

                                                    SDValue &Offset0,

                                                    SDValue &Offset1) const {

  return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);

}


bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,

                                            SDValue &Offset0, SDValue &Offset1,

                                            unsigned Size) const {

  SDLoc DL(Addr);


  if (CurDAG->isBaseWithConstantOffset(Addr)) {

    SDValue N0 = Addr.getOperand(0);

    SDValue N1 = Addr.getOperand(1);

    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);

    unsigned OffsetValue0 = C1->getZExtValue();

    unsigned OffsetValue1 = OffsetValue0 + Size;


    // (add n0, c0)

    if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {

      Base = N0;

      Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);

      Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);

      return true;

    }

  } else if (Addr.getOpcode() == ISD::SUB) {

    // sub C, x -> add (sub 0, x), C

    if (const ConstantSDNode *C =

            dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {

      unsigned OffsetValue0 = C->getZExtValue();

      unsigned OffsetValue1 = OffsetValue0 + Size;


      if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {

        SDLoc DL(Addr);

        SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);


        // XXX - This is kind of hacky. Create a dummy sub node so we can check

        // the known bits in isDSOffsetLegal. We need to emit the selected node

        // here, so this is thrown away.

        SDValue Sub =

            CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));


        if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {

          SmallVector<SDValue, 3> Opnds;

          Opnds.push_back(Zero);

          Opnds.push_back(Addr.getOperand(1));

          unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;

          if (Subtarget->hasAddNoCarryInsts()) {

            SubOp = AMDGPU::V_SUB_U32_e64;

            Opnds.push_back(

                CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit

          }


          MachineSDNode *MachineSub = CurDAG->getMachineNode(

              SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);


          Base = SDValue(MachineSub, 0);

          Offset0 =

              CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);

          Offset1 =

              CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);

          return true;

        }

      }

    }

  } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {

    unsigned OffsetValue0 = CAddr->getZExtValue();

    unsigned OffsetValue1 = OffsetValue0 + Size;


    if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {

      SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);

      MachineSDNode *MovZero =

          CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);

      Base = SDValue(MovZero, 0);

      Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);

      Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);

      return true;

    }

  }


  // default case


  Base = Addr;

  Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i32);

  Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i32);

  return true;

}


bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,

                                     SDValue &SOffset, SDValue &Offset,

                                     SDValue &Offen, SDValue &Idxen,

                                     SDValue &Addr64) const {

  // Subtarget prefers to use flat instruction

  // FIXME: This should be a pattern predicate and not reach here

  if (Subtarget->useFlatForGlobal())

    return false;


  SDLoc DL(Addr);


  Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);

  Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);

  Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);

  SOffset = Subtarget->hasRestrictedSOffset()

                ? CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32)

                : CurDAG->getTargetConstant(0, DL, MVT::i32);


  ConstantSDNode *C1 = nullptr;

  SDValue N0 = Addr;

  if (CurDAG->isBaseWithConstantOffset(Addr)) {

    C1 = cast<ConstantSDNode>(Addr.getOperand(1));

    if (isUInt<32>(C1->getZExtValue()))

      N0 = Addr.getOperand(0);

    else

      C1 = nullptr;

  }


  if (N0->isAnyAdd()) {

    // (add N2, N3) -> addr64, or

    // (add (add N2, N3), C1) -> addr64

    SDValue N2 = N0.getOperand(0);

    SDValue N3 = N0.getOperand(1);

    Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);


    if (N2->isDivergent()) {

      if (N3->isDivergent()) {

        // Both N2 and N3 are divergent. Use N0 (the result of the add) as the

        // addr64, and construct the resource from a 0 address.

        Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);

        VAddr = N0;

      } else {

        // N2 is divergent, N3 is not.

        Ptr = N3;

        VAddr = N2;

      }

    } else {

      // N2 is not divergent.

      Ptr = N2;

      VAddr = N3;

    }

    Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);

  } else if (N0->isDivergent()) {

    // N0 is divergent. Use it as the addr64, and construct the resource from a

    // 0 address.

    Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);

    VAddr = N0;

    Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);

  } else {

    // N0 -> offset, or

    // (N0 + C1) -> offset

    VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);

    Ptr = N0;

  }


  if (!C1) {

    // No offset.

    Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);

    return true;

  }


  const SIInstrInfo *TII = Subtarget->getInstrInfo();

  if (TII->isLegalMUBUFImmOffset(C1->getZExtValue())) {

    // Legal offset for instruction.

    Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);

    return true;

  }


  // Illegal offset, store it in soffset.

  Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);

  SOffset =

      SDValue(CurDAG->getMachineNode(

                  AMDGPU::S_MOV_B32, DL, MVT::i32,

                  CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),

              0);

  return true;

}


bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,

                                           SDValue &VAddr, SDValue &SOffset,

                                           SDValue &Offset) const {

  SDValue Ptr, Offen, Idxen, Addr64;


  // addr64 bit was removed for volcanic islands.

  // FIXME: This should be a pattern predicate and not reach here

  if (!Subtarget->hasAddr64())

    return false;


  if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))

    return false;


  ConstantSDNode *C = cast<ConstantSDNode>(Addr64);

  if (C->getSExtValue()) {

    SDLoc DL(Addr);


    const SITargetLowering& Lowering =

      *static_cast<const SITargetLowering*>(getTargetLowering());


    SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);

    return true;

  }


  return false;

}


std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {

  SDLoc DL(N);


  auto *FI = dyn_cast<FrameIndexSDNode>(N);

  SDValue TFI =

      FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;


  // We rebase the base address into an absolute stack address and hence

  // use constant 0 for soffset. This value must be retained until

  // frame elimination and eliminateFrameIndex will choose the appropriate

  // frame register if need be.

  return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));

}


bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,

                                                 SDValue Addr, SDValue &Rsrc,

                                                 SDValue &VAddr, SDValue &SOffset,

                                                 SDValue &ImmOffset) const {


  SDLoc DL(Addr);

  MachineFunction &MF = CurDAG->getMachineFunction();

  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();


  Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);


  if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {

    int64_t Imm = CAddr->getSExtValue();

    const int64_t NullPtr =

        AMDGPU::getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS);

    // Don't fold null pointer.

    if (Imm != NullPtr) {

      const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);

      SDValue HighBits =

          CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);

      MachineSDNode *MovHighBits = CurDAG->getMachineNode(

        AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);

      VAddr = SDValue(MovHighBits, 0);


      SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);

      ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32);

      return true;

    }

  }


  if (CurDAG->isBaseWithConstantOffset(Addr)) {

    // (add n0, c1)


    SDValue N0 = Addr.getOperand(0);

    uint64_t C1 = Addr.getConstantOperandVal(1);


    // Offsets in vaddr must be positive if range checking is enabled.

    //

    // The total computation of vaddr + soffset + offset must not overflow.  If

    // vaddr is negative, even if offset is 0 the sgpr offset add will end up

    // overflowing.

    //

    // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would

    // always perform a range check. If a negative vaddr base index was used,

    // this would fail the range check. The overall address computation would

    // compute a valid address, but this doesn't happen due to the range

    // check. For out-of-bounds MUBUF loads, a 0 is returned.

    //

    // Therefore it should be safe to fold any VGPR offset on gfx9 into the

    // MUBUF vaddr, but not on older subtargets which can only do this if the

    // sign bit is known 0.

    const SIInstrInfo *TII = Subtarget->getInstrInfo();

    if (TII->isLegalMUBUFImmOffset(C1) &&

        (!Subtarget->privateMemoryResourceIsRangeChecked() ||

         CurDAG->SignBitIsZero(N0))) {

      std::tie(VAddr, SOffset) = foldFrameIndex(N0);

      ImmOffset = CurDAG->getTargetConstant(C1, DL, MVT::i32);

      return true;

    }

  }


  // (node)

  std::tie(VAddr, SOffset) = foldFrameIndex(Addr);

  ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);

  return true;

}


static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {

  if (Val.getOpcode() != ISD::CopyFromReg)

    return false;

  auto Reg = cast<RegisterSDNode>(Val.getOperand(1))->getReg();

  if (!Reg.isPhysical())

    return false;

  const auto *RC = TRI.getPhysRegBaseClass(Reg);

  return RC && TRI.isSGPRClass(RC);

}


bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,

                                                  SDValue Addr,

                                                  SDValue &SRsrc,

                                                  SDValue &SOffset,

                                                  SDValue &Offset) const {

  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();

  const SIInstrInfo *TII = Subtarget->getInstrInfo();

  MachineFunction &MF = CurDAG->getMachineFunction();

  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();

  SDLoc DL(Addr);


  // CopyFromReg <sgpr>

  if (IsCopyFromSGPR(*TRI, Addr)) {

    SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);

    SOffset = Addr;

    Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);

    return true;

  }


  ConstantSDNode *CAddr;

  if (Addr.getOpcode() == ISD::ADD) {

    // Add (CopyFromReg <sgpr>) <constant>

    CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));

    if (!CAddr || !TII->isLegalMUBUFImmOffset(CAddr->getZExtValue()))

      return false;

    if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))

      return false;


    SOffset = Addr.getOperand(0);

  } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&

             TII->isLegalMUBUFImmOffset(CAddr->getZExtValue())) {

    // <constant>

    SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);

  } else {

    return false;

  }


  SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);


  Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32);

  return true;

}


bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,

                                           SDValue &SOffset, SDValue &Offset

                                           ) const {

  SDValue Ptr, VAddr, Offen, Idxen, Addr64;

  const SIInstrInfo *TII = Subtarget->getInstrInfo();


  if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))

    return false;


  if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&

      !cast<ConstantSDNode>(Idxen)->getSExtValue() &&

      !cast<ConstantSDNode>(Addr64)->getSExtValue()) {

    uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |

                    maskTrailingOnes<uint64_t>(32); // Size

    SDLoc DL(Addr);


    const SITargetLowering& Lowering =

      *static_cast<const SITargetLowering*>(getTargetLowering());


    SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);

    return true;

  }

  return false;

}


bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,

                                          SDValue &SOffset) const {

  if (Subtarget->hasRestrictedSOffset() && isNullConstant(ByteOffsetNode)) {

    SOffset = CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32);

    return true;

  }


  SOffset = ByteOffsetNode;

  return true;

}


// Find a load or store from corresponding pattern root.

// Roots may be build_vector, bitconvert or their combinations.


static MemSDNode* findMemSDNode(SDNode *N) {

  N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode();

  if (MemSDNode *MN = dyn_cast<MemSDNode>(N))

    return MN;

  assert(isa<BuildVectorSDNode>(N));

  for (SDValue V : N->op_values())

    if (MemSDNode *MN =

          dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))

      return MN;

  llvm_unreachable("cannot find MemSDNode in the pattern!");

}


bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(

    SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &Offset,

    AMDGPU::FlatAddrSpace FlatVariant) const {

  using AMDGPU::FlatAddrSpace;

  int64_t OffsetVal = 0;


  unsigned AS = findMemSDNode(N)->getAddressSpace();


  bool CanHaveFlatSegmentOffsetBug =

      Subtarget->hasFlatSegmentOffsetBug() &&

      FlatVariant == FlatAddrSpace::FLAT &&

      (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::GLOBAL_ADDRESS);


  if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {

    SDValue N0, N1;

    if (isBaseWithConstantOffset64(Addr, N0, N1) &&

        (FlatVariant != FlatAddrSpace::FlatScratch ||

         isFlatScratchBaseLegal(Addr))) {

      int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();


      // Adding the offset to the base address in a FLAT instruction must not

      // change the memory aperture in which the address falls. Therefore we can

      // only fold offsets from inbounds GEPs into FLAT instructions.

      bool IsInBounds =

          Addr.getOpcode() == ISD::PTRADD && Addr->getFlags().hasInBounds();

      if (COffsetVal == 0 || FlatVariant != FlatAddrSpace::FLAT || IsInBounds) {

        const SIInstrInfo *TII = Subtarget->getInstrInfo();

        if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {

          Addr = N0;

          OffsetVal = COffsetVal;

        } else {

          // If the offset doesn't fit, put the low bits into the offset field

          // and add the rest.

          //

          // For a FLAT instruction the hardware decides whether to access

          // global/scratch/shared memory based on the high bits of vaddr,

          // ignoring the offset field, so we have to ensure that when we add

          // remainder to vaddr it still points into the same underlying object.

          // The easiest way to do that is to make sure that we split the offset

          // into two pieces that are both >= 0 or both <= 0.


          SDLoc DL(N);

          uint64_t RemainderOffset;


          std::tie(OffsetVal, RemainderOffset) =

              TII->splitFlatOffset(COffsetVal, AS, FlatVariant);


          SDValue AddOffsetLo =

              getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);

          SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);


          if (Addr.getValueType().getSizeInBits() == 32) {

            SmallVector<SDValue, 3> Opnds;

            Opnds.push_back(N0);

            Opnds.push_back(AddOffsetLo);

            unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;

            if (Subtarget->hasAddNoCarryInsts()) {

              AddOp = AMDGPU::V_ADD_U32_e64;

              Opnds.push_back(Clamp);

            }

            Addr =

                SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);

          } else {

            // TODO: Should this try to use a scalar add pseudo if the base

            // address is uniform and saddr is usable?

            SDValue Sub0 =

                CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);

            SDValue Sub1 =

                CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);


            SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,

                                                  DL, MVT::i32, N0, Sub0);

            SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,

                                                  DL, MVT::i32, N0, Sub1);


            SDValue AddOffsetHi =

                getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);


            SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);


            SDNode *Add =

                CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,

                                       {AddOffsetLo, SDValue(N0Lo, 0), Clamp});


            SDNode *Addc = CurDAG->getMachineNode(

                AMDGPU::V_ADDC_U32_e64, DL, VTs,

                {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});


            SDValue RegSequenceArgs[] = {

                CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL,

                                          MVT::i32),

                SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};


            Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,

                                                  MVT::i64, RegSequenceArgs),

                           0);

          }

        }

      }

    }

  }


  VAddr = Addr;

  Offset = CurDAG->getSignedTargetConstant(OffsetVal, SDLoc(), MVT::i32);

  return true;

}


bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,

                                          SDValue &VAddr,

                                          SDValue &Offset) const {

  return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,

                              AMDGPU::FlatAddrSpace::FLAT);

}


bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,

                                            SDValue &VAddr,

                                            SDValue &Offset) const {

  return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,

                              AMDGPU::FlatAddrSpace::FlatGlobal);

}


bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,

                                             SDValue &VAddr,

                                             SDValue &Offset) const {

  return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,

                              AMDGPU::FlatAddrSpace::FlatScratch);

}


// If this matches *_extend i32:x, return x

// Otherwise if the value is I32 returns x.


static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned,

                                    const SelectionDAG *DAG) {

  if (Op.getValueType() == MVT::i32)

    return Op;


  if (Op.getOpcode() != (IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND) &&

      Op.getOpcode() != ISD::ANY_EXTEND &&

      !(DAG->SignBitIsZero(Op) &&

        Op.getOpcode() == (IsSigned ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND)))

    return SDValue();


  SDValue ExtSrc = Op.getOperand(0);

  return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();

}


// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)

// or (64-bit SGPR base) + (sext vgpr offset) + sext(imm offset)

bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,

                                           SDValue &SAddr, SDValue &VOffset,

                                           SDValue &Offset, bool &ScaleOffset,

                                           bool NeedIOffset) const {

  using AMDGPU::FlatAddrSpace;

  int64_t ImmOffset = 0;

  ScaleOffset = false;


  // Match the immediate offset first, which canonically is moved as low as

  // possible.


  SDValue LHS, RHS;

  if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {

    int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();

    const SIInstrInfo *TII = Subtarget->getInstrInfo();


    if (NeedIOffset &&

        TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,

                               FlatAddrSpace::FlatGlobal)) {

      Addr = LHS;

      ImmOffset = COffsetVal;

    } else if (!LHS->isDivergent()) {

      if (COffsetVal > 0) {

        SDLoc SL(N);

        // saddr + large_offset -> saddr +

        //                         (voffset = large_offset & ~MaxOffset) +

        //                         (large_offset & MaxOffset);

        int64_t SplitImmOffset = 0, RemainderOffset = COffsetVal;

        if (NeedIOffset) {

          std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(

              COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, FlatAddrSpace::FlatGlobal);

        }


        if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)

                                            : isUInt<32>(RemainderOffset)) {

          SDNode *VMov = CurDAG->getMachineNode(

              AMDGPU::V_MOV_B32_e32, SL, MVT::i32,

              CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));

          VOffset = SDValue(VMov, 0);

          SAddr = LHS;

          Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);

          return true;

        }

      }


      // We are adding a 64 bit SGPR and a constant. If constant bus limit

      // is 1 we would need to perform 1 or 2 extra moves for each half of

      // the constant and it is better to do a scalar add and then issue a

      // single VALU instruction to materialize zero. Otherwise it is less

      // instructions to perform VALU adds with immediates or inline literals.

      unsigned NumLiterals =

          !TII->isInlineConstant(APInt(32, Lo_32(COffsetVal))) +

          !TII->isInlineConstant(APInt(32, Hi_32(COffsetVal)));

      if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)

        return false;

    }

  }


  // Match the variable offset.

  if (Addr->isAnyAdd()) {

    LHS = Addr.getOperand(0);


    if (!LHS->isDivergent()) {

      // add (i64 sgpr), (*_extend (i32 vgpr))

      RHS = Addr.getOperand(1);

      ScaleOffset = SelectScaleOffset(N, RHS, Subtarget->hasSignedGVSOffset());

      if (SDValue ExtRHS = matchExtFromI32orI32(

              RHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {

        SAddr = LHS;

        VOffset = ExtRHS;

      }

    }


    RHS = Addr.getOperand(1);

    if (!SAddr && !RHS->isDivergent()) {

      // add (*_extend (i32 vgpr)), (i64 sgpr)

      ScaleOffset = SelectScaleOffset(N, LHS, Subtarget->hasSignedGVSOffset());

      if (SDValue ExtLHS = matchExtFromI32orI32(

              LHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {

        SAddr = RHS;

        VOffset = ExtLHS;

      }

    }


    if (SAddr) {

      Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);

      return true;

    }

  }


  if (Subtarget->hasScaleOffset() &&

      (Addr.getOpcode() == (Subtarget->hasSignedGVSOffset()

                                ? AMDGPUISD::MAD_I64_I32

                                : AMDGPUISD::MAD_U64_U32) ||

       (Addr.getOpcode() == AMDGPUISD::MAD_U64_U32 &&

        CurDAG->SignBitIsZero(Addr.getOperand(0)))) &&

      Addr.getOperand(0)->isDivergent() &&

      isa<ConstantSDNode>(Addr.getOperand(1)) &&

      !Addr.getOperand(2)->isDivergent()) {

    // mad_u64_u32 (i32 vgpr), (i32 c), (i64 sgpr)

    unsigned Size =

        (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;

    ScaleOffset = Addr.getConstantOperandVal(1) == Size;

    if (ScaleOffset) {

      SAddr = Addr.getOperand(2);

      VOffset = Addr.getOperand(0);

      Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);

      return true;

    }

  }


  if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||

      isa<ConstantSDNode>(Addr))

    return false;


  // It's cheaper to materialize a single 32-bit zero for vaddr than the two

  // moves required to copy a 64-bit SGPR to VGPR.

  SAddr = Addr;

  SDNode *VMov =

      CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,

                             CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));

  VOffset = SDValue(VMov, 0);

  Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);

  return true;

}


bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,

                                           SDValue &SAddr, SDValue &VOffset,

                                           SDValue &Offset,

                                           SDValue &CPol) const {

  bool ScaleOffset;

  if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))

    return false;


  CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,

                                   SDLoc(), MVT::i32);

  return true;

}


bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPol(SDNode *N, SDValue Addr,

                                               SDValue &SAddr, SDValue &VOffset,

                                               SDValue &Offset,

                                               SDValue &CPol) const {

  bool ScaleOffset;

  if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))

    return false;


  // We are assuming CPol is always the last operand of the intrinsic.

  auto PassedCPol =

      N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;

  CPol = CurDAG->getTargetConstant(

      (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);

  return true;

}


bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPolM0(SDNode *N, SDValue Addr,

                                                 SDValue &SAddr,

                                                 SDValue &VOffset,

                                                 SDValue &Offset,

                                                 SDValue &CPol) const {

  bool ScaleOffset;

  if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))

    return false;


  // We are assuming CPol is second from last operand of the intrinsic.

  auto PassedCPol =

      N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;

  CPol = CurDAG->getTargetConstant(

      (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);

  return true;

}


bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,

                                              SDValue &SAddr, SDValue &VOffset,

                                              SDValue &Offset,

                                              SDValue &CPol) const {

  bool ScaleOffset;

  if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))

    return false;


  unsigned CPolVal = (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | AMDGPU::CPol::GLC;

  CPol = CurDAG->getTargetConstant(CPolVal, SDLoc(), MVT::i32);

  return true;

}


bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr,

                                                    SDValue &SAddr,

                                                    SDValue &VOffset,

                                                    SDValue &CPol) const {

  bool ScaleOffset;

  SDValue DummyOffset;

  if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset,

                         false))

    return false;


  // We are assuming CPol is always the last operand of the intrinsic.

  auto PassedCPol =

      N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;

  CPol = CurDAG->getTargetConstant(

      (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);

  return true;

}


bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffsetM0(SDNode *N, SDValue Addr,

                                                      SDValue &SAddr,

                                                      SDValue &VOffset,

                                                      SDValue &CPol) const {

  bool ScaleOffset;

  SDValue DummyOffset;

  if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset,

                         false))

    return false;


  // We are assuming CPol is second from last operand of the intrinsic.

  auto PassedCPol =

      N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;

  CPol = CurDAG->getTargetConstant(

      (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);

  return true;

}


static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {

  if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) {

    SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));

  } else if (SAddr.getOpcode() == ISD::ADD &&

             isa<FrameIndexSDNode>(SAddr.getOperand(0))) {

    // Materialize this into a scalar move for scalar address to avoid

    // readfirstlane.

    auto *FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));

    SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),

                                              FI->getValueType(0));

    SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),

                                           MVT::i32, TFI, SAddr.getOperand(1)),

                    0);

  }


  return SAddr;

}


// Match (32-bit SGPR base) + sext(imm offset)

bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,

                                            SDValue &SAddr,

                                            SDValue &Offset) const {

  using AMDGPU::FlatAddrSpace;

  if (Addr->isDivergent())

    return false;


  SDLoc DL(Addr);


  int64_t COffsetVal = 0;


  if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {

    COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();

    SAddr = Addr.getOperand(0);

  } else {

    SAddr = Addr;

  }


  SAddr = SelectSAddrFI(CurDAG, SAddr);


  const SIInstrInfo *TII = Subtarget->getInstrInfo();


  if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,

                              FlatAddrSpace::FlatScratch)) {

    int64_t SplitImmOffset, RemainderOffset;

    std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(

        COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, FlatAddrSpace::FlatScratch);


    COffsetVal = SplitImmOffset;


    SDValue AddOffset =

        SAddr.getOpcode() == ISD::TargetFrameIndex

            ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)

            : CurDAG->getSignedTargetConstant(RemainderOffset, DL, MVT::i32);

    SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,

                                           SAddr, AddOffset),

                    0);

  }


  Offset = CurDAG->getSignedTargetConstant(COffsetVal, DL, MVT::i32);


  return true;

}


// Check whether the flat scratch SVS swizzle bug affects this access.

bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(

    SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {

  if (!Subtarget->hasFlatScratchSVSSwizzleBug())

    return false;


  // The bug affects the swizzling of SVS accesses if there is any carry out

  // from the two low order bits (i.e. from bit 1 into bit 2) when adding

  // voffset to (soffset + inst_offset).

  KnownBits VKnown = CurDAG->computeKnownBits(VAddr);

  KnownBits SKnown =

      KnownBits::add(CurDAG->computeKnownBits(SAddr),

                     KnownBits::makeConstant(APInt(32, ImmOffset,

                                                   /*isSigned=*/true)));

  uint64_t VMax = VKnown.getMaxValue().getZExtValue();

  uint64_t SMax = SKnown.getMaxValue().getZExtValue();

  return (VMax & 3) + (SMax & 3) >= 4;

}


bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,

                                             SDValue &VAddr, SDValue &SAddr,

                                             SDValue &Offset,

                                             SDValue &CPol) const {

  int64_t ImmOffset = 0;


  SDValue LHS, RHS;

  SDValue OrigAddr = Addr;

  if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {

    int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();

    const SIInstrInfo *TII = Subtarget->getInstrInfo();


    if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,

                               AMDGPU::FlatAddrSpace::FlatScratch)) {

      Addr = LHS;

      ImmOffset = COffsetVal;

    } else if (!LHS->isDivergent() && COffsetVal > 0) {

      SDLoc SL(N);

      // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +

      //                         (large_offset & MaxOffset);

      int64_t SplitImmOffset, RemainderOffset;

      std::tie(SplitImmOffset, RemainderOffset) =

          TII->splitFlatOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,

                               AMDGPU::FlatAddrSpace::FlatScratch);


      if (isUInt<32>(RemainderOffset)) {

        SDNode *VMov = CurDAG->getMachineNode(

          AMDGPU::V_MOV_B32_e32, SL, MVT::i32,

          CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));

        VAddr = SDValue(VMov, 0);

        SAddr = LHS;

        if (!isFlatScratchBaseLegal(Addr))

          return false;

        if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))

          return false;

        Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);

        CPol = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);

        return true;

      }

    }

  }


  if (Addr.getOpcode() != ISD::ADD)

    return false;


  LHS = Addr.getOperand(0);

  RHS = Addr.getOperand(1);


  if (!LHS->isDivergent() && RHS->isDivergent()) {

    SAddr = LHS;

    VAddr = RHS;

  } else if (!RHS->isDivergent() && LHS->isDivergent()) {

    SAddr = RHS;

    VAddr = LHS;

  } else {

    return false;

  }


  if (OrigAddr != Addr) {

    if (!isFlatScratchBaseLegalSVImm(OrigAddr))

      return false;

  } else {

    if (!isFlatScratchBaseLegalSV(OrigAddr))

      return false;

  }


  if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))

    return false;

  SAddr = SelectSAddrFI(CurDAG, SAddr);

  Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);


  bool ScaleOffset = SelectScaleOffset(N, VAddr, true /* IsSigned */);

  CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,

                                   SDLoc(), MVT::i32);

  return true;

}


// For unbuffered smem loads, it is illegal for the Immediate Offset to be

// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.

// Handle the case where the Immediate Offset + SOffset is negative.

bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,

                                                     bool Imm32Only,

                                                     bool IsBuffer,

                                                     int64_t ImmOffset) const {

  if (!IsBuffer && !Imm32Only && ImmOffset < 0 &&

      AMDGPU::hasSMRDSignedImmOffset(*Subtarget)) {

    KnownBits SKnown = CurDAG->computeKnownBits(*SOffset);

    if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0)

      return false;

  }


  return true;

}


// Given \p Offset and load node \p N check if an \p Offset is a multiple of

// the load byte size. If it is update \p Offset to a pre-scaled value and

// return true.

bool AMDGPUDAGToDAGISel::SelectScaleOffset(SDNode *N, SDValue &Offset,

                                           bool IsSigned) const {

  bool ScaleOffset = false;

  if (!Subtarget->hasScaleOffset() || !Offset)

    return false;


  unsigned Size =

      (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;


  SDValue Off = Offset;

  if (SDValue Ext = matchExtFromI32orI32(Offset, IsSigned, CurDAG))

    Off = Ext;


  if (isPowerOf2_32(Size) && Off.getOpcode() == ISD::SHL) {

    if (auto *C = dyn_cast<ConstantSDNode>(Off.getOperand(1)))

      ScaleOffset = C->getZExtValue() == Log2_32(Size);

  } else if (Offset.getOpcode() == ISD::MUL ||

             (IsSigned && Offset.getOpcode() == AMDGPUISD::MUL_I24) ||

             Offset.getOpcode() == AMDGPUISD::MUL_U24 ||

             (Offset.isMachineOpcode() &&

              Offset.getMachineOpcode() ==

                  (IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO

                            : AMDGPU::S_MUL_U64_U32_PSEUDO))) {

    if (auto *C = dyn_cast<ConstantSDNode>(Offset.getOperand(1)))

      ScaleOffset = C->getZExtValue() == Size;

  }


  if (ScaleOffset)

    Offset = Off.getOperand(0);


  return ScaleOffset;

}


// Match an immediate (if Offset is not null) or an SGPR (if SOffset is

// not null) offset. If Imm32Only is true, match only 32-bit immediate

// offsets available on CI.

bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode,

                                          SDValue *SOffset, SDValue *Offset,

                                          bool Imm32Only, bool IsBuffer,

                                          bool HasSOffset, int64_t ImmOffset,

                                          bool *ScaleOffset) const {

  assert((!SOffset || !Offset) &&

         "Cannot match both soffset and offset at the same time!");


  if (ScaleOffset) {

    assert(N && SOffset);


    *ScaleOffset = SelectScaleOffset(N, ByteOffsetNode, false /* IsSigned */);

  }


  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);

  if (!C) {

    if (!SOffset)

      return false;


    if (ByteOffsetNode.getValueType().isScalarInteger() &&

        ByteOffsetNode.getValueType().getSizeInBits() == 32) {

      *SOffset = ByteOffsetNode;

      return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,

                                         ImmOffset);

    }

    if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {

      if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {

        *SOffset = ByteOffsetNode.getOperand(0);

        return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,

                                           ImmOffset);

      }

    }

    return false;

  }


  SDLoc SL(ByteOffsetNode);


  // GFX9 and GFX10 have signed byte immediate offsets. The immediate

  // offset for S_BUFFER instructions is unsigned.

  int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();

  std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(

      *Subtarget, ByteOffset, IsBuffer, HasSOffset);

  if (EncodedOffset && Offset && !Imm32Only) {

    *Offset = CurDAG->getSignedTargetConstant(*EncodedOffset, SL, MVT::i32);

    return true;

  }


  // SGPR and literal offsets are unsigned.

  if (ByteOffset < 0)

    return false;


  EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);

  if (EncodedOffset && Offset && Imm32Only) {

    *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);

    return true;

  }


  if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))

    return false;


  if (SOffset) {

    SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);

    *SOffset = SDValue(

        CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);

    return true;

  }


  return false;

}


SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {

  if (Addr.getValueType() != MVT::i32)

    return Addr;


  // Zero-extend a 32-bit address.

  SDLoc SL(Addr);


  const MachineFunction &MF = CurDAG->getMachineFunction();

  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();

  unsigned AddrHiVal = Info->get32BitAddressHighBits();

  SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);


  const SDValue Ops[] = {

    CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),

    Addr,

    CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),

    SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),

            0),

    CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),

  };


  return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,

                                        Ops), 0);

}


// Match a base and an immediate (if Offset is not null) or an SGPR (if

// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is

// true, match only 32-bit immediate offsets available on CI.

bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDNode *N, SDValue Addr,

                                              SDValue &SBase, SDValue *SOffset,

                                              SDValue *Offset, bool Imm32Only,

                                              bool IsBuffer, bool HasSOffset,

                                              int64_t ImmOffset,

                                              bool *ScaleOffset) const {

  if (SOffset && Offset) {

    assert(!Imm32Only && !IsBuffer);

    SDValue B;


    if (!SelectSMRDBaseOffset(N, Addr, B, nullptr, Offset, false, false, true))

      return false;


    int64_t ImmOff = 0;

    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))

      ImmOff = C->getSExtValue();


    return SelectSMRDBaseOffset(N, B, SBase, SOffset, nullptr, false, false,

                                true, ImmOff, ScaleOffset);

  }


  // A 32-bit (address + offset) should not cause unsigned 32-bit integer

  // wraparound, because s_load instructions perform the addition in 64 bits.

  if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&

      !Addr->getFlags().hasNoUnsignedWrap())

    return false;


  SDValue N0, N1;

  // Extract the base and offset if possible.

  if (Addr->isAnyAdd() || CurDAG->isADDLike(Addr)) {

    N0 = Addr.getOperand(0);

    N1 = Addr.getOperand(1);

  } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {

    assert(N0 && N1 && isa<ConstantSDNode>(N1));

  }

  if (!N0 || !N1)

    return false;


  if (SelectSMRDOffset(N, N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,

                       ImmOffset, ScaleOffset)) {

    SBase = N0;

    return true;

  }

  if (SelectSMRDOffset(N, N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,

                       ImmOffset, ScaleOffset)) {

    SBase = N1;

    return true;

  }

  return false;

}


bool AMDGPUDAGToDAGISel::SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase,

                                    SDValue *SOffset, SDValue *Offset,

                                    bool Imm32Only, bool *ScaleOffset) const {

  if (SelectSMRDBaseOffset(N, Addr, SBase, SOffset, Offset, Imm32Only,

                           /* IsBuffer */ false, /* HasSOffset */ false,

                           /* ImmOffset */ 0, ScaleOffset)) {

    SBase = Expand32BitAddress(SBase);

    return true;

  }


  if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {

    SBase = Expand32BitAddress(Addr);

    *Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);

    return true;

  }


  return false;

}


bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,

                                       SDValue &Offset) const {

  return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,

                    &Offset);

}


bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,

                                         SDValue &Offset) const {

  assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);

  return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,

                    &Offset, /* Imm32Only */ true);

}


bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase,

                                        SDValue &SOffset, SDValue &CPol) const {

  bool ScaleOffset;

  if (!SelectSMRD(N, Addr, SBase, &SOffset, /* Offset */ nullptr,

                  /* Imm32Only */ false, &ScaleOffset))

    return false;


  CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,

                                   SDLoc(N), MVT::i32);

  return true;

}


bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDNode *N, SDValue Addr,

                                           SDValue &SBase, SDValue &SOffset,

                                           SDValue &Offset,

                                           SDValue &CPol) const {

  bool ScaleOffset;

  if (!SelectSMRD(N, Addr, SBase, &SOffset, &Offset, false, &ScaleOffset))

    return false;


  CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,

                                   SDLoc(N), MVT::i32);

  return true;

}


bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {

  return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,

                          /* Imm32Only */ false, /* IsBuffer */ true);

}


bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,

                                               SDValue &Offset) const {

  assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);

  return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,

                          /* Imm32Only */ true, /* IsBuffer */ true);

}


bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,

                                                 SDValue &Offset) const {

  // Match the (soffset + offset) pair as a 32-bit register base and

  // an immediate offset.

  return N.getValueType() == MVT::i32 &&

         SelectSMRDBaseOffset(/* N */ nullptr, N, /* SBase */ SOffset,

                              /* SOffset*/ nullptr, &Offset,

                              /* Imm32Only */ false, /* IsBuffer */ true);

}


bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,

                                            SDValue &Base,

                                            SDValue &Offset) const {

  SDLoc DL(Index);


  if (CurDAG->isBaseWithConstantOffset(Index)) {

    SDValue N0 = Index.getOperand(0);

    SDValue N1 = Index.getOperand(1);

    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);


    // (add n0, c0)

    // Don't peel off the offset (c0) if doing so could possibly lead

    // the base (n0) to be negative.

    // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.

    if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||

        (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {

      Base = N0;

      Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);

      return true;

    }

  }


  if (isa<ConstantSDNode>(Index))

    return false;


  Base = Index;

  Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);

  return true;

}


SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,

                                     SDValue Val, uint32_t Offset,

                                     uint32_t Width) {

  if (Val->isDivergent()) {

    unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;

    SDValue Off = CurDAG->getTargetConstant(Offset, DL, MVT::i32);

    SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);


    return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);

  }

  unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;

  // Transformation function, pack the offset and width of a BFE into

  // the format expected by the S_BFE_I32 / S_BFE_U32. In the second

  // source, bits [5:0] contain the offset and bits [22:16] the width.

  uint32_t PackedVal = Offset | (Width << 16);

  SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);


  return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);

}


void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {

  // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)

  // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)

  // Predicate: 0 < b <= c < 32


  const SDValue &Shl = N->getOperand(0);

  ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));

  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));


  if (B && C) {

    uint32_t BVal = B->getZExtValue();

    uint32_t CVal = C->getZExtValue();


    if (0 < BVal && BVal <= CVal && CVal < 32) {

      bool Signed = N->getOpcode() == ISD::SRA;

      ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,

                  32 - CVal));

      return;

    }

  }

  SelectCode(N);

}


void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {

  switch (N->getOpcode()) {

  case ISD::AND:

    if (N->getOperand(0).getOpcode() == ISD::SRL) {

      // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"

      // Predicate: isMask(mask)

      const SDValue &Srl = N->getOperand(0);

      ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));

      ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));


      if (Shift && Mask) {

        uint32_t ShiftVal = Shift->getZExtValue();

        uint32_t MaskVal = Mask->getZExtValue();


        if (isMask_32(MaskVal)) {

          uint32_t WidthVal = llvm::popcount(MaskVal);

          ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,

                                  WidthVal));

          return;

        }

      }

    }

    break;

  case ISD::SRL:

    if (N->getOperand(0).getOpcode() == ISD::AND) {

      // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"

      // Predicate: isMask(mask >> b)

      const SDValue &And = N->getOperand(0);

      ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));

      ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));


      if (Shift && Mask) {

        uint32_t ShiftVal = Shift->getZExtValue();

        uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;


        if (isMask_32(MaskVal)) {

          uint32_t WidthVal = llvm::popcount(MaskVal);

          ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,

                      WidthVal));

          return;

        }

      }

    } else if (N->getOperand(0).getOpcode() == ISD::SHL) {

      SelectS_BFEFromShifts(N);

      return;

    }

    break;

  case ISD::SRA:

    if (N->getOperand(0).getOpcode() == ISD::SHL) {

      SelectS_BFEFromShifts(N);

      return;

    }

    break;


  case ISD::SIGN_EXTEND_INREG: {

    // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8

    SDValue Src = N->getOperand(0);

    if (Src.getOpcode() != ISD::SRL)

      break;


    const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));

    if (!Amt)

      break;


    unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();

    ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),

                            Amt->getZExtValue(), Width));

    return;

  }

  }


  SelectCode(N);

}


bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {

  assert(N->getOpcode() == ISD::BRCOND);

  if (!N->hasOneUse())

    return false;


  SDValue Cond = N->getOperand(1);

  if (Cond.getOpcode() == ISD::CopyToReg)

    Cond = Cond.getOperand(2);


  if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())

    return false;


  MVT VT = Cond.getOperand(0).getSimpleValueType();

  if (VT == MVT::i32)

    return true;


  if (VT == MVT::i64) {

    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

    return (CC == ISD::SETEQ || CC == ISD::SETNE) &&

           Subtarget->hasScalarCompareEq64();

  }


  if ((VT == MVT::f16 || VT == MVT::f32) && Subtarget->hasSALUFloatInsts())

    return true;


  return false;

}


static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {

  assert(VCMP->getOpcode() == AMDGPUISD::SETCC);

  // Special case for amdgcn.ballot:

  // %Cond = i1 (and/or combination of i1 ISD::SETCCs)

  // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq

  // =>

  // Use i1 %Cond value instead of i(WaveSize) %VCMP.

  // This is possible because divergent ISD::SETCC is selected as V_CMP and

  // Cond becomes a i(WaveSize) full mask value.

  // Note that ballot doesn't use SETEQ condition but its easy to support it

  // here for completeness, so in this case Negate is set true on return.

  auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();

  if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) &&

      isNullConstant(VCMP.getOperand(1))) {


    auto Cond = VCMP.getOperand(0);

    if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.

      Cond = Cond.getOperand(0);


    if (isBoolSGPR(Cond)) {

      Negate = VCMP_CC == ISD::SETEQ;

      return Cond;

    }

  }

  return SDValue();

}


void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {

  SDValue Cond = N->getOperand(1);


  if (Cond.isUndef()) {

    CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,

                         N->getOperand(2), N->getOperand(0));

    return;

  }


  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();


  bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);

  bool AndExec = !UseSCCBr;

  bool Negate = false;


  if (Cond.getOpcode() == ISD::SETCC &&

      Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {

    SDValue VCMP = Cond->getOperand(0);

    auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();

    if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&

        isNullConstant(Cond->getOperand(1)) &&

        // We may encounter ballot.i64 in wave32 mode on -O0.

        VCMP.getValueType().getSizeInBits() == Subtarget->getWavefrontSize()) {

      // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...

      // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq

      // BRCOND i1 %C, %BB

      // =>

      // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...

      // VCC = COPY i(WaveSize) %VCMP

      // S_CBRANCH_VCCNZ/VCCZ %BB

      Negate = CC == ISD::SETEQ;

      bool NegatedBallot = false;

      if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {

        Cond = BallotCond;

        UseSCCBr = !BallotCond->isDivergent();

        Negate = Negate ^ NegatedBallot;

      } else {

        // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always

        // selected as V_CMP, but this may change for uniform condition.

        Cond = VCMP;

        UseSCCBr = false;

      }

    }

    // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of

    // V_CMPs resulted from ballot or ballot has uniform condition and SCC is

    // used.

    AndExec = false;

  }


  unsigned BrOp =

      UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)

               : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);

  Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();

  SDLoc SL(N);


  if (AndExec) {

    // This is the case that we are selecting to S_CBRANCH_VCCNZ.  We have not

    // analyzed what generates the vcc value, so we do not know whether vcc

    // bits for disabled lanes are 0.  Thus we need to mask out bits for

    // disabled lanes.

    //

    // For the case that we select S_CBRANCH_SCC1 and it gets

    // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls

    // SIInstrInfo::moveToVALU which inserts the S_AND).

    //

    // We could add an analysis of what generates the vcc value here and omit

    // the S_AND when is unnecessary. But it would be better to add a separate

    // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it

    // catches both cases.

    Cond = SDValue(

        CurDAG->getMachineNode(

            Subtarget->isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64, SL,

            MVT::i1,

            CurDAG->getRegister(Subtarget->isWave32() ? AMDGPU::EXEC_LO

                                                      : AMDGPU::EXEC,

                                MVT::i1),

            Cond),

        0);

  }


  SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);

  CurDAG->SelectNodeTo(N, BrOp, MVT::Other,

                       N->getOperand(2), // Basic Block

                       VCC.getValue(0));

}


void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {

  if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&

      !N->isDivergent()) {

    SDValue Src = N->getOperand(0);

    if (Src.getValueType() == MVT::f16) {

      if (isExtractHiElt(Src, Src)) {

        CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(),

                             {Src});

        return;

      }

    }

  }


  SelectCode(N);

}


void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {

  // The address is assumed to be uniform, so if it ends up in a VGPR, it will

  // be copied to an SGPR with readfirstlane.

  unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?

    AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;


  SDValue Chain = N->getOperand(0);

  SDValue Ptr = N->getOperand(2);

  MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);

  MachineMemOperand *MMO = M->getMemOperand();

  bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;


  SDValue Offset;

  if (CurDAG->isBaseWithConstantOffset(Ptr)) {

    SDValue PtrBase = Ptr.getOperand(0);

    SDValue PtrOffset = Ptr.getOperand(1);


    const APInt &OffsetVal = PtrOffset->getAsAPIntVal();

    if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {

      N = glueCopyToM0(N, PtrBase);

      Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);

    }

  }


  if (!Offset) {

    N = glueCopyToM0(N, Ptr);

    Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);

  }


  SDValue Ops[] = {

    Offset,

    CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),

    Chain,

    N->getOperand(N->getNumOperands() - 1) // New glue

  };


  SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);

  CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});

}


// We need to handle this here because tablegen doesn't support matching

// instructions with multiple outputs.

void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID) {

  unsigned Opc;

  switch (IntrID) {

  case Intrinsic::amdgcn_ds_bvh_stack_rtn:

  case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:

    Opc = AMDGPU::DS_BVH_STACK_RTN_B32;

    break;

  case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:

    Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;

    break;

  case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:

    Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;

    break;

  }

  SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),

                   N->getOperand(5), N->getOperand(0)};


  MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);

  MachineMemOperand *MMO = M->getMemOperand();

  SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);

  CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});

}


void AMDGPUDAGToDAGISel::SelectTensorLoadStore(SDNode *N, unsigned IntrID) {

  bool IsLoad = IntrID == Intrinsic::amdgcn_tensor_load_to_lds;

  unsigned Opc =

      IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d4 : AMDGPU::TENSOR_STORE_FROM_LDS_d4;


  SmallVector<SDValue, 7> TensorOps;

  // First two groups

  TensorOps.push_back(N->getOperand(2)); // D# group 0

  TensorOps.push_back(N->getOperand(3)); // D# group 1


  // Use _D2 version if both group 2 and 3 are zero-initialized.

  SDValue Group2 = N->getOperand(4);

  SDValue Group3 = N->getOperand(5);

  if (ISD::isBuildVectorAllZeros(Group2.getNode()) &&

      ISD::isBuildVectorAllZeros(Group3.getNode())) {

    Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d2

                 : AMDGPU::TENSOR_STORE_FROM_LDS_d2;

  } else {                       // Has at least 4 groups

    TensorOps.push_back(Group2); // D# group 2

    TensorOps.push_back(Group3); // D# group 3

  }


  // TODO: Handle the fifth group: N->getOperand(6), which is silently ignored

  // for now because all existing targets only support up to 4 groups.

  TensorOps.push_back(CurDAG->getTargetConstant(0, SDLoc(N), MVT::i1)); // r128

  TensorOps.push_back(N->getOperand(7)); // cache policy

  TensorOps.push_back(N->getOperand(0)); // chain


  (void)CurDAG->SelectNodeTo(N, Opc, MVT::Other, TensorOps);

}


static unsigned gwsIntrinToOpcode(unsigned IntrID) {

  switch (IntrID) {

  case Intrinsic::amdgcn_ds_gws_init:

    return AMDGPU::DS_GWS_INIT;

  case Intrinsic::amdgcn_ds_gws_barrier:

    return AMDGPU::DS_GWS_BARRIER;

  case Intrinsic::amdgcn_ds_gws_sema_v:

    return AMDGPU::DS_GWS_SEMA_V;

  case Intrinsic::amdgcn_ds_gws_sema_br:

    return AMDGPU::DS_GWS_SEMA_BR;

  case Intrinsic::amdgcn_ds_gws_sema_p:

    return AMDGPU::DS_GWS_SEMA_P;

  case Intrinsic::amdgcn_ds_gws_sema_release_all:

    return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;

  default:

    llvm_unreachable("not a gws intrinsic");

  }

}


void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {

  if (!Subtarget->hasGWS() ||

      (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&

       !Subtarget->hasGWSSemaReleaseAll())) {

    // Let this error.

    SelectCode(N);

    return;

  }


  // Chain, intrinsic ID, vsrc, offset

  const bool HasVSrc = N->getNumOperands() == 4;

  assert(HasVSrc || N->getNumOperands() == 3);


  SDLoc SL(N);

  SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);

  int ImmOffset = 0;

  MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);

  MachineMemOperand *MMO = M->getMemOperand();


  // Don't worry if the offset ends up in a VGPR. Only one lane will have

  // effect, so SIFixSGPRCopies will validly insert readfirstlane.


  // The resource id offset is computed as (<isa opaque base> + M0[21:16] +

  // offset field) % 64. Some versions of the programming guide omit the m0

  // part, or claim it's from offset 0.

  if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {

    // If we have a constant offset, try to use the 0 in m0 as the base.

    // TODO: Look into changing the default m0 initialization value. If the

    // default -1 only set the low 16-bits, we could leave it as-is and add 1 to

    // the immediate offset.

    glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));

    ImmOffset = ConstOffset->getZExtValue();

  } else {

    if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {

      ImmOffset = BaseOffset.getConstantOperandVal(1);

      BaseOffset = BaseOffset.getOperand(0);

    }


    // Prefer to do the shift in an SGPR since it should be possible to use m0

    // as the result directly. If it's already an SGPR, it will be eliminated

    // later.

    SDNode *SGPROffset

      = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,

                               BaseOffset);

    // Shift to offset in m0

    SDNode *M0Base

      = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,

                               SDValue(SGPROffset, 0),

                               CurDAG->getTargetConstant(16, SL, MVT::i32));

    glueCopyToM0(N, SDValue(M0Base, 0));

  }


  SDValue Chain = N->getOperand(0);

  SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);


  const unsigned Opc = gwsIntrinToOpcode(IntrID);


  const MCInstrDesc &InstrDesc = TII->get(Opc);

  int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);


  const TargetRegisterClass *DataRC = TII->getRegClass(InstrDesc, Data0Idx);


  SmallVector<SDValue, 5> Ops;

  if (HasVSrc) {

    const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();


    SDValue Data = N->getOperand(2);

    MVT DataVT = Data.getValueType().getSimpleVT();

    if (TRI->isTypeLegalForClass(*DataRC, DataVT)) {

      // Normal 32-bit case.

      Ops.push_back(N->getOperand(2));

    } else {

      // Operand is really 32-bits, but requires 64-bit alignment, so use the

      // even aligned 64-bit register class.

      const SDValue RegSeqOps[] = {

          CurDAG->getTargetConstant(DataRC->getID(), SL, MVT::i32), Data,

          CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),

          SDValue(

              CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL, MVT::i32),

              0),

          CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32)};


      Ops.push_back(SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,

                                                   SL, MVT::v2i32, RegSeqOps),

                            0));

    }

  }


  Ops.push_back(OffsetField);

  Ops.push_back(Chain);


  SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);

  CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});

}


void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {

  if (Subtarget->getLDSBankCount() != 16) {

    // This is a single instruction with a pattern.

    SelectCode(N);

    return;

  }


  SDLoc DL(N);


  // This requires 2 instructions. It is possible to write a pattern to support

  // this, but the generated isel emitter doesn't correctly deal with multiple

  // output instructions using the same physical register input. The copy to m0

  // is incorrectly placed before the second instruction.

  //

  // TODO: Match source modifiers.

  //

  // def : Pat <

  //   (int_amdgcn_interp_p1_f16

  //    (VOP3Mods f32:$src0, i32:$src0_modifiers),

  //                             (i32 timm:$attrchan), (i32 timm:$attr),

  //                             (i1 timm:$high), M0),

  //   (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,

  //       timm:$attrchan, 0,

  //       (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {

  //   let Predicates = [has16BankLDS];

  // }


  // 16 bank LDS

  SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,

                                      N->getOperand(5), SDValue());


  SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);


  SDNode *InterpMov =

    CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {

        CurDAG->getTargetConstant(2, DL, MVT::i32), // P0

        N->getOperand(3),  // Attr

        N->getOperand(2),  // Attrchan

        ToM0.getValue(1) // In glue

  });


  SDNode *InterpP1LV =

    CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {

        CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers

        N->getOperand(1), // Src0

        N->getOperand(3), // Attr

        N->getOperand(2), // Attrchan

        CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers

        SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high

        N->getOperand(4), // high

        CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp

        CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod

        SDValue(InterpMov, 1)

  });


  CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));

}


void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {

  unsigned IntrID = N->getConstantOperandVal(1);

  switch (IntrID) {

  case Intrinsic::amdgcn_ds_append:

  case Intrinsic::amdgcn_ds_consume: {

    if (N->getValueType(0) != MVT::i32)

      break;

    SelectDSAppendConsume(N, IntrID);

    return;

  }

  case Intrinsic::amdgcn_ds_bvh_stack_rtn:

  case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:

  case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:

  case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:

    SelectDSBvhStackIntrinsic(N, IntrID);

    return;

  case Intrinsic::amdgcn_init_whole_wave:

    CurDAG->getMachineFunction()

        .getInfo<SIMachineFunctionInfo>()

        ->setInitWholeWave();

    break;

  }


  SelectCode(N);

}


void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {

  unsigned IntrID = N->getConstantOperandVal(0);

  unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END;

  SDNode *ConvGlueNode = N->getGluedNode();

  if (ConvGlueNode) {

    // FIXME: Possibly iterate over multiple glue nodes?

    assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE);

    ConvGlueNode = ConvGlueNode->getOperand(0).getNode();

    ConvGlueNode =

        CurDAG->getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, {},

                               MVT::Glue, SDValue(ConvGlueNode, 0));

  } else {

    ConvGlueNode = nullptr;

  }

  switch (IntrID) {

  case Intrinsic::amdgcn_wqm:

    Opcode = AMDGPU::WQM;

    break;

  case Intrinsic::amdgcn_softwqm:

    Opcode = AMDGPU::SOFT_WQM;

    break;

  case Intrinsic::amdgcn_wwm:

  case Intrinsic::amdgcn_strict_wwm:

    Opcode = AMDGPU::STRICT_WWM;

    break;

  case Intrinsic::amdgcn_strict_wqm:

    Opcode = AMDGPU::STRICT_WQM;

    break;

  case Intrinsic::amdgcn_interp_p1_f16:

    SelectInterpP1F16(N);

    return;

  case Intrinsic::amdgcn_permlane16_swap:

  case Intrinsic::amdgcn_permlane32_swap: {

    if ((IntrID == Intrinsic::amdgcn_permlane16_swap &&

         !Subtarget->hasPermlane16Swap()) ||

        (IntrID == Intrinsic::amdgcn_permlane32_swap &&

         !Subtarget->hasPermlane32Swap())) {

      SelectCode(N); // Hit the default error

      return;

    }


    Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap

                 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64

                 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;


    SmallVector<SDValue, 4> NewOps(N->op_begin() + 1, N->op_end());

    if (ConvGlueNode)

      NewOps.push_back(SDValue(ConvGlueNode, 0));


    bool FI = N->getConstantOperandVal(3);

    NewOps[2] = CurDAG->getTargetConstant(

        FI ? AMDGPU::DPP::DPP_FI_1 : AMDGPU::DPP::DPP_FI_0, SDLoc(), MVT::i32);


    CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), NewOps);

    return;

  }

  default:

    SelectCode(N);

    break;

  }


  if (Opcode != AMDGPU::INSTRUCTION_LIST_END) {

    SDValue Src = N->getOperand(1);

    CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});

  }


  if (ConvGlueNode) {

    SmallVector<SDValue, 4> NewOps(N->ops());

    NewOps.push_back(SDValue(ConvGlueNode, 0));

    CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), NewOps);

  }

}


void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {

  unsigned IntrID = N->getConstantOperandVal(1);

  switch (IntrID) {

  case Intrinsic::amdgcn_ds_gws_init:

  case Intrinsic::amdgcn_ds_gws_barrier:

  case Intrinsic::amdgcn_ds_gws_sema_v:

  case Intrinsic::amdgcn_ds_gws_sema_br:

  case Intrinsic::amdgcn_ds_gws_sema_p:

  case Intrinsic::amdgcn_ds_gws_sema_release_all:

    SelectDS_GWS(N, IntrID);

    return;

  case Intrinsic::amdgcn_tensor_load_to_lds:

  case Intrinsic::amdgcn_tensor_store_from_lds:

    SelectTensorLoadStore(N, IntrID);

    return;

  default:

    break;

  }


  SelectCode(N);

}


void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {

  SDValue Log2WaveSize =

    CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);

  CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),

                       {N->getOperand(0), Log2WaveSize});

}


void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {

  SDValue SrcVal = N->getOperand(1);

  if (SrcVal.getValueType() != MVT::i32) {

    SelectCode(N); // Emit default error

    return;

  }


  SDValue CopyVal;

  Register SP = TLI->getStackPointerRegisterToSaveRestore();

  SDLoc SL(N);


  if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {

    CopyVal = SrcVal.getOperand(0);

  } else {

    SDValue Log2WaveSize = CurDAG->getTargetConstant(

        Subtarget->getWavefrontSizeLog2(), SL, MVT::i32);


    if (N->isDivergent()) {

      SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,

                                              MVT::i32, SrcVal),

                       0);

    }


    CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,

                                             {SrcVal, Log2WaveSize}),

                      0);

  }


  SDValue CopyToSP = CurDAG->getCopyToReg(N->getOperand(0), SL, SP, CopyVal);

  CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyToSP);

}


bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,

                                            unsigned &Mods,

                                            bool IsCanonicalizing,

                                            bool AllowAbs) const {

  Mods = SISrcMods::NONE;

  Src = In;


  if (Src.getOpcode() == ISD::FNEG) {

    Mods |= SISrcMods::NEG;

    Src = Src.getOperand(0);

  } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {

    // Fold fsub [+-]0 into fneg. This may not have folded depending on the

    // denormal mode, but we're implicitly canonicalizing in a source operand.

    auto *LHS = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));

    if (LHS && LHS->isZero()) {

      Mods |= SISrcMods::NEG;

      Src = Src.getOperand(1);

    }

  }


  if (AllowAbs && Src.getOpcode() == ISD::FABS) {

    Mods |= SISrcMods::ABS;

    Src = Src.getOperand(0);

  }


  if (Mods != SISrcMods::NONE)

    return true;


  // Convert various sign-bit masks on integers to src mods. Currently disabled

  // for 16-bit types as the codegen replaces the operand without adding a

  // srcmod. This is intentionally finding the cases where we are performing

  // float neg and abs on int types, the goal is not to obtain two's complement

  // neg or abs. Limit converison to select operands via the nonCanonalizing

  // pattern.

  // TODO: Add 16-bit support.

  if (IsCanonicalizing)

    return true;


  // v2i32 xor/or/and are legal. A vselect using these instructions as operands

  // is scalarised into two selects with EXTRACT_VECTOR_ELT operands. Peek

  // through the extract to the bitwise op.

  SDValue PeekSrc =

      Src->getOpcode() == ISD::EXTRACT_VECTOR_ELT ? Src->getOperand(0) : Src;

  // Convert various sign-bit masks to src mods. Currently disabled for 16-bit

  // types as the codegen replaces the operand without adding a srcmod.

  // This is intentionally finding the cases where we are performing float neg

  // and abs on int types, the goal is not to obtain two's complement neg or

  // abs.

  // TODO: Add 16-bit support.

  unsigned Opc = PeekSrc.getOpcode();

  EVT VT = Src.getValueType();

  if ((Opc != ISD::AND && Opc != ISD::OR && Opc != ISD::XOR) ||

      (VT != MVT::i32 && VT != MVT::v2i32 && VT != MVT::i64))

    return true;


  ConstantSDNode *CRHS = isConstOrConstSplat(PeekSrc->getOperand(1));

  if (!CRHS)

    return true;


  auto ReplaceSrc = [&]() -> SDValue {

    if (Src->getOpcode() != ISD::EXTRACT_VECTOR_ELT)

      return Src.getOperand(0);


    SDValue LHS = PeekSrc->getOperand(0);

    SDValue Index = Src->getOperand(1);

    return CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Src),

                           Src.getValueType(), LHS, Index);

  };


  // Recognise Srcmods:

  // (xor a, 0x80000000) or v2i32 (xor a, {0x80000000,0x80000000}) as NEG.

  // (and a, 0x7fffffff) or v2i32 (and a, {0x7fffffff,0x7fffffff}) as ABS.

  // (or a, 0x80000000) or v2i32 (or a, {0x80000000,0x80000000}) as NEG+ABS

  // SrcModifiers.

  if (Opc == ISD::XOR && CRHS->getAPIntValue().isSignMask()) {

    Mods |= SISrcMods::NEG;

    Src = ReplaceSrc();

  } else if (Opc == ISD::AND && AllowAbs &&

             CRHS->getAPIntValue().isMaxSignedValue()) {

    Mods |= SISrcMods::ABS;

    Src = ReplaceSrc();

  } else if (Opc == ISD::OR && AllowAbs && CRHS->getAPIntValue().isSignMask()) {

    Mods |= SISrcMods::ABS | SISrcMods::NEG;

    Src = ReplaceSrc();

  }


  return true;

}


bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,

                                        SDValue &SrcMods) const {

  unsigned Mods;

  if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,

                         /*AllowAbs=*/true)) {

    SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);

    return true;

  }


  return false;

}


bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(

    SDValue In, SDValue &Src, SDValue &SrcMods) const {

  unsigned Mods;

  if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,

                         /*AllowAbs=*/true)) {

    SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);

    return true;

  }


  return false;

}


bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,

                                         SDValue &SrcMods) const {

  unsigned Mods;

  if (SelectVOP3ModsImpl(In, Src, Mods,

                         /*IsCanonicalizing=*/true,

                         /*AllowAbs=*/false)) {

    SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);

    return true;

  }


  return false;

}


bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {

  if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)

    return false;


  Src = In;

  return true;

}


bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,

                                               SDValue &SrcMods,

                                               bool OpSel) const {

  unsigned Mods;

  if (SelectVOP3ModsImpl(In, Src, Mods,

                         /*IsCanonicalizing=*/true,

                         /*AllowAbs=*/false)) {

    if (OpSel)

      Mods |= SISrcMods::OP_SEL_0;

    SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);

    return true;

  }


  return false;

}


bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,

                                           SDValue &SrcMods) const {

  return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);

}


bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,

                                             SDValue &SrcMods) const {

  return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);

}


bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,

                                         SDValue &SrcMods, SDValue &Clamp,

                                         SDValue &Omod) const {

  SDLoc DL(In);

  Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);

  Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);


  return SelectVOP3Mods(In, Src, SrcMods);

}


bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,

                                          SDValue &SrcMods, SDValue &Clamp,

                                          SDValue &Omod) const {

  SDLoc DL(In);

  Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);

  Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);


  return SelectVOP3BMods(In, Src, SrcMods);

}


bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,

                                         SDValue &Clamp, SDValue &Omod) const {

  Src = In;


  SDLoc DL(In);

  Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);

  Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);


  return true;

}


bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,

                                         SDValue &SrcMods, bool IsDOT) const {

  unsigned Mods = SISrcMods::NONE;

  Src = In;


  // TODO: Handle G_FSUB 0 as fneg

  if (Src.getOpcode() == ISD::FNEG) {

    Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);

    Src = Src.getOperand(0);

  }


  // 64-bit VOP3P instructions do not have OPSEL or ABS.

  bool HasOpSel = Src.getValueSizeInBits() != 128;


  if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 &&

      (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {

    unsigned VecMods = Mods;


    SDValue Lo = stripBitcast(Src.getOperand(0));

    SDValue Hi = stripBitcast(Src.getOperand(1));


    if (Lo.getOpcode() == ISD::FNEG) {

      Lo = stripBitcast(Lo.getOperand(0));

      Mods ^= SISrcMods::NEG;

    }


    if (Hi.getOpcode() == ISD::FNEG) {

      Hi = stripBitcast(Hi.getOperand(0));

      Mods ^= SISrcMods::NEG_HI;

    }


    if (HasOpSel) {

      if (isExtractHiElt(Lo, Lo))

        Mods |= SISrcMods::OP_SEL_0;


      if (isExtractHiElt(Hi, Hi))

        Mods |= SISrcMods::OP_SEL_1;

    }


    unsigned VecSize = Src.getValueSizeInBits();

    Lo = stripExtractLoElt(Lo);

    Hi = stripExtractLoElt(Hi);


    if (Lo.getValueSizeInBits() > VecSize) {

      Lo = CurDAG->getTargetExtractSubreg(

        (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),

        MVT::getIntegerVT(VecSize), Lo);

    }


    if (Hi.getValueSizeInBits() > VecSize) {

      Hi = CurDAG->getTargetExtractSubreg(

        (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),

        MVT::getIntegerVT(VecSize), Hi);

    }


    assert(Lo.getValueSizeInBits() <= VecSize &&

           Hi.getValueSizeInBits() <= VecSize);


    if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {

      // Really a scalar input. Just select from the low half of the register to

      // avoid packing.


      if (VecSize == Lo.getValueSizeInBits()) {

        Src = Lo;

      } else if (VecSize == 32) {

        Src = createVOP3PSrc32FromLo16(Lo, Src, CurDAG, Subtarget);

      } else {

        assert((Lo.getValueSizeInBits() == 32 && VecSize == 64) ||

               (Lo.getValueSizeInBits() == 64 && VecSize == 128));


        SDLoc SL(In);

        SDValue Undef = SDValue(

          CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,

                                 Lo.getValueType()), 0);

        const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();

        // <2 x 64> instructions do not have OPSEL and also replicate low 64

        // bits of a scalar input into high 64 bits. Use VGPRs in this case.

        // TODO: This fact can be exploited but we need to set proper OPSEL for

        // codegen folding purposes. It will not affect a final instruction.

        auto RC = (Lo->isDivergent() || !HasOpSel)

                      ? TRI->getVGPRClassForBitWidth(VecSize)

                      : TRI->getSGPRClassForBitWidth(VecSize);

        unsigned NumRegs = Lo.getValueSizeInBits() == 32 ? 1 : 2;

        const SDValue Ops[] = {

            CurDAG->getTargetConstant(RC->getID(), SL, MVT::i32), Lo,

            CurDAG->getTargetConstant(TRI->getSubRegFromChannel(0, NumRegs), SL,

                                      MVT::i32),

            HasOpSel ? Undef : Hi,

            CurDAG->getTargetConstant(

                TRI->getSubRegFromChannel(NumRegs, NumRegs), SL, MVT::i32)};


        Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,

                                             Src.getValueType(), Ops), 0);

      }

      SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);

      return true;

    }


    if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {

      uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()

                      .bitcastToAPInt().getZExtValue();

      if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {

        Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);

        SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);

        return true;

      }

    }


    Mods = VecMods;

  } else if (Src.getOpcode() == ISD::VECTOR_SHUFFLE &&

             Src.getNumOperands() == 2) {


    // TODO: We should repeat the build_vector source check above for the

    // vector_shuffle for negates and casts of individual elements.


    assert(Src.getValueSizeInBits() != 128 &&

           "<2 x 64> VECTOR_SHUFFLE should not be legal.");


    auto *SVN = cast<ShuffleVectorSDNode>(Src);

    ArrayRef<int> Mask = SVN->getMask();


    if (Mask[0] < 2 && Mask[1] < 2) {

      // src1 should be undef.

      SDValue ShuffleSrc = SVN->getOperand(0);


      if (ShuffleSrc.getOpcode() == ISD::FNEG) {

        ShuffleSrc = ShuffleSrc.getOperand(0);

        Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);

      }


      if (Mask[0] == 1)

        Mods |= SISrcMods::OP_SEL_0;

      if (Mask[1] == 1)

        Mods |= SISrcMods::OP_SEL_1;


      Src = ShuffleSrc;

      SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);

      return true;

    }

  }


  // Packed instructions do not have abs modifiers.

  Mods |= SISrcMods::OP_SEL_1;


  SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);

  return true;

}


bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,

                                            SDValue &SrcMods) const {

  return SelectVOP3PMods(In, Src, SrcMods, true);

}


bool AMDGPUDAGToDAGISel::SelectVOP3PNoModsDOT(SDValue In, SDValue &Src) const {

  SDValue SrcTmp, SrcModsTmp;

  SelectVOP3PMods(In, SrcTmp, SrcModsTmp, true);

  if (cast<ConstantSDNode>(SrcModsTmp)->getZExtValue() == SISrcMods::OP_SEL_1) {

    Src = SrcTmp;

    return true;

  }


  return false;

}


bool AMDGPUDAGToDAGISel::SelectVOP3PModsF32(SDValue In, SDValue &Src,

                                            SDValue &SrcMods) const {

  SelectVOP3Mods(In, Src, SrcMods);

  unsigned Mods = SISrcMods::OP_SEL_1;

  Mods |= cast<ConstantSDNode>(SrcMods)->getZExtValue();

  SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);

  return true;

}


bool AMDGPUDAGToDAGISel::SelectVOP3PNoModsF32(SDValue In, SDValue &Src) const {

  SDValue SrcTmp, SrcModsTmp;

  SelectVOP3PModsF32(In, SrcTmp, SrcModsTmp);

  if (cast<ConstantSDNode>(SrcModsTmp)->getZExtValue() == SISrcMods::OP_SEL_1) {

    Src = SrcTmp;

    return true;

  }


  return false;

}


bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,

                                                  SDValue &Src) const {

  const ConstantSDNode *C = cast<ConstantSDNode>(In);

  assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");


  unsigned Mods = SISrcMods::OP_SEL_1;

  unsigned SrcVal = C->getZExtValue();

  if (SrcVal == 1)

    Mods |= SISrcMods::OP_SEL_0;


  Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);

  return true;

}


MachineSDNode *

AMDGPUDAGToDAGISel::buildRegSequence32(SmallVectorImpl<SDValue> &Elts,

                                       const SDLoc &DL) const {

  unsigned DstRegClass;

  EVT DstTy;

  switch (Elts.size()) {

  case 8:

    DstRegClass = AMDGPU::VReg_256RegClassID;

    DstTy = MVT::v8i32;

    break;

  case 4:

    DstRegClass = AMDGPU::VReg_128RegClassID;

    DstTy = MVT::v4i32;

    break;

  case 2:

    DstRegClass = AMDGPU::VReg_64RegClassID;

    DstTy = MVT::v2i32;

    break;

  default:

    llvm_unreachable("unhandled Reg sequence size");

  }


  SmallVector<SDValue, 17> Ops;

  Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));

  for (unsigned i = 0; i < Elts.size(); ++i) {

    Ops.push_back(Elts[i]);

    Ops.push_back(CurDAG->getTargetConstant(

        SIRegisterInfo::getSubRegFromChannel(i), DL, MVT::i32));

  }

  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);

}


MachineSDNode *

AMDGPUDAGToDAGISel::buildRegSequence16(SmallVectorImpl<SDValue> &Elts,

                                       const SDLoc &DL) const {

  SmallVector<SDValue, 8> PackedElts;

  assert("unhandled Reg sequence size" &&

         (Elts.size() == 8 || Elts.size() == 16));


  // Pack 16-bit elements in pairs into 32-bit register. If both elements are

  // unpacked from 32-bit source use it, otherwise pack them using v_perm.

  for (unsigned i = 0; i < Elts.size(); i += 2) {

    SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));

    SDValue HiSrc;

    if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {

      PackedElts.push_back(HiSrc);

    } else {

      if (Subtarget->useRealTrue16Insts()) {

        // FIXME-TRUE16. For now pack VGPR_32 for 16-bit source before

        // passing to v_perm_b32. Eventually we should use replace v_perm_b32

        // by reg_sequence.

        SDValue Undef = SDValue(

            CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i16),

            0);

        Elts[i] =

            emitRegSequence(*CurDAG, AMDGPU::VGPR_32RegClassID, MVT::i32,

                            {Elts[i], Undef}, {AMDGPU::lo16, AMDGPU::hi16}, DL);

        Elts[i + 1] = emitRegSequence(*CurDAG, AMDGPU::VGPR_32RegClassID,

                                      MVT::i32, {Elts[i + 1], Undef},

                                      {AMDGPU::lo16, AMDGPU::hi16}, DL);

      }

      SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);

      MachineSDNode *Packed =

          CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,

                                 {Elts[i + 1], Elts[i], PackLoLo});

      PackedElts.push_back(SDValue(Packed, 0));

    }

  }

  return buildRegSequence32(PackedElts, DL);

}


MachineSDNode *

AMDGPUDAGToDAGISel::buildRegSequence(SmallVectorImpl<SDValue> &Elts,

                                     const SDLoc &DL,

                                     unsigned ElementSize) const {

  if (ElementSize == 16)

    return buildRegSequence16(Elts, DL);

  if (ElementSize == 32)

    return buildRegSequence32(Elts, DL);

  llvm_unreachable("Unhandled element size");

}


void AMDGPUDAGToDAGISel::selectWMMAModsNegAbs(unsigned ModOpcode,

                                              unsigned &Mods,

                                              SmallVectorImpl<SDValue> &Elts,

                                              SDValue &Src, const SDLoc &DL,

                                              unsigned ElementSize) const {

  if (ModOpcode == ISD::FNEG) {

    Mods |= SISrcMods::NEG;

    // Check if all elements also have abs modifier

    SmallVector<SDValue, 8> NegAbsElts;

    for (auto El : Elts) {

      if (El.getOpcode() != ISD::FABS)

        break;

      NegAbsElts.push_back(El->getOperand(0));

    }

    if (Elts.size() != NegAbsElts.size()) {

      // Neg

      Src = SDValue(buildRegSequence(Elts, DL, ElementSize), 0);

    } else {

      // Neg and Abs

      Mods |= SISrcMods::NEG_HI;

      Src = SDValue(buildRegSequence(NegAbsElts, DL, ElementSize), 0);

    }

  } else {

    assert(ModOpcode == ISD::FABS);

    // Abs

    Mods |= SISrcMods::NEG_HI;

    Src = SDValue(buildRegSequence(Elts, DL, ElementSize), 0);

  }

}


// Check all f16 elements for modifiers while looking through b32 and v2b16

// build vector, stop if element does not satisfy ModifierCheck.

static void


checkWMMAElementsModifiersF16(BuildVectorSDNode *BV,

                              std::function<bool(SDValue)> ModifierCheck) {

  for (unsigned i = 0; i < BV->getNumOperands(); ++i) {

    if (auto *F16Pair =

            dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {

      for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {

        SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));

        if (!ModifierCheck(ElF16))

          break;

      }

    }

  }

}


bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,

                                              SDValue &SrcMods) const {

  Src = In;

  unsigned Mods = SISrcMods::OP_SEL_1;


  // mods are on f16 elements

  if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {

    SmallVector<SDValue, 8> EltsF16;


    checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {

      if (Element.getOpcode() != ISD::FNEG)

        return false;

      EltsF16.push_back(Element.getOperand(0));

      return true;

    });


    // All elements have neg modifier

    if (BV->getNumOperands() * 2 == EltsF16.size()) {

      Src = SDValue(buildRegSequence16(EltsF16, SDLoc(In)), 0);

      Mods |= SISrcMods::NEG;

      Mods |= SISrcMods::NEG_HI;

    }

  }


  // mods are on v2f16 elements

  if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {

    SmallVector<SDValue, 8> EltsV2F16;

    for (unsigned i = 0; i < BV->getNumOperands(); ++i) {

      SDValue ElV2f16 = stripBitcast(BV->getOperand(i));

      // Based on first element decide which mod we match, neg or abs

      if (ElV2f16.getOpcode() != ISD::FNEG)

        break;

      EltsV2F16.push_back(ElV2f16.getOperand(0));

    }


    // All pairs of elements have neg modifier

    if (BV->getNumOperands() == EltsV2F16.size()) {

      Src = SDValue(buildRegSequence32(EltsV2F16, SDLoc(In)), 0);

      Mods |= SISrcMods::NEG;

      Mods |= SISrcMods::NEG_HI;

    }

  }


  SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);

  return true;

}


bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,

                                                 SDValue &SrcMods) const {

  Src = In;

  unsigned Mods = SISrcMods::OP_SEL_1;

  unsigned ModOpcode;


  // mods are on f16 elements

  if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {

    SmallVector<SDValue, 8> EltsF16;

    checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {

      // Based on first element decide which mod we match, neg or abs

      if (EltsF16.empty())

        ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;

      if (ElF16.getOpcode() != ModOpcode)

        return false;

      EltsF16.push_back(ElF16.getOperand(0));

      return true;

    });


    // All elements have ModOpcode modifier

    if (BV->getNumOperands() * 2 == EltsF16.size())

      selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, SDLoc(In), 16);

  }


  // mods are on v2f16 elements

  if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {

    SmallVector<SDValue, 8> EltsV2F16;


    for (unsigned i = 0; i < BV->getNumOperands(); ++i) {

      SDValue ElV2f16 = stripBitcast(BV->getOperand(i));

      // Based on first element decide which mod we match, neg or abs

      if (EltsV2F16.empty())

        ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;

      if (ElV2f16->getOpcode() != ModOpcode)

        break;

      EltsV2F16.push_back(ElV2f16->getOperand(0));

    }


    // All elements have ModOpcode modifier

    if (BV->getNumOperands() == EltsV2F16.size())

      selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, SDLoc(In), 32);

  }


  SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);

  return true;

}


bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,

                                                 SDValue &SrcMods) const {

  Src = In;

  unsigned Mods = SISrcMods::OP_SEL_1;

  SmallVector<SDValue, 8> EltsF32;


  if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {

    assert(BV->getNumOperands() > 0);

    // Based on first element decide which mod we match, neg or abs

    SDValue ElF32 = stripBitcast(BV->getOperand(0));

    unsigned ModOpcode =

        (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;

    for (unsigned i = 0; i < BV->getNumOperands(); ++i) {

      SDValue ElF32 = stripBitcast(BV->getOperand(i));

      if (ElF32.getOpcode() != ModOpcode)

        break;

      EltsF32.push_back(ElF32.getOperand(0));

    }


    // All elements had ModOpcode modifier

    if (BV->getNumOperands() == EltsF32.size())

      selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, SDLoc(In), 32);

  }


  SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);

  return true;

}


bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {

  if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {

    BitVector UndefElements;

    if (SDValue Splat = BV->getSplatValue(&UndefElements))

      if (isInlineImmediate(Splat.getNode())) {

        if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {

          unsigned Imm = C->getAPIntValue().getSExtValue();

          Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);

          return true;

        }

        if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {

          unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();

          Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);

          return true;

        }

        llvm_unreachable("unhandled Constant node");

      }

  }


  // 16 bit splat

  SDValue SplatSrc32 = stripBitcast(In);

  if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32))

    if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {

      SDValue SplatSrc16 = stripBitcast(Splat32);

      if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16))

        if (SDValue Splat = SplatSrc16BV->getSplatValue()) {

          const SIInstrInfo *TII = Subtarget->getInstrInfo();

          std::optional<APInt> RawValue;

          if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat))

            RawValue = C->getValueAPF().bitcastToAPInt();

          else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat))

            RawValue = C->getAPIntValue();


          if (RawValue.has_value()) {

            EVT VT = In.getValueType().getScalarType();

            if (VT.getSimpleVT() == MVT::f16 || VT.getSimpleVT() == MVT::bf16) {

              APFloat FloatVal(VT.getSimpleVT() == MVT::f16

                                   ? APFloatBase::IEEEhalf()

                                   : APFloatBase::BFloat(),

                               RawValue.value());

              if (TII->isInlineConstant(FloatVal)) {

                Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),

                                                MVT::i16);

                return true;

              }

            } else if (VT.getSimpleVT() == MVT::i16) {

              if (TII->isInlineConstant(RawValue.value())) {

                Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),

                                                MVT::i16);

                return true;

              }

            } else

              llvm_unreachable("unknown 16-bit type");

          }

        }

    }


  // Currently f64 immediate vectors are represented as vectors of v2i32, with

  // different lo and hi 32-bit values even though double values are splated.

  // So we have to manually compare to determine whether it is splated.

  if (CurDAG->isConstantIntBuildVectorOrConstantInt(SplatSrc32)) {

    int64_t Imm64 = 0;

    for (unsigned i = 0; i < SplatSrc32->getNumOperands(); i += 2) {

      auto Lo32 = cast<ConstantSDNode>(SplatSrc32->getOperand(i));

      auto Hi32 = cast<ConstantSDNode>(SplatSrc32->getOperand(i + 1));

      int64_t LoImm = Lo32->getAPIntValue().getSExtValue();

      int64_t HiImm = Hi32->getAPIntValue().getSExtValue();

      int64_t Imm64I = (HiImm << 32) + LoImm;

      if (i == 0) {

        if (!isInlineImmediate(APInt(64, Imm64I)))

          return false;

        Imm64 = Imm64I;

      } else if (Imm64I != Imm64)

        return false;

    } // end for


    Src = CurDAG->getTargetConstant(Imm64, SDLoc(In), MVT::i64);

    return true;

  }


  return false;

}


bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,

                                            SDValue &IndexKey) const {

  unsigned Key = 0;

  Src = In;


  if (In.getOpcode() == ISD::SRL) {

    const llvm::SDValue &ShiftSrc = In.getOperand(0);

    ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));

    if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&

        ShiftAmt->getZExtValue() % 8 == 0) {

      Key = ShiftAmt->getZExtValue() / 8;

      Src = ShiftSrc;

    }

  }


  IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);

  return true;

}


bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,

                                             SDValue &IndexKey) const {

  unsigned Key = 0;

  Src = In;


  if (In.getOpcode() == ISD::SRL) {

    const llvm::SDValue &ShiftSrc = In.getOperand(0);

    ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));

    if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&

        ShiftAmt->getZExtValue() == 16) {

      Key = 1;

      Src = ShiftSrc;

    }

  }


  IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);

  return true;

}


bool AMDGPUDAGToDAGISel::SelectSWMMACIndex32(SDValue In, SDValue &Src,

                                             SDValue &IndexKey) const {

  unsigned Key = 0;

  Src = In;


  SDValue InI32;


  if (In.getOpcode() == ISD::ANY_EXTEND || In.getOpcode() == ISD::ZERO_EXTEND) {

    const SDValue &ExtendSrc = In.getOperand(0);

    if (ExtendSrc.getValueSizeInBits() == 32)

      InI32 = ExtendSrc;

  } else if (In->getOpcode() == ISD::BITCAST) {

    const SDValue &CastSrc = In.getOperand(0);

    if (CastSrc.getOpcode() == ISD::BUILD_VECTOR &&

        CastSrc.getOperand(0).getValueSizeInBits() == 32) {

      ConstantSDNode *Zero = dyn_cast<ConstantSDNode>(CastSrc.getOperand(1));

      if (Zero && Zero->getZExtValue() == 0)

        InI32 = CastSrc.getOperand(0);

    }

  }


  if (InI32 && InI32.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {

    const SDValue &ExtractVecEltSrc = InI32.getOperand(0);

    ConstantSDNode *EltIdx = dyn_cast<ConstantSDNode>(InI32.getOperand(1));

    if (ExtractVecEltSrc.getValueSizeInBits() == 64 && EltIdx &&

        EltIdx->getZExtValue() == 1) {

      Key = 1;

      Src = ExtractVecEltSrc;

    }

  }


  IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);

  return true;

}


bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,

                                         SDValue &SrcMods) const {

  Src = In;

  // FIXME: Handle op_sel

  SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);

  return true;

}


bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,

                                             SDValue &SrcMods) const {

  // FIXME: Handle op_sel

  return SelectVOP3Mods(In, Src, SrcMods);

}


// Match lowered fpext from bf16 to f32. This is a bit operation extending

// a 16-bit value with 16-bit of zeroes at LSB:

//

// 1. (f32 (bitcast (build_vector (i16 0), (i16 (bitcast bf16:val)))))

// 2. (f32 (bitcast (and i32:val, 0xffff0000))) -> IsExtractHigh = true

// 3. (f32 (bitcast (shl i32:va, 16) -> IsExtractHigh = false


static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh) {

  if (Op.getValueType() != MVT::f32 || Op.getOpcode() != ISD::BITCAST)

    return SDValue();

  Op = Op.getOperand(0);


  IsExtractHigh = false;

  if (Op.getValueType() == MVT::v2i16 && Op.getOpcode() == ISD::BUILD_VECTOR) {

    auto Low16 = dyn_cast<ConstantSDNode>(Op.getOperand(0));

    if (!Low16 || !Low16->isZero())

      return SDValue();

    Op = stripBitcast(Op.getOperand(1));

    if (Op.getValueType() != MVT::bf16)

      return SDValue();

    return Op;

  }


  if (Op.getValueType() != MVT::i32)

    return SDValue();


  if (Op.getOpcode() == ISD::AND) {

    if (auto Mask = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {

      if (Mask->getZExtValue() == 0xffff0000) {

        IsExtractHigh = true;

        return Op.getOperand(0);

      }

    }

    return SDValue();

  }


  if (Op.getOpcode() == ISD::SHL) {

    if (auto Amt = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {

      if (Amt->getZExtValue() == 16)

        return Op.getOperand(0);

    }

  }


  return SDValue();

}


// The return value is not whether the match is possible (which it always is),

// but whether or not it a conversion is really used.

bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,

                                                   unsigned &Mods,

                                                   MVT VT) const {

  Mods = 0;

  SelectVOP3ModsImpl(In, Src, Mods);


  bool IsExtractHigh = false;

  if (Src.getOpcode() == ISD::FP_EXTEND) {

    Src = Src.getOperand(0);

  } else if (VT == MVT::bf16) {

    SDValue B16 = matchBF16FPExtendLike(Src, IsExtractHigh);

    if (!B16)

      return false;

    Src = B16;

  } else

    return false;


  if (Src.getValueType() != VT &&

      (VT != MVT::bf16 || Src.getValueType() != MVT::i32))

    return false;


  Src = stripBitcast(Src);


  // Be careful about folding modifiers if we already have an abs. fneg is

  // applied last, so we don't want to apply an earlier fneg.

  if ((Mods & SISrcMods::ABS) == 0) {

    unsigned ModsTmp;

    SelectVOP3ModsImpl(Src, Src, ModsTmp);


    if ((ModsTmp & SISrcMods::NEG) != 0)

      Mods ^= SISrcMods::NEG;


    if ((ModsTmp & SISrcMods::ABS) != 0)

      Mods |= SISrcMods::ABS;

  }


  // op_sel/op_sel_hi decide the source type and source.

  // If the source's op_sel_hi is set, it indicates to do a conversion from

  // fp16. If the sources's op_sel is set, it picks the high half of the source

  // register.


  Mods |= SISrcMods::OP_SEL_1;

  if (Src.getValueSizeInBits() == 16) {

    if (isExtractHiElt(Src, Src)) {

      Mods |= SISrcMods::OP_SEL_0;


      // TODO: Should we try to look for neg/abs here?

      return true;

    }


    if (Src.getOpcode() == ISD::TRUNCATE &&

        Src.getOperand(0).getValueType() == MVT::i32) {

      Src = Src.getOperand(0);

      return true;

    }


    if (Subtarget->useRealTrue16Insts())

      // In true16 mode, pack src to a 32bit

      Src = createVOP3PSrc32FromLo16(Src, In, CurDAG, Subtarget);

  } else if (IsExtractHigh)

    Mods |= SISrcMods::OP_SEL_0;


  return true;

}


bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,

                                                  SDValue &SrcMods) const {

  unsigned Mods = 0;

  if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16))

    return false;

  SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);

  return true;

}


bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,

                                               SDValue &SrcMods) const {

  unsigned Mods = 0;

  SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16);

  SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);

  return true;

}


bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src,

                                                      SDValue &SrcMods) const {

  unsigned Mods = 0;

  if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16))

    return false;

  SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);

  return true;

}


bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,

                                                   SDValue &SrcMods) const {

  unsigned Mods = 0;

  SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16);

  SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);

  return true;

}


// Match BITOP3 operation and return a number of matched instructions plus

// truth table.


static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,

                                              SmallVectorImpl<SDValue> &Src) {

  unsigned NumOpcodes = 0;

  uint8_t LHSBits, RHSBits;


  auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool {

    // Define truth table given Src0, Src1, Src2 bits permutations:

    //                          0     0     0

    //                          0     0     1

    //                          0     1     0

    //                          0     1     1

    //                          1     0     0

    //                          1     0     1

    //                          1     1     0

    //                          1     1     1

    const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };


    if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

      if (C->isAllOnes()) {

        Bits = 0xff;

        return true;

      }

      if (C->isZero()) {

        Bits = 0;

        return true;

      }

    }


    for (unsigned I = 0; I < Src.size(); ++I) {

      // Try to find existing reused operand

      if (Src[I] == Op) {

        Bits = SrcBits[I];

        return true;

      }

      // Try to replace parent operator

      if (Src[I] == In) {

        Bits = SrcBits[I];

        Src[I] = Op;

        return true;

      }

    }


    if (Src.size() == 3) {

      // No room left for operands. Try one last time, there can be a 'not' of

      // one of our source operands. In this case we can compute the bits

      // without growing Src vector.

      if (Op.getOpcode() == ISD::XOR) {

        if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {

          if (C->isAllOnes()) {

            SDValue LHS = Op.getOperand(0);

            for (unsigned I = 0; I < Src.size(); ++I) {

              if (Src[I] == LHS) {

                Bits = ~SrcBits[I];

                return true;

              }

            }

          }

        }

      }


      return false;

    }


    Bits = SrcBits[Src.size()];

    Src.push_back(Op);

    return true;

  };


  switch (In.getOpcode()) {

  case ISD::AND:

  case ISD::OR:

  case ISD::XOR: {

    SDValue LHS = In.getOperand(0);

    SDValue RHS = In.getOperand(1);


    SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());

    if (!getOperandBits(LHS, LHSBits) ||

        !getOperandBits(RHS, RHSBits)) {

      Src = std::move(Backup);

      return std::make_pair(0, 0);

    }


    // Recursion is naturally limited by the size of the operand vector.

    //

    // When LHS and RHS share a common sub-expression, one side's recursion

    // may decompose that sub-expression and replace the Src slot the other

    // side occupies with sub-operands via the "replace parent" path in

    // getOperandBits. The other side's cached bit-pattern then refers to a

    // slot whose contents changed, producing a wrong truth table.

    //

    // We detect this in three ways:

    // (A) If LHS recursed, its truth table is valid against the Src state

    //     when LHS recursion completed (SrcAfterLHS). If RHS recursion

    //     then mutates a Src slot that LHSBits depends on, LHSBits is

    //     stale.

    // (B) If RHS did not recurse, RHSBits came from getOperandBits and

    //     refers to a specific Src slot. If that slot's contents changed

    //     (by either recursion), RHSBits is stale.

    // (C) Symmetrically for LHS if it did not recurse.

    SmallVector<SDValue, 3> SrcBeforeRecurse(Src.begin(), Src.end());

    uint8_t LHSBitsOrig = LHSBits;

    uint8_t RHSBitsOrig = RHSBits;


    auto LHSOp = BitOp3_Op(LHS, Src);

    if (LHSOp.first) {

      NumOpcodes += LHSOp.first;

      LHSBits = LHSOp.second;

    }


    SmallVector<SDValue, 3> SrcAfterLHS(Src.begin(), Src.end());


    auto RHSOp = BitOp3_Op(RHS, Src);

    if (RHSOp.first) {

      NumOpcodes += RHSOp.first;

      RHSBits = RHSOp.second;

    }


    // dependsOnSlot: true iff the truth table TT varies with slot Slot.

    auto dependsOnSlot = [](uint8_t TT, int Slot) -> bool {

      if (Slot < 0 || Slot > 2)

        return false;

      const uint8_t Masks[3] = {0x0f, 0x33, 0x55};

      const int Shifts[3] = {4, 2, 1};

      return ((TT ^ (TT >> Shifts[Slot])) & Masks[Slot]) != 0;

    };


    // findSlot: locate the Src slot a getOperandBits result depends on,

    // including negated (XOR with -1) patterns that getOperandBits

    // resolves via the NOT shortcut (~SrcBits[I]).

    const uint8_t SrcBitsConst[3] = {0xf0, 0xcc, 0xaa};

    auto findSlot = [&](uint8_t Bits, SDValue Op,

                        const SmallVectorImpl<SDValue> &S) -> int {

      SDValue NegatedInner;

      bool IsNegationOp =

          Op.getOpcode() == ISD::XOR && isAllOnesConstant(Op.getOperand(1));

      if (IsNegationOp)

        NegatedInner = Op.getOperand(0);

      for (int I = 0; I < (int)S.size(); I++) {

        if (Bits == SrcBitsConst[I] && S[I] == Op)

          return I;

        if (IsNegationOp && Bits == (uint8_t)~SrcBitsConst[I] &&

            S[I] == NegatedInner)

          return I;

      }

      return -1;

    };


    bool Stale = false;


    // (A) LHS recursed: its truth table is against SrcAfterLHS.

    //     Check if RHS recursion mutated a slot that LHSBits uses.

    if (LHSOp.first) {

      for (int I = 0; I < (int)SrcAfterLHS.size() && I < 3; I++) {

        if (I < (int)Src.size() && Src[I] != SrcAfterLHS[I] &&

            dependsOnSlot(LHSBits, I)) {

          Stale = true;

          break;

        }

      }

    }


    // (B) RHS did not recurse: RHSBits from getOperandBits is against

    //     SrcBeforeRecurse. Check if that slot was mutated since then.

    if (!Stale && !RHSOp.first) {

      int Slot = findSlot(RHSBitsOrig, RHS, SrcBeforeRecurse);

      if (Slot >= 0 &&

          (Slot >= (int)Src.size() || Src[Slot] != SrcBeforeRecurse[Slot]))

        Stale = true;

    }


    // (C) LHS did not recurse: LHSBits from getOperandBits is against

    //     SrcBeforeRecurse. Check if that slot was mutated since then.

    if (!Stale && !LHSOp.first) {

      int Slot = findSlot(LHSBitsOrig, LHS, SrcBeforeRecurse);

      if (Slot >= 0 &&

          (Slot >= (int)Src.size() || Src[Slot] != SrcBeforeRecurse[Slot]))

        Stale = true;

    }


    if (Stale) {

      Src = std::move(SrcBeforeRecurse);

      LHSBits = LHSBitsOrig;

      RHSBits = RHSBitsOrig;

      NumOpcodes = 0;

    }

    break;

  }

  default:

    return std::make_pair(0, 0);

  }


  uint8_t TTbl;

  switch (In.getOpcode()) {

  case ISD::AND:

    TTbl = LHSBits & RHSBits;

    break;

  case ISD::OR:

    TTbl = LHSBits | RHSBits;

    break;

  case ISD::XOR:

    TTbl = LHSBits ^ RHSBits;

    break;

  default:

    break;

  }


  return std::make_pair(NumOpcodes + 1, TTbl);

}


bool AMDGPUDAGToDAGISel::SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1,

                                      SDValue &Src2, SDValue &Tbl) const {

  SmallVector<SDValue, 3> Src;

  uint8_t TTbl;

  unsigned NumOpcodes;


  std::tie(NumOpcodes, TTbl) = BitOp3_Op(In, Src);


  // Src.empty() case can happen if all operands are all zero or all ones.

  // Normally it shall be optimized out before reaching this.

  if (NumOpcodes < 2 || Src.empty())

    return false;


  // For a uniform case threshold should be higher to account for moves between

  // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs

  // and a readtfirstlane after.

  if (NumOpcodes < 4 && !In->isDivergent())

    return false;


  if (NumOpcodes == 2 && In.getValueType() == MVT::i32) {

    // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes

    // asm more readable. This cannot be modeled with AddedComplexity because

    // selector does not know how many operations did we match.

    if ((In.getOpcode() == ISD::XOR || In.getOpcode() == ISD::OR) &&

        (In.getOperand(0).getOpcode() == In.getOpcode() ||

         In.getOperand(1).getOpcode() == In.getOpcode()))

      return false;


    if (In.getOpcode() == ISD::OR &&

        (In.getOperand(0).getOpcode() == ISD::AND ||

         In.getOperand(1).getOpcode() == ISD::AND))

      return false;

  }


  // Last operand can be ignored, turning a ternary operation into a binary.

  // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace

  // 'c' with 'a' here without changing the answer. In some pathological

  // cases it should be possible to get an operation with a single operand

  // too if optimizer would not catch it.

  while (Src.size() < 3)

    Src.push_back(Src[0]);


  Src0 = Src[0];

  Src1 = Src[1];

  Src2 = Src[2];


  Tbl = CurDAG->getTargetConstant(TTbl, SDLoc(In), MVT::i32);

  return true;

}


SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {

  if (In.isUndef())

    return CurDAG->getUNDEF(MVT::i32);


  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {

    SDLoc SL(In);

    return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);

  }


  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {

    SDLoc SL(In);

    return CurDAG->getConstant(

      C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);

  }


  SDValue Src;

  if (isExtractHiElt(In, Src))

    return Src;


  return SDValue();

}


bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {

  assert(CurDAG->getTarget().getTargetTriple().isAMDGCN());


  const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();

  const SIInstrInfo *SII = Subtarget->getInstrInfo();


  unsigned Limit = 0;

  bool AllUsesAcceptSReg = true;

  for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();

    Limit < 10 && U != E; ++U, ++Limit) {

    const TargetRegisterClass *RC =

        getOperandRegClass(U->getUser(), U->getOperandNo());


    // If the register class is unknown, it could be an unknown

    // register class that needs to be an SGPR, e.g. an inline asm

    // constraint

    if (!RC || SIRI->isSGPRClass(RC))

      return false;


    if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass &&

        RC != &AMDGPU::VS_64_Align2RegClass) {

      AllUsesAcceptSReg = false;

      SDNode *User = U->getUser();

      if (User->isMachineOpcode()) {

        unsigned Opc = User->getMachineOpcode();

        const MCInstrDesc &Desc = SII->get(Opc);

        if (Desc.isCommutable()) {

          unsigned OpIdx = Desc.getNumDefs() + U->getOperandNo();

          unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;

          if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {

            unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();

            const TargetRegisterClass *CommutedRC =

                getOperandRegClass(U->getUser(), CommutedOpNo);

            if (CommutedRC == &AMDGPU::VS_32RegClass ||

                CommutedRC == &AMDGPU::VS_64RegClass ||

                CommutedRC == &AMDGPU::VS_64_Align2RegClass)

              AllUsesAcceptSReg = true;

          }

        }

      }

      // If "AllUsesAcceptSReg == false" so far we haven't succeeded

      // commuting current user. This means have at least one use

      // that strictly require VGPR. Thus, we will not attempt to commute

      // other user instructions.

      if (!AllUsesAcceptSReg)

        break;

    }

  }

  return !AllUsesAcceptSReg && (Limit < 10);

}


bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {

  const auto *Ld = cast<LoadSDNode>(N);

  const MachineMemOperand *MMO = Ld->getMemOperand();


  // FIXME: We ought to able able to take the direct isDivergent result. We

  // cannot rely on the MMO for a uniformity check, and should stop using

  // it. This is a hack for 2 ways that the IR divergence analysis is superior

  // to the DAG divergence: Recognizing shift-of-workitem-id as always

  // uniform, and isSingleLaneExecution. These should be handled in the DAG

  // version, and then this can be dropped.

  if (Ld->isDivergent() && !AMDGPU::isUniformMMO(MMO))

    return false;


  return MMO->getSize().hasValue() &&

         Ld->getAlign() >=

             Align(std::min(MMO->getSize().getValue().getKnownMinValue(),

                            uint64_t(4))) &&

         (MMO->isInvariant() ||

          (Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||

           Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||

          (Subtarget->getScalarizeGlobalBehavior() &&

           Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&

           Ld->isSimple() &&

           static_cast<const SITargetLowering *>(getTargetLowering())

               ->isMemOpHasNoClobberedMemOperand(N)));

}


void AMDGPUDAGToDAGISel::PostprocessISelDAG() {

  const AMDGPUTargetLowering& Lowering =

    *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());

  bool IsModified = false;

  do {

    IsModified = false;


    // Go over all selected nodes and try to fold them a bit more

    SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();

    while (Position != CurDAG->allnodes_end()) {

      SDNode *Node = &*Position++;

      MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);

      if (!MachineNode)

        continue;


      SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);

      if (ResNode != Node) {

        if (ResNode)

          ReplaceUses(Node, ResNode);

        IsModified = true;

      }

    }

    CurDAG->RemoveDeadNodes();

  } while (IsModified);

}


AMDGPUDAGToDAGISelLegacy::AMDGPUDAGToDAGISelLegacy(TargetMachine &TM,

                                                   CodeGenOptLevel OptLevel)

    : SelectionDAGISelLegacy(

          ID, std::make_unique<AMDGPUDAGToDAGISel>(TM, OptLevel)) {}


char AMDGPUDAGToDAGISelLegacy::ID = 0;

SelectTypeKind::FP
@ FP
Definition AArch64ISelDAGToDAG.cpp:2000

SDValue
return SDValue()

assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

getBaseWithOffsetUsingSplitOR
static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr, SDValue &N0, SDValue &N1)
Definition AMDGPUISelDAGToDAG.cpp:941

SelectSAddrFI
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr)
Definition AMDGPUISelDAGToDAG.cpp:2246

matchExtFromI32orI32
static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned, const SelectionDAG *DAG)
Definition AMDGPUISelDAGToDAG.cpp:2008

findMemSDNode
static MemSDNode * findMemSDNode(SDNode *N)
Definition AMDGPUISelDAGToDAG.cpp:1866

IsCopyFromSGPR
static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val)
Definition AMDGPUISelDAGToDAG.cpp:1775

combineBallotPattern
static SDValue combineBallotPattern(SDValue VCMP, bool &Negate)
Definition AMDGPUISelDAGToDAG.cpp:2864

matchBF16FPExtendLike
static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh)
Definition AMDGPUISelDAGToDAG.cpp:4237

checkWMMAElementsModifiersF16
static void checkWMMAElementsModifiersF16(BuildVectorSDNode *BV, std::function< bool(SDValue)> ModifierCheck)
Definition AMDGPUISelDAGToDAG.cpp:3925

AMDGPUISelDAGToDAG.h
Defines an instruction selector for the AMDGPU target.

AMDGPUInstrInfo.h
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.

isNoUnsignedWrap
static bool isNoUnsignedWrap(MachineInstr *Addr)
Definition AMDGPUInstructionSelector.cpp:6565

isExtractHiElt
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
Definition AMDGPUInstructionSelector.cpp:2920

BitOp3_Op
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
Definition AMDGPUInstructionSelector.cpp:4234

gwsIntrinToOpcode
static unsigned gwsIntrinToOpcode(unsigned IntrID)
Definition AMDGPUInstructionSelector.cpp:1960

AMDGPUMCTargetDesc.h
Provides AMDGPU specific target descriptions.

AMDGPUSubtarget.h
Base class for AMDGPU specific classes of TargetSubtarget.

AMDGPUTargetMachine.h
The AMDGPU TargetMachine interface definition for hw codegen targets.

AMDGPU.h

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition ARMSLSHardening.cpp:73

E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

Dominators.h

FunctionLoweringInfo.h

TII
const HexagonInstrInfo * TII
Definition HexagonCopyToCombine.cpp:118

InitializePasses.h

InlinePriorityMode::Size
@ Size
Definition InlineOrder.cpp:25

Ops
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Definition ItaniumDemangle.h:3391

LoopInfo.h

F
#define F(x, y, z)
Definition MD5.cpp:54

I
#define I(x, y, z)
Definition MD5.cpp:57

Reg
Register Reg
Definition MachineSink.cpp:2126

TRI
Register const TargetRegisterInfo * TRI
Definition MachineSink.cpp:2127

Register
Promote Memory to Register
Definition Mem2Reg.cpp:110

OpIdx
MachineInstr unsigned OpIdx
Definition NVPTXPrologEpilogPass.cpp:56

FAM
FunctionAnalysisManager FAM
Definition PassBuilderBindings.cpp:61

INITIALIZE_PASS_DEPENDENCY
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42

INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44

INITIALIZE_PASS_BEGIN
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39

R600MCTargetDesc.h
Provides R600 specific target descriptions.

R600RegisterInfo.h
Interface definition for R600RegisterInfo.

Cond
const SmallVectorImpl< MachineOperand > & Cond
Definition RISCVRedundantCopyElimination.cpp:73

Opc
auto Opc
Definition RISCVRedundantCopyElimination.cpp:77

SIISelLowering.h
SI DAG Lowering interface definition.

SIMachineFunctionInfo.h

SelectionDAGISel.h

SelectionDAGNodes.h

SelectionDAG.h

LLVM_DEBUG
#define LLVM_DEBUG(...)
Definition Debug.h:119

UniformityAnalysis.h
LLVM IR instance of the generic uniformity analysis.

RHS
Value * RHS
Definition X86PartialReduction.cpp:81

LHS
Value * LHS
Definition X86PartialReduction.cpp:80

Node
Definition ItaniumDemangle.h:166

llvm::AMDGPUDAGToDAGISelLegacy
Definition AMDGPUISelDAGToDAG.h:322

llvm::AMDGPUDAGToDAGISelLegacy::ID
static char ID
Definition AMDGPUISelDAGToDAG.h:324

llvm::AMDGPUDAGToDAGISelLegacy::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition AMDGPUISelDAGToDAG.cpp:255

llvm::AMDGPUDAGToDAGISelLegacy::AMDGPUDAGToDAGISelLegacy
AMDGPUDAGToDAGISelLegacy(TargetMachine &TM, CodeGenOptLevel OptLevel)
Definition AMDGPUISelDAGToDAG.cpp:4764

llvm::AMDGPUDAGToDAGISelLegacy::runOnMachineFunction
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
Definition AMDGPUISelDAGToDAG.cpp:244

llvm::AMDGPUDAGToDAGISelLegacy::getPassName
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
Definition AMDGPUISelDAGToDAG.cpp:988

llvm::AMDGPUDAGToDAGISel
AMDGPU specific code to select AMDGPU machine instructions for SelectionDAG operations.
Definition AMDGPUISelDAGToDAG.h:51

llvm::AMDGPUDAGToDAGISel::isSDWAOperand
bool isSDWAOperand(const SDNode *N) const
Definition AMDGPUISelDAGToDAG.cpp:902

llvm::AMDGPUDAGToDAGISel::SelectBuildVector
void SelectBuildVector(SDNode *N, unsigned RegClassID)
Definition AMDGPUISelDAGToDAG.cpp:505

llvm::AMDGPUDAGToDAGISel::Select
void Select(SDNode *N) override
Main hook for targets to transform nodes into machine nodes.
Definition AMDGPUISelDAGToDAG.cpp:680

llvm::AMDGPUDAGToDAGISel::runOnMachineFunction
bool runOnMachineFunction(MachineFunction &MF) override
Definition AMDGPUISelDAGToDAG.cpp:174

llvm::AMDGPUDAGToDAGISel::SelectVectorShuffle
void SelectVectorShuffle(SDNode *N)
Definition AMDGPUISelDAGToDAG.cpp:591

llvm::AMDGPUDAGToDAGISel::PreprocessISelDAG
void PreprocessISelDAG() override
PreprocessISelDAG - This hook allows targets to hack on the graph before instruction selection starts...
Definition AMDGPUISelDAGToDAG.cpp:344

llvm::AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel
AMDGPUDAGToDAGISel()=delete

llvm::AMDGPUDAGToDAGISel::PostprocessISelDAG
void PostprocessISelDAG() override
PostprocessISelDAG() - This hook allows the target to hack on the graph right after selection.
Definition AMDGPUISelDAGToDAG.cpp:4738

llvm::AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector
bool matchLoadD16FromBuildVector(SDNode *N) const
Definition AMDGPUISelDAGToDAG.cpp:264

llvm::AMDGPUISelDAGToDAGPass::run
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
Definition AMDGPUISelDAGToDAG.cpp:997

llvm::AMDGPUISelDAGToDAGPass::AMDGPUISelDAGToDAGPass
AMDGPUISelDAGToDAGPass(TargetMachine &TM)
Definition AMDGPUISelDAGToDAG.cpp:992

llvm::AMDGPUSubtarget::GFX9
@ GFX9
Definition AMDGPUSubtarget.h:41

llvm::AMDGPUSubtarget::SEA_ISLANDS
@ SEA_ISLANDS
Definition AMDGPUSubtarget.h:39

llvm::AMDGPUSubtarget::VOLCANIC_ISLANDS
@ VOLCANIC_ISLANDS
Definition AMDGPUSubtarget.h:40

llvm::AMDGPUTargetLowering
Definition AMDGPUISelLowering.h:27

llvm::AMDGPUTargetLowering::stripBitcast
static SDValue stripBitcast(SDValue Val)
Definition AMDGPUISelLowering.h:199

llvm::APFloatBase::BFloat
static const fltSemantics & BFloat()
Definition APFloat.h:296

llvm::APFloatBase::IEEEhalf
static const fltSemantics & IEEEhalf()
Definition APFloat.h:295

llvm::APInt
Class for arbitrary precision integers.
Definition APInt.h:78

llvm::APInt::getZExtValue
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563

llvm::APInt::isSignMask
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:467

llvm::APInt::isMaxSignedValue
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition APInt.h:406

llvm::APInt::getSExtValue
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585

llvm::APInt::countr_one
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1679

llvm::AnalysisManager::getResult
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition PassManager.h:434

llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition PassAnalysisSupport.h:48

llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition PassAnalysisSupport.h:76

llvm::ArrayRef
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40

llvm::ArrayRef::size
size_t size() const
Get the array size.
Definition ArrayRef.h:141

llvm::BasicBlock
LLVM Basic Block Representation.
Definition BasicBlock.h:62

llvm::BasicBlock::getTerminator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237

llvm::BuildVectorSDNode
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Definition SelectionDAGNodes.h:2295

llvm::BuildVectorSDNode::getSplatValue
LLVM_ABI SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
Definition SelectionDAG.cpp:14618

llvm::ConstantFPSDNode
Definition SelectionDAGNodes.h:1864

llvm::ConstantSDNode
Definition SelectionDAGNodes.h:1809

llvm::ConstantSDNode::getZExtValue
uint64_t getZExtValue() const
Definition SelectionDAGNodes.h:1826

llvm::ConstantSDNode::getAPIntValue
const APInt & getAPIntValue() const
Definition SelectionDAGNodes.h:1825

llvm::ConstantSDNode::getSExtValue
int64_t getSExtValue() const
Definition SelectionDAGNodes.h:1827

llvm::DominatorTreeAnalysis
Analysis pass which computes a DominatorTree.
Definition Dominators.h:270

llvm::DominatorTreeWrapperPass
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:306

llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:151

llvm::FunctionAnalysisManagerMachineFunctionProxy
Definition MachinePassManager.h:130

llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314

llvm::GCNSubtarget
Definition GCNSubtarget.h:45

llvm::GCNSubtarget::getInstrInfo
const SIInstrInfo * getInstrInfo() const override
Definition GCNSubtarget.h:126

llvm::GCNSubtarget::useRealTrue16Insts
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
Definition GCNSubtarget.h:1038

llvm::GCNSubtarget::getGeneration
Generation getGeneration() const
Definition GCNSubtarget.h:168

llvm::GCNSubtarget::checkSubtargetFeatures
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
Definition GCNSubtarget.cpp:171

llvm::Instruction
Definition Instruction.h:70

llvm::LoadSDNode
This class is used to represent ISD::LOAD nodes.
Definition SelectionDAGNodes.h:2656

llvm::LoadSDNode::getBasePtr
const SDValue & getBasePtr() const
Definition SelectionDAGNodes.h:2675

llvm::LoadSDNode::getExtensionType
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Definition SelectionDAGNodes.h:2671

llvm::LocationSize::hasValue
bool hasValue() const
Definition MemoryLocation.h:153

llvm::LocationSize::getValue
TypeSize getValue() const
Definition MemoryLocation.h:158

llvm::LoopAnalysis
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:587

llvm::LoopInfoBase::getLoopsInPreorder
SmallVector< LoopT *, 4 > getLoopsInPreorder() const
Return all of the loops in the function in preorder across the loop nests, with siblings in forward p...
Definition GenericLoopInfoImpl.h:613

llvm::LoopInfoWrapperPass
The legacy pass manager's analysis pass to compute loop information.
Definition LoopInfo.h:612

llvm::LoopInfo
Definition LoopInfo.h:426

llvm::MVT
Machine Value Type.
Definition MachineValueType.h:36

llvm::MVT::getIntegerVT
static MVT getIntegerVT(unsigned BitWidth)
Definition MachineValueType.h:476

llvm::MachineFunction
Definition MachineFunction.h:294

llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition MachineFunction.h:798

llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition MachineFunction.h:749

llvm::MachineMemOperand::getSize
LocationSize getSize() const
Return the size in bytes of the memory reference.
Definition MachineMemOperand.h:243

llvm::MachineMemOperand::isInvariant
bool isInvariant() const
Definition MachineMemOperand.h:304

llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition MachineRegisterInfo.h:53

llvm::MachineRegisterInfo::getRegClass
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
Definition MachineRegisterInfo.h:669

llvm::MachineSDNode
An SDNode that represents everything that will be needed to construct a MachineInstr.
Definition SelectionDAGNodes.h:3266

llvm::MemSDNode
This is an abstract virtual class for memory operations.
Definition SelectionDAGNodes.h:1412

llvm::MemSDNode::getAddressSpace
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Definition SelectionDAGNodes.h:1539

llvm::MemSDNode::getMemOperand
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
Definition SelectionDAGNodes.h:1508

llvm::MemSDNode::getChain
const SDValue & getChain() const
Definition SelectionDAGNodes.h:1575

llvm::MemSDNode::getMemoryVT
EVT getMemoryVT() const
Return the type of the in-memory value.
Definition SelectionDAGNodes.h:1503

llvm::Pass::getAnalysis
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
Definition PassAnalysisSupport.h:224

llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112

llvm::Register
Wrapper class representing virtual and physical registers.
Definition Register.h:20

llvm::SDLoc
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Definition SelectionDAGNodes.h:1240

llvm::SDNode
Represents one node in the SelectionDAG.
Definition SelectionDAGNodes.h:505

llvm::SDNode::getAsAPIntVal
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
Definition SelectionDAGNodes.h:1860

llvm::SDNode::getOpcode
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
Definition SelectionDAGNodes.h:700

llvm::SDNode::isDivergent
bool isDivergent() const
Definition SelectionDAGNodes.h:766

llvm::SDNode::getFlags
SDNodeFlags getFlags() const
Definition SelectionDAGNodes.h:1101

llvm::SDNode::getAsZExtVal
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
Definition SelectionDAGNodes.h:1852

llvm::SDNode::getNumOperands
unsigned getNumOperands() const
Return the number of values used by this operation.
Definition SelectionDAGNodes.h:1029

llvm::SDNode::getOperand
const SDValue & getOperand(unsigned Num) const
Definition SelectionDAGNodes.h:1050

llvm::SDNode::getConstantOperandVal
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Definition SelectionDAGNodes.h:1848

llvm::SDNode::isPredecessorOf
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
Definition SelectionDAGNodes.h:950

llvm::SDNode::isAnyAdd
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
Definition SelectionDAGNodes.h:712

llvm::SDNode::use_end
static use_iterator use_end()
Definition SelectionDAGNodes.h:898

llvm::SDValue
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
Definition SelectionDAGNodes.h:147

llvm::SDValue::getNode
SDNode * getNode() const
get the SDNode which holds the desired result
Definition SelectionDAGNodes.h:161

llvm::SDValue::getValue
SDValue getValue(unsigned R) const
Definition SelectionDAGNodes.h:181

llvm::SDValue::getValueType
EVT getValueType() const
Return the ValueType of the referenced return value.
Definition SelectionDAGNodes.h:1275

llvm::SDValue::getValueSizeInBits
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
Definition SelectionDAGNodes.h:201

llvm::SDValue::getOperand
const SDValue & getOperand(unsigned i) const
Definition SelectionDAGNodes.h:1283

llvm::SDValue::getConstantOperandVal
uint64_t getConstantOperandVal(unsigned i) const
Definition SelectionDAGNodes.h:1287

llvm::SDValue::getOpcode
unsigned getOpcode() const
Definition SelectionDAGNodes.h:1271

llvm::SIInstrInfo
Definition SIInstrInfo.h:107

llvm::SIInstrInfo::getMaxMUBUFImmOffset
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Definition SIInstrInfo.cpp:10263

llvm::SIInstrInfo::findCommutedOpIndices
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
Definition SIInstrInfo.cpp:2828

llvm::SIRegisterInfo
Definition SIRegisterInfo.h:40

llvm::SIRegisterInfo::getSubRegFromChannel
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
Definition SIRegisterInfo.cpp:560

llvm::SIRegisterInfo::getSGPRClassForBitWidth
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
Definition SIRegisterInfo.cpp:3854

llvm::SIRegisterInfo::isSGPRClass
static bool isSGPRClass(const TargetRegisterClass *RC)
Definition SIRegisterInfo.h:233

llvm::SITargetLowering
Definition SIISelLowering.h:32

llvm::SelectionDAGISelLegacy::runOnMachineFunction
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
Definition SelectionDAGISel.cpp:374

llvm::SelectionDAGISelLegacy::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition SelectionDAGISel.cpp:413

llvm::SelectionDAGISelLegacy::SelectionDAGISelLegacy
SelectionDAGISelLegacy(char &ID, std::unique_ptr< SelectionDAGISel > S)
Definition SelectionDAGISel.cpp:365

llvm::SelectionDAGISelPass::SelectionDAGISelPass
SelectionDAGISelPass(std::unique_ptr< SelectionDAGISel > Selector)
Definition SelectionDAGISel.h:589

llvm::SelectionDAGISelPass::run
LLVM_ABI PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
Definition SelectionDAGISel.cpp:440

llvm::SelectionDAGISel::FuncInfo
std::unique_ptr< FunctionLoweringInfo > FuncInfo
Definition SelectionDAGISel.h:51

llvm::SelectionDAGISel::TM
TargetMachine & TM
Definition SelectionDAGISel.h:47

llvm::SelectionDAGISel::SP
SSPLayoutInfo * SP
Definition SelectionDAGISel.h:61

llvm::SelectionDAGISel::TLI
const TargetLowering * TLI
Definition SelectionDAGISel.h:65

llvm::SelectionDAGISel::MF
MachineFunction * MF
Definition SelectionDAGISel.h:53

llvm::SelectionDAGISel::CurDAG
SelectionDAG * CurDAG
Definition SelectionDAGISel.h:56

llvm::SelectionDAGISel::TII
const TargetInstrInfo * TII
Definition SelectionDAGISel.h:64

llvm::SelectionDAGISel::OptLevel
CodeGenOptLevel OptLevel
Definition SelectionDAGISel.h:63

llvm::SelectionDAGISel::ReplaceUses
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
Definition SelectionDAGISel.h:389

llvm::SelectionDAGISel::ReplaceNode
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
Definition SelectionDAGISel.h:410

llvm::SelectionDAGISel::SelectionDAGISel
SelectionDAGISel(TargetMachine &tm, CodeGenOptLevel OL=CodeGenOptLevel::Default)
Definition SelectionDAGISel.cpp:398

llvm::SelectionDAGISel::runOnMachineFunction
virtual bool runOnMachineFunction(MachineFunction &mf)
Definition SelectionDAGISel.cpp:606

llvm::SelectionDAGISel::getTargetLowering
const TargetLowering * getTargetLowering() const
Definition SelectionDAGISel.h:96

llvm::SelectionDAG
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition SelectionDAG.h:231

llvm::SelectionDAG::getMachineNode
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
Definition SelectionDAG.cpp:12531

llvm::SelectionDAG::getRegister
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
Definition SelectionDAG.cpp:2434

llvm::SelectionDAG::getTargetFrameIndex
SDValue getTargetFrameIndex(int FI, EVT VT)
Definition SelectionDAG.h:787

llvm::SelectionDAG::SignBitIsZero
LLVM_ABI bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
Definition SelectionDAG.cpp:2865

llvm::SelectionDAG::getTargetConstant
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition SelectionDAG.h:730

llvm::SelectionDAG::isBaseWithConstantOffset
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
Definition SelectionDAG.cpp:6130

llvm::SelectionDAG::getMachineFunction
MachineFunction & getMachineFunction() const
Definition SelectionDAG.h:509

llvm::SelectionDAG::computeKnownBits
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
Definition SelectionDAG.cpp:3350

llvm::SelectionDAG::allnodes_iterator
ilist< SDNode >::iterator allnodes_iterator
Definition SelectionDAG.h:584

llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition SmallVector.h:592

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition SmallVector.h:423

llvm::SmallVectorTemplateCommon::size
size_t size() const
Definition SmallVector.h:83

llvm::SmallVectorTemplateCommon::empty
bool empty() const
Definition SmallVector.h:86

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition SmallVector.h:1236

llvm::StringRef
Represent a constant reference to a string, i.e.
Definition StringRef.h:56

llvm::TargetInstrInfo::CommuteAnyOperandIndex
static const unsigned CommuteAnyOperandIndex
Definition TargetInstrInfo.h:528

llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition TargetMachine.h:83

llvm::TargetRegisterClass
Definition TargetRegisterInfo.h:46

llvm::TargetRegisterClass::getID
unsigned getID() const
Return the register class ID number.
Definition TargetRegisterInfo.h:75

llvm::UniformityInfoWrapperPass
Legacy analysis pass which computes a CycleInfo.
Definition UniformityAnalysis.h:55

llvm::details::FixedOrScalableQuantity::getKnownMinValue
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165

uint32_t

uint64_t

uint8_t

ErrorHandling.h

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition ErrorHandling.h:164

llvm::AMDGPUAS::CONSTANT_ADDRESS_32BIT
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition AMDGPUAddrSpace.h:40

llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition AMDGPUAddrSpace.h:34

llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition AMDGPUAddrSpace.h:36

llvm::AMDGPUAS::CONSTANT_ADDRESS
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
Definition AMDGPUAddrSpace.h:37

llvm::AMDGPUAS::FLAT_ADDRESS
@ FLAT_ADDRESS
Address space for flat memory.
Definition AMDGPUAddrSpace.h:32

llvm::AMDGPUAS::GLOBAL_ADDRESS
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition AMDGPUAddrSpace.h:33

llvm::AMDGPUAS::PRIVATE_ADDRESS
@ PRIVATE_ADDRESS
Address space for private memory.
Definition AMDGPUAddrSpace.h:38

llvm::AMDGPUISD::MAD_U64_U32
@ MAD_U64_U32
Definition AMDGPUSelectionDAGInfo.h:26

llvm::AMDGPUISD::MAD_I64_I32
@ MAD_I64_I32
Definition AMDGPUSelectionDAGInfo.h:27

llvm::AMDGPUISD::WAVE_ADDRESS
@ WAVE_ADDRESS
Definition AMDGPUSelectionDAGInfo.h:23

llvm::AMDGPU::CPol::CPol
CPol
Definition SIDefines.h:587

llvm::AMDGPU::CPol::GLC
@ GLC
Definition SIDefines.h:588

llvm::AMDGPU::CPol::SCAL
@ SCAL
Definition SIDefines.h:633

llvm::AMDGPU::DPP::DPP_FI_1
@ DPP_FI_1
Definition SIDefines.h:1231

llvm::AMDGPU::DPP::DPP_FI_0
@ DPP_FI_0
Definition SIDefines.h:1230

llvm::AMDGPU::HSAMD::Kernel::Arg::Key::Align
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
Definition AMDGPUMetadata.h:183

llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition AMDGPUMetadata.h:396

llvm::AMDGPU::getSMRDEncodedLiteralOffset32
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
Definition AMDGPUBaseInfo.cpp:3432

llvm::AMDGPU::isGFX12Plus
bool isGFX12Plus(const MCSubtargetInfo &STI)
Definition AMDGPUBaseInfo.cpp:2603

llvm::AMDGPU::getNullPointerValue
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
Definition AMDGPUAddrSpace.h:178

llvm::AMDGPU::FlatAddrSpace
FlatAddrSpace
Definition AMDGPUAddrSpace.h:92

llvm::AMDGPU::FlatAddrSpace::FlatGlobal
@ FlatGlobal
Definition AMDGPUAddrSpace.h:92

llvm::AMDGPU::FlatAddrSpace::FlatScratch
@ FlatScratch
Definition AMDGPUAddrSpace.h:92

llvm::AMDGPU::FlatAddrSpace::FLAT
@ FLAT
Definition AMDGPUAddrSpace.h:92

llvm::AMDGPU::Imm
@ Imm
Definition AMDGPURegBankLegalizeRules.h:170

llvm::AMDGPU::isValid32BitLiteral
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
Definition AMDGPUBaseInfo.cpp:3274

llvm::AMDGPU::isInlinableLiteral32
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
Definition AMDGPUBaseInfo.cpp:3048

llvm::AMDGPU::hasSMRDSignedImmOffset
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
Definition AMDGPUBaseInfo.cpp:204

llvm::AMDGPU::getSMRDEncodedOffset
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
Definition AMDGPUBaseInfo.cpp:3403

llvm::AMDGPU::isUniformMMO
bool isUniformMMO(const MachineMemOperand *MMO)
Definition AMDGPUInstrInfo.cpp:30

llvm::ARMII::VecSize
@ VecSize
Definition ARMBaseInfo.h:437

llvm::ARM::ProfileKind::M
@ M
Definition ARMTargetParser.h:171

llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition BitmaskEnum.h:126

llvm::CallingConv::C
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34

llvm::ISD::SETCC
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:827

llvm::ISD::STACKRESTORE
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition ISDOpcodes.h:1277

llvm::ISD::STORE
@ STORE
Definition ISDOpcodes.h:1183

llvm::ISD::PTRADD
@ PTRADD
PTRADD represents pointer arithmetic semantics, for targets that opt in using shouldPreservePtrArith(...
Definition ISDOpcodes.h:1576

llvm::ISD::FLOG10
@ FLOG10
Definition ISDOpcodes.h:1070

llvm::ISD::SMUL_LOHI
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275

llvm::ISD::UINT_TO_FP
@ UINT_TO_FP
Definition ISDOpcodes.h:889

llvm::ISD::FPOW
@ FPOW
Definition ISDOpcodes.h:1056

llvm::ISD::ConstantFP
@ ConstantFP
Definition ISDOpcodes.h:87

llvm::ISD::UADDO
@ UADDO
Definition ISDOpcodes.h:349

llvm::ISD::FTRUNC
@ FTRUNC
Definition ISDOpcodes.h:1075

llvm::ISD::FMAD
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:522

llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264

llvm::ISD::LOAD
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition ISDOpcodes.h:1182

llvm::ISD::ANY_EXTEND
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:861

llvm::ISD::FSUB
@ FSUB
Definition ISDOpcodes.h:418

llvm::ISD::FMA
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518

llvm::ISD::FABS
@ FABS
Definition ISDOpcodes.h:1044

llvm::ISD::FNEARBYINT
@ FNEARBYINT
Definition ISDOpcodes.h:1077

llvm::ISD::INTRINSIC_VOID
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:220

llvm::ISD::SINT_TO_FP
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:888

llvm::ISD::FADD
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417

llvm::ISD::SRL
@ SRL
Definition ISDOpcodes.h:771

llvm::ISD::BITCAST
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:1001

llvm::ISD::BUILD_PAIR
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254

llvm::ISD::FFLOOR
@ FFLOOR
Definition ISDOpcodes.h:1080

llvm::ISD::FLDEXP
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition ISDOpcodes.h:1059

llvm::ISD::SRA
@ SRA
Definition ISDOpcodes.h:770

llvm::ISD::USUBO
@ USUBO
Definition ISDOpcodes.h:353

llvm::ISD::CONVERGENCECTRL_GLUE
@ CONVERGENCECTRL_GLUE
This does not correspond to any convergence control intrinsic.
Definition ISDOpcodes.h:1592

llvm::ISD::SIGN_EXTEND
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:852

llvm::ISD::FLOG2
@ FLOG2
Definition ISDOpcodes.h:1069

llvm::ISD::SCALAR_TO_VECTOR
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:665

llvm::ISD::FMAXNUM
@ FMAXNUM
Definition ISDOpcodes.h:1101

llvm::ISD::FPOWI
@ FPOWI
Definition ISDOpcodes.h:1057

llvm::ISD::FRINT
@ FRINT
Definition ISDOpcodes.h:1076

llvm::ISD::FNEG
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition ISDOpcodes.h:1043

llvm::ISD::OR
@ OR
Definition ISDOpcodes.h:740

llvm::ISD::FCANONICALIZE
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:541

llvm::ISD::UMUL_LOHI
@ UMUL_LOHI
Definition ISDOpcodes.h:276

llvm::ISD::UNDEF
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:233

llvm::ISD::CopyFromReg
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:230

llvm::ISD::FROUND
@ FROUND
Definition ISDOpcodes.h:1078

llvm::ISD::SHL
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:769

llvm::ISD::VECTOR_SHUFFLE
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649

llvm::ISD::FCOS
@ FCOS
Definition ISDOpcodes.h:1048

llvm::ISD::XOR
@ XOR
Definition ISDOpcodes.h:741

llvm::ISD::EXTRACT_VECTOR_ELT
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576

llvm::ISD::CopyToReg
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224

llvm::ISD::ZERO_EXTEND
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:858

llvm::ISD::FMUL
@ FMUL
Definition ISDOpcodes.h:419

llvm::ISD::FMINNUM
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
Definition ISDOpcodes.h:1100

llvm::ISD::SUB
@ SUB
Definition ISDOpcodes.h:265

llvm::ISD::TargetFrameIndex
@ TargetFrameIndex
Definition ISDOpcodes.h:187

llvm::ISD::SIGN_EXTEND_INREG
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:896

llvm::ISD::Constant
@ Constant
Definition ISDOpcodes.h:86

llvm::ISD::FP_EXTEND
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:986

llvm::ISD::UADDO_CARRY
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328

llvm::ISD::FROUNDEVEN
@ FROUNDEVEN
Definition ISDOpcodes.h:1079

llvm::ISD::FDIV
@ FDIV
Definition ISDOpcodes.h:420

llvm::ISD::FREM
@ FREM
Definition ISDOpcodes.h:421

llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739

llvm::ISD::INTRINSIC_WO_CHAIN
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205

llvm::ISD::USUBO_CARRY
@ USUBO_CARRY
Definition ISDOpcodes.h:329

llvm::ISD::FLOG
@ FLOG
Definition ISDOpcodes.h:1068

llvm::ISD::FSIN
@ FSIN
Definition ISDOpcodes.h:1047

llvm::ISD::FEXP
@ FEXP
Definition ISDOpcodes.h:1071

llvm::ISD::FCEIL
@ FCEIL
Definition ISDOpcodes.h:1074

llvm::ISD::MUL
@ MUL
Definition ISDOpcodes.h:266

llvm::ISD::FP_ROUND
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:967

llvm::ISD::FSQRT
@ FSQRT
Definition ISDOpcodes.h:1045

llvm::ISD::TRUNCATE
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:864

llvm::ISD::BRCOND
@ BRCOND
BRCOND - Conditional branch.
Definition ISDOpcodes.h:1221

llvm::ISD::FEXP2
@ FEXP2
Definition ISDOpcodes.h:1072

llvm::ISD::INTRINSIC_W_CHAIN
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213

llvm::ISD::BUILD_VECTOR
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556

llvm::ISD::isExtOpcode
bool isExtOpcode(unsigned Opcode)
Definition ISDOpcodes.h:1860

llvm::ISD::isBuildVectorAllZeros
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
Definition SelectionDAG.cpp:271

llvm::ISD::CondCode
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition ISDOpcodes.h:1789

llvm::ISD::SETNE
@ SETNE
Definition ISDOpcodes.h:1814

llvm::ISD::SETEQ
@ SETEQ
Definition ISDOpcodes.h:1809

llvm::ISD::SEXTLOAD
@ SEXTLOAD
Definition ISDOpcodes.h:1769

llvm::M68k::MemAddrModeKind::U
@ U
Definition M68kBaseInfo.h:60

llvm::M68k::MemAddrModeKind::K
@ K
Definition M68kBaseInfo.h:67

llvm::MCID::RegSequence
@ RegSequence
Definition MCInstrDesc.h:183

llvm::RISCVFenceField::W
@ W
Definition RISCVBaseInfo.h:495

llvm::SISrcMods::NONE
@ NONE
Definition SIDefines.h:492

llvm::SISrcMods::ABS
@ ABS
Definition SIDefines.h:494

llvm::SISrcMods::OP_SEL_0
@ OP_SEL_0
Definition SIDefines.h:497

llvm::SISrcMods::NEG_HI
@ NEG_HI
Definition SIDefines.h:496

llvm::SISrcMods::OP_SEL_1
@ OP_SEL_1
Definition SIDefines.h:498

llvm::SISrcMods::NEG
@ NEG
Definition SIDefines.h:493

llvm::X86II::Imm64
@ Imm64
Definition X86BaseInfo.h:776

llvm::abi::StructPacking::Packed
@ Packed
Definition Types.h:250

llvm::dwarf::Index
Index
Definition Dwarf.h:909

llvm::lltok::APFloat
@ APFloat
Definition LLToken.h:533

llvm::logicalview::LVAttributeKind::Zero
@ Zero
Definition LVOptions.h:130

llvm::lsp::TraceLevel::Off
@ Off
Definition Protocol.h:199

llvm::lsp::MessageType::Info
@ Info
Definition Protocol.h:1281

llvm::objcarc::ARCInstKind::User
@ User
could "use" a pointer
Definition ObjCARCInstKind.h:52

llvm::sampleprof::Base
@ Base
Definition Discriminator.h:58

llvm::sys::unicode::SBase
constexpr char32_t SBase
Definition UnicodeNameToCodepoint.cpp:254

llvm::tgtok::In
@ In
Definition TGLexer.h:83

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition FunctionInfo.h:25

llvm::Offset
@ Offset
Definition DWP.cpp:573

llvm::isInt
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165

llvm::isNullConstant
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
Definition SelectionDAG.cpp:13642

llvm::RegState::Undef
@ Undef
Value of the register doesn't matter.
Definition MachineInstrBuilder.h:65

llvm::dyn_cast
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643

llvm::isMask_32
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:255

llvm::MachineFunctionAnalysisManager
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
Definition MachineFunctionAnalysisManager.h:24

llvm::ExpandVariadicsMode::Lowering
@ Lowering
Definition ExpandVariadics.h:23

llvm::Desc
Op::Description Desc
Definition DWARFExpressionPrinter.cpp:26

llvm::popcount
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156

llvm::Value
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Value
Definition InstrProf.h:143

llvm::Log2_32
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331

llvm::isBoolSGPR
bool isBoolSGPR(SDValue V)
Definition SIISelLowering.cpp:14428

llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279

llvm::LitModifier::Lit
@ Lit
Definition AMDGPUMCExpr.h:19

llvm::ComplexDeinterleavingOperation::Splat
@ Splat
Definition ComplexDeinterleavingPass.h:42

llvm::Hi_32
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150

llvm::dbgs
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209

llvm::getConstantValue
static bool getConstantValue(SDValue N, uint32_t &Out)
Definition AMDGPUISelDAGToDAG.h:28

llvm::isUInt
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189

llvm::CodeGenOptLevel
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82

llvm::SmallVector
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
Definition SmallVector.h:1162

llvm::Lo_32
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155

llvm::isa
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547

llvm::Key
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
Definition PassManager.h:690

llvm::PackElem::Hi
@ Hi
Definition VECustomDAG.h:132

llvm::PackElem::Lo
@ Lo
Definition VECustomDAG.h:131

llvm::createAMDGPUISelDag
FunctionPass * createAMDGPUISelDag(TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a AMDGPU-specific.
Definition AMDGPUISelDAGToDAG.cpp:165

llvm::LEB128Sign::Signed
@ Signed
Definition LEB128.h:232

llvm::RecurKind::SMax
@ SMax
Signed integer max implemented in terms of select(cmp()).
Definition IVDescriptors.h:46

llvm::RecurKind::And
@ And
Bitwise or logical AND of integers.
Definition IVDescriptors.h:43

llvm::RecurKind::Sub
@ Sub
Subtraction of integers.
Definition IVDescriptors.h:39

llvm::RecurKind::Add
@ Add
Sum of integers.
Definition IVDescriptors.h:38

llvm::Op
DWARFExpression::Operation Op
Definition DWARFExpressionPrinter.cpp:25

llvm::M0
unsigned M0(unsigned Val)
Definition VE.h:376

llvm::isConstOrConstSplat
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
Definition SelectionDAG.cpp:13793

llvm::cast
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559

llvm::maskTrailingOnes
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77

llvm::Data
@ Data
Definition SIMachineScheduler.h:55

llvm::isAllOnesConstant
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
Definition SelectionDAG.cpp:13656

std
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:860

N
#define N

llvm::EVT
Extended Value Type.
Definition ValueTypes.h:35

llvm::EVT::getSizeInBits
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:396

llvm::EVT::getScalarSizeInBits
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:408

llvm::EVT::getSimpleVT
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:339

llvm::EVT::bitsEq
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:279

llvm::EVT::getVectorElementType
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:351

llvm::EVT::isScalarInteger
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:165

llvm::EVT::getVectorNumElements
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:359

llvm::KnownBits::makeConstant
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:315

llvm::KnownBits::add
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false, bool SelfAdd=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:361

llvm::KnownBits::getMaxValue
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:146

llvm::KnownBits::getMinValue
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition KnownBits.h:130

llvm::KnownBits::Zero
APInt Zero
Definition KnownBits.h:25

llvm::R600RegisterInfo::getSubRegFromChannel
static unsigned getSubRegFromChannel(unsigned Channel)
Definition R600RegisterInfo.cpp:24

llvm::SDNodeFlags::hasInBounds
bool hasInBounds() const
Definition SelectionDAGNodes.h:480

llvm::SDNodeFlags::hasNoUnsignedWrap
bool hasNoUnsignedWrap() const
Definition SelectionDAGNodes.h:465

llvm::SDVTList
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
Definition SelectionDAGNodes.h:80

llvm::SIModeRegisterDefaults
Definition SIModeRegisterDefaults.h:20