doxygen/SIISelLowering_8cpp_source.html

//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

/// \file

/// Custom DAG lowering for SI

//

//===----------------------------------------------------------------------===//


#include "SIISelLowering.h"

#include "AMDGPU.h"

#include "AMDGPUInstrInfo.h"

#include "AMDGPULaneMaskUtils.h"

#include "AMDGPUSelectionDAGInfo.h"

#include "AMDGPUTargetMachine.h"

#include "GCNSubtarget.h"

#include "MCTargetDesc/AMDGPUMCTargetDesc.h"

#include "SIMachineFunctionInfo.h"

#include "SIRegisterInfo.h"

#include "llvm/ADT/APInt.h"

#include "llvm/ADT/FloatingPointMode.h"

#include "llvm/ADT/Statistic.h"

#include "llvm/Analysis/OptimizationRemarkEmitter.h"

#include "llvm/Analysis/UniformityAnalysis.h"

#include "llvm/CodeGen/Analysis.h"

#include "llvm/CodeGen/ByteProvider.h"

#include "llvm/CodeGen/FunctionLoweringInfo.h"

#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"

#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"

#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"

#include "llvm/CodeGen/MachineFrameInfo.h"

#include "llvm/CodeGen/MachineFunction.h"

#include "llvm/CodeGen/MachineLoopInfo.h"

#include "llvm/CodeGen/PseudoSourceValueManager.h"

#include "llvm/CodeGen/SDPatternMatch.h"

#include "llvm/IR/DiagnosticInfo.h"

#include "llvm/IR/IRBuilder.h"

#include "llvm/IR/IntrinsicInst.h"

#include "llvm/IR/IntrinsicsAMDGPU.h"

#include "llvm/IR/IntrinsicsR600.h"

#include "llvm/IR/MDBuilder.h"

#include "llvm/Support/CommandLine.h"

#include "llvm/Support/KnownBits.h"

#include "llvm/Support/ModRef.h"

#include "llvm/Transforms/Utils/LowerAtomic.h"

#include <optional>


using namespace llvm;

using namespace llvm::SDPatternMatch;


#define DEBUG_TYPE "si-lower"


STATISTIC(NumTailCalls, "Number of tail calls");


static cl::opt<bool>

    DisableLoopAlignment("amdgpu-disable-loop-alignment",

                         cl::desc("Do not align and prefetch loops"),

                         cl::init(false));


static cl::opt<bool> UseDivergentRegisterIndexing(

    "amdgpu-use-divergent-register-indexing", cl::Hidden,

    cl::desc("Use indirect register addressing for divergent indexes"),

    cl::init(false));


static bool denormalModeIsFlushAllF32(const MachineFunction &MF) {

  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();

  return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();

}


static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF) {

  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();

  return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();

}


static unsigned findFirstFreeSGPR(CCState &CCInfo) {

  unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();

  for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {

    if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {

      return AMDGPU::SGPR0 + Reg;

    }

  }

  llvm_unreachable("Cannot allocate sgpr");

}


SITargetLowering::SITargetLowering(const TargetMachine &TM,

                                   const GCNSubtarget &STI)

    : AMDGPUTargetLowering(TM, STI, STI), Subtarget(&STI) {

  addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);

  addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);


  addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);


  const SIRegisterInfo *TRI = STI.getRegisterInfo();

  const TargetRegisterClass *V32RegClass =

      TRI->getDefaultVectorSuperClassForBitWidth(32);

  addRegisterClass(MVT::f32, V32RegClass);


  addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);


  const TargetRegisterClass *V64RegClass =

      TRI->getDefaultVectorSuperClassForBitWidth(64);


  addRegisterClass(MVT::f64, V64RegClass);

  addRegisterClass(MVT::v2f32, V64RegClass);

  addRegisterClass(MVT::Untyped, V64RegClass);


  addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);

  addRegisterClass(MVT::v3f32, TRI->getDefaultVectorSuperClassForBitWidth(96));


  addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);

  addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);


  addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);

  addRegisterClass(MVT::v4f32, TRI->getDefaultVectorSuperClassForBitWidth(128));


  addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);

  addRegisterClass(MVT::v5f32, TRI->getDefaultVectorSuperClassForBitWidth(160));


  addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);

  addRegisterClass(MVT::v6f32, TRI->getDefaultVectorSuperClassForBitWidth(192));


  addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);

  addRegisterClass(MVT::v3f64, TRI->getDefaultVectorSuperClassForBitWidth(192));


  addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);

  addRegisterClass(MVT::v7f32, TRI->getDefaultVectorSuperClassForBitWidth(224));


  addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);

  addRegisterClass(MVT::v8f32, TRI->getDefaultVectorSuperClassForBitWidth(256));


  addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);

  addRegisterClass(MVT::v4f64, TRI->getDefaultVectorSuperClassForBitWidth(256));


  addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);

  addRegisterClass(MVT::v9f32, TRI->getDefaultVectorSuperClassForBitWidth(288));


  addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);

  addRegisterClass(MVT::v10f32,

                   TRI->getDefaultVectorSuperClassForBitWidth(320));


  addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);

  addRegisterClass(MVT::v11f32,

                   TRI->getDefaultVectorSuperClassForBitWidth(352));


  addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);

  addRegisterClass(MVT::v12f32,

                   TRI->getDefaultVectorSuperClassForBitWidth(384));


  addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);

  addRegisterClass(MVT::v16f32,

                   TRI->getDefaultVectorSuperClassForBitWidth(512));


  addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);

  addRegisterClass(MVT::v8f64, TRI->getDefaultVectorSuperClassForBitWidth(512));


  addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);

  addRegisterClass(MVT::v16f64,

                   TRI->getDefaultVectorSuperClassForBitWidth(1024));


  if (Subtarget->has16BitInsts()) {

    if (Subtarget->useRealTrue16Insts()) {

      addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);

      addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);

      addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);

    } else {

      addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);

      addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);

      addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);

    }


    // Unless there are also VOP3P operations, not operations are really legal.

    addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);

    addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);

    addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);

    addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);

    addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);

    addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);

    addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);

    addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);

    addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);

    addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);

    addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);

    addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);

    addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);

    addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);

    addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);

  }


  addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);

  addRegisterClass(MVT::v32f32,

                   TRI->getDefaultVectorSuperClassForBitWidth(1024));


  computeRegisterProperties(Subtarget->getRegisterInfo());


  setMinFunctionAlignment(Align(4));

  setPrefFunctionAlignment(Align(STI.getInstCacheLineSize()));


  // The boolean content concept here is too inflexible. Compares only ever

  // really produce a 1-bit result. Any copy/extend from these will turn into a

  // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as

  // it's what most targets use.

  setBooleanContents(ZeroOrOneBooleanContent);

  setBooleanVectorContents(ZeroOrOneBooleanContent);


  // We need to custom lower vector stores from local memory

  setOperationAction(ISD::LOAD,

                     {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,

                      MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,

                      MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,

                      MVT::i1, MVT::v32i32},

                     Custom);


  setOperationAction(ISD::STORE,

                     {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,

                      MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,

                      MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,

                      MVT::i1, MVT::v32i32},

                     Custom);


  if (isTypeLegal(MVT::bf16)) {

    for (unsigned Opc :

         {ISD::FADD,     ISD::FSUB,       ISD::FMUL,    ISD::FDIV,

          ISD::FREM,     ISD::FMA,        ISD::FMINNUM, ISD::FMAXNUM,

          ISD::FMINIMUM, ISD::FMAXIMUM,   ISD::FSQRT,   ISD::FCBRT,

          ISD::FSIN,     ISD::FCOS,       ISD::FPOW,    ISD::FPOWI,

          ISD::FLDEXP,   ISD::FFREXP,     ISD::FLOG,    ISD::FLOG2,

          ISD::FLOG10,   ISD::FEXP,       ISD::FEXP2,   ISD::FEXP10,

          ISD::FCEIL,    ISD::FTRUNC,     ISD::FRINT,   ISD::FNEARBYINT,

          ISD::FROUND,   ISD::FROUNDEVEN, ISD::FFLOOR,  ISD::FCANONICALIZE,

          ISD::SETCC}) {

      setOperationAction(Opc, MVT::bf16, Promote);

    }


    setOperationAction(ISD::FP_ROUND, MVT::bf16, Expand);


    setOperationAction(ISD::SELECT, MVT::bf16, Promote);

    AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);


    setOperationAction(ISD::FABS, MVT::bf16, Legal);

    setOperationAction(ISD::FNEG, MVT::bf16, Legal);

    setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Legal);


    // We only need to custom lower because we can't specify an action for bf16

    // sources.

    setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);

    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);

  }


  setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);

  setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);

  setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);

  setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);

  setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);

  setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);

  setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);

  setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);

  setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);

  setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);

  setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);

  setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);

  setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);

  setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);

  setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);

  setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);


  setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);

  setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);

  setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);

  setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);

  setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);

  setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);

  setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);


  setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);

  setOperationAction(ISD::ExternalSymbol, {MVT::i32, MVT::i64}, Custom);


  setOperationAction(ISD::SELECT, MVT::i1, Promote);

  setOperationAction(ISD::SELECT, MVT::i64, Custom);

  setOperationAction(ISD::SELECT, MVT::f64, Promote);

  AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);


  setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);


  setOperationAction(ISD::SELECT_CC,

                     {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);


  setOperationAction(ISD::SETCC, MVT::i1, Promote);

  setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);

  AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);


  setOperationAction(ISD::TRUNCATE,

                     {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,

                      MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,

                      MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},

                     Expand);

  setOperationAction(ISD::FP_ROUND,

                     {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,

                      MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,

                      MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},

                     Expand);


  setOperationAction(ISD::SIGN_EXTEND_INREG,

                     {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,

                      MVT::v3i16, MVT::v4i16, MVT::Other},

                     Custom);


  setOperationAction(ISD::BRCOND, MVT::Other, Custom);

  setOperationAction(ISD::BR_CC,

                     {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);


  setOperationAction({ISD::ABS, ISD::UADDO, ISD::USUBO}, MVT::i32, Legal);


  setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i32, Legal);


  setOperationAction({ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS}, MVT::i64,

                     Expand);


  setOperationAction(ISD::INLINEASM, MVT::Other, Custom);


#if 0

  setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i64, Legal);

#endif


  // We only support LOAD/STORE and vector manipulation ops for vectors

  // with > 4 elements.

  for (MVT VT :

       {MVT::v8i32,   MVT::v8f32,  MVT::v9i32,  MVT::v9f32,  MVT::v10i32,

        MVT::v10f32,  MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,

        MVT::v16i32,  MVT::v16f32, MVT::v2i64,  MVT::v2f64,  MVT::v4i16,

        MVT::v4f16,   MVT::v4bf16, MVT::v3i64,  MVT::v3f64,  MVT::v6i32,

        MVT::v6f32,   MVT::v4i64,  MVT::v4f64,  MVT::v8i64,  MVT::v8f64,

        MVT::v8i16,   MVT::v8f16,  MVT::v8bf16, MVT::v16i16, MVT::v16f16,

        MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,

        MVT::v32i16,  MVT::v32f16, MVT::v32bf16}) {

    for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {

      switch (Op) {

      case ISD::LOAD:

      case ISD::STORE:

      case ISD::BUILD_VECTOR:

      case ISD::BITCAST:

      case ISD::UNDEF:

      case ISD::EXTRACT_VECTOR_ELT:

      case ISD::INSERT_VECTOR_ELT:

      case ISD::SCALAR_TO_VECTOR:

      case ISD::IS_FPCLASS:

        break;

      case ISD::EXTRACT_SUBVECTOR:

      case ISD::INSERT_SUBVECTOR:

      case ISD::CONCAT_VECTORS:

        setOperationAction(Op, VT, Custom);

        break;

      default:

        setOperationAction(Op, VT, Expand);

        break;

      }

    }

  }


  setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);


  // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that

  // is expanded to avoid having two separate loops in case the index is a VGPR.


  // Most operations are naturally 32-bit vector operations. We only support

  // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.

  for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {

    setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);

    AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);


    setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);

    AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);


    setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);

    AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);


    setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);

    AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);

  }


  for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {

    setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);

    AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);


    setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);

    AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);


    setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);

    AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);


    setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);

    AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);

  }


  for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {

    setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);

    AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);


    setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);

    AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);


    setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);

    AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);


    setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);

    AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);

  }


  for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {

    setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);

    AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);


    setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);

    AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);


    setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);

    AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);


    setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);

    AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);

  }


  for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {

    setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);

    AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);


    setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);

    AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);


    setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);

    AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);


    setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);

    AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);

  }


  setOperationAction(ISD::VECTOR_SHUFFLE,

                     {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,

                      MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},

                     Custom);


  if (Subtarget->hasPkMovB32()) {

    // TODO: 16-bit element vectors should be legal with even aligned elements.

    // TODO: Can be legal with wider source types than the result with

    // subregister extracts.

    setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);

  }


  setOperationAction({ISD::AND, ISD::OR, ISD::XOR}, MVT::v2i32, Legal);

  // Prevent SELECT v2i32 from being implemented with the above bitwise ops and

  // instead lower to cndmask in SITargetLowering::LowerSELECT().

  setOperationAction(ISD::SELECT, MVT::v2i32, Custom);

  // Enable MatchRotate to produce ISD::ROTR, which is later transformed to

  // alignbit.

  setOperationAction(ISD::ROTR, MVT::v2i32, Custom);


  setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},

                     Custom);


  // Avoid stack access for these.

  // TODO: Generalize to more vector types.

  setOperationAction({ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT},

                     {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,

                      MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},

                     Custom);


  // Deal with vec3 vector operations when widened to vec4.

  setOperationAction(ISD::INSERT_SUBVECTOR,

                     {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);


  // Deal with vec5/6/7 vector operations when widened to vec8.

  setOperationAction(ISD::INSERT_SUBVECTOR,

                     {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,

                      MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,

                      MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,

                      MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},

                     Custom);


  // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,

  // and output demarshalling

  setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);


  // We can't return success/failure, only the old value,

  // let LLVM add the comparison

  setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, {MVT::i32, MVT::i64},

                     Expand);


  setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);


  setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);


  // FIXME: This should be narrowed to i32, but that only happens if i64 is

  // illegal.

  // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.

  setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);


  // On SI this is s_memtime and s_memrealtime on VI.

  setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);


  if (Subtarget->hasSMemRealTime() ||

      Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)

    setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);

  setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom);


  if (Subtarget->has16BitInsts()) {

    setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote);

    setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom);

    setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);

    setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Legal);

    setOperationAction(ISD::FCANONICALIZE, MVT::f16, Legal);

  } else {

    setOperationAction(ISD::FSQRT, MVT::f16, Custom);

  }


  if (Subtarget->hasMadMacF32Insts())

    setOperationAction(ISD::FMAD, MVT::f32, Legal);


  setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_POISON}, MVT::i32, Custom);

  setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_POISON}, MVT::i32, Custom);

  setOperationAction(ISD::CTLS, MVT::i32, Custom);


  // We only really have 32-bit BFE instructions (and 16-bit on VI).

  //

  // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any

  // effort to match them now. We want this to be false for i64 cases when the

  // extraction isn't restricted to the upper or lower half. Ideally we would

  // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that

  // span the midpoint are probably relatively rare, so don't worry about them

  // for now.

  setHasExtractBitsInsn(true);


  // Clamp modifier on add/sub

  if (Subtarget->hasIntClamp())

    setOperationAction({ISD::UADDSAT, ISD::USUBSAT}, MVT::i32, Legal);


  if (Subtarget->hasAddNoCarryInsts())

    setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},

                       Legal);


  setOperationAction(

      {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},

      {MVT::f32, MVT::f64}, Custom);


  // These are really only legal for ieee_mode functions. We should be avoiding

  // them for functions that don't have ieee_mode enabled, so just say they are

  // legal.

  setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},

                     {MVT::f32, MVT::f64}, Legal);


  if (Subtarget->haveRoundOpsF64())

    setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, MVT::f64,

                       Legal);

  else

    setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},

                       MVT::f64, Custom);


  setOperationAction(ISD::FFLOOR, MVT::f64, Legal);

  setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},

                     Legal);

  setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);


  setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom);

  setOperationAction(ISD::FDIV, MVT::f64, Custom);


  setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);

  setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);


  setOperationAction({ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, MVT::i32,

                     Custom);

  setOperationAction({ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, MVT::i16,

                     Custom);

  setOperationAction({ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, MVT::i1,

                     Custom);


  // Custom lower these because we can't specify a rule based on an illegal

  // source bf16.

  setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom);

  setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f64, Custom);


  if (Subtarget->has16BitInsts()) {

    setOperationAction({ISD::Constant, ISD::SMIN, ISD::SMAX, ISD::UMIN,

                        ISD::UMAX, ISD::UADDSAT, ISD::USUBSAT},

                       MVT::i16, Legal);


    AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);


    setOperationAction({ISD::ROTR, ISD::ROTL, ISD::SELECT_CC, ISD::BR_CC},

                       MVT::i16, Expand);


    setOperationAction({ISD::SIGN_EXTEND, ISD::SDIV, ISD::UDIV, ISD::SREM,

                        ISD::UREM, ISD::BITREVERSE, ISD::CTTZ,

                        ISD::CTTZ_ZERO_POISON, ISD::CTLZ, ISD::CTLZ_ZERO_POISON,

                        ISD::CTPOP},

                       MVT::i16, Promote);


    setOperationAction(ISD::LOAD, MVT::i16, Custom);


    setTruncStoreAction(MVT::i64, MVT::i16, Expand);


    setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);

    AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);

    setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);

    AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);


    setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::i16, Custom);

    setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);

    setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i1, Custom);


    setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i32, Custom);


    // F16 - Constant Actions.

    setOperationAction(ISD::ConstantFP, MVT::f16, Legal);

    setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);


    // F16 - Load/Store Actions.

    setOperationAction(ISD::LOAD, MVT::f16, Promote);

    AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);

    setOperationAction(ISD::STORE, MVT::f16, Promote);

    AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);


    // BF16 - Load/Store Actions.

    setOperationAction(ISD::LOAD, MVT::bf16, Promote);

    AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);

    setOperationAction(ISD::STORE, MVT::bf16, Promote);

    AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);


    // F16 - VOP1 Actions.

    setOperationAction({ISD::FP_ROUND, ISD::STRICT_FP_ROUND, ISD::FCOS,

                        ISD::FSIN, ISD::FROUND},

                       MVT::f16, Custom);


    // BF16 - VOP1 Actions.

    if (Subtarget->hasBF16TransInsts())

      setOperationAction({ISD::FCOS, ISD::FSIN, ISD::FDIV}, MVT::bf16, Custom);


    setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,

                        ISD::FP_TO_UINT_SAT},

                       MVT::f16, Promote);

    setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,

                        ISD::FP_TO_UINT_SAT},

                       MVT::bf16, Promote);


    // F16 - VOP2 Actions.

    setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},

                       Expand);

    setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom);

    setOperationAction(ISD::FFREXP, MVT::f16, Custom);

    setOperationAction(ISD::FDIV, MVT::f16, Custom);


    // F16 - VOP3 Actions.

    setOperationAction(ISD::FMA, MVT::f16, Legal);

    if (STI.hasMadF16())

      setOperationAction(ISD::FMAD, MVT::f16, Legal);


    for (MVT VT :

         {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,

          MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,

          MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {

      for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {

        switch (Op) {

        case ISD::LOAD:

        case ISD::STORE:

        case ISD::BUILD_VECTOR:

        case ISD::BITCAST:

        case ISD::UNDEF:

        case ISD::EXTRACT_VECTOR_ELT:

        case ISD::INSERT_VECTOR_ELT:

        case ISD::INSERT_SUBVECTOR:

        case ISD::SCALAR_TO_VECTOR:

        case ISD::IS_FPCLASS:

          break;

        case ISD::EXTRACT_SUBVECTOR:

        case ISD::CONCAT_VECTORS:

        case ISD::FSIN:

        case ISD::FCOS:

          setOperationAction(Op, VT, Custom);

          break;

        default:

          setOperationAction(Op, VT, Expand);

          break;

        }

      }

    }


    // v_perm_b32 can handle either of these.

    setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);

    setOperationAction(ISD::BSWAP, MVT::v4i16, Custom);


    // XXX - Do these do anything? Vector constants turn into build_vector.

    setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);


    setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},

                       Legal);


    setOperationAction(ISD::STORE, MVT::v2i16, Promote);

    AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);

    setOperationAction(ISD::STORE, MVT::v2f16, Promote);

    AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);


    setOperationAction(ISD::LOAD, MVT::v2i16, Promote);

    AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);

    setOperationAction(ISD::LOAD, MVT::v2f16, Promote);

    AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);


    setOperationAction(ISD::ATOMIC_LOAD, MVT::v2i16, Promote);

    AddPromotedToType(ISD::ATOMIC_LOAD, MVT::v2i16, MVT::i32);

    setOperationAction(ISD::ATOMIC_LOAD, MVT::v2f16, Promote);

    AddPromotedToType(ISD::ATOMIC_LOAD, MVT::v2f16, MVT::i32);


    setOperationAction(ISD::ATOMIC_STORE, MVT::v2i16, Promote);

    AddPromotedToType(ISD::ATOMIC_STORE, MVT::v2i16, MVT::i32);

    setOperationAction(ISD::ATOMIC_STORE, MVT::v2f16, Promote);

    AddPromotedToType(ISD::ATOMIC_STORE, MVT::v2f16, MVT::i32);


    setOperationAction(ISD::AND, MVT::v2i16, Promote);

    AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);

    setOperationAction(ISD::OR, MVT::v2i16, Promote);

    AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);

    setOperationAction(ISD::XOR, MVT::v2i16, Promote);

    AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);


    setOperationAction(ISD::LOAD, MVT::v4i16, Promote);

    AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);

    setOperationAction(ISD::LOAD, MVT::v4f16, Promote);

    AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);

    setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);

    AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);


    setOperationAction(ISD::ATOMIC_LOAD, MVT::v4i16, Promote);

    AddPromotedToType(ISD::ATOMIC_LOAD, MVT::v4i16, MVT::i64);

    setOperationAction(ISD::ATOMIC_LOAD, MVT::v4f16, Promote);

    AddPromotedToType(ISD::ATOMIC_LOAD, MVT::v4f16, MVT::i64);


    setOperationAction(ISD::ATOMIC_STORE, MVT::v4i16, Promote);

    AddPromotedToType(ISD::ATOMIC_STORE, MVT::v4i16, MVT::i64);

    setOperationAction(ISD::ATOMIC_STORE, MVT::v4f16, Promote);

    AddPromotedToType(ISD::ATOMIC_STORE, MVT::v4f16, MVT::i64);


    setOperationAction(ISD::STORE, MVT::v4i16, Promote);

    AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);

    setOperationAction(ISD::STORE, MVT::v4f16, Promote);

    AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);

    setOperationAction(ISD::STORE, MVT::v4bf16, Promote);

    AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);


    setOperationAction(ISD::LOAD, MVT::v8i16, Promote);

    AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);

    setOperationAction(ISD::LOAD, MVT::v8f16, Promote);

    AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);

    setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);

    AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);


    setOperationAction(ISD::STORE, MVT::v4i16, Promote);

    AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);

    setOperationAction(ISD::STORE, MVT::v4f16, Promote);

    AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);


    setOperationAction(ISD::STORE, MVT::v8i16, Promote);

    AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);

    setOperationAction(ISD::STORE, MVT::v8f16, Promote);

    AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);

    setOperationAction(ISD::STORE, MVT::v8bf16, Promote);

    AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);


    setOperationAction(ISD::LOAD, MVT::v16i16, Promote);

    AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);

    setOperationAction(ISD::LOAD, MVT::v16f16, Promote);

    AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);

    setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);

    AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);


    setOperationAction(ISD::STORE, MVT::v16i16, Promote);

    AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);

    setOperationAction(ISD::STORE, MVT::v16f16, Promote);

    AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);

    setOperationAction(ISD::STORE, MVT::v16bf16, Promote);

    AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);


    setOperationAction(ISD::LOAD, MVT::v32i16, Promote);

    AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);

    setOperationAction(ISD::LOAD, MVT::v32f16, Promote);

    AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);

    setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);

    AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);


    setOperationAction(ISD::STORE, MVT::v32i16, Promote);

    AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);

    setOperationAction(ISD::STORE, MVT::v32f16, Promote);

    AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);

    setOperationAction(ISD::STORE, MVT::v32bf16, Promote);

    AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);


    setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},

                       MVT::v2i32, Expand);

    setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);


    setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},

                       MVT::v4i32, Expand);


    setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},

                       MVT::v8i32, Expand);


    setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},

                       Subtarget->hasVOP3PInsts() ? Legal : Custom);


    setOperationAction(ISD::FNEG, {MVT::v2f16, MVT::v2bf16}, Legal);

    // This isn't really legal, but this avoids the legalizer unrolling it (and

    // allows matching fneg (fabs x) patterns)

    setOperationAction(ISD::FABS, {MVT::v2f16, MVT::v2bf16}, Legal);


    // Can do this in one BFI plus a constant materialize.

    setOperationAction(ISD::FCOPYSIGN,

                       {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,

                        MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,

                        MVT::v32f16, MVT::v32bf16},

                       Custom);


    setOperationAction(

        {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},

        MVT::f16, Custom);

    setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);


    setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM,

                        ISD::FMAXIMUMNUM},

                       {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},

                       Custom);


    setOperationAction({ISD::FMINNUM, ISD::FMAXNUM},

                       {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},

                       Expand);


    for (MVT Vec16 :

         {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,

          MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {

      setOperationAction(

          {ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR},

          Vec16, Custom);

      setOperationAction(ISD::INSERT_VECTOR_ELT, Vec16, Expand);

    }

  }


  if (Subtarget->hasVOP3PInsts()) {

    setOperationAction({ISD::ADD, ISD::SUB, ISD::MUL, ISD::SHL, ISD::SRL,

                        ISD::SRA, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX,

                        ISD::UADDSAT, ISD::USUBSAT, ISD::SADDSAT, ISD::SSUBSAT},

                       MVT::v2i16, Legal);


    setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG, ISD::FABS,

                        ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE,

                        ISD::FCANONICALIZE},

                       MVT::v2f16, Legal);


    setOperationAction(ISD::EXTRACT_VECTOR_ELT,

                       {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);


    setOperationAction(ISD::VECTOR_SHUFFLE,

                       {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,

                        MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,

                        MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},

                       Custom);


    for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})

      // Split vector operations.

      setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB,

                          ISD::MUL, ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN,

                          ISD::UMAX, ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT,

                          ISD::SSUBSAT},

                         VT, Custom);


    for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})

      // Split vector operations.

      setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG, ISD::FABS,

                          ISD::FCANONICALIZE},

                         VT, Custom);


    setOperationAction(

        {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},

        {MVT::v2f16, MVT::v4f16}, Custom);


    setOperationAction(ISD::FEXP, MVT::v2f16, Custom);

    setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},

                       Custom);


    if (Subtarget->hasBF16PackedInsts()) {

      setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMAXNUM, ISD::FMINNUM,

                          ISD::FMA, ISD::FNEG, ISD::FABS, ISD::FCANONICALIZE},

                         MVT::v2bf16, Legal);


      for (MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})

        // Split vector operations.

        setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE,

                            ISD::FNEG, ISD::FABS},

                           VT, Custom);

    }


    if (Subtarget->hasPackedFP32Ops()) {

      setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG},

                         MVT::v2f32, Legal);

      setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG},

                         {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},

                         Custom);

    }

  }


  setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom);


  if (Subtarget->has16BitInsts()) {

    setOperationAction(ISD::SELECT, MVT::v2i16, Promote);

    AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);

    setOperationAction(ISD::SELECT, MVT::v2f16, Promote);

    AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);

  } else {

    // Legalization hack.

    setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);


    setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom);

  }


  setOperationAction(ISD::SELECT,

                     {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,

                      MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,

                      MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,

                      MVT::v32f16, MVT::v32bf16},

                     Custom);


  setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom);


  if (Subtarget->hasVMulU64Inst())

    setOperationAction(ISD::MUL, MVT::i64, Legal);

  else if (Subtarget->hasScalarSMulU64())

    setOperationAction(ISD::MUL, MVT::i64, Custom);


  if (Subtarget->hasMad64_32())

    setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom);


  if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())

    setOperationAction(ISD::PREFETCH, MVT::Other, Custom);


  if (Subtarget->hasIEEEMinimumMaximumInsts()) {

    setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM},

                       {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);

  } else {

    // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum

    if (Subtarget->hasMinimum3Maximum3F32())

      setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Legal);


    if (Subtarget->hasMinimum3Maximum3PKF16()) {

      setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::v2f16, Legal);


      // If only the vector form is available, we need to widen to a vector.

      if (!Subtarget->hasMinimum3Maximum3F16())

        setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f16, Custom);

    }

  }


  if (Subtarget->hasVOP3PInsts()) {

    // We want to break these into v2f16 pieces, not scalarize.

    setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM},

                       {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},

                       Custom);

  }


  if (Subtarget->hasIntMinMax64())

    setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i64,

                       Legal);


  setOperationAction(ISD::INTRINSIC_WO_CHAIN,

                     {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,

                      MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,

                      MVT::i8},

                     Custom);


  setOperationAction(ISD::INTRINSIC_W_CHAIN,

                     {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,

                      MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,

                      MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,

                      MVT::i16, MVT::bf16, MVT::i8, MVT::i128},

                     Custom);


  setOperationAction(ISD::INTRINSIC_VOID,

                     {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,

                      MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,

                      MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,

                      MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},

                     Custom);


  setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);

  setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);

  setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);

  setOperationAction(ISD::GET_FPENV, MVT::i64, Custom);

  setOperationAction(ISD::SET_FPENV, MVT::i64, Custom);


  // TODO: Could move this to custom lowering, could benefit from combines on

  // extract of relevant bits.

  setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);


  setOperationAction(ISD::MUL, MVT::i1, Promote);


  if (Subtarget->hasBF16ConversionInsts()) {

    setOperationAction(ISD::FP_ROUND, {MVT::bf16, MVT::v2bf16}, Custom);

    setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Legal);

  }


  if (Subtarget->hasBF16TransInsts()) {

    setOperationAction({ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, MVT::bf16, Legal);

  }


  if (Subtarget->hasCvtPkF16F32Inst()) {

    setOperationAction(ISD::FP_ROUND,

                       {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},

                       Custom);

  }


  setTargetDAGCombine({ISD::ADD,

                       ISD::PTRADD,

                       ISD::UADDO_CARRY,

                       ISD::SUB,

                       ISD::USUBO_CARRY,

                       ISD::MUL,

                       ISD::FADD,

                       ISD::FSUB,

                       ISD::FDIV,

                       ISD::FMUL,

                       ISD::FMINNUM,

                       ISD::FMAXNUM,

                       ISD::FMINNUM_IEEE,

                       ISD::FMAXNUM_IEEE,

                       ISD::FMINIMUM,

                       ISD::FMAXIMUM,

                       ISD::FMINIMUMNUM,

                       ISD::FMAXIMUMNUM,

                       ISD::FMA,

                       ISD::SMIN,

                       ISD::SMAX,

                       ISD::UMIN,

                       ISD::UMAX,

                       ISD::SETCC,

                       ISD::SELECT,

                       ISD::SMIN,

                       ISD::SMAX,

                       ISD::UMIN,

                       ISD::UMAX,

                       ISD::AND,

                       ISD::OR,

                       ISD::XOR,

                       ISD::SHL,

                       ISD::SRL,

                       ISD::SRA,

                       ISD::FSHR,

                       ISD::SINT_TO_FP,

                       ISD::UINT_TO_FP,

                       ISD::FCANONICALIZE,

                       ISD::SCALAR_TO_VECTOR,

                       ISD::ZERO_EXTEND,

                       ISD::SIGN_EXTEND_INREG,

                       ISD::ANY_EXTEND,

                       ISD::EXTRACT_VECTOR_ELT,

                       ISD::INSERT_VECTOR_ELT,

                       ISD::FCOPYSIGN});


  if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())

    setTargetDAGCombine(ISD::FP_ROUND);


  // All memory operations. Some folding on the pointer operand is done to help

  // matching the constant offsets in the addressing modes.

  setTargetDAGCombine({ISD::LOAD,

                       ISD::STORE,

                       ISD::ATOMIC_LOAD,

                       ISD::ATOMIC_STORE,

                       ISD::ATOMIC_CMP_SWAP,

                       ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,

                       ISD::ATOMIC_SWAP,

                       ISD::ATOMIC_LOAD_ADD,

                       ISD::ATOMIC_LOAD_SUB,

                       ISD::ATOMIC_LOAD_AND,

                       ISD::ATOMIC_LOAD_OR,

                       ISD::ATOMIC_LOAD_XOR,

                       ISD::ATOMIC_LOAD_NAND,

                       ISD::ATOMIC_LOAD_MIN,

                       ISD::ATOMIC_LOAD_MAX,

                       ISD::ATOMIC_LOAD_UMIN,

                       ISD::ATOMIC_LOAD_UMAX,

                       ISD::ATOMIC_LOAD_FADD,

                       ISD::ATOMIC_LOAD_FMIN,

                       ISD::ATOMIC_LOAD_FMAX,

                       ISD::ATOMIC_LOAD_UINC_WRAP,

                       ISD::ATOMIC_LOAD_UDEC_WRAP,

                       ISD::ATOMIC_LOAD_USUB_COND,

                       ISD::ATOMIC_LOAD_USUB_SAT,

                       ISD::INTRINSIC_VOID,

                       ISD::INTRINSIC_W_CHAIN});


  // FIXME: In other contexts we pretend this is a per-function property.

  setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32);


  setSchedulingPreference(Sched::RegPressure);

}


const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }


ArrayRef<MCPhysReg> SITargetLowering::getRoundingControlRegisters() const {

  static const MCPhysReg RCRegs[] = {AMDGPU::MODE};

  return RCRegs;

}


//===----------------------------------------------------------------------===//

// TargetLowering queries

//===----------------------------------------------------------------------===//


// v_mad_mix* support a conversion from f16 to f32.

//

// There is only one special case when denormals are enabled we don't currently,

// where this is OK to use.


bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,

                                       EVT DestVT, EVT SrcVT) const {

  return DestVT.getScalarType() == MVT::f32 &&

         ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||

            (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&

           SrcVT.getScalarType() == MVT::f16) ||

          (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&

           SrcVT.getScalarType() == MVT::bf16)) &&

         // TODO: This probably only requires no input flushing?

         denormalModeIsFlushAllF32(DAG.getMachineFunction());

}


bool SITargetLowering::isFPExtFoldable(const MachineInstr &MI, unsigned Opcode,

                                       LLT DestTy, LLT SrcTy) const {

  return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||

          (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&

         DestTy.getScalarSizeInBits() == 32 &&

         SrcTy.getScalarSizeInBits() == 16 &&

         // TODO: This probably only requires no input flushing?

         denormalModeIsFlushAllF32(*MI.getMF());

}


bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {

  // SI has some legal vector types, but no legal vector operations. Say no

  // shuffles are legal in order to prefer scalarizing some vector operations.

  return false;

}


MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,

                                                    CallingConv::ID CC,

                                                    EVT VT) const {

  if (CC == CallingConv::AMDGPU_KERNEL)

    return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);


  if (VT.isVector()) {

    EVT ScalarVT = VT.getScalarType();

    unsigned Size = ScalarVT.getSizeInBits();

    if (Size == 16) {

      return Subtarget->has16BitInsts()

                 ? MVT::getVectorVT(ScalarVT.getSimpleVT(), 2)

                 : MVT::i32;

    }


    if (Size < 16)

      return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;

    return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;

  }


  if (!Subtarget->has16BitInsts() && VT.getSizeInBits() == 16)

    return MVT::i32;


  if (VT.getSizeInBits() > 32)

    return MVT::i32;


  return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);

}


unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,

                                                         CallingConv::ID CC,

                                                         EVT VT) const {

  if (CC == CallingConv::AMDGPU_KERNEL)

    return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);


  if (VT.isVector()) {

    unsigned NumElts = VT.getVectorNumElements();

    EVT ScalarVT = VT.getScalarType();

    unsigned Size = ScalarVT.getSizeInBits();


    // FIXME: Should probably promote 8-bit vectors to i16.

    if (Size == 16)

      return (NumElts + 1) / 2;


    if (Size <= 32)

      return NumElts;


    if (Size > 32)

      return NumElts * ((Size + 31) / 32);

  } else if (VT.getSizeInBits() > 32)

    return (VT.getSizeInBits() + 31) / 32;


  return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);

}


unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(

    LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,

    unsigned &NumIntermediates, MVT &RegisterVT) const {

  if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {

    unsigned NumElts = VT.getVectorNumElements();

    EVT ScalarVT = VT.getScalarType();

    unsigned Size = ScalarVT.getSizeInBits();

    // FIXME: We should fix the ABI to be the same on targets without 16-bit

    // support, but unless we can properly handle 3-vectors, it will be still be

    // inconsistent.

    if (Size == 16) {

      MVT SimpleIntermediateVT =

          MVT::getVectorVT(ScalarVT.getSimpleVT(), ElementCount::getFixed(2));

      IntermediateVT = SimpleIntermediateVT;

      RegisterVT = Subtarget->has16BitInsts() ? SimpleIntermediateVT : MVT::i32;

      NumIntermediates = (NumElts + 1) / 2;

      return (NumElts + 1) / 2;

    }


    if (Size == 32) {

      RegisterVT = ScalarVT.getSimpleVT();

      IntermediateVT = RegisterVT;

      NumIntermediates = NumElts;

      return NumIntermediates;

    }


    if (Size < 16 && Subtarget->has16BitInsts()) {

      // FIXME: Should probably form v2i16 pieces

      RegisterVT = MVT::i16;

      IntermediateVT = ScalarVT;

      NumIntermediates = NumElts;

      return NumIntermediates;

    }


    if (Size != 16 && Size <= 32) {

      RegisterVT = MVT::i32;

      IntermediateVT = ScalarVT;

      NumIntermediates = NumElts;

      return NumIntermediates;

    }


    if (Size > 32) {

      RegisterVT = MVT::i32;

      IntermediateVT = RegisterVT;

      NumIntermediates = NumElts * ((Size + 31) / 32);

      return NumIntermediates;

    }

  }


  return TargetLowering::getVectorTypeBreakdownForCallingConv(

      Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);

}


static EVT memVTFromLoadIntrData(const SITargetLowering &TLI,

                                 const DataLayout &DL, Type *Ty,

                                 unsigned MaxNumLanes) {

  assert(MaxNumLanes != 0);


  LLVMContext &Ctx = Ty->getContext();

  if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {

    unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());

    return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),

                            NumElts);

  }


  return TLI.getValueType(DL, Ty);

}


// Peek through TFE struct returns to only use the data size.


static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI,

                                   const DataLayout &DL, Type *Ty,

                                   unsigned MaxNumLanes) {

  auto *ST = dyn_cast<StructType>(Ty);

  if (!ST)

    return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);


  // TFE intrinsics return an aggregate type.

  assert(ST->getNumContainedTypes() == 2 &&

         ST->getContainedType(1)->isIntegerTy(32));

  return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);

}


/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its

/// in-memory representation. This return value is a custom type because there

/// is no MVT::i160 and adding one breaks integer promotion logic. While this

/// could cause issues during codegen, these address space 7 pointers will be

/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer

/// in order to allow pre-codegen passes that query TargetTransformInfo, often

/// for cost modeling, to work. (This also sets us up decently for doing the

/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)


MVT SITargetLowering::getPointerTy(const DataLayout &DL, unsigned AS) const {

  if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)

    return MVT::amdgpuBufferFatPointer;

  if (AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&

      DL.getPointerSizeInBits(AS) == 192)

    return MVT::amdgpuBufferStridedPointer;

  return AMDGPUTargetLowering::getPointerTy(DL, AS);

}


/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka

/// v8i32 when padding is added.

/// The in-memory representation of a p9 is {p8, i32, i32}, which is

/// also v8i32 with padding.


MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const {

  if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&

       DL.getPointerSizeInBits(AS) == 160) ||

      (AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&

       DL.getPointerSizeInBits(AS) == 192))

    return MVT::v8i32;

  return AMDGPUTargetLowering::getPointerMemTy(DL, AS);

}


static unsigned getIntrMemWidth(unsigned IntrID) {

  switch (IntrID) {

  case Intrinsic::amdgcn_global_load_async_to_lds_b8:

  case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:

  case Intrinsic::amdgcn_global_store_async_from_lds_b8:

    return 8;

  case Intrinsic::amdgcn_global_load_async_to_lds_b32:

  case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:

  case Intrinsic::amdgcn_global_store_async_from_lds_b32:

  case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:

  case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:

  case Intrinsic::amdgcn_flat_load_monitor_b32:

  case Intrinsic::amdgcn_global_load_monitor_b32:

    return 32;

  case Intrinsic::amdgcn_global_load_async_to_lds_b64:

  case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:

  case Intrinsic::amdgcn_global_store_async_from_lds_b64:

  case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:

  case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:

  case Intrinsic::amdgcn_flat_load_monitor_b64:

  case Intrinsic::amdgcn_global_load_monitor_b64:

    return 64;

  case Intrinsic::amdgcn_global_load_async_to_lds_b128:

  case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:

  case Intrinsic::amdgcn_global_store_async_from_lds_b128:

  case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:

  case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:

  case Intrinsic::amdgcn_flat_load_monitor_b128:

  case Intrinsic::amdgcn_global_load_monitor_b128:

    return 128;

  default:

    llvm_unreachable("Unknown width");

  }

}


static AtomicOrdering parseAtomicOrderingCABIArg(const CallBase &CI,

                                                 unsigned ArgIdx) {

  Value *OrderingArg = CI.getArgOperand(ArgIdx);

  unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();

  switch (AtomicOrderingCABI(Ord)) {

  case AtomicOrderingCABI::acquire:

    return AtomicOrdering::Acquire;

    break;

  case AtomicOrderingCABI::release:

    return AtomicOrdering::Release;

    break;

  case AtomicOrderingCABI::seq_cst:

    return AtomicOrdering::SequentiallyConsistent;

    break;

  default:

    return AtomicOrdering::Monotonic;

  }

}


static unsigned parseSyncscopeMDArg(const CallBase &CI, unsigned ArgIdx) {

  MDNode *ScopeMD = cast<MDNode>(

      cast<MetadataAsValue>(CI.getArgOperand(ArgIdx))->getMetadata());

  StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();

  return CI.getContext().getOrInsertSyncScopeID(Scope);

}


void SITargetLowering::getTgtMemIntrinsic(SmallVectorImpl<IntrinsicInfo> &Infos,

                                          const CallBase &CI,

                                          MachineFunction &MF,

                                          unsigned IntrID) const {

  MachineMemOperand::Flags Flags = MachineMemOperand::MONone;

  if (CI.hasMetadata(LLVMContext::MD_invariant_load))

    Flags |= MachineMemOperand::MOInvariant;

  if (CI.hasMetadata(LLVMContext::MD_nontemporal))

    Flags |= MachineMemOperand::MONonTemporal;

  Flags |= getTargetMMOFlags(CI);


  if (const AMDGPU::RsrcIntrinsic *RsrcIntr =

          AMDGPU::lookupRsrcIntrinsic(IntrID)) {

    AttributeSet Attr =

        Intrinsic::getFnAttributes(CI.getContext(), (Intrinsic::ID)IntrID);

    MemoryEffects ME = Attr.getMemoryEffects();

    if (ME.doesNotAccessMemory())

      return;


    bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;

    if (!IsSPrefetch) {

      auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));

      if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)

        Flags |= MachineMemOperand::MOVolatile;

    }

    Flags |= MachineMemOperand::MODereferenceable;


    IntrinsicInfo Info;

    // TODO: Should images get their own address space?

    Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;


    const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;

    if (RsrcIntr->IsImage) {

      const AMDGPU::ImageDimIntrinsicInfo *Intr =

          AMDGPU::getImageDimIntrinsicInfo(IntrID);

      BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);

      Info.align.reset();

    }


    Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);

    if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {

      if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)

        // We conservatively set the memory operand of a buffer intrinsic to the

        // base resource pointer, so that we can access alias information about

        // those pointers. Cases like "this points at the same value

        // but with a different offset" are handled in

        // areMemAccessesTriviallyDisjoint.

        Info.ptrVal = RsrcArg;

    }


    if (ME.onlyReadsMemory()) {

      if (RsrcIntr->IsImage) {

        unsigned MaxNumLanes = 4;


        if (!BaseOpcode->Gather4) {

          // If this isn't a gather, we may have excess loaded elements in the

          // IR type. Check the dmask for the real number of elements loaded.

          unsigned DMask =

              cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();

          MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);

        }


        Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),

                                             CI.getType(), MaxNumLanes);

      } else {

        Info.memVT =

            memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(),

                                    std::numeric_limits<unsigned>::max());

      }


      // FIXME: What does alignment mean for an image?

      Info.opc = ISD::INTRINSIC_W_CHAIN;

      Info.flags = Flags | MachineMemOperand::MOLoad;

    } else if (ME.onlyWritesMemory()) {

      Info.opc = ISD::INTRINSIC_VOID;


      Type *DataTy = CI.getArgOperand(0)->getType();

      if (RsrcIntr->IsImage) {

        unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();

        unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);

        Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,

                                           DMaskLanes);

      } else

        Info.memVT = getValueType(MF.getDataLayout(), DataTy);


      Info.flags = Flags | MachineMemOperand::MOStore;

    } else {

      // Atomic, NoReturn Sampler or prefetch

      Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID

                                          : ISD::INTRINSIC_W_CHAIN;


      switch (IntrID) {

      default:

        Info.flags = Flags | MachineMemOperand::MOLoad;

        if (!IsSPrefetch)

          Info.flags |= MachineMemOperand::MOStore;


        if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {

          // Fake memory access type for no return sampler intrinsics

          Info.memVT = MVT::i32;

        } else {

          // XXX - Should this be volatile without known ordering?

          Info.flags |= MachineMemOperand::MOVolatile;

          Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());

        }

        break;

      case Intrinsic::amdgcn_raw_buffer_load_lds:

      case Intrinsic::amdgcn_raw_buffer_load_async_lds:

      case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:

      case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:

      case Intrinsic::amdgcn_struct_buffer_load_lds:

      case Intrinsic::amdgcn_struct_buffer_load_async_lds:

      case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:

      case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {

        unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();


        // Entry 0: Load from buffer.

        // Don't set an offset, since the pointer value always represents the

        // base of the buffer.

        Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);

        Info.flags = Flags | MachineMemOperand::MOLoad;

        Infos.push_back(Info);


        // Entry 1: Store to LDS.

        // Instruction offset is applied, and an additional per-lane offset

        // which we simulate using a larger memory type.

        Info.memVT = EVT::getIntegerVT(

            CI.getContext(), Width * 8 * Subtarget->getWavefrontSize());

        Info.ptrVal = CI.getArgOperand(1); // LDS destination pointer

        Info.offset = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 2))

                          ->getZExtValue();

        Info.fallbackAddressSpace = AMDGPUAS::LOCAL_ADDRESS;

        Info.flags = Flags | MachineMemOperand::MOStore;

        Infos.push_back(Info);

        return;

      }

      case Intrinsic::amdgcn_raw_atomic_buffer_load:

      case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:

      case Intrinsic::amdgcn_struct_atomic_buffer_load:

      case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {

        Info.memVT =

            memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(),

                                    std::numeric_limits<unsigned>::max());

        Info.flags = Flags | MachineMemOperand::MOLoad;

        Infos.push_back(Info);

        return;

      }

      }

    }

    Infos.push_back(Info);

    return;

  }


  IntrinsicInfo Info;

  switch (IntrID) {

  case Intrinsic::amdgcn_ds_ordered_add:

  case Intrinsic::amdgcn_ds_ordered_swap: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::getVT(CI.getType());

    Info.ptrVal = CI.getOperand(0);

    Info.align.reset();

    Info.flags = Flags | MachineMemOperand::MOLoad | MachineMemOperand::MOStore;


    const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));

    if (!Vol->isZero())

      Info.flags |= MachineMemOperand::MOVolatile;


    Infos.push_back(Info);

    return;

  }

  case Intrinsic::amdgcn_ds_add_gs_reg_rtn:

  case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::getVT(CI.getOperand(0)->getType());

    Info.ptrVal = nullptr;

    Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;

    Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;

    Infos.push_back(Info);

    return;

  }

  case Intrinsic::amdgcn_ds_append:

  case Intrinsic::amdgcn_ds_consume: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::getVT(CI.getType());

    Info.ptrVal = CI.getOperand(0);

    Info.align.reset();

    Info.flags = Flags | MachineMemOperand::MOLoad | MachineMemOperand::MOStore;


    const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));

    if (!Vol->isZero())

      Info.flags |= MachineMemOperand::MOVolatile;


    Infos.push_back(Info);

    return;

  }

  case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:

  case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {

    Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)

                   ? ISD::INTRINSIC_W_CHAIN

                   : ISD::INTRINSIC_VOID;

    Info.memVT = MVT::getVT(CI.getType());

    Info.ptrVal = CI.getOperand(0);

    Info.memVT = MVT::i64;

    Info.size = 8;

    Info.align.reset();

    Info.flags = Flags | MachineMemOperand::MOLoad | MachineMemOperand::MOStore;

    Info.order = AtomicOrdering::Monotonic;

    Infos.push_back(Info);

    return;

  }

  case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:

  case Intrinsic::amdgcn_image_bvh_intersect_ray:

  case Intrinsic::amdgcn_image_bvh8_intersect_ray: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT =

        MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray

                       ? CI.getType()

                       : cast<StructType>(CI.getType())

                             ->getElementType(0)); // XXX: what is correct VT?


    Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;

    Info.align.reset();

    Info.flags = Flags | MachineMemOperand::MOLoad |

                 MachineMemOperand::MODereferenceable;

    Infos.push_back(Info);

    return;

  }

  case Intrinsic::amdgcn_global_atomic_fmin_num:

  case Intrinsic::amdgcn_global_atomic_fmax_num:

  case Intrinsic::amdgcn_global_atomic_ordered_add_b64:

  case Intrinsic::amdgcn_flat_atomic_fmin_num:

  case Intrinsic::amdgcn_flat_atomic_fmax_num: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::getVT(CI.getType());

    Info.ptrVal = CI.getOperand(0);

    Info.align.reset();

    Info.flags =

        Flags | MachineMemOperand::MOLoad | MachineMemOperand::MOStore |

        MachineMemOperand::MODereferenceable | MachineMemOperand::MOVolatile;

    Infos.push_back(Info);

    return;

  }

  case Intrinsic::amdgcn_cluster_load_b32:

  case Intrinsic::amdgcn_cluster_load_b64:

  case Intrinsic::amdgcn_cluster_load_b128:

  case Intrinsic::amdgcn_ds_load_tr6_b96:

  case Intrinsic::amdgcn_ds_load_tr4_b64:

  case Intrinsic::amdgcn_ds_load_tr8_b64:

  case Intrinsic::amdgcn_ds_load_tr16_b128:

  case Intrinsic::amdgcn_global_load_tr6_b96:

  case Intrinsic::amdgcn_global_load_tr4_b64:

  case Intrinsic::amdgcn_global_load_tr_b64:

  case Intrinsic::amdgcn_global_load_tr_b128:

  case Intrinsic::amdgcn_ds_read_tr4_b64:

  case Intrinsic::amdgcn_ds_read_tr6_b96:

  case Intrinsic::amdgcn_ds_read_tr8_b64:

  case Intrinsic::amdgcn_ds_read_tr16_b64: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = MVT::getVT(CI.getType());

    Info.ptrVal = CI.getOperand(0);

    Info.align.reset();

    Info.flags = Flags | MachineMemOperand::MOLoad;

    Infos.push_back(Info);

    return;

  }

  case Intrinsic::amdgcn_flat_load_monitor_b32:

  case Intrinsic::amdgcn_flat_load_monitor_b64:

  case Intrinsic::amdgcn_flat_load_monitor_b128:

  case Intrinsic::amdgcn_global_load_monitor_b32:

  case Intrinsic::amdgcn_global_load_monitor_b64:

  case Intrinsic::amdgcn_global_load_monitor_b128: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));

    Info.ptrVal = CI.getOperand(0);

    Info.align.reset();

    Info.flags = MachineMemOperand::MOLoad;

    Info.order = parseAtomicOrderingCABIArg(CI, 1);

    Info.ssid = parseSyncscopeMDArg(CI, 2);

    Infos.push_back(Info);

    return;

  }

  case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:

  case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:

  case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;

    Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));

    Info.ptrVal = CI.getOperand(0);

    Info.align.reset();

    Info.flags = (MachineMemOperand::MOLoad | MOCooperative);

    Info.order = parseAtomicOrderingCABIArg(CI, 1);

    Info.ssid = parseSyncscopeMDArg(CI, 2);

    Infos.push_back(Info);

    return;

  }

  case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:

  case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:

  case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {

    Info.opc = ISD::INTRINSIC_VOID;

    Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));

    Info.ptrVal = CI.getArgOperand(0);

    Info.align.reset();

    Info.flags = (MachineMemOperand::MOStore | MOCooperative);

    Info.order = parseAtomicOrderingCABIArg(CI, 2);

    Info.ssid = parseSyncscopeMDArg(CI, 3);

    Infos.push_back(Info);

    return;

  }

  case Intrinsic::amdgcn_ds_gws_init:

  case Intrinsic::amdgcn_ds_gws_barrier:

  case Intrinsic::amdgcn_ds_gws_sema_v:

  case Intrinsic::amdgcn_ds_gws_sema_br:

  case Intrinsic::amdgcn_ds_gws_sema_p:

  case Intrinsic::amdgcn_ds_gws_sema_release_all: {

    Info.opc = ISD::INTRINSIC_VOID;


    const GCNTargetMachine &TM =

        static_cast<const GCNTargetMachine &>(getTargetMachine());


    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

    Info.ptrVal = MFI->getGWSPSV(TM);


    // This is an abstract access, but we need to specify a type and size.

    Info.memVT = MVT::i32;

    Info.size = 4;

    Info.align = Align(4);


    if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)

      Info.flags = Flags | MachineMemOperand::MOLoad;

    else

      Info.flags = Flags | MachineMemOperand::MOStore;

    Infos.push_back(Info);

    return;

  }

  case Intrinsic::amdgcn_global_load_async_to_lds_b8:

  case Intrinsic::amdgcn_global_load_async_to_lds_b32:

  case Intrinsic::amdgcn_global_load_async_to_lds_b64:

  case Intrinsic::amdgcn_global_load_async_to_lds_b128:

  case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:

  case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:

  case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:

  case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {

    // Entry 0: Load from source (global/flat).

    Info.opc = ISD::INTRINSIC_VOID;

    Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));

    Info.ptrVal = CI.getArgOperand(0); // Global pointer

    Info.offset = cast<ConstantInt>(CI.getArgOperand(2))->getSExtValue();

    Info.flags = Flags | MachineMemOperand::MOLoad;

    Infos.push_back(Info);


    // Entry 1: Store to LDS (same offset).

    Info.flags = Flags | MachineMemOperand::MOStore;

    Info.ptrVal = CI.getArgOperand(1); // LDS pointer

    Infos.push_back(Info);

    return;

  }

  case Intrinsic::amdgcn_global_store_async_from_lds_b8:

  case Intrinsic::amdgcn_global_store_async_from_lds_b32:

  case Intrinsic::amdgcn_global_store_async_from_lds_b64:

  case Intrinsic::amdgcn_global_store_async_from_lds_b128: {

    // Entry 0: Load from LDS.

    Info.opc = ISD::INTRINSIC_VOID;

    Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));

    Info.ptrVal = CI.getArgOperand(1); // LDS pointer

    Info.offset = cast<ConstantInt>(CI.getArgOperand(2))->getSExtValue();

    Info.flags = Flags | MachineMemOperand::MOLoad;

    Infos.push_back(Info);


    // Entry 1: Store to global (same offset).

    Info.flags = Flags | MachineMemOperand::MOStore;

    Info.ptrVal = CI.getArgOperand(0); // Global pointer

    Infos.push_back(Info);

    return;

  }

  case Intrinsic::amdgcn_load_to_lds:

  case Intrinsic::amdgcn_load_async_to_lds:

  case Intrinsic::amdgcn_global_load_lds:

  case Intrinsic::amdgcn_global_load_async_lds: {

    unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();

    auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));

    bool IsVolatile = Aux->getZExtValue() & AMDGPU::CPol::VOLATILE;

    if (IsVolatile)

      Flags |= MachineMemOperand::MOVolatile;


    // Entry 0: Load from source (global/flat).

    Info.opc = ISD::INTRINSIC_VOID;

    Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);

    Info.ptrVal = CI.getArgOperand(0); // Source pointer

    Info.offset = cast<ConstantInt>(CI.getArgOperand(3))->getSExtValue();

    Info.flags = Flags | MachineMemOperand::MOLoad;

    Infos.push_back(Info);


    // Entry 1: Store to LDS.

    // Same offset from the instruction, but an additional per-lane offset is

    // added. Represent that using a wider memory type.

    Info.memVT = EVT::getIntegerVT(CI.getContext(),

                                   Width * 8 * Subtarget->getWavefrontSize());

    Info.ptrVal = CI.getArgOperand(1); // LDS destination pointer

    Info.flags = Flags | MachineMemOperand::MOStore;

    Infos.push_back(Info);

    return;

  }

  case Intrinsic::amdgcn_ds_bvh_stack_rtn:

  case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:

  case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:

  case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {

    Info.opc = ISD::INTRINSIC_W_CHAIN;


    const GCNTargetMachine &TM =

        static_cast<const GCNTargetMachine &>(getTargetMachine());


    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

    Info.ptrVal = MFI->getGWSPSV(TM);


    // This is an abstract access, but we need to specify a type and size.

    Info.memVT = MVT::i32;

    Info.size = 4;

    Info.align = Align(4);


    Info.flags = Flags | MachineMemOperand::MOLoad | MachineMemOperand::MOStore;

    Infos.push_back(Info);

    return;

  }

  case Intrinsic::amdgcn_s_prefetch_data:

  case Intrinsic::amdgcn_flat_prefetch:

  case Intrinsic::amdgcn_global_prefetch: {

    Info.opc = ISD::INTRINSIC_VOID;

    Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);

    Info.ptrVal = CI.getArgOperand(0);

    Info.flags = Flags | MachineMemOperand::MOLoad;

    Infos.push_back(Info);

    return;

  }

  default:

    return;

  }

}


void SITargetLowering::CollectTargetIntrinsicOperands(

    const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {

  switch (cast<IntrinsicInst>(I).getIntrinsicID()) {

  case Intrinsic::amdgcn_addrspacecast_nonnull: {

    // The DAG's ValueType loses the addrspaces.

    // Add them as 2 extra Constant operands "from" and "to".

    unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();

    unsigned DstAS = I.getType()->getPointerAddressSpace();

    Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));

    Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));

    break;

  }

  default:

    break;

  }

}


bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,

                                            SmallVectorImpl<Value *> &Ops,

                                            Type *&AccessTy) const {

  Value *Ptr = nullptr;

  switch (II->getIntrinsicID()) {

  case Intrinsic::amdgcn_cluster_load_b128:

  case Intrinsic::amdgcn_cluster_load_b64:

  case Intrinsic::amdgcn_cluster_load_b32:

  case Intrinsic::amdgcn_ds_append:

  case Intrinsic::amdgcn_ds_consume:

  case Intrinsic::amdgcn_ds_load_tr8_b64:

  case Intrinsic::amdgcn_ds_load_tr16_b128:

  case Intrinsic::amdgcn_ds_load_tr4_b64:

  case Intrinsic::amdgcn_ds_load_tr6_b96:

  case Intrinsic::amdgcn_ds_read_tr4_b64:

  case Intrinsic::amdgcn_ds_read_tr6_b96:

  case Intrinsic::amdgcn_ds_read_tr8_b64:

  case Intrinsic::amdgcn_ds_read_tr16_b64:

  case Intrinsic::amdgcn_ds_ordered_add:

  case Intrinsic::amdgcn_ds_ordered_swap:

  case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:

  case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:

  case Intrinsic::amdgcn_flat_atomic_fmax_num:

  case Intrinsic::amdgcn_flat_atomic_fmin_num:

  case Intrinsic::amdgcn_global_atomic_fmax_num:

  case Intrinsic::amdgcn_global_atomic_fmin_num:

  case Intrinsic::amdgcn_global_atomic_ordered_add_b64:

  case Intrinsic::amdgcn_global_load_tr_b64:

  case Intrinsic::amdgcn_global_load_tr_b128:

  case Intrinsic::amdgcn_global_load_tr4_b64:

  case Intrinsic::amdgcn_global_load_tr6_b96:

  case Intrinsic::amdgcn_global_store_async_from_lds_b8:

  case Intrinsic::amdgcn_global_store_async_from_lds_b32:

  case Intrinsic::amdgcn_global_store_async_from_lds_b64:

  case Intrinsic::amdgcn_global_store_async_from_lds_b128:

    Ptr = II->getArgOperand(0);

    break;

  case Intrinsic::amdgcn_load_to_lds:

  case Intrinsic::amdgcn_load_async_to_lds:

  case Intrinsic::amdgcn_global_load_lds:

  case Intrinsic::amdgcn_global_load_async_lds:

  case Intrinsic::amdgcn_global_load_async_to_lds_b8:

  case Intrinsic::amdgcn_global_load_async_to_lds_b32:

  case Intrinsic::amdgcn_global_load_async_to_lds_b64:

  case Intrinsic::amdgcn_global_load_async_to_lds_b128:

  case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:

  case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:

  case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:

  case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:

    Ptr = II->getArgOperand(1);

    break;

  default:

    return false;

  }

  AccessTy = II->getType();

  Ops.push_back(Ptr);

  return true;

}


bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM,

                                                 unsigned AddrSpace) const {

  if (!Subtarget->hasFlatInstOffsets()) {

    // Flat instructions do not have offsets, and only have the register

    // address.

    return AM.BaseOffs == 0 && AM.Scale == 0;

  }


  decltype(SIInstrFlags::FLAT) FlatVariant =

      AddrSpace == AMDGPUAS::GLOBAL_ADDRESS    ? SIInstrFlags::FlatGlobal

      : AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ? SIInstrFlags::FlatScratch

                                               : SIInstrFlags::FLAT;


  return AM.Scale == 0 &&

         (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(

                                  AM.BaseOffs, AddrSpace, FlatVariant));

}


bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {

  if (Subtarget->hasFlatGlobalInsts())

    return isLegalFlatAddressingMode(AM, AMDGPUAS::GLOBAL_ADDRESS);


  if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {

    // Assume the we will use FLAT for all global memory accesses

    // on VI.

    // FIXME: This assumption is currently wrong.  On VI we still use

    // MUBUF instructions for the r + i addressing mode.  As currently

    // implemented, the MUBUF instructions only work on buffer < 4GB.

    // It may be possible to support > 4GB buffers with MUBUF instructions,

    // by setting the stride value in the resource descriptor which would

    // increase the size limit to (stride * 4GB).  However, this is risky,

    // because it has never been validated.

    return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS);

  }


  return isLegalMUBUFAddressingMode(AM);

}


bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {

  // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and

  // additionally can do r + r + i with addr64. 32-bit has more addressing

  // mode options. Depending on the resource constant, it can also do

  // (i64 r0) + (i32 r1) * (i14 i).

  //

  // Private arrays end up using a scratch buffer most of the time, so also

  // assume those use MUBUF instructions. Scratch loads / stores are currently

  // implemented as mubuf instructions with offen bit set, so slightly

  // different than the normal addr64.

  const SIInstrInfo *TII = Subtarget->getInstrInfo();

  if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))

    return false;


  // FIXME: Since we can split immediate into soffset and immediate offset,

  // would it make sense to allow any immediate?


  switch (AM.Scale) {

  case 0: // r + i or just i, depending on HasBaseReg.

    return true;

  case 1:

    return true; // We have r + r or r + i.

  case 2:

    if (AM.HasBaseReg) {

      // Reject 2 * r + r.

      return false;

    }


    // Allow 2 * r as r + r

    // Or  2 * r + i is allowed as r + r + i.

    return true;

  default: // Don't allow n * r

    return false;

  }

}


bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,

                                             const AddrMode &AM, Type *Ty,

                                             unsigned AS,

                                             Instruction *I) const {

  // No global is ever allowed as a base.

  if (AM.BaseGV)

    return false;


  if (AS == AMDGPUAS::GLOBAL_ADDRESS)

    return isLegalGlobalAddressingMode(AM);


  if (AS == AMDGPUAS::CONSTANT_ADDRESS ||

      AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||

      AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE ||

      AS == AMDGPUAS::BUFFER_STRIDED_POINTER) {

    // If the offset isn't a multiple of 4, it probably isn't going to be

    // correctly aligned.

    // FIXME: Can we get the real alignment here?

    if (AM.BaseOffs % 4 != 0)

      return isLegalMUBUFAddressingMode(AM);


    if (!Subtarget->hasScalarSubwordLoads()) {

      // There are no SMRD extloads, so if we have to do a small type access we

      // will use a MUBUF load.

      // FIXME?: We also need to do this if unaligned, but we don't know the

      // alignment here.

      if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)

        return isLegalGlobalAddressingMode(AM);

    }


    if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {

      // SMRD instructions have an 8-bit, dword offset on SI.

      if (!isUInt<8>(AM.BaseOffs / 4))

        return false;

    } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {

      // On CI+, this can also be a 32-bit literal constant offset. If it fits

      // in 8-bits, it can use a smaller encoding.

      if (!isUInt<32>(AM.BaseOffs / 4))

        return false;

    } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {

      // On VI, these use the SMEM format and the offset is 20-bit in bytes.

      if (!isUInt<20>(AM.BaseOffs))

        return false;

    } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {

      // On GFX9 the offset is signed 21-bit in bytes (but must not be negative

      // for S_BUFFER_* instructions).

      if (!isInt<21>(AM.BaseOffs))

        return false;

    } else {

      // On GFX12, all offsets are signed 24-bit in bytes.

      if (!isInt<24>(AM.BaseOffs))

        return false;

    }


    if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||

         AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&

        AM.BaseOffs < 0) {

      // Scalar (non-buffer) loads can only use a negative offset if

      // soffset+offset is non-negative. Since the compiler can only prove that

      // in a few special cases, it is safer to claim that negative offsets are

      // not supported.

      return false;

    }


    if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.

      return true;


    if (AM.Scale == 1 && AM.HasBaseReg)

      return true;


    return false;

  }


  if (AS == AMDGPUAS::PRIVATE_ADDRESS)

    return Subtarget->hasFlatScratchEnabled()

               ? isLegalFlatAddressingMode(AM, AMDGPUAS::PRIVATE_ADDRESS)

               : isLegalMUBUFAddressingMode(AM);


  if (AS == AMDGPUAS::LOCAL_ADDRESS ||

      (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {

    // Basic, single offset DS instructions allow a 16-bit unsigned immediate

    // field.

    // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have

    // an 8-bit dword offset but we don't know the alignment here.

    if (!isUInt<16>(AM.BaseOffs))

      return false;


    if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.

      return true;


    if (AM.Scale == 1 && AM.HasBaseReg)

      return true;


    return false;

  }


  if (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) {

    // For an unknown address space, this usually means that this is for some

    // reason being used for pure arithmetic, and not based on some addressing

    // computation. We don't have instructions that compute pointers with any

    // addressing modes, so treat them as having no offset like flat

    // instructions.

    return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS);

  }


  // Assume a user alias of global for unknown address spaces.

  return isLegalGlobalAddressingMode(AM);

}


bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,

                                        const MachineFunction &MF) const {

  if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)

    return (MemVT.getSizeInBits() <= 4 * 32);

  if (AS == AMDGPUAS::PRIVATE_ADDRESS) {

    unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();

    return (MemVT.getSizeInBits() <= MaxPrivateBits);

  }

  if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)

    return (MemVT.getSizeInBits() <= 2 * 32);

  return true;

}


bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(

    unsigned Size, unsigned AddrSpace, Align Alignment,

    MachineMemOperand::Flags Flags, unsigned *IsFast) const {

  if (IsFast)

    *IsFast = 0;


  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||

      AddrSpace == AMDGPUAS::REGION_ADDRESS) {

    // Check if alignment requirements for ds_read/write instructions are

    // disabled.

    if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))

      return false;


    Align RequiredAlignment(

        PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.

    if (Subtarget->hasLDSMisalignedBugInWGPMode() && Size > 32 &&

        Alignment < RequiredAlignment)

      return false;


    // Either, the alignment requirements are "enabled", or there is an

    // unaligned LDS access related hardware bug though alignment requirements

    // are "disabled". In either case, we need to check for proper alignment

    // requirements.

    //

    switch (Size) {

    case 64:

      // SI has a hardware bug in the LDS / GDS bounds checking: if the base

      // address is negative, then the instruction is incorrectly treated as

      // out-of-bounds even if base + offsets is in bounds. Split vectorized

      // loads here to avoid emitting ds_read2_b32. We may re-combine the

      // load later in the SILoadStoreOptimizer.

      if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))

        return false;


      // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we

      // can do a 4 byte aligned, 8 byte access in a single operation using

      // ds_read2/write2_b32 with adjacent offsets.

      RequiredAlignment = Align(4);


      if (Subtarget->hasUnalignedDSAccessEnabled()) {

        // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/

        // ds_write2_b32 depending on the alignment. In either case with either

        // alignment there is no faster way of doing this.


        // The numbers returned here and below are not additive, it is a 'speed

        // rank'. They are just meant to be compared to decide if a certain way

        // of lowering an operation is faster than another. For that purpose

        // naturally aligned operation gets it bitsize to indicate that "it

        // operates with a speed comparable to N-bit wide load". With the full

        // alignment ds128 is slower than ds96 for example. If underaligned it

        // is comparable to a speed of a single dword access, which would then

        // mean 32 < 128 and it is faster to issue a wide load regardless.

        // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a

        // wider load which will not be aligned anymore the latter is slower.

        if (IsFast)

          *IsFast = (Alignment >= RequiredAlignment) ? 64

                    : (Alignment < Align(4))         ? 32

                                                     : 1;

        return true;

      }


      break;

    case 96:

      if (!Subtarget->hasDS96AndDS128())

        return false;


      // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on

      // gfx8 and older.


      if (Subtarget->hasUnalignedDSAccessEnabled()) {

        // Naturally aligned access is fastest. However, also report it is Fast

        // if memory is aligned less than DWORD. A narrow load or store will be

        // be equally slow as a single ds_read_b96/ds_write_b96, but there will

        // be more of them, so overall we will pay less penalty issuing a single

        // instruction.


        // See comment on the values above.

        if (IsFast)

          *IsFast = (Alignment >= RequiredAlignment) ? 96

                    : (Alignment < Align(4))         ? 32

                                                     : 1;

        return true;

      }


      break;

    case 128:

      if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())

        return false;


      // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on

      // gfx8 and older, but  we can do a 8 byte aligned, 16 byte access in a

      // single operation using ds_read2/write2_b64.

      RequiredAlignment = Align(8);


      if (Subtarget->hasUnalignedDSAccessEnabled()) {

        // Naturally aligned access is fastest. However, also report it is Fast

        // if memory is aligned less than DWORD. A narrow load or store will be

        // be equally slow as a single ds_read_b128/ds_write_b128, but there

        // will be more of them, so overall we will pay less penalty issuing a

        // single instruction.


        // See comment on the values above.

        if (IsFast)

          *IsFast = (Alignment >= RequiredAlignment) ? 128

                    : (Alignment < Align(4))         ? 32

                                                     : 1;

        return true;

      }


      break;

    default:

      if (Size > 32)

        return false;


      break;

    }


    // See comment on the values above.

    // Note that we have a single-dword or sub-dword here, so if underaligned

    // it is a slowest possible access, hence returned value is 0.

    if (IsFast)

      *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;


    return Alignment >= RequiredAlignment ||

           Subtarget->hasUnalignedDSAccessEnabled();

  }


  // FIXME: We have to be conservative here and assume that flat operations

  // will access scratch.  If we had access to the IR function, then we

  // could determine if any private memory was used in the function.

  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||

      AddrSpace == AMDGPUAS::FLAT_ADDRESS) {

    bool AlignedBy4 = Alignment >= Align(4);

    if (Subtarget->hasUnalignedScratchAccessEnabled()) {

      if (IsFast)

        *IsFast = AlignedBy4 ? Size : 1;

      return true;

    }


    if (IsFast)

      *IsFast = AlignedBy4;


    return AlignedBy4;

  }


  // So long as they are correct, wide global memory operations perform better

  // than multiple smaller memory ops -- even when misaligned

  if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {

    if (IsFast)

      *IsFast = Size;


    return Alignment >= Align(4) ||

           Subtarget->hasUnalignedBufferAccessEnabled();

  }


  // Ensure robust out-of-bounds guarantees for buffer accesses are met if

  // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper

  // out-of-bounds behavior, but in the edge case where an access starts

  // out-of-bounds and then enter in-bounds, the entire access would be treated

  // as out-of-bounds. Prevent misaligned memory accesses by requiring the

  // natural alignment of buffer accesses.

  if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||

      AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||

      AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {

    if (!Subtarget->hasRelaxedBufferOOBMode() &&

        Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))

      return false;

  }


  // Smaller than dword value must be aligned.

  if (Size < 32)

    return false;


  // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the

  // byte-address are ignored, thus forcing Dword alignment.

  // This applies to private, global, and constant memory.

  if (IsFast)

    *IsFast = 1;


  return Size >= 32 && Alignment >= Align(4);

}


bool SITargetLowering::allowsMisalignedMemoryAccesses(

    EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,

    unsigned *IsFast) const {

  return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,

                                            Alignment, Flags, IsFast);

}


EVT SITargetLowering::getOptimalMemOpType(

    LLVMContext &Context, const MemOp &Op,

    const AttributeList &FuncAttributes) const {

  // FIXME: Should account for address space here.


  // The default fallback uses the private pointer size as a guess for a type to

  // use. Make sure we switch these to 64-bit accesses.


  if (Op.size() >= 16 &&

      Op.isDstAligned(Align(4))) // XXX: Should only do for global

    return MVT::v4i32;


  if (Op.size() >= 8 && Op.isDstAligned(Align(4)))

    return MVT::v2i32;


  // Use the default.

  return MVT::Other;

}


bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {

  const MemSDNode *MemNode = cast<MemSDNode>(N);

  return MemNode->getMemOperand()->getFlags() & MONoClobber;

}


bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) {

  return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||

         AS == AMDGPUAS::PRIVATE_ADDRESS;

}


bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,

                                           unsigned DestAS) const {

  if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {

    if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&

        Subtarget->hasGloballyAddressableScratch()) {

      // Flat -> private requires subtracting src_flat_scratch_base_lo.

      return false;

    }


    // Flat -> private/local is a simple truncate.

    // Flat -> global is no-op

    return true;

  }


  const GCNTargetMachine &TM =

      static_cast<const GCNTargetMachine &>(getTargetMachine());

  return TM.isNoopAddrSpaceCast(SrcAS, DestAS);

}


TargetLoweringBase::LegalizeTypeAction


SITargetLowering::getPreferredVectorAction(MVT VT) const {

  if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&

      VT.getScalarType().bitsLE(MVT::i16))

    return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector;

  return TargetLoweringBase::getPreferredVectorAction(VT);

}


bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,

                                                         Type *Ty) const {

  // FIXME: Could be smarter if called for vector constants.

  return true;

}


bool SITargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,

                                               unsigned Index) const {

  if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))

    return false;


  // TODO: Add more cases that are cheap.

  return Index == 0;

}


bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {

  // TODO: This should be more aggressive, particular for 16-bit element

  // vectors. However there are some mixed improvements and regressions.

  EVT EltTy = VT.getVectorElementType();

  unsigned MinAlign = Subtarget->useRealTrue16Insts() ? 16 : 32;

  return EltTy.getSizeInBits() % MinAlign == 0;

}


bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {

  if (Subtarget->has16BitInsts() && VT == MVT::i16) {

    switch (Op) {

    case ISD::LOAD:

    case ISD::STORE:

      return true;

    default:

      return false;

    }

  }


  // SimplifySetCC uses this function to determine whether or not it should

  // create setcc with i1 operands.  We don't have instructions for i1 setcc.

  if (VT == MVT::i1 && Op == ISD::SETCC)

    return false;


  return TargetLowering::isTypeDesirableForOp(Op, VT);

}


MachinePointerInfo


SITargetLowering::getKernargSegmentPtrInfo(MachineFunction &MF) const {

  // This isn't really a constant pool but close enough.

  MachinePointerInfo PtrInfo(MF.getPSVManager().getConstantPool());

  PtrInfo.AddrSpace = AMDGPUAS::CONSTANT_ADDRESS;

  return PtrInfo;

}


SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,

                                                   const SDLoc &SL,

                                                   SDValue Chain,

                                                   uint64_t Offset) const {

  const DataLayout &DL = DAG.getDataLayout();

  MachineFunction &MF = DAG.getMachineFunction();

  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();

  MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);


  auto [InputPtrReg, RC, ArgTy] =

      Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);


  // We may not have the kernarg segment argument if we have no kernel

  // arguments.

  if (!InputPtrReg)

    return DAG.getConstant(Offset, SL, PtrVT);


  MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();

  SDValue BasePtr = DAG.getCopyFromReg(

      Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);


  return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));

}


SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,

                                            const SDLoc &SL) const {

  uint64_t Offset =

      getImplicitParameterOffset(DAG.getMachineFunction(), FIRST_IMPLICIT);

  return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);

}


SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,

                                         const SDLoc &SL) const {


  Function &F = DAG.getMachineFunction().getFunction();

  std::optional<uint32_t> KnownSize =

      AMDGPUMachineFunctionInfo::getLDSKernelIdMetadata(F);

  if (KnownSize.has_value())

    return DAG.getConstant(*KnownSize, SL, MVT::i32);

  return SDValue();

}


SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,

                                         const SDLoc &SL, SDValue Val,

                                         bool Signed,

                                         const ISD::InputArg *Arg) const {

  // First, if it is a widened vector, narrow it.

  if (VT.isVector() &&

      VT.getVectorNumElements() != MemVT.getVectorNumElements()) {

    EVT NarrowedVT =

        EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(),

                         VT.getVectorNumElements());

    Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,

                      DAG.getConstant(0, SL, MVT::i32));

  }


  // Then convert the vector elements or scalar value.

  if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {

    unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;

    Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));

  }


  if (MemVT.isFloatingPoint()) {

    if (VT.isFloatingPoint()) {

      Val = getFPExtOrFPRound(DAG, Val, SL, VT);

    } else {

      assert(!MemVT.isVector());

      EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());

      SDValue Cast = DAG.getBitcast(IntVT, Val);

      Val = DAG.getAnyExtOrTrunc(Cast, SL, VT);

    }

  } else if (Signed)

    Val = DAG.getSExtOrTrunc(Val, SL, VT);

  else

    Val = DAG.getZExtOrTrunc(Val, SL, VT);


  return Val;

}


SDValue SITargetLowering::lowerKernargMemParameter(

    SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,

    uint64_t Offset, Align Alignment, bool Signed,

    const ISD::InputArg *Arg) const {


  MachinePointerInfo PtrInfo =

      getKernargSegmentPtrInfo(DAG.getMachineFunction());


  // Try to avoid using an extload by loading earlier than the argument address,

  // and extracting the relevant bits. The load should hopefully be merged with

  // the previous argument.

  if (MemVT.getStoreSize() < 4 && Alignment < 4) {

    // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).

    int64_t AlignDownOffset = alignDown(Offset, 4);

    int64_t OffsetDiff = Offset - AlignDownOffset;


    EVT IntVT = MemVT.changeTypeToInteger();


    // TODO: If we passed in the base kernel offset we could have a better

    // alignment than 4, but we don't really need it.

    SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);

    SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr,

                               PtrInfo.getWithOffset(AlignDownOffset), Align(4),

                               MachineMemOperand::MODereferenceable |

                                   MachineMemOperand::MOInvariant);


    SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);

    SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);


    SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);

    ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);

    ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);


    return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);

  }


  SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);

  SDValue Load = DAG.getLoad(

      MemVT, SL, Chain, Ptr, PtrInfo.getWithOffset(Offset), Alignment,

      MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant);


  SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);

  return DAG.getMergeValues({Val, Load.getValue(1)}, SL);

}


/// Coerce an argument which was passed in a different ABI type to the original

/// expected value type.

SDValue SITargetLowering::convertABITypeToValueType(SelectionDAG &DAG,

                                                    SDValue Val,

                                                    CCValAssign &VA,

                                                    const SDLoc &SL) const {

  EVT ValVT = VA.getValVT();


  // If this is an 8 or 16-bit value, it is really passed promoted

  // to 32 bits. Insert an assert[sz]ext to capture this, then

  // truncate to the right size.

  switch (VA.getLocInfo()) {

  case CCValAssign::Full:

    return Val;

  case CCValAssign::BCvt:

    return DAG.getNode(ISD::BITCAST, SL, ValVT, Val);

  case CCValAssign::SExt:

    Val = DAG.getNode(ISD::AssertSext, SL, VA.getLocVT(), Val,

                      DAG.getValueType(ValVT));

    return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);

  case CCValAssign::ZExt:

    Val = DAG.getNode(ISD::AssertZext, SL, VA.getLocVT(), Val,

                      DAG.getValueType(ValVT));

    return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);

  case CCValAssign::AExt:

    return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);

  default:

    llvm_unreachable("Unknown loc info!");

  }

}


SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,

                                              CCValAssign &VA, const SDLoc &SL,

                                              SDValue Chain,

                                              const ISD::InputArg &Arg) const {

  MachineFunction &MF = DAG.getMachineFunction();

  MachineFrameInfo &MFI = MF.getFrameInfo();


  if (Arg.Flags.isByVal()) {

    unsigned Size = Arg.Flags.getByValSize();

    int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);

    return DAG.getFrameIndex(FrameIdx, MVT::i32);

  }


  unsigned ArgOffset = VA.getLocMemOffset();

  unsigned ArgSize = VA.getValVT().getStoreSize();


  int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);


  // Create load nodes to retrieve arguments from the stack.

  SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);


  // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)

  ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;

  MVT MemVT = VA.getValVT();


  switch (VA.getLocInfo()) {

  default:

    break;

  case CCValAssign::BCvt:

    MemVT = VA.getLocVT();

    break;

  case CCValAssign::SExt:

    ExtType = ISD::SEXTLOAD;

    break;

  case CCValAssign::ZExt:

    ExtType = ISD::ZEXTLOAD;

    break;

  case CCValAssign::AExt:

    ExtType = ISD::EXTLOAD;

    break;

  }


  SDValue ArgValue = DAG.getExtLoad(

      ExtType, SL, VA.getLocVT(), Chain, FIN,

      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), MemVT);


  SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);

  if (ConvertedVal == ArgValue)

    return ConvertedVal;


  return DAG.getMergeValues({ConvertedVal, ArgValue.getValue(1)}, SL);

}


SDValue SITargetLowering::lowerWorkGroupId(

    SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,

    AMDGPUFunctionArgInfo::PreloadedValue WorkGroupIdPV,

    AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV,

    AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {

  if (!Subtarget->hasClusters())

    return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);


  // Clusters are supported. Return the global position in the grid. If clusters

  // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.


  // WorkGroupIdXYZ = ClusterId == 0 ?

  //   ClusterIdXYZ :

  //   ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ

  SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);

  SDLoc SL(ClusterIdXYZ);

  SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);

  SDValue One = DAG.getConstant(1, SL, VT);

  SDValue ClusterSizeXYZ = DAG.getNode(ISD::ADD, SL, VT, ClusterMaxIdXYZ, One);

  SDValue ClusterWorkGroupIdXYZ =

      getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);

  SDValue GlobalIdXYZ =

      DAG.getNode(ISD::ADD, SL, VT, ClusterWorkGroupIdXYZ,

                  DAG.getNode(ISD::MUL, SL, VT, ClusterIdXYZ, ClusterSizeXYZ));


  switch (MFI.getClusterDims().getKind()) {

  case AMDGPU::ClusterDimsAttr::Kind::FixedDims:

  case AMDGPU::ClusterDimsAttr::Kind::VariableDims:

    return GlobalIdXYZ;

  case AMDGPU::ClusterDimsAttr::Kind::NoCluster:

    return ClusterIdXYZ;

  case AMDGPU::ClusterDimsAttr::Kind::Unknown: {

    using namespace AMDGPU::Hwreg;

    SDValue ClusterIdField =

        DAG.getTargetConstant(HwregEncoding::encode(ID_IB_STS2, 6, 4), SL, VT);

    SDNode *GetReg =

        DAG.getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);

    SDValue ClusterId(GetReg, 0);

    SDValue Zero = DAG.getConstant(0, SL, VT);

    return DAG.getNode(ISD::SELECT_CC, SL, VT, ClusterId, Zero, ClusterIdXYZ,

                       GlobalIdXYZ, DAG.getCondCode(ISD::SETEQ));

  }

  }


  llvm_unreachable("nothing should reach here");

}


SDValue SITargetLowering::getPreloadedValue(

    SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,

    AMDGPUFunctionArgInfo::PreloadedValue PVID) const {

  const ArgDescriptor *Reg = nullptr;

  const TargetRegisterClass *RC;

  LLT Ty;


  CallingConv::ID CC = DAG.getMachineFunction().getFunction().getCallingConv();

  const ArgDescriptor WorkGroupIDX =

      ArgDescriptor::createRegister(AMDGPU::TTMP9);

  // If GridZ is not programmed in an entry function then the hardware will set

  // it to all zeros, so there is no need to mask the GridY value in the low

  // order bits.

  const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(

      AMDGPU::TTMP7,

      AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);

  const ArgDescriptor WorkGroupIDZ =

      ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);

  const ArgDescriptor ClusterWorkGroupIDX =

      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);

  const ArgDescriptor ClusterWorkGroupIDY =

      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);

  const ArgDescriptor ClusterWorkGroupIDZ =

      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);

  const ArgDescriptor ClusterWorkGroupMaxIDX =

      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);

  const ArgDescriptor ClusterWorkGroupMaxIDY =

      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);

  const ArgDescriptor ClusterWorkGroupMaxIDZ =

      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);

  const ArgDescriptor ClusterWorkGroupMaxFlatID =

      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);


  auto LoadConstant = [&](unsigned N) {

    return DAG.getConstant(N, SDLoc(), VT);

  };


  if (Subtarget->hasArchitectedSGPRs() &&

      (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {

    AMDGPU::ClusterDimsAttr ClusterDims = MFI.getClusterDims();

    bool HasFixedDims = ClusterDims.isFixedDims();


    switch (PVID) {

    case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:

      Reg = &WorkGroupIDX;

      RC = &AMDGPU::SReg_32RegClass;

      Ty = LLT::scalar(32);

      break;

    case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:

      Reg = &WorkGroupIDY;

      RC = &AMDGPU::SReg_32RegClass;

      Ty = LLT::scalar(32);

      break;

    case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:

      Reg = &WorkGroupIDZ;

      RC = &AMDGPU::SReg_32RegClass;

      Ty = LLT::scalar(32);

      break;

    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X:

      if (HasFixedDims && ClusterDims.getDims()[0] == 1)

        return LoadConstant(0);

      Reg = &ClusterWorkGroupIDX;

      RC = &AMDGPU::SReg_32RegClass;

      Ty = LLT::scalar(32);

      break;

    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y:

      if (HasFixedDims && ClusterDims.getDims()[1] == 1)

        return LoadConstant(0);

      Reg = &ClusterWorkGroupIDY;

      RC = &AMDGPU::SReg_32RegClass;

      Ty = LLT::scalar(32);

      break;

    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z:

      if (HasFixedDims && ClusterDims.getDims()[2] == 1)

        return LoadConstant(0);

      Reg = &ClusterWorkGroupIDZ;

      RC = &AMDGPU::SReg_32RegClass;

      Ty = LLT::scalar(32);

      break;

    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X:

      if (HasFixedDims)

        return LoadConstant(ClusterDims.getDims()[0] - 1);

      Reg = &ClusterWorkGroupMaxIDX;

      RC = &AMDGPU::SReg_32RegClass;

      Ty = LLT::scalar(32);

      break;

    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y:

      if (HasFixedDims)

        return LoadConstant(ClusterDims.getDims()[1] - 1);

      Reg = &ClusterWorkGroupMaxIDY;

      RC = &AMDGPU::SReg_32RegClass;

      Ty = LLT::scalar(32);

      break;

    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z:

      if (HasFixedDims)

        return LoadConstant(ClusterDims.getDims()[2] - 1);

      Reg = &ClusterWorkGroupMaxIDZ;

      RC = &AMDGPU::SReg_32RegClass;

      Ty = LLT::scalar(32);

      break;

    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID:

      Reg = &ClusterWorkGroupMaxFlatID;

      RC = &AMDGPU::SReg_32RegClass;

      Ty = LLT::scalar(32);

      break;

    default:

      break;

    }

  }


  if (!Reg)

    std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);

  if (!Reg) {

    if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) {

      // It's possible for a kernarg intrinsic call to appear in a kernel with

      // no allocated segment, in which case we do not add the user sgpr

      // argument, so just return null.

      return DAG.getConstant(0, SDLoc(), VT);

    }


    // It's undefined behavior if a function marked with the amdgpu-no-*

    // attributes uses the corresponding intrinsic.

    return DAG.getPOISON(VT);

  }


  return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);

}


static void processPSInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,

                               CallingConv::ID CallConv,

                               ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,

                               FunctionType *FType,

                               SIMachineFunctionInfo *Info) {

  for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {

    const ISD::InputArg *Arg = &Ins[I];


    assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&

           "vector type argument should have been split");


    // First check if it's a PS input addr.

    if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&

        PSInputNum <= 15) {

      bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);


      // Inconveniently only the first part of the split is marked as isSplit,

      // so skip to the end. We only want to increment PSInputNum once for the

      // entire split argument.

      if (Arg->Flags.isSplit()) {

        while (!Arg->Flags.isSplitEnd()) {

          assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&

                 "unexpected vector split in ps argument type");

          if (!SkipArg)

            Splits.push_back(*Arg);

          Arg = &Ins[++I];

        }

      }


      if (SkipArg) {

        // We can safely skip PS inputs.

        Skipped.set(Arg->getOrigArgIndex());

        ++PSInputNum;

        continue;

      }


      Info->markPSInputAllocated(PSInputNum);

      if (Arg->Used)

        Info->markPSInputEnabled(PSInputNum);


      ++PSInputNum;

    }


    Splits.push_back(*Arg);

  }

}


// Allocate special inputs passed in VGPRs.


void SITargetLowering::allocateSpecialEntryInputVGPRs(

    CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,

    SIMachineFunctionInfo &Info) const {

  const LLT S32 = LLT::scalar(32);

  MachineRegisterInfo &MRI = MF.getRegInfo();


  if (Info.hasWorkItemIDX()) {

    Register Reg = AMDGPU::VGPR0;

    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);


    CCInfo.AllocateReg(Reg);

    unsigned Mask =

        (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;

    Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));

  }


  if (Info.hasWorkItemIDY()) {

    assert(Info.hasWorkItemIDX());

    if (Subtarget->hasPackedTID()) {

      Info.setWorkItemIDY(

          ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));

    } else {

      unsigned Reg = AMDGPU::VGPR1;

      MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);


      CCInfo.AllocateReg(Reg);

      Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));

    }

  }


  if (Info.hasWorkItemIDZ()) {

    assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());

    if (Subtarget->hasPackedTID()) {

      Info.setWorkItemIDZ(

          ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));

    } else {

      unsigned Reg = AMDGPU::VGPR2;

      MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);


      CCInfo.AllocateReg(Reg);

      Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));

    }

  }

}


// Try to allocate a VGPR at the end of the argument list, or if no argument

// VGPRs are left allocating a stack slot.

// If \p Mask is is given it indicates bitfield position in the register.

// If \p Arg is given use it with new ]p Mask instead of allocating new.


static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,

                                         ArgDescriptor Arg = ArgDescriptor()) {

  if (Arg.isSet())

    return ArgDescriptor::createArg(Arg, Mask);


  ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);

  unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);

  if (RegIdx == ArgVGPRs.size()) {

    // Spill to stack required.

    int64_t Offset = CCInfo.AllocateStack(4, Align(4));


    return ArgDescriptor::createStack(Offset, Mask);

  }


  unsigned Reg = ArgVGPRs[RegIdx];

  Reg = CCInfo.AllocateReg(Reg);

  assert(Reg != AMDGPU::NoRegister);


  MachineFunction &MF = CCInfo.getMachineFunction();

  Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);

  MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));

  return ArgDescriptor::createRegister(Reg, Mask);

}


static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,

                                             const TargetRegisterClass *RC,

                                             unsigned NumArgRegs) {

  ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);

  unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);

  if (RegIdx == ArgSGPRs.size())

    report_fatal_error("ran out of SGPRs for arguments");


  unsigned Reg = ArgSGPRs[RegIdx];

  Reg = CCInfo.AllocateReg(Reg);

  assert(Reg != AMDGPU::NoRegister);


  MachineFunction &MF = CCInfo.getMachineFunction();

  MF.addLiveIn(Reg, RC);

  return ArgDescriptor::createRegister(Reg);

}


// If this has a fixed position, we still should allocate the register in the

// CCInfo state. Technically we could get away with this for values passed

// outside of the normal argument range.


static void allocateFixedSGPRInputImpl(CCState &CCInfo,

                                       const TargetRegisterClass *RC,

                                       MCRegister Reg) {

  Reg = CCInfo.AllocateReg(Reg);

  assert(Reg != AMDGPU::NoRegister);

  MachineFunction &MF = CCInfo.getMachineFunction();

  MF.addLiveIn(Reg, RC);

}


static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {

  if (Arg) {

    allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,

                               Arg.getRegister());

  } else

    Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);

}


static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {

  if (Arg) {

    allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,

                               Arg.getRegister());

  } else

    Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);

}


/// Allocate implicit function VGPR arguments at the end of allocated user

/// arguments.


void SITargetLowering::allocateSpecialInputVGPRs(

    CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,

    SIMachineFunctionInfo &Info) const {

  const unsigned Mask = 0x3ff;

  ArgDescriptor Arg;


  if (Info.hasWorkItemIDX()) {

    Arg = allocateVGPR32Input(CCInfo, Mask);

    Info.setWorkItemIDX(Arg);

  }


  if (Info.hasWorkItemIDY()) {

    Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);

    Info.setWorkItemIDY(Arg);

  }


  if (Info.hasWorkItemIDZ())

    Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));

}


/// Allocate implicit function VGPR arguments in fixed registers.


void SITargetLowering::allocateSpecialInputVGPRsFixed(

    CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,

    SIMachineFunctionInfo &Info) const {

  Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);

  if (!Reg)

    report_fatal_error("failed to allocate VGPR for implicit arguments");


  const unsigned Mask = 0x3ff;

  Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));

  Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));

  Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));

}


void SITargetLowering::allocateSpecialInputSGPRs(

    CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,

    SIMachineFunctionInfo &Info) const {

  auto &ArgInfo = Info.getArgInfo();

  const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();


  // TODO: Unify handling with private memory pointers.

  if (UserSGPRInfo.hasDispatchPtr())

    allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);


  if (UserSGPRInfo.hasQueuePtr())

    allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);


  // Implicit arg ptr takes the place of the kernarg segment pointer. This is a

  // constant offset from the kernarg segment.

  if (Info.hasImplicitArgPtr())

    allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);


  if (UserSGPRInfo.hasDispatchID())

    allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);


  // flat_scratch_init is not applicable for non-kernel functions.


  if (Info.hasWorkGroupIDX())

    allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);


  if (Info.hasWorkGroupIDY())

    allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);


  if (Info.hasWorkGroupIDZ())

    allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);


  if (Info.hasLDSKernelId())

    allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);

}


// Allocate special inputs passed in user SGPRs.


void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,

                                            MachineFunction &MF,

                                            const SIRegisterInfo &TRI,

                                            SIMachineFunctionInfo &Info) const {

  const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();

  if (UserSGPRInfo.hasImplicitBufferPtr()) {

    Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);

    MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);

    CCInfo.AllocateReg(ImplicitBufferPtrReg);

  }


  // FIXME: How should these inputs interact with inreg / custom SGPR inputs?

  if (UserSGPRInfo.hasPrivateSegmentBuffer()) {

    Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);

    MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);

    CCInfo.AllocateReg(PrivateSegmentBufferReg);

  }


  if (UserSGPRInfo.hasDispatchPtr()) {

    Register DispatchPtrReg = Info.addDispatchPtr(TRI);

    MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);

    CCInfo.AllocateReg(DispatchPtrReg);

  }


  if (UserSGPRInfo.hasQueuePtr()) {

    Register QueuePtrReg = Info.addQueuePtr(TRI);

    MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);

    CCInfo.AllocateReg(QueuePtrReg);

  }


  if (UserSGPRInfo.hasKernargSegmentPtr()) {

    MachineRegisterInfo &MRI = MF.getRegInfo();

    Register InputPtrReg = Info.addKernargSegmentPtr(TRI);

    CCInfo.AllocateReg(InputPtrReg);


    Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);

    MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));

  }


  if (UserSGPRInfo.hasDispatchID()) {

    Register DispatchIDReg = Info.addDispatchID(TRI);

    MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);

    CCInfo.AllocateReg(DispatchIDReg);

  }


  if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {

    Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);

    MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);

    CCInfo.AllocateReg(FlatScratchInitReg);

  }


  if (UserSGPRInfo.hasPrivateSegmentSize()) {

    Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);

    MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);

    CCInfo.AllocateReg(PrivateSegmentSizeReg);

  }


  // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read

  // these from the dispatch pointer.

}


// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be

// sequential starting from the first argument.


void SITargetLowering::allocatePreloadKernArgSGPRs(

    CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,

    const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF,

    const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {

  Function &F = MF.getFunction();

  unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();

  GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();

  bool InPreloadSequence = true;

  unsigned InIdx = 0;

  bool AlignedForImplictArgs = false;

  unsigned ImplicitArgOffset = 0;

  for (auto &Arg : F.args()) {

    if (!InPreloadSequence || !Arg.hasInRegAttr())

      break;


    unsigned ArgIdx = Arg.getArgNo();

    // Don't preload non-original args or parts not in the current preload

    // sequence.

    if (InIdx < Ins.size() &&

        (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))

      break;


    for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&

           Ins[InIdx].getOrigArgIndex() == ArgIdx;

         InIdx++) {

      assert(ArgLocs[ArgIdx].isMemLoc());

      auto &ArgLoc = ArgLocs[InIdx];

      const Align KernelArgBaseAlign = Align(16);

      unsigned ArgOffset = ArgLoc.getLocMemOffset();

      Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);

      unsigned NumAllocSGPRs =

          alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;


      // Fix alignment for hidden arguments.

      if (Arg.hasAttribute("amdgpu-hidden-argument")) {

        if (!AlignedForImplictArgs) {

          ImplicitArgOffset =

              alignTo(LastExplicitArgOffset,

                      Subtarget->getAlignmentForImplicitArgPtr()) -

              LastExplicitArgOffset;

          AlignedForImplictArgs = true;

        }

        ArgOffset += ImplicitArgOffset;

      }


      // Arg is preloaded into the previous SGPR.

      if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {

        assert(InIdx >= 1 && "No previous SGPR");

        Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(

            Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);

        continue;

      }


      unsigned Padding = ArgOffset - LastExplicitArgOffset;

      unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;

      // Check for free user SGPRs for preloading.

      if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {

        InPreloadSequence = false;

        break;

      }


      // Preload this argument.

      const TargetRegisterClass *RC =

          TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);

      SmallVectorImpl<MCRegister> *PreloadRegs =

          Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);


      if (PreloadRegs->size() > 1)

        RC = &AMDGPU::SGPR_32RegClass;

      for (auto &Reg : *PreloadRegs) {

        assert(Reg);

        MF.addLiveIn(Reg, RC);

        CCInfo.AllocateReg(Reg);

      }


      LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;

    }

  }

}


void SITargetLowering::allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF,

                                           const SIRegisterInfo &TRI,

                                           SIMachineFunctionInfo &Info) const {

  // Always allocate this last since it is a synthetic preload.

  if (Info.hasLDSKernelId()) {

    Register Reg = Info.addLDSKernelId();

    MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);

    CCInfo.AllocateReg(Reg);

  }

}


// Allocate special input registers that are initialized per-wave.


void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF,

                                           SIMachineFunctionInfo &Info,

                                           CallingConv::ID CallConv,

                                           bool IsShader) const {

  bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();

  if (Subtarget->hasUserSGPRInit16BugInWave32() && !IsShader) {

    // Note: user SGPRs are handled by the front-end for graphics shaders

    // Pad up the used user SGPRs with dead inputs.


    // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately

    // before enabling architected SGPRs for workgroup IDs.

    assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");


    unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();

    // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to

    // rely on it to reach 16 since if we end up having no stack usage, it will

    // not really be added.

    unsigned NumRequiredSystemSGPRs =

        Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +

        Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();

    for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {

      Register Reg = Info.addReservedUserSGPR();

      MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);

      CCInfo.AllocateReg(Reg);

    }

  }


  if (!HasArchitectedSGPRs) {

    if (Info.hasWorkGroupIDX()) {

      Register Reg = Info.addWorkGroupIDX();

      MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);

      CCInfo.AllocateReg(Reg);

    }


    if (Info.hasWorkGroupIDY()) {

      Register Reg = Info.addWorkGroupIDY();

      MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);

      CCInfo.AllocateReg(Reg);

    }


    if (Info.hasWorkGroupIDZ()) {

      Register Reg = Info.addWorkGroupIDZ();

      MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);

      CCInfo.AllocateReg(Reg);

    }

  }


  if (Info.hasWorkGroupInfo()) {

    Register Reg = Info.addWorkGroupInfo();

    MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);

    CCInfo.AllocateReg(Reg);

  }


  if (Info.hasPrivateSegmentWaveByteOffset()) {

    // Scratch wave offset passed in system SGPR.

    unsigned PrivateSegmentWaveByteOffsetReg;


    if (IsShader) {

      PrivateSegmentWaveByteOffsetReg =

          Info.getPrivateSegmentWaveByteOffsetSystemSGPR();


      // This is true if the scratch wave byte offset doesn't have a fixed

      // location.

      if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {

        PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);

        Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);

      }

    } else

      PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();


    MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);

    CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);

  }


  assert(!Subtarget->hasUserSGPRInit16BugInWave32() || IsShader ||

         Info.getNumPreloadedSGPRs() >= 16);

}


static void reservePrivateMemoryRegs(const TargetMachine &TM,

                                     MachineFunction &MF,

                                     const SIRegisterInfo &TRI,

                                     SIMachineFunctionInfo &Info) {

  // Now that we've figured out where the scratch register inputs are, see if

  // should reserve the arguments and use them directly.

  MachineFrameInfo &MFI = MF.getFrameInfo();

  bool HasStackObjects = MFI.hasStackObjects();

  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();


  // Record that we know we have non-spill stack objects so we don't need to

  // check all stack objects later.

  if (HasStackObjects)

    Info.setHasNonSpillStackObjects(true);


  // Everything live out of a block is spilled with fast regalloc, so it's

  // almost certain that spilling will be required.

  if (TM.getOptLevel() == CodeGenOptLevel::None)

    HasStackObjects = true;


  // For now assume stack access is needed in any callee functions, so we need

  // the scratch registers to pass in.

  bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();


  if (!ST.hasFlatScratchEnabled()) {

    if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {

      // If we have stack objects, we unquestionably need the private buffer

      // resource. For the Code Object V2 ABI, this will be the first 4 user

      // SGPR inputs. We can reserve those and use them directly.


      Register PrivateSegmentBufferReg =

          Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);

      Info.setScratchRSrcReg(PrivateSegmentBufferReg);

    } else {

      unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);

      // We tentatively reserve the last registers (skipping the last registers

      // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,

      // we'll replace these with the ones immediately after those which were

      // really allocated. In the prologue copies will be inserted from the

      // argument to these reserved registers.


      // Without HSA, relocations are used for the scratch pointer and the

      // buffer resource setup is always inserted in the prologue. Scratch wave

      // offset is still in an input SGPR.

      Info.setScratchRSrcReg(ReservedBufferReg);

    }

  }


  MachineRegisterInfo &MRI = MF.getRegInfo();


  // For entry functions we have to set up the stack pointer if we use it,

  // whereas non-entry functions get this "for free". This means there is no

  // intrinsic advantage to using S32 over S34 in cases where we do not have

  // calls but do need a frame pointer (i.e. if we are requested to have one

  // because frame pointer elimination is disabled). To keep things simple we

  // only ever use S32 as the call ABI stack pointer, and so using it does not

  // imply we need a separate frame pointer.

  //

  // Try to use s32 as the SP, but move it if it would interfere with input

  // arguments. This won't work with calls though.

  //

  // FIXME: Move SP to avoid any possible inputs, or find a way to spill input

  // registers.

  if (!MRI.isLiveIn(AMDGPU::SGPR32)) {

    Info.setStackPtrOffsetReg(AMDGPU::SGPR32);

  } else {

    assert(AMDGPU::isShader(MF.getFunction().getCallingConv()));


    if (MFI.hasCalls())

      report_fatal_error("call in graphics shader with too many input SGPRs");


    for (unsigned Reg : AMDGPU::SGPR_32RegClass) {

      if (!MRI.isLiveIn(Reg)) {

        Info.setStackPtrOffsetReg(Reg);

        break;

      }

    }


    if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)

      report_fatal_error("failed to find register for SP");

  }


  // hasFP should be accurate for entry functions even before the frame is

  // finalized, because it does not rely on the known stack size, only

  // properties like whether variable sized objects are present.

  if (ST.getFrameLowering()->hasFP(MF)) {

    Info.setFrameOffsetReg(AMDGPU::SGPR33);

  }

}


bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {

  const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();

  return !Info->isEntryFunction();

}


void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {}


void SITargetLowering::insertCopiesSplitCSR(

    MachineBasicBlock *Entry,

    const SmallVectorImpl<MachineBasicBlock *> &Exits) const {

  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();


  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());

  if (!IStart)

    return;


  const TargetInstrInfo *TII = Subtarget->getInstrInfo();

  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();

  MachineBasicBlock::iterator MBBI = Entry->begin();

  for (const MCPhysReg *I = IStart; *I; ++I) {

    const TargetRegisterClass *RC = nullptr;

    if (AMDGPU::SReg_64RegClass.contains(*I))

      RC = &AMDGPU::SGPR_64RegClass;

    else if (AMDGPU::SReg_32RegClass.contains(*I))

      RC = &AMDGPU::SGPR_32RegClass;

    else

      llvm_unreachable("Unexpected register class in CSRsViaCopy!");


    Register NewVR = MRI->createVirtualRegister(RC);

    // Create copy from CSR to a virtual register.

    Entry->addLiveIn(*I);

    BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)

        .addReg(*I);


    // Insert the copy-back instructions right before the terminator.

    for (auto *Exit : Exits)

      BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),

              TII->get(TargetOpcode::COPY), *I)

          .addReg(NewVR);

  }

}


SDValue SITargetLowering::LowerFormalArguments(

    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,

    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,

    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {

  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();


  MachineFunction &MF = DAG.getMachineFunction();

  const Function &Fn = MF.getFunction();

  FunctionType *FType = MF.getFunction().getFunctionType();

  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();

  bool IsError = false;


  if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {

    DAG.getContext()->diagnose(DiagnosticInfoUnsupported(

        Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));

    IsError = true;

  }


  SmallVector<ISD::InputArg, 16> Splits;

  SmallVector<CCValAssign, 16> ArgLocs;

  BitVector Skipped(Ins.size());

  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,

                 *DAG.getContext());


  bool IsGraphics = AMDGPU::isGraphics(CallConv);

  bool IsKernel = AMDGPU::isKernel(CallConv);

  bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);


  if (IsGraphics) {

    const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();

    assert(!UserSGPRInfo.hasDispatchPtr() &&

           !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&

           !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&

           !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());

    (void)UserSGPRInfo;

    if (!Subtarget->hasFlatScratchEnabled())

      assert(!UserSGPRInfo.hasFlatScratchInit());

    if ((CallConv != CallingConv::AMDGPU_CS &&

         CallConv != CallingConv::AMDGPU_Gfx &&

         CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||

        !Subtarget->hasArchitectedSGPRs())

      assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&

             !Info->hasWorkGroupIDZ());

  }


  bool IsWholeWaveFunc = Info->isWholeWaveFunction();


  if (CallConv == CallingConv::AMDGPU_PS) {

    processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);


    // At least one interpolation mode must be enabled or else the GPU will

    // hang.

    //

    // Check PSInputAddr instead of PSInputEnable. The idea is that if the user

    // set PSInputAddr, the user wants to enable some bits after the compilation

    // based on run-time states. Since we can't know what the final PSInputEna

    // will look like, so we shouldn't do anything here and the user should take

    // responsibility for the correct programming.

    //

    // Otherwise, the following restrictions apply:

    // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.

    // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be

    //   enabled too.

    if ((Info->getPSInputAddr() & 0x7F) == 0 ||

        ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {

      CCInfo.AllocateReg(AMDGPU::VGPR0);

      CCInfo.AllocateReg(AMDGPU::VGPR1);

      Info->markPSInputAllocated(0);

      Info->markPSInputEnabled(0);

    }

    if (Subtarget->isAmdPalOS()) {

      // For isAmdPalOS, the user does not enable some bits after compilation

      // based on run-time states; the register values being generated here are

      // the final ones set in hardware. Therefore we need to apply the

      // workaround to PSInputAddr and PSInputEnable together.  (The case where

      // a bit is set in PSInputAddr but not PSInputEnable is where the

      // frontend set up an input arg for a particular interpolation mode, but

      // nothing uses that input arg. Really we should have an earlier pass

      // that removes such an arg.)

      unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();

      if ((PsInputBits & 0x7F) == 0 ||

          ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))

        Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));

    }

  } else if (IsKernel) {

    assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());

  } else {

    Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),

                  Ins.end());

  }


  if (IsKernel)

    analyzeFormalArgumentsCompute(CCInfo, Ins);


  if (IsEntryFunc) {

    allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);

    allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);

    if (IsKernel && Subtarget->hasKernargPreload())

      allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);


    allocateLDSKernelId(CCInfo, MF, *TRI, *Info);

  } else if (!IsGraphics) {

    // For the fixed ABI, pass workitem IDs in the last argument register.

    allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);


    // FIXME: Sink this into allocateSpecialInputSGPRs

    if (!Subtarget->hasFlatScratchEnabled())

      CCInfo.AllocateReg(Info->getScratchRSrcReg());


    allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);

  }


  if (!IsKernel) {

    CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);

    CCInfo.AnalyzeFormalArguments(Splits, AssignFn);


    // This assumes the registers are allocated by CCInfo in ascending order

    // with no gaps.

    Info->setNumWaveDispatchSGPRs(

        CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));

    Info->setNumWaveDispatchVGPRs(

        CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));

  } else if (Info->getNumKernargPreloadedSGPRs()) {

    Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());

  }


  SmallVector<SDValue, 16> Chains;


  if (IsWholeWaveFunc) {

    SDValue Setup = DAG.getNode(AMDGPUISD::WHOLE_WAVE_SETUP, DL,

                                {MVT::i1, MVT::Other}, Chain);

    InVals.push_back(Setup.getValue(0));

    Chains.push_back(Setup.getValue(1));

  }


  // FIXME: This is the minimum kernel argument alignment. We should improve

  // this to the maximum alignment of the arguments.

  //

  // FIXME: Alignment of explicit arguments totally broken with non-0 explicit

  // kern arg offset.

  const Align KernelArgBaseAlign = Align(16);


  for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;

       ++i) {

    const ISD::InputArg &Arg = Ins[i];

    if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {

      InVals.push_back(DAG.getPOISON(Arg.VT));

      continue;

    }


    CCValAssign &VA = ArgLocs[ArgIdx++];

    MVT VT = VA.getLocVT();


    if (IsEntryFunc && VA.isMemLoc()) {

      VT = Ins[i].VT;

      EVT MemVT = VA.getLocVT();


      const uint64_t Offset = VA.getLocMemOffset();

      Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);


      if (Arg.Flags.isByRef()) {

        SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);


        const GCNTargetMachine &TM =

            static_cast<const GCNTargetMachine &>(getTargetMachine());

        if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,

                                    Arg.Flags.getPointerAddrSpace())) {

          Ptr = DAG.getAddrSpaceCast(DL, VT, Ptr, AMDGPUAS::CONSTANT_ADDRESS,

                                     Arg.Flags.getPointerAddrSpace());

        }


        InVals.push_back(Ptr);

        continue;

      }


      SDValue NewArg;

      if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {

        if (MemVT.getStoreSize() < 4 && Alignment < 4) {

          // In this case the argument is packed into the previous preload SGPR.

          int64_t AlignDownOffset = alignDown(Offset, 4);

          int64_t OffsetDiff = Offset - AlignDownOffset;

          EVT IntVT = MemVT.changeTypeToInteger();


          const SIMachineFunctionInfo *Info =

              MF.getInfo<SIMachineFunctionInfo>();

          MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();

          Register Reg =

              Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];


          assert(Reg);

          Register VReg = MRI.getLiveInVirtReg(Reg);

          SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);


          SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);

          SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);


          SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);

          ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);

          NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,

                                  Ins[i].Flags.isSExt(), &Ins[i]);


          NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);

        } else {

          const SIMachineFunctionInfo *Info =

              MF.getInfo<SIMachineFunctionInfo>();

          MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();

          const SmallVectorImpl<MCRegister> &PreloadRegs =

              Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;


          SDValue Copy;

          if (PreloadRegs.size() == 1) {

            Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);

            const TargetRegisterClass *RC = MRI.getRegClass(VReg);

            NewArg = DAG.getCopyFromReg(

                Chain, DL, VReg,

                EVT::getIntegerVT(*DAG.getContext(),

                                  TRI->getRegSizeInBits(*RC)));


          } else {

            // If the kernarg alignment does not match the alignment of the SGPR

            // tuple RC that can accommodate this argument, it will be built up

            // via copies from from the individual SGPRs that the argument was

            // preloaded to.

            SmallVector<SDValue, 4> Elts;

            for (auto Reg : PreloadRegs) {

              Register VReg = MRI.getLiveInVirtReg(Reg);

              Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);

              Elts.push_back(Copy);

            }

            NewArg =

                DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,

                                                    PreloadRegs.size()),

                                   DL, Elts);

          }


          // If the argument was preloaded to multiple consecutive 32-bit

          // registers because of misalignment between addressable SGPR tuples

          // and the argument size, we can still assume that because of kernarg

          // segment alignment restrictions that NewArg's size is the same as

          // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a

          // truncate since we cannot preload to less than a single SGPR and the

          // MemVT may be smaller.

          EVT MemVTInt =

              EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());

          if (MemVT.bitsLT(NewArg.getSimpleValueType()))

            NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);


          NewArg = DAG.getBitcast(MemVT, NewArg);

          NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,

                                  Ins[i].Flags.isSExt(), &Ins[i]);

          NewArg = DAG.getMergeValues({NewArg, Chain}, DL);

        }

      } else {

        // Hidden arguments that are in the kernel signature must be preloaded

        // to user SGPRs. Print a diagnostic error if a hidden argument is in

        // the argument list and is not preloaded.

        if (Arg.isOrigArg()) {

          Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());

          if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {

            DAG.getContext()->diagnose(DiagnosticInfoUnsupported(

                *OrigArg->getParent(),

                "hidden argument in kernel signature was not preloaded",

                DL.getDebugLoc()));

          }

        }


        NewArg =

            lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,

                                     Alignment, Ins[i].Flags.isSExt(), &Ins[i]);

      }

      Chains.push_back(NewArg.getValue(1));


      auto *ParamTy =

          dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));

      if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&

          ParamTy &&

          (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||

           ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {

        // On SI local pointers are just offsets into LDS, so they are always

        // less than 16-bits.  On CI and newer they could potentially be

        // real pointers, so we can't guarantee their size.

        NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,

                             DAG.getValueType(MVT::i16));

      }


      InVals.push_back(NewArg);

      continue;

    }

    if (!IsEntryFunc && VA.isMemLoc()) {

      SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);

      InVals.push_back(Val);

      if (!Arg.Flags.isByVal())

        Chains.push_back(Val.getValue(1));

      continue;

    }


    assert(VA.isRegLoc() && "Parameter must be in a register!");


    Register Reg = VA.getLocReg();

    const TargetRegisterClass *RC = nullptr;

    if (AMDGPU::VGPR_32RegClass.contains(Reg))

      RC = &AMDGPU::VGPR_32RegClass;

    else if (AMDGPU::SGPR_32RegClass.contains(Reg))

      RC = &AMDGPU::SGPR_32RegClass;

    else

      llvm_unreachable("Unexpected register class in LowerFormalArguments!");


    Reg = MF.addLiveIn(Reg, RC);

    SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);

    if (Arg.Flags.isInReg() && RC == &AMDGPU::VGPR_32RegClass) {

      // FIXME: Need to forward the chains created by `CopyFromReg`s, make sure

      // they will read physical regs before any side effect instructions.

      SDValue ReadFirstLane =

          DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);

      Val = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Val.getValueType(),

                        ReadFirstLane, Val);

    }


    if (Arg.Flags.isSRet()) {

      // The return object should be reasonably addressable.


      // FIXME: This helps when the return is a real sret. If it is a

      // automatically inserted sret (i.e. CanLowerReturn returns false), an

      // extra copy is inserted in SelectionDAGBuilder which obscures this.

      unsigned NumBits =

          32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex();

      Val = DAG.getNode(

          ISD::AssertZext, DL, VT, Val,

          DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));

    }


    Val = convertABITypeToValueType(DAG, Val, VA, DL);

    InVals.push_back(Val);

  }


  // Start adding system SGPRs.

  if (IsEntryFunc)

    allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);


  unsigned StackArgSize = CCInfo.getStackSize();

  Info->setBytesInStackArgArea(StackArgSize);


  return Chains.empty() ? Chain

                        : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);

}


// TODO: If return values can't fit in registers, we should return as many as

// possible in registers before passing on stack.


bool SITargetLowering::CanLowerReturn(

    CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,

    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,

    const Type *RetTy) const {

  // Replacing returns with sret/stack usage doesn't make sense for shaders.

  // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn

  // for shaders. Vector types should be explicitly handled by CC.

  if (AMDGPU::isEntryFunctionCC(CallConv))

    return true;


  SmallVector<CCValAssign, 16> RVLocs;

  CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);

  if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))

    return false;


  // We must use the stack if return would require unavailable registers.

  unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);

  unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();

  for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)

    if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))

      return false;


  return true;

}


SDValue


SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,

                              bool isVarArg,

                              const SmallVectorImpl<ISD::OutputArg> &Outs,

                              const SmallVectorImpl<SDValue> &OutVals,

                              const SDLoc &DL, SelectionDAG &DAG) const {

  MachineFunction &MF = DAG.getMachineFunction();

  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();

  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();


  if (AMDGPU::isKernel(CallConv)) {

    return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,

                                             OutVals, DL, DAG);

  }


  bool IsShader = AMDGPU::isShader(CallConv);


  Info->setIfReturnsVoid(Outs.empty());

  bool IsWaveEnd = Info->returnsVoid() && IsShader;


  // CCValAssign - represent the assignment of the return value to a location.

  SmallVector<CCValAssign, 48> RVLocs;


  // CCState - Info about the registers and stack slots.

  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,

                 *DAG.getContext());


  // Analyze outgoing return values.

  CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));


  SDValue Glue;

  SmallVector<SDValue, 48> RetOps;

  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)


  SDValue ReadFirstLane =

      DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);

  // Copy the result values into the output registers.

  for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;

       ++I, ++RealRVLocIdx) {

    CCValAssign &VA = RVLocs[I];

    assert(VA.isRegLoc() && "Can only return in registers!");

    // TODO: Partially return in registers if return values don't fit.

    SDValue Arg = OutVals[RealRVLocIdx];


    // Copied from other backends.

    switch (VA.getLocInfo()) {

    case CCValAssign::Full:

      break;

    case CCValAssign::BCvt:

      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);

      break;

    case CCValAssign::SExt:

      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);

      break;

    case CCValAssign::ZExt:

      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);

      break;

    case CCValAssign::AExt:

      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);

      break;

    default:

      llvm_unreachable("Unknown loc info!");

    }

    if (TRI->isSGPRPhysReg(VA.getLocReg()))

      Arg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Arg.getValueType(),

                        ReadFirstLane, Arg);

    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);

    Glue = Chain.getValue(1);

    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));

  }


  // FIXME: Does sret work properly?

  if (!Info->isEntryFunction()) {

    const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();

    const MCPhysReg *I =

        TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());

    if (I) {

      for (; *I; ++I) {

        if (AMDGPU::SReg_64RegClass.contains(*I))

          RetOps.push_back(DAG.getRegister(*I, MVT::i64));

        else if (AMDGPU::SReg_32RegClass.contains(*I))

          RetOps.push_back(DAG.getRegister(*I, MVT::i32));

        else

          llvm_unreachable("Unexpected register class in CSRsViaCopy!");

      }

    }

  }


  // Update chain and glue.

  RetOps[0] = Chain;

  if (Glue.getNode())

    RetOps.push_back(Glue);


  unsigned Opc = AMDGPUISD::ENDPGM;

  if (!IsWaveEnd)

    Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN

          : IsShader                  ? AMDGPUISD::RETURN_TO_EPILOG

                                      : AMDGPUISD::RET_GLUE;

  return DAG.getNode(Opc, DL, MVT::Other, RetOps);

}


SDValue SITargetLowering::LowerCallResult(

    SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,

    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,

    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,

    SDValue ThisVal) const {

  CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);


  // Assign locations to each value returned by this call.

  SmallVector<CCValAssign, 16> RVLocs;

  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,

                 *DAG.getContext());

  CCInfo.AnalyzeCallResult(Ins, RetCC);


  // Copy all of the result registers out of their specified physreg.

  for (CCValAssign VA : RVLocs) {

    SDValue Val;


    if (VA.isRegLoc()) {

      Val =

          DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);

      Chain = Val.getValue(1);

      InGlue = Val.getValue(2);

    } else if (VA.isMemLoc()) {

      report_fatal_error("TODO: return values in memory");

    } else

      llvm_unreachable("unknown argument location type");


    switch (VA.getLocInfo()) {

    case CCValAssign::Full:

      break;

    case CCValAssign::BCvt:

      Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);

      break;

    case CCValAssign::ZExt:

      Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,

                        DAG.getValueType(VA.getValVT()));

      Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);

      break;

    case CCValAssign::SExt:

      Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,

                        DAG.getValueType(VA.getValVT()));

      Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);

      break;

    case CCValAssign::AExt:

      Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);

      break;

    default:

      llvm_unreachable("Unknown loc info!");

    }


    InVals.push_back(Val);

  }


  return Chain;

}


// Add code to pass special inputs required depending on used features separate

// from the explicit user arguments present in the IR.


void SITargetLowering::passSpecialInputs(

    CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,

    SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,

    SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {

  // If we don't have a call site, this was a call inserted by

  // legalization. These can never use special inputs.

  if (!CLI.CB)

    return;


  SelectionDAG &DAG = CLI.DAG;

  const SDLoc &DL = CLI.DL;

  const Function &F = DAG.getMachineFunction().getFunction();


  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();

  const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();


  const AMDGPUFunctionArgInfo &CalleeArgInfo =

      AMDGPUFunctionArgInfo::FixedABIFunctionInfo;


  // TODO: Unify with private memory register handling. This is complicated by

  // the fact that at least in kernels, the input argument is not necessarily

  // in the same location as the input.

  // clang-format off

  static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,

      std::array<StringLiteral, 2>> ImplicitAttrs[] = {

    {AMDGPUFunctionArgInfo::DISPATCH_PTR, {"amdgpu-no-dispatch-ptr", ""}},

    {AMDGPUFunctionArgInfo::QUEUE_PTR, {"amdgpu-no-queue-ptr", ""}},

    {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, {"amdgpu-no-implicitarg-ptr", ""}},

    {AMDGPUFunctionArgInfo::DISPATCH_ID, {"amdgpu-no-dispatch-id", ""}},

    {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, {"amdgpu-no-workgroup-id-x", "amdgpu-no-cluster-id-x"}},

    {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, {"amdgpu-no-workgroup-id-y", "amdgpu-no-cluster-id-y"}},

    {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, {"amdgpu-no-workgroup-id-z", "amdgpu-no-cluster-id-z"}},

    {AMDGPUFunctionArgInfo::LDS_KERNEL_ID, {"amdgpu-no-lds-kernel-id", ""}},

  };

  // clang-format on


  for (auto [InputID, Attrs] : ImplicitAttrs) {

    // If the callee does not use the attribute value, skip copying the value.

    if (all_of(Attrs, [&](StringRef Attr) {

          return Attr.empty() || CLI.CB->hasFnAttr(Attr);

        }))

      continue;


    const auto [OutgoingArg, ArgRC, ArgTy] =

        CalleeArgInfo.getPreloadedValue(InputID);

    if (!OutgoingArg)

      continue;


    const auto [IncomingArg, IncomingArgRC, Ty] =

        CallerArgInfo.getPreloadedValue(InputID);

    assert(IncomingArgRC == ArgRC);


    // All special arguments are ints for now.

    EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;

    SDValue InputReg;


    if (IncomingArg) {

      InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);

    } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {

      // The implicit arg ptr is special because it doesn't have a corresponding

      // input for kernels, and is computed from the kernarg segment pointer.

      InputReg = getImplicitArgPtr(DAG, DL);

    } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {

      std::optional<uint32_t> Id =

          AMDGPUMachineFunctionInfo::getLDSKernelIdMetadata(F);

      if (Id.has_value()) {

        InputReg = DAG.getConstant(*Id, DL, ArgVT);

      } else {

        InputReg = DAG.getPOISON(ArgVT);

      }

    } else {

      // We may have proven the input wasn't needed, although the ABI is

      // requiring it. We just need to allocate the register appropriately.

      InputReg = DAG.getPOISON(ArgVT);

    }


    if (OutgoingArg->isRegister()) {

      RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);

      if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))

        report_fatal_error("failed to allocate implicit input argument");

    } else {

      unsigned SpecialArgOffset =

          CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));

      SDValue ArgStore =

          storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);

      MemOpChains.push_back(ArgStore);

    }

  }


  // Pack workitem IDs into a single register or pass it as is if already

  // packed.


  auto [OutgoingArg, ArgRC, Ty] =

      CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);

  if (!OutgoingArg)

    std::tie(OutgoingArg, ArgRC, Ty) =

        CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);

  if (!OutgoingArg)

    std::tie(OutgoingArg, ArgRC, Ty) =

        CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);

  if (!OutgoingArg)

    return;


  const ArgDescriptor *IncomingArgX = std::get<0>(

      CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X));

  const ArgDescriptor *IncomingArgY = std::get<0>(

      CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y));

  const ArgDescriptor *IncomingArgZ = std::get<0>(

      CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z));


  SDValue InputReg;

  SDLoc SL;


  const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");

  const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");

  const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");


  // If incoming ids are not packed we need to pack them.

  if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo.WorkItemIDX &&

      NeedWorkItemIDX) {

    if (Subtarget->getMaxWorkitemID(F, 0) != 0) {

      InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);

    } else {

      InputReg = DAG.getConstant(0, DL, MVT::i32);

    }

  }


  if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo.WorkItemIDY &&

      NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {

    SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);

    Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,

                    DAG.getShiftAmountConstant(10, MVT::i32, SL));

    InputReg = InputReg.getNode()

                   ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)

                   : Y;

  }


  if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo.WorkItemIDZ &&

      NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {

    SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);

    Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,

                    DAG.getShiftAmountConstant(20, MVT::i32, SL));

    InputReg = InputReg.getNode()

                   ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)

                   : Z;

  }


  if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {

    if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {

      // We're in a situation where the outgoing function requires the workitem

      // ID, but the calling function does not have it (e.g a graphics function

      // calling a C calling convention function). This is illegal, but we need

      // to produce something.

      InputReg = DAG.getPOISON(MVT::i32);

    } else {

      // Workitem ids are already packed, any of present incoming arguments

      // will carry all required fields.

      ArgDescriptor IncomingArg =

          ArgDescriptor::createArg(IncomingArgX   ? *IncomingArgX

                                   : IncomingArgY ? *IncomingArgY

                                                  : *IncomingArgZ,

                                   ~0u);

      InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);

    }

  }


  if (OutgoingArg->isRegister()) {

    if (InputReg)

      RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);


    CCInfo.AllocateReg(OutgoingArg->getRegister());

  } else {

    unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));

    if (InputReg) {

      SDValue ArgStore =

          storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);

      MemOpChains.push_back(ArgStore);

    }

  }

}


bool SITargetLowering::isEligibleForTailCallOptimization(

    SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,

    const SmallVectorImpl<ISD::OutputArg> &Outs,

    const SmallVectorImpl<SDValue> &OutVals,

    const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {

  if (AMDGPU::isChainCC(CalleeCC))

    return true;


  if (!AMDGPU::mayTailCallThisCC(CalleeCC))

    return false;


  // For a divergent call target, we need to do a waterfall loop over the

  // possible callees which precludes us from using a simple jump.

  if (Callee->isDivergent())

    return false;


  MachineFunction &MF = DAG.getMachineFunction();

  const Function &CallerF = MF.getFunction();

  CallingConv::ID CallerCC = CallerF.getCallingConv();

  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();

  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);


  // Kernels aren't callable, and don't have a live in return address so it

  // doesn't make sense to do a tail call with entry functions.

  if (!CallerPreserved)

    return false;


  bool CCMatch = CallerCC == CalleeCC;


  if (DAG.getTarget().Options.GuaranteedTailCallOpt) {

    if (AMDGPU::canGuaranteeTCO(CalleeCC) && CCMatch)

      return true;

    return false;

  }


  // TODO: Can we handle var args?

  if (IsVarArg)

    return false;


  for (const Argument &Arg : CallerF.args()) {

    if (Arg.hasByValAttr())

      return false;

  }


  LLVMContext &Ctx = *DAG.getContext();


  // Check that the call results are passed in the same way.

  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,

                                  CCAssignFnForCall(CalleeCC, IsVarArg),

                                  CCAssignFnForCall(CallerCC, IsVarArg)))

    return false;


  // The callee has to preserve all registers the caller needs to preserve.

  if (!CCMatch) {

    const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);

    if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))

      return false;

  }


  // Nothing more to check if the callee is taking no arguments.

  if (Outs.empty())

    return true;


  SmallVector<CCValAssign, 16> ArgLocs;

  CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);


  // FIXME: We are not allocating special input registers, so we will be

  // deciding based on incorrect register assignments.

  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));


  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();

  // If the stack arguments for this call do not fit into our own save area then

  // the call cannot be made tail.

  // TODO: Is this really necessary?

  if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())

    return false;


  for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {

    // FIXME: What about inreg arguments that end up passed in memory?

    if (!CCVA.isRegLoc())

      continue;


    // If we are passing an argument in an SGPR, and the value is divergent,

    // this call requires a waterfall loop.

    if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {

      LLVM_DEBUG(

          dbgs() << "Cannot tail call due to divergent outgoing argument in "

                 << printReg(CCVA.getLocReg(), TRI) << '\n');

      return false;

    }

  }


  const MachineRegisterInfo &MRI = MF.getRegInfo();

  return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);

}


bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {

  if (!CI->isTailCall())

    return false;


  const Function *ParentFn = CI->getFunction();

  if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))

    return false;

  return true;

}


namespace {

// Chain calls have special arguments that we need to handle. These are

// tagging along at the end of the arguments list(s), after the SGPR and VGPR

// arguments (index 0 and 1 respectively).

enum ChainCallArgIdx {

  Exec = 2,

  Flags,

  NumVGPRs,

  FallbackExec,

  FallbackCallee

};

} // anonymous namespace


// The wave scratch offset register is used as the global base pointer.


SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,

                                    SmallVectorImpl<SDValue> &InVals) const {

  CallingConv::ID CallConv = CLI.CallConv;

  bool IsChainCallConv = AMDGPU::isChainCC(CallConv);


  SelectionDAG &DAG = CLI.DAG;


  const SDLoc &DL = CLI.DL;

  SDValue Chain = CLI.Chain;

  SDValue Callee = CLI.Callee;


  llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;

  bool UsesDynamicVGPRs = false;

  if (IsChainCallConv) {

    // The last arguments should be the value that we need to put in EXEC,

    // followed by the flags and any other arguments with special meanings.

    // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so

    // we don't treat them like the "real" arguments.

    auto RequestedExecIt =

        llvm::find_if(CLI.Outs, [](const ISD::OutputArg &Arg) {

          return Arg.OrigArgIndex == 2;

        });

    assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");


    size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();

    CLI.OutVals.erase(CLI.OutVals.begin() + SpecialArgsBeginIdx,

                      CLI.OutVals.end());

    CLI.Outs.erase(RequestedExecIt, CLI.Outs.end());


    assert(CLI.Outs.back().OrigArgIndex < 2 &&

           "Haven't popped all the special args");


    TargetLowering::ArgListEntry RequestedExecArg =

        CLI.Args[ChainCallArgIdx::Exec];

    if (!RequestedExecArg.Ty->isIntegerTy(Subtarget->getWavefrontSize()))

      return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");


    // Convert constants into TargetConstants, so they become immediate operands

    // instead of being selected into S_MOV.

    auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {

      if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) {

        ChainCallSpecialArgs.push_back(DAG.getTargetConstant(

            ArgNode->getAPIntValue(), DL, ArgNode->getValueType(0)));

      } else

        ChainCallSpecialArgs.push_back(Arg.Node);

    };


    PushNodeOrTargetConstant(RequestedExecArg);


    // Process any other special arguments depending on the value of the flags.

    TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];


    const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();

    if (FlagsValue.isZero()) {

      if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)

        return lowerUnhandledCall(CLI, InVals,

                                  "no additional args allowed if flags == 0");

    } else if (FlagsValue.isOneBitSet(0)) {

      if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {

        return lowerUnhandledCall(CLI, InVals, "expected 3 additional args");

      }


      if (!Subtarget->isWave32()) {

        return lowerUnhandledCall(

            CLI, InVals, "dynamic VGPR mode is only supported for wave32");

      }


      UsesDynamicVGPRs = true;

      std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,

                    CLI.Args.end(), PushNodeOrTargetConstant);

    }

  }


  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;

  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;

  SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;

  bool &IsTailCall = CLI.IsTailCall;

  bool IsVarArg = CLI.IsVarArg;

  bool IsSibCall = false;

  MachineFunction &MF = DAG.getMachineFunction();


  if (Callee.isUndef() || isNullConstant(Callee)) {

    if (!CLI.IsTailCall) {

      for (ISD::InputArg &Arg : CLI.Ins)

        InVals.push_back(DAG.getPOISON(Arg.VT));

    }


    return Chain;

  }


  if (IsVarArg) {

    return lowerUnhandledCall(CLI, InVals,

                              "unsupported call to variadic function ");

  }


  if (!CLI.CB)

    return lowerUnhandledCall(CLI, InVals, "unsupported libcall legalization");


  if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {

    return lowerUnhandledCall(CLI, InVals,

                              "unsupported required tail call to function ");

  }


  if (IsTailCall) {

    IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,

                                                   Outs, OutVals, Ins, DAG);

    if (!IsTailCall &&

        ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {

      report_fatal_error("failed to perform tail call elimination on a call "

                         "site marked musttail or on llvm.amdgcn.cs.chain");

    }


    bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;


    // A sibling call is one where we're under the usual C ABI and not planning

    // to change that but can still do a tail call:

    if (!TailCallOpt && IsTailCall)

      IsSibCall = true;


    if (IsTailCall)

      ++NumTailCalls;

  }


  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();

  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;

  SmallVector<SDValue, 8> MemOpChains;


  // Analyze operands of the call, assigning locations to each operand.

  SmallVector<CCValAssign, 16> ArgLocs;

  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());

  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);


  if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&

      CallConv != CallingConv::AMDGPU_Gfx_WholeWave) {

    // With a fixed ABI, allocate fixed registers before user arguments.

    passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);

  }


  // Mark the scratch resource descriptor as allocated so the CC analysis

  // does not assign user arguments to these registers, matching the callee.

  if (!Subtarget->hasFlatScratchEnabled())

    CCInfo.AllocateReg(Info->getScratchRSrcReg());


  CCInfo.AnalyzeCallOperands(Outs, AssignFn);


  // Get a count of how many bytes are to be pushed on the stack.

  unsigned NumBytes = CCInfo.getStackSize();


  if (IsSibCall) {

    // Since we're not changing the ABI to make this a tail call, the memory

    // operands are already available in the caller's incoming argument space.

    NumBytes = 0;

  }


  // FPDiff is the byte offset of the call's argument area from the callee's.

  // Stores to callee stack arguments will be placed in FixedStackSlots offset

  // by this amount for a tail call. In a sibling call it must be 0 because the

  // caller will deallocate the entire stack and the callee still expects its

  // arguments to begin at SP+0. Completely unused for non-tail calls.

  int32_t FPDiff = 0;

  MachineFrameInfo &MFI = MF.getFrameInfo();

  auto *TRI = Subtarget->getRegisterInfo();


  // Adjust the stack pointer for the new arguments...

  // These operations are automatically eliminated by the prolog/epilog pass

  if (!IsSibCall)

    Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);


  if (!IsSibCall || IsChainCallConv) {

    if (!Subtarget->hasFlatScratchEnabled()) {

      SmallVector<SDValue, 4> CopyFromChains;


      // In the HSA case, this should be an identity copy.

      SDValue ScratchRSrcReg =

          DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);

      RegsToPass.emplace_back(IsChainCallConv

                                  ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51

                                  : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,

                              ScratchRSrcReg);

      CopyFromChains.push_back(ScratchRSrcReg.getValue(1));

      Chain = DAG.getTokenFactor(DL, CopyFromChains);

    }

  }


  const unsigned NumSpecialInputs = RegsToPass.size();


  MVT PtrVT = MVT::i32;


  // Walk the register/memloc assignments, inserting copies/loads.

  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {

    CCValAssign &VA = ArgLocs[i];

    SDValue Arg = OutVals[i];


    // Promote the value if needed.

    switch (VA.getLocInfo()) {

    case CCValAssign::Full:

      break;

    case CCValAssign::BCvt:

      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);

      break;

    case CCValAssign::ZExt:

      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);

      break;

    case CCValAssign::SExt:

      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);

      break;

    case CCValAssign::AExt:

      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);

      break;

    case CCValAssign::FPExt:

      Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);

      break;

    default:

      llvm_unreachable("Unknown loc info!");

    }


    if (VA.isRegLoc()) {

      RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));

    } else {

      assert(VA.isMemLoc());


      SDValue DstAddr;

      MachinePointerInfo DstInfo;


      unsigned LocMemOffset = VA.getLocMemOffset();

      int32_t Offset = LocMemOffset;


      SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);

      MaybeAlign Alignment;


      if (IsTailCall) {

        ISD::ArgFlagsTy Flags = Outs[i].Flags;

        unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()

                                          : VA.getValVT().getStoreSize();


        // FIXME: We can have better than the minimum byval required alignment.

        Alignment =

            Flags.isByVal()

                ? Flags.getNonZeroByValAlign()

                : commonAlignment(Subtarget->getStackAlignment(), Offset);


        Offset = Offset + FPDiff;

        int FI = MFI.CreateFixedObject(OpSize, Offset, true);


        DstAddr = DAG.getFrameIndex(FI, PtrVT);

        DstInfo = MachinePointerInfo::getFixedStack(MF, FI);


        // Make sure any stack arguments overlapping with where we're storing

        // are loaded before this eventual operation. Otherwise they'll be

        // clobbered.


        // FIXME: Why is this really necessary? This seems to just result in a

        // lot of code to copy the stack and write them back to the same

        // locations, which are supposed to be immutable?

        Chain = addTokenForArgument(Chain, DAG, MFI, FI);

      } else {

        // Stores to the argument stack area are relative to the stack pointer.

        SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),

                                        MVT::i32);

        DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);

        DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);

        Alignment =

            commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);

      }


      if (Outs[i].Flags.isByVal()) {

        SDValue SizeNode =

            DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);

        SDValue Cpy =

            DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,

                          Outs[i].Flags.getNonZeroByValAlign(),

                          /*isVol = */ false, /*AlwaysInline = */ true,

                          /*CI=*/nullptr, std::nullopt, DstInfo,

                          MachinePointerInfo(AMDGPUAS::PRIVATE_ADDRESS));


        MemOpChains.push_back(Cpy);

      } else {

        SDValue Store =

            DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);

        MemOpChains.push_back(Store);

      }

    }

  }


  if (!MemOpChains.empty())

    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);


  SDValue ReadFirstLaneID =

      DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);


  SDValue TokenGlue;

  if (CLI.ConvergenceControlToken) {

    TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,

                            CLI.ConvergenceControlToken);

  }


  // Build a sequence of copy-to-reg nodes chained together with token chain

  // and flag operands which copy the outgoing args into the appropriate regs.

  SDValue InGlue;


  unsigned ArgIdx = 0;

  for (auto [Reg, Val] : RegsToPass) {

    if (ArgIdx++ >= NumSpecialInputs &&

        (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {

      // For chain calls, the inreg arguments are required to be

      // uniform. Speculatively Insert a readfirstlane in case we cannot prove

      // they are uniform.

      //

      // For other calls, if an inreg arguments is known to be uniform,

      // speculatively insert a readfirstlane in case it is in a VGPR.

      //

      // FIXME: We need to execute this in a waterfall loop if it is a divergent

      // value, so let that continue to produce invalid code.


      SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});

      if (TokenGlue)

        ReadfirstlaneArgs.push_back(TokenGlue);

      Val = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Val.getValueType(),

                        ReadfirstlaneArgs);

    }


    Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);

    InGlue = Chain.getValue(1);

  }


  // We don't usually want to end the call-sequence here because we would tidy

  // the frame up *after* the call, however in the ABI-changing tail-call case

  // we've carefully laid out the parameters so that when sp is reset they'll be

  // in the correct location.

  if (IsTailCall && !IsSibCall) {

    Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);

    InGlue = Chain.getValue(1);

  }


  std::vector<SDValue> Ops({Chain});


  // Add a redundant copy of the callee global which will not be legalized, as

  // we need direct access to the callee later.

  if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {

    const GlobalValue *GV = GSD->getGlobal();

    Ops.push_back(Callee);

    Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));

  } else {

    if (IsTailCall) {

      // isEligibleForTailCallOptimization considered whether the call target is

      // divergent, but we may still end up with a uniform value in a VGPR.

      // Insert a readfirstlane just in case.

      SDValue ReadFirstLaneID =

          DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);


      SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});

      if (TokenGlue)

        ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.

      Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),

                           ReadfirstlaneArgs);

    }


    Ops.push_back(Callee);

    Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));

  }


  if (IsTailCall) {

    // Each tail call may have to adjust the stack by a different amount, so

    // this information must travel along with the operation for eventual

    // consumption by emitEpilogue.

    Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));

  }


  if (IsChainCallConv)

    llvm::append_range(Ops, ChainCallSpecialArgs);


  // Add argument registers to the end of the list so that they are known live

  // into the call.

  for (auto &[Reg, Val] : RegsToPass)

    Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));


  // Add a register mask operand representing the call-preserved registers.

  const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);

  assert(Mask && "Missing call preserved mask for calling convention");

  Ops.push_back(DAG.getRegisterMask(Mask));


  if (SDValue Token = CLI.ConvergenceControlToken) {

    SmallVector<SDValue, 2> GlueOps;

    GlueOps.push_back(Token);

    if (InGlue)

      GlueOps.push_back(InGlue);


    InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,

                                        MVT::Glue, GlueOps),

                     0);

  }


  if (InGlue)

    Ops.push_back(InGlue);


  // If we're doing a tall call, use a TC_RETURN here rather than an

  // actual call instruction.

  if (IsTailCall) {

    MFI.setHasTailCall();

    unsigned OPC = AMDGPUISD::TC_RETURN;

    switch (CallConv) {

    case CallingConv::AMDGPU_Gfx:

      OPC = AMDGPUISD::TC_RETURN_GFX;

      break;

    case CallingConv::AMDGPU_CS_Chain:

    case CallingConv::AMDGPU_CS_ChainPreserve:

      OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR

                             : AMDGPUISD::TC_RETURN_CHAIN;

      break;

    }


    // If the caller is a whole wave function, we need to use a special opcode

    // so we can patch up EXEC.

    if (Info->isWholeWaveFunction())

      OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;


    return DAG.getNode(OPC, DL, MVT::Other, Ops);

  }


  // Returns a chain and a flag for retval copy to use.

  SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);

  Chain = Call.getValue(0);

  InGlue = Call.getValue(1);


  uint64_t CalleePopBytes = NumBytes;

  Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);

  if (!Ins.empty())

    InGlue = Chain.getValue(1);


  // Handle result values, copying them out of physregs into vregs that we

  // return.

  return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,

                         InVals, /*IsThisReturn=*/false, SDValue());

}


// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,

// except for:

// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and

// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size


SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,

                                                  SelectionDAG &DAG) const {

  const MachineFunction &MF = DAG.getMachineFunction();

  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();


  SDLoc dl(Op);

  EVT VT = Op.getValueType();

  SDValue Chain = Op.getOperand(0);

  Register SPReg = Info->getStackPtrOffsetReg();


  // Chain the dynamic stack allocation so that it doesn't modify the stack

  // pointer when other instructions are using the stack.

  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);


  SDValue Size = Op.getOperand(1);

  SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);

  Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();


  const TargetFrameLowering *TFL = Subtarget->getFrameLowering();

  assert(TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&

         "Stack grows upwards for AMDGPU");


  Chain = BaseAddr.getValue(1);

  Align StackAlign = TFL->getStackAlign();

  if (Alignment > StackAlign) {

    uint64_t ScaledAlignment = Alignment.value()

                               << Subtarget->getWavefrontSizeLog2();

    uint64_t StackAlignMask = ScaledAlignment - 1;

    SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,

                                  DAG.getConstant(StackAlignMask, dl, VT));

    BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,

                           DAG.getSignedConstant(-ScaledAlignment, dl, VT));

  }


  assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");

  SDValue NewSP;

  if (isa<ConstantSDNode>(Size)) {

    // For constant sized alloca, scale alloca size by wave-size

    SDValue ScaledSize = DAG.getNode(

        ISD::SHL, dl, VT, Size,

        DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));

    NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value

  } else {

    // For dynamic sized alloca, perform wave-wide reduction to get max of

    // alloca size(divergent) and then scale it by wave-size

    SDValue WaveReduction =

        DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);

    Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,

                       Size, DAG.getTargetConstant(0, dl, MVT::i32));

    SDValue ScaledSize = DAG.getNode(

        ISD::SHL, dl, VT, Size,

        DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));

    NewSP =

        DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.

    SDValue ReadFirstLaneID =

        DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);

    NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,

                        NewSP);

  }


  Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain

  SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);


  return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);

}


SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const {

  if (Op.getValueType() != MVT::i32)

    return Op; // Defer to cannot select error.


  Register SP = getStackPointerRegisterToSaveRestore();

  SDLoc SL(Op);


  SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);


  // Convert from wave uniform to swizzled vector address. This should protect

  // from any edge cases where the stacksave result isn't directly used with

  // stackrestore.

  SDValue VectorAddress =

      DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);

  return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);

}


SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,

                                            SelectionDAG &DAG) const {

  SDLoc SL(Op);

  assert(Op.getValueType() == MVT::i32);


  uint32_t BothRoundHwReg =

      AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 4);

  SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);


  SDValue IntrinID =

      DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);

  SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),

                               Op.getOperand(0), IntrinID, GetRoundBothImm);


  // There are two rounding modes, one for f32 and one for f64/f16. We only

  // report in the standard value range if both are the same.

  //

  // The raw values also differ from the expected FLT_ROUNDS values. Nearest

  // ties away from zero is not supported, and the other values are rotated by

  // 1.

  //

  // If the two rounding modes are not the same, report a target defined value.


  // Mode register rounding mode fields:

  //

  // [1:0] Single-precision round mode.

  // [3:2] Double/Half-precision round mode.

  //

  // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.

  //

  //             Hardware   Spec

  // Toward-0        3        0

  // Nearest Even    0        1

  // +Inf            1        2

  // -Inf            2        3

  //  NearestAway0  N/A       4

  //

  // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit

  // table we can index by the raw hardware mode.

  //

  // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf


  SDValue BitTable =

      DAG.getConstant(AMDGPU::FltRoundConversionTable, SL, MVT::i64);


  SDValue Two = DAG.getConstant(2, SL, MVT::i32);

  SDValue RoundModeTimesNumBits =

      DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);


  // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we

  // knew only one mode was demanded.

  SDValue TableValue =

      DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);

  SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);


  SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);

  SDValue TableEntry =

      DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);


  // There's a gap in the 4-bit encoded table and actual enum values, so offset

  // if it's an extended value.

  SDValue Four = DAG.getConstant(4, SL, MVT::i32);

  SDValue IsStandardValue =

      DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);

  SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);

  SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,

                               TableEntry, EnumOffset);


  return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);

}


SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op,

                                            SelectionDAG &DAG) const {

  SDLoc SL(Op);


  SDValue NewMode = Op.getOperand(1);

  assert(NewMode.getValueType() == MVT::i32);


  // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the

  // hardware MODE.fp_round values.

  if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {

    uint32_t ClampedVal = std::min(

        static_cast<uint32_t>(ConstMode->getZExtValue()),

        static_cast<uint32_t>(AMDGPU::TowardZeroF32_TowardNegativeF64));

    NewMode = DAG.getConstant(

        AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);

  } else {

    // If we know the input can only be one of the supported standard modes in

    // the range 0-3, we can use a simplified mapping to hardware values.

    KnownBits KB = DAG.computeKnownBits(NewMode);

    const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;

    // The supported standard values are 0-3. The extended values start at 8. We

    // need to offset by 4 if the value is in the extended range.


    if (UseReducedTable) {

      // Truncate to the low 32-bits.

      SDValue BitTable = DAG.getConstant(

          AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);


      SDValue Two = DAG.getConstant(2, SL, MVT::i32);

      SDValue RoundModeTimesNumBits =

          DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);


      NewMode =

          DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);


      // TODO: SimplifyDemandedBits on the setreg source here can likely reduce

      // the table extracted bits into inline immediates.

    } else {

      // table_index = umin(value, value - 4)

      // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf

      SDValue BitTable =

          DAG.getConstant(AMDGPU::FltRoundToHWConversionTable, SL, MVT::i64);


      SDValue Four = DAG.getConstant(4, SL, MVT::i32);

      SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);

      SDValue IndexVal =

          DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);


      SDValue Two = DAG.getConstant(2, SL, MVT::i32);

      SDValue RoundModeTimesNumBits =

          DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);


      SDValue TableValue =

          DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);

      SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);


      // No need to mask out the high bits since the setreg will ignore them

      // anyway.

      NewMode = TruncTable;

    }


    // Insert a readfirstlane in case the value is a VGPR. We could do this

    // earlier and keep more operations scalar, but that interferes with

    // combining the source.

    SDValue ReadFirstLaneID =

        DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);

    NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,

                          ReadFirstLaneID, NewMode);

  }


  // N.B. The setreg will be later folded into s_round_mode on supported

  // targets.

  SDValue IntrinID =

      DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);

  uint32_t BothRoundHwReg =

      AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 4);

  SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);


  SDValue SetReg =

      DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),

                  IntrinID, RoundBothImm, NewMode);


  return SetReg;

}


SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {

  if (Op->isDivergent() &&

      (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))

    // Cannot do I$ prefetch with divergent pointer.

    return SDValue();


  switch (cast<MemSDNode>(Op)->getAddressSpace()) {

  case AMDGPUAS::FLAT_ADDRESS:

  case AMDGPUAS::GLOBAL_ADDRESS:

  case AMDGPUAS::CONSTANT_ADDRESS:

    break;

  case AMDGPUAS::CONSTANT_ADDRESS_32BIT:

    if (Subtarget->hasSafeSmemPrefetch())

      break;

    [[fallthrough]];

  default:

    return SDValue();

  }


  // I$ prefetch

  if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))

    return SDValue();


  return Op;

}


// Work around DAG legality rules only based on the result type.


SDValue SITargetLowering::lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {

  bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;

  SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

  EVT SrcVT = Src.getValueType();


  if (SrcVT.getScalarType() != MVT::bf16)

    return Op;


  SDLoc SL(Op);

  SDValue BitCast =

      DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);


  EVT DstVT = Op.getValueType();

  if (IsStrict)

    llvm_unreachable("Need STRICT_BF16_TO_FP");


  return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);

}


SDValue SITargetLowering::lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const {

  SDLoc SL(Op);

  if (Op.getValueType() != MVT::i64)

    return Op;


  uint32_t ModeHwReg =

      AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23);

  SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);

  uint32_t TrapHwReg =

      AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5);

  SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);


  SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);

  SDValue IntrinID =

      DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);

  SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,

                                   Op.getOperand(0), IntrinID, ModeHwRegImm);

  SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,

                                   Op.getOperand(0), IntrinID, TrapHwRegImm);

  SDValue TokenReg =

      DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),

                  GetTrapReg.getValue(1));


  SDValue CvtPtr =

      DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);

  SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);


  return DAG.getMergeValues({Result, TokenReg}, SL);

}


SDValue SITargetLowering::lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const {

  SDLoc SL(Op);

  if (Op.getOperand(1).getValueType() != MVT::i64)

    return Op;


  SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));

  SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,

                                   DAG.getConstant(0, SL, MVT::i32));

  SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,

                                   DAG.getConstant(1, SL, MVT::i32));


  SDValue ReadFirstLaneID =

      DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);

  NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,

                           ReadFirstLaneID, NewModeReg);

  NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,

                           ReadFirstLaneID, NewTrapReg);


  unsigned ModeHwReg =

      AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23);

  SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);

  unsigned TrapHwReg =

      AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5);

  SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);


  SDValue IntrinID =

      DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);

  SDValue SetModeReg =

      DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),

                  IntrinID, ModeHwRegImm, NewModeReg);

  SDValue SetTrapReg =

      DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),

                  IntrinID, TrapHwRegImm, NewTrapReg);

  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);

}


Register SITargetLowering::getRegisterByName(const char *RegName, LLT VT,

                                             const MachineFunction &MF) const {

  const Function &Fn = MF.getFunction();


  Register Reg = StringSwitch<Register>(RegName)

                     .Case("m0", AMDGPU::M0)

                     .Case("exec", AMDGPU::EXEC)

                     .Case("exec_lo", AMDGPU::EXEC_LO)

                     .Case("exec_hi", AMDGPU::EXEC_HI)

                     .Case("flat_scratch", AMDGPU::FLAT_SCR)

                     .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)

                     .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)

                     .Default(Register());

  if (!Reg)

    return Reg;


  if (!Subtarget->hasFlatScrRegister() &&

      Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {

    Fn.getContext().emitError(Twine("invalid register \"" + StringRef(RegName) +

                                    "\" for subtarget."));

  }


  switch (Reg) {

  case AMDGPU::M0:

  case AMDGPU::EXEC_LO:

  case AMDGPU::EXEC_HI:

  case AMDGPU::FLAT_SCR_LO:

  case AMDGPU::FLAT_SCR_HI:

    if (VT.getSizeInBits() == 32)

      return Reg;

    break;

  case AMDGPU::EXEC:

  case AMDGPU::FLAT_SCR:

    if (VT.getSizeInBits() == 64)

      return Reg;

    break;

  default:

    llvm_unreachable("missing register type checking");

  }


  report_fatal_error(

      Twine("invalid type for register \"" + StringRef(RegName) + "\"."));

}


// If kill is not the last instruction, split the block so kill is always a

// proper terminator.

MachineBasicBlock *


SITargetLowering::splitKillBlock(MachineInstr &MI,

                                 MachineBasicBlock *BB) const {

  MachineBasicBlock *SplitBB = BB->splitAt(MI, /*UpdateLiveIns=*/true);

  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();

  MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));

  return SplitBB;

}


// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,

// \p MI will be the only instruction in the loop body block. Otherwise, it will

// be the first instruction in the remainder block.

//

/// \returns { LoopBody, Remainder }

static std::pair<MachineBasicBlock *, MachineBasicBlock *>


splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) {

  MachineFunction *MF = MBB.getParent();

  MachineBasicBlock::iterator I(&MI);


  // To insert the loop we need to split the block. Move everything after this

  // point to a new block, and insert a new empty block between the two.

  MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();

  MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();

  MachineFunction::iterator MBBI(MBB);

  ++MBBI;


  MF->insert(MBBI, LoopBB);

  MF->insert(MBBI, RemainderBB);


  LoopBB->addSuccessor(LoopBB);

  LoopBB->addSuccessor(RemainderBB);


  // Move the rest of the block into a new block.

  RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);


  if (InstInLoop) {

    auto Next = std::next(I);


    // Move instruction to loop body.

    LoopBB->splice(LoopBB->begin(), &MBB, I, Next);


    // Move the rest of the block.

    RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());

  } else {

    RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());

  }


  MBB.addSuccessor(LoopBB);


  return std::pair(LoopBB, RemainderBB);

}


/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.


void SITargetLowering::bundleInstWithWaitcnt(MachineInstr &MI) const {

  MachineBasicBlock *MBB = MI.getParent();

  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();

  auto I = MI.getIterator();

  auto E = std::next(I);


  // clang-format off

  BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))

      .addImm(0);

  // clang-format on


  MIBundleBuilder Bundler(*MBB, I, E);

  finalizeBundle(*MBB, Bundler.begin());

}


MachineBasicBlock *


SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,

                                         MachineBasicBlock *BB) const {

  const DebugLoc &DL = MI.getDebugLoc();


  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();


  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();


  // Apparently kill flags are only valid if the def is in the same block?

  if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))

    Src->setIsKill(false);


  auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);


  MachineBasicBlock::iterator I = LoopBB->end();


  const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(

      AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1);


  // Clear TRAP_STS.MEM_VIOL

  BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))

      .addImm(0)

      .addImm(EncodedReg);


  bundleInstWithWaitcnt(MI);


  Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);


  // Load and check TRAP_STS.MEM_VIOL

  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)

      .addImm(EncodedReg);


  // FIXME: Do we need to use an isel pseudo that may clobber scc?

  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))

      .addReg(Reg, RegState::Kill)

      .addImm(0);

  // clang-format off

  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))

      .addMBB(LoopBB);

  // clang-format on


  return RemainderBB;

}


// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the

// wavefront. If the value is uniform and just happens to be in a VGPR, this

// will only do one iteration. In the worst case, this will loop 64 times.

//

// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.

static MachineBasicBlock::iterator


emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,

                       MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,

                       const DebugLoc &DL, const MachineOperand &Idx,

                       unsigned InitReg, unsigned ResultReg, unsigned PhiReg,

                       unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,

                       Register &SGPRIdxReg) {


  MachineFunction *MF = OrigBB.getParent();

  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();

  const SIRegisterInfo *TRI = ST.getRegisterInfo();

  const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);

  MachineBasicBlock::iterator I = LoopBB.begin();


  const TargetRegisterClass *BoolRC = TRI->getBoolRC();

  Register PhiExec = MRI.createVirtualRegister(BoolRC);

  Register NewExec = MRI.createVirtualRegister(BoolRC);

  Register CurrentIdxReg =

      MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);

  Register CondReg = MRI.createVirtualRegister(BoolRC);


  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)

      .addReg(InitReg)

      .addMBB(&OrigBB)

      .addReg(ResultReg)

      .addMBB(&LoopBB);


  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)

      .addReg(InitSaveExecReg)

      .addMBB(&OrigBB)

      .addReg(NewExec)

      .addMBB(&LoopBB);


  // Read the next variant <- also loop target.

  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)

      .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));


  // Compare the just read M0 value to all possible Idx values.

  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)

      .addReg(CurrentIdxReg)

      .addReg(Idx.getReg(), {}, Idx.getSubReg());


  // Update EXEC, save the original EXEC value to VCC.

  BuildMI(LoopBB, I, DL, TII->get(LMC.AndSaveExecOpc), NewExec)

      .addReg(CondReg, RegState::Kill);


  MRI.setSimpleHint(NewExec, CondReg);


  if (UseGPRIdxMode) {

    if (Offset == 0) {

      SGPRIdxReg = CurrentIdxReg;

    } else {

      SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);

      BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)

          .addReg(CurrentIdxReg, RegState::Kill)

          .addImm(Offset);

    }

  } else {

    // Move index from VCC into M0

    if (Offset == 0) {

      BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)

          .addReg(CurrentIdxReg, RegState::Kill);

    } else {

      BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)

          .addReg(CurrentIdxReg, RegState::Kill)

          .addImm(Offset);

    }

  }


  // Update EXEC, switch all done bits to 0 and all todo bits to 1.

  MachineInstr *InsertPt =

      BuildMI(LoopBB, I, DL, TII->get(LMC.XorTermOpc), LMC.ExecReg)

          .addReg(LMC.ExecReg)

          .addReg(NewExec);


  // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use

  // s_cbranch_scc0?


  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.

  // clang-format off

  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))

      .addMBB(&LoopBB);

  // clang-format on


  return InsertPt->getIterator();

}


// This has slightly sub-optimal regalloc when the source vector is killed by

// the read. The register allocator does not understand that the kill is

// per-workitem, so is kept alive for the whole loop so we end up not re-using a

// subregister from it, using 1 more VGPR than necessary. This was saved when

// this was expanded after register allocation.

static MachineBasicBlock::iterator


loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI,

               unsigned InitResultReg, unsigned PhiReg, int Offset,

               bool UseGPRIdxMode, Register &SGPRIdxReg) {

  MachineFunction *MF = MBB.getParent();

  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();

  const SIRegisterInfo *TRI = ST.getRegisterInfo();

  MachineRegisterInfo &MRI = MF->getRegInfo();

  const DebugLoc &DL = MI.getDebugLoc();

  MachineBasicBlock::iterator I(&MI);


  const auto *BoolXExecRC = TRI->getWaveMaskRegClass();

  Register DstReg = MI.getOperand(0).getReg();

  Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);

  Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);

  const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);


  BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);


  // Save the EXEC mask

  // clang-format off

  BuildMI(MBB, I, DL, TII->get(LMC.MovOpc), SaveExec)

      .addReg(LMC.ExecReg);

  // clang-format on


  auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);


  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);


  auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,

                                      InitResultReg, DstReg, PhiReg, TmpExec,

                                      Offset, UseGPRIdxMode, SGPRIdxReg);


  MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();

  MachineFunction::iterator MBBI(LoopBB);

  ++MBBI;

  MF->insert(MBBI, LandingPad);

  LoopBB->removeSuccessor(RemainderBB);

  LandingPad->addSuccessor(RemainderBB);

  LoopBB->addSuccessor(LandingPad);

  MachineBasicBlock::iterator First = LandingPad->begin();

  // clang-format off

  BuildMI(*LandingPad, First, DL, TII->get(LMC.MovOpc), LMC.ExecReg)

      .addReg(SaveExec);

  // clang-format on


  return InsPt;

}


// Returns subreg index, offset

static std::pair<unsigned, int>


computeIndirectRegAndOffset(const SIRegisterInfo &TRI,

                            const TargetRegisterClass *SuperRC, unsigned VecReg,

                            int Offset) {

  int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;


  // Skip out of bounds offsets, or else we would end up using an undefined

  // register.

  if (Offset >= NumElts || Offset < 0)

    return std::pair(AMDGPU::sub0, Offset);


  return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);

}


static void setM0ToIndexFromSGPR(const SIInstrInfo *TII,

                                 MachineRegisterInfo &MRI, MachineInstr &MI,

                                 int Offset) {

  MachineBasicBlock *MBB = MI.getParent();

  const DebugLoc &DL = MI.getDebugLoc();

  MachineBasicBlock::iterator I(&MI);


  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);


  assert(Idx->getReg() != AMDGPU::NoRegister);


  if (Offset == 0) {

    // clang-format off

    BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)

        .add(*Idx);

    // clang-format on

  } else {

    BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)

        .add(*Idx)

        .addImm(Offset);

  }

}


static Register getIndirectSGPRIdx(const SIInstrInfo *TII,

                                   MachineRegisterInfo &MRI, MachineInstr &MI,

                                   int Offset) {

  MachineBasicBlock *MBB = MI.getParent();

  const DebugLoc &DL = MI.getDebugLoc();

  MachineBasicBlock::iterator I(&MI);


  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);


  if (Offset == 0)

    return Idx->getReg();


  Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);

  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)

      .add(*Idx)

      .addImm(Offset);

  return Tmp;

}


static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,

                                          MachineBasicBlock &MBB,

                                          const GCNSubtarget &ST) {

  const SIInstrInfo *TII = ST.getInstrInfo();

  const SIRegisterInfo &TRI = TII->getRegisterInfo();

  MachineFunction *MF = MBB.getParent();

  MachineRegisterInfo &MRI = MF->getRegInfo();


  Register Dst = MI.getOperand(0).getReg();

  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);

  Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();

  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();


  const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);

  const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());


  unsigned SubReg;

  std::tie(SubReg, Offset) =

      computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);


  const bool UseGPRIdxMode = ST.useVGPRIndexMode();


  // Check for a SGPR index.

  if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {

    MachineBasicBlock::iterator I(&MI);

    const DebugLoc &DL = MI.getDebugLoc();


    if (UseGPRIdxMode) {

      // TODO: Look at the uses to avoid the copy. This may require rescheduling

      // to avoid interfering with other uses, so probably requires a new

      // optimization pass.

      Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);


      const MCInstrDesc &GPRIDXDesc =

          TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);

      BuildMI(MBB, I, DL, GPRIDXDesc, Dst)

          .addReg(SrcReg)

          .addReg(Idx)

          .addImm(SubReg);

    } else {

      setM0ToIndexFromSGPR(TII, MRI, MI, Offset);


      BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)

          .addReg(SrcReg, {}, SubReg)

          .addReg(SrcReg, RegState::Implicit);

    }


    MI.eraseFromParent();


    return &MBB;

  }


  // Control flow needs to be inserted if indexing with a VGPR.

  const DebugLoc &DL = MI.getDebugLoc();

  MachineBasicBlock::iterator I(&MI);


  Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

  Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);


  BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);


  Register SGPRIdxReg;

  auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,

                              UseGPRIdxMode, SGPRIdxReg);


  MachineBasicBlock *LoopBB = InsPt->getParent();


  if (UseGPRIdxMode) {

    const MCInstrDesc &GPRIDXDesc =

        TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);


    BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)

        .addReg(SrcReg)

        .addReg(SGPRIdxReg)

        .addImm(SubReg);

  } else {

    BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)

        .addReg(SrcReg, {}, SubReg)

        .addReg(SrcReg, RegState::Implicit);

  }


  MI.eraseFromParent();


  return LoopBB;

}


static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,

                                          MachineBasicBlock &MBB,

                                          const GCNSubtarget &ST) {

  const SIInstrInfo *TII = ST.getInstrInfo();

  const SIRegisterInfo &TRI = TII->getRegisterInfo();

  MachineFunction *MF = MBB.getParent();

  MachineRegisterInfo &MRI = MF->getRegInfo();


  Register Dst = MI.getOperand(0).getReg();

  const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);

  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);

  const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);

  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();

  const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());

  const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());


  // This can be an immediate, but will be folded later.

  assert(Val->getReg());


  unsigned SubReg;

  std::tie(SubReg, Offset) =

      computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);

  const bool UseGPRIdxMode = ST.useVGPRIndexMode();


  if (Idx->getReg() == AMDGPU::NoRegister) {

    MachineBasicBlock::iterator I(&MI);

    const DebugLoc &DL = MI.getDebugLoc();


    assert(Offset == 0);


    BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)

        .add(*SrcVec)

        .add(*Val)

        .addImm(SubReg);


    MI.eraseFromParent();

    return &MBB;

  }


  // Check for a SGPR index.

  if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {

    MachineBasicBlock::iterator I(&MI);

    const DebugLoc &DL = MI.getDebugLoc();


    if (UseGPRIdxMode) {

      Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);


      const MCInstrDesc &GPRIDXDesc =

          TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);

      BuildMI(MBB, I, DL, GPRIDXDesc, Dst)

          .addReg(SrcVec->getReg())

          .add(*Val)

          .addReg(Idx)

          .addImm(SubReg);

    } else {

      setM0ToIndexFromSGPR(TII, MRI, MI, Offset);


      const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(

          TRI.getRegSizeInBits(*VecRC), 32, false);

      BuildMI(MBB, I, DL, MovRelDesc, Dst)

          .addReg(SrcVec->getReg())

          .add(*Val)

          .addImm(SubReg);

    }

    MI.eraseFromParent();

    return &MBB;

  }


  // Control flow needs to be inserted if indexing with a VGPR.

  if (Val->isReg())

    MRI.clearKillFlags(Val->getReg());


  const DebugLoc &DL = MI.getDebugLoc();


  Register PhiReg = MRI.createVirtualRegister(VecRC);


  Register SGPRIdxReg;

  auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,

                              UseGPRIdxMode, SGPRIdxReg);

  MachineBasicBlock *LoopBB = InsPt->getParent();


  if (UseGPRIdxMode) {

    const MCInstrDesc &GPRIDXDesc =

        TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);


    BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)

        .addReg(PhiReg)

        .add(*Val)

        .addReg(SGPRIdxReg)

        .addImm(SubReg);

  } else {

    const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(

        TRI.getRegSizeInBits(*VecRC), 32, false);

    BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)

        .addReg(PhiReg)

        .add(*Val)

        .addImm(SubReg);

  }


  MI.eraseFromParent();

  return LoopBB;

}


static MachineBasicBlock *expand64BitScalarArithmetic(MachineInstr &MI,

                                                      MachineBasicBlock *BB) {

  // For targets older than GFX12, we emit a sequence of 32-bit operations.

  // For GFX12, we emit s_add_u64 and s_sub_u64.

  MachineFunction *MF = BB->getParent();

  const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();

  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();

  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();

  const DebugLoc &DL = MI.getDebugLoc();

  MachineOperand &Dest = MI.getOperand(0);

  MachineOperand &Src0 = MI.getOperand(1);

  MachineOperand &Src1 = MI.getOperand(2);

  bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);

  if (ST.hasScalarAddSub64()) {

    unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;

    // clang-format off

    BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())

        .add(Src0)

        .add(Src1);

    // clang-format on

  } else {

    const SIRegisterInfo *TRI = ST.getRegisterInfo();

    const TargetRegisterClass *BoolRC = TRI->getBoolRC();


    Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);

    Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);


    MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(

        MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);

    MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(

        MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);


    MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(

        MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);

    MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(

        MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);


    unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;

    unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;

    BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);

    BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);

    BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())

        .addReg(DestSub0)

        .addImm(AMDGPU::sub0)

        .addReg(DestSub1)

        .addImm(AMDGPU::sub1);

  }

  MI.eraseFromParent();

  return BB;

}


static void expand64BitV_CNDMASK(MachineInstr &MI, MachineBasicBlock *BB) {

  MachineFunction *MF = BB->getParent();

  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();

  const SIInstrInfo *TII = ST.getInstrInfo();

  const SIRegisterInfo *TRI = ST.getRegisterInfo();

  MachineRegisterInfo &MRI = MF->getRegInfo();

  const DebugLoc &DL = MI.getDebugLoc();

  Register Dst = MI.getOperand(0).getReg();

  const MachineOperand &Src0 = MI.getOperand(1);

  const MachineOperand &Src1 = MI.getOperand(2);

  Register SrcCond = MI.getOperand(3).getReg();


  Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

  Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

  const TargetRegisterClass *CondRC = TRI->getWaveMaskRegClass();

  Register SrcCondCopy = MRI.createVirtualRegister(CondRC);


  int Src0Idx =

      AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);

  int Src1Idx =

      AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1);

  const TargetRegisterClass *Src0RC =

      TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), Src0Idx));

  const TargetRegisterClass *Src1RC =

      TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), Src1Idx));


  const TargetRegisterClass *Src0SubRC =

      TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);

  const TargetRegisterClass *Src1SubRC =

      TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);


  MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(

      MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);

  MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(

      MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);


  MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(

      MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);

  MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(

      MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);


  BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);

  BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)

      .addImm(0)

      .add(Src0Sub0)

      .addImm(0)

      .add(Src1Sub0)

      .addReg(SrcCondCopy);


  BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)

      .addImm(0)

      .add(Src0Sub1)

      .addImm(0)

      .add(Src1Sub1)

      .addReg(SrcCondCopy);


  BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)

      .addReg(DstLo)

      .addImm(AMDGPU::sub0)

      .addReg(DstHi)

      .addImm(AMDGPU::sub1);

  MI.eraseFromParent();

}


static uint64_t getIdentityValueForWaveReduction(unsigned Opc) {

  switch (Opc) {

  case AMDGPU::S_MIN_U32:

    return std::numeric_limits<uint32_t>::max();

  case AMDGPU::S_MIN_I32:

    return std::numeric_limits<int32_t>::max();

  case AMDGPU::S_MAX_U32:

    return std::numeric_limits<uint32_t>::min();

  case AMDGPU::S_MAX_I32:

    return std::numeric_limits<int32_t>::min();

  case AMDGPU::V_ADD_F32_e64: // -0.0

    return 0x80000000;

  case AMDGPU::V_SUB_F32_e64: // +0.0

    return 0x0;

  case AMDGPU::S_ADD_I32:

  case AMDGPU::S_SUB_I32:

  case AMDGPU::S_OR_B32:

  case AMDGPU::S_XOR_B32:

    return std::numeric_limits<uint32_t>::min();

  case AMDGPU::S_AND_B32:

    return std::numeric_limits<uint32_t>::max();

  case AMDGPU::V_MIN_F32_e64:

  case AMDGPU::V_MAX_F32_e64:

    return 0x7fc00000;           // qNAN

  case AMDGPU::V_CMP_LT_U64_e64: // umin.u64

    return std::numeric_limits<uint64_t>::max();

  case AMDGPU::V_CMP_LT_I64_e64: // min.i64

    return std::numeric_limits<int64_t>::max();

  case AMDGPU::V_CMP_GT_U64_e64: // umax.u64

    return std::numeric_limits<uint64_t>::min();

  case AMDGPU::V_CMP_GT_I64_e64: // max.i64

    return std::numeric_limits<int64_t>::min();

  case AMDGPU::V_MIN_F64_e64:

  case AMDGPU::V_MAX_F64_e64:

  case AMDGPU::V_MIN_NUM_F64_e64:

  case AMDGPU::V_MAX_NUM_F64_e64:

    return 0x7FF8000000000000; // qNAN

  case AMDGPU::S_ADD_U64_PSEUDO:

  case AMDGPU::S_SUB_U64_PSEUDO:

  case AMDGPU::S_OR_B64:

  case AMDGPU::S_XOR_B64:

    return std::numeric_limits<uint64_t>::min();

  case AMDGPU::S_AND_B64:

    return std::numeric_limits<uint64_t>::max();

  case AMDGPU::V_ADD_F64_e64:

  case AMDGPU::V_ADD_F64_pseudo_e64:

    return 0x8000000000000000; // -0.0

  default:

    llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction");

  }

}


static bool is32bitWaveReduceOperation(unsigned Opc) {

  return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||

         Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||

         Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||

         Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||

         Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||

         Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 ||

         Opc == AMDGPU::V_SUB_F32_e64;

}


static bool isFloatingPointWaveReduceOperation(unsigned Opc) {

  return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||

         Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64 ||

         Opc == AMDGPU::V_MIN_F64_e64 || Opc == AMDGPU::V_MAX_F64_e64 ||

         Opc == AMDGPU::V_MIN_NUM_F64_e64 || Opc == AMDGPU::V_MAX_NUM_F64_e64 ||

         Opc == AMDGPU::V_ADD_F64_e64 || Opc == AMDGPU::V_ADD_F64_pseudo_e64;

}


static std::tuple<unsigned, unsigned>


getDPPOpcForWaveReduction(unsigned Opc, const GCNSubtarget &ST) {

  unsigned DPPOpc;

  switch (Opc) {

  case AMDGPU::S_MIN_U32:

    DPPOpc = AMDGPU::V_MIN_U32_dpp;

    break;

  case AMDGPU::S_MIN_I32:

    DPPOpc = AMDGPU::V_MIN_I32_dpp;

    break;

  case AMDGPU::S_MAX_U32:

    DPPOpc = AMDGPU::V_MAX_U32_dpp;

    break;

  case AMDGPU::S_MAX_I32:

    DPPOpc = AMDGPU::V_MAX_I32_dpp;

    break;

  case AMDGPU::S_ADD_I32:

  case AMDGPU::S_SUB_I32:

    DPPOpc = ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_dpp

                                     : AMDGPU::V_ADD_CO_U32_dpp;

    break;

  case AMDGPU::S_AND_B32:

    DPPOpc = AMDGPU::V_AND_B32_dpp;

    break;

  case AMDGPU::S_OR_B32:

    DPPOpc = AMDGPU::V_OR_B32_dpp;

    break;

  case AMDGPU::S_XOR_B32:

    DPPOpc = AMDGPU::V_XOR_B32_dpp;

    break;

  case AMDGPU::V_ADD_F32_e64:

  case AMDGPU::V_SUB_F32_e64:

    DPPOpc = AMDGPU::V_ADD_F32_dpp;

    break;

  case AMDGPU::V_MIN_F32_e64:

    DPPOpc = AMDGPU::V_MIN_F32_dpp;

    break;

  case AMDGPU::V_MAX_F32_e64:

    DPPOpc = AMDGPU::V_MAX_F32_dpp;

    break;

  case AMDGPU::V_CMP_LT_U64_e64: // umin.u64

  case AMDGPU::V_CMP_LT_I64_e64: // min.i64

  case AMDGPU::V_CMP_GT_U64_e64: // umax.u64

  case AMDGPU::V_CMP_GT_I64_e64: // max.i64

  case AMDGPU::S_ADD_U64_PSEUDO:

  case AMDGPU::S_SUB_U64_PSEUDO:

  case AMDGPU::S_AND_B64:

  case AMDGPU::S_OR_B64:

  case AMDGPU::S_XOR_B64:

  case AMDGPU::V_MIN_NUM_F64_e64:

  case AMDGPU::V_MIN_F64_e64:

  case AMDGPU::V_MAX_NUM_F64_e64:

  case AMDGPU::V_MAX_F64_e64:

  case AMDGPU::V_ADD_F64_pseudo_e64:

  case AMDGPU::V_ADD_F64_e64:

    DPPOpc = AMDGPU::V_MOV_B64_DPP_PSEUDO;

    break;

  default:

    llvm_unreachable("unhandled lane op");

  }

  unsigned ClampOpc = Opc;

  if (!ST.getInstrInfo()->isVALU(Opc)) {

    if (Opc == AMDGPU::S_SUB_I32)

      ClampOpc = AMDGPU::S_ADD_I32;

    if (Opc == AMDGPU::S_ADD_U64_PSEUDO || Opc == AMDGPU::S_SUB_U64_PSEUDO)

      ClampOpc = AMDGPU::V_ADD_CO_U32_e64;

    else if (Opc == AMDGPU::S_AND_B64)

      ClampOpc = AMDGPU::V_AND_B32_e64;

    else if (Opc == AMDGPU::S_OR_B64)

      ClampOpc = AMDGPU::V_OR_B32_e64;

    else if (Opc == AMDGPU::S_XOR_B64)

      ClampOpc = AMDGPU::V_XOR_B32_e64;

    else

      ClampOpc = ST.getInstrInfo()->getVALUOp(ClampOpc);

  }

  return {DPPOpc, ClampOpc};

}


static std::pair<Register, Register>


ExtractSubRegs(MachineInstr &MI, MachineOperand &Op,

               const TargetRegisterClass *SrcRC, const GCNSubtarget &ST,

               MachineRegisterInfo &MRI) {

  const SIRegisterInfo *TRI = ST.getRegisterInfo();

  const SIInstrInfo *TII = ST.getInstrInfo();

  const TargetRegisterClass *SrcSubRC =

      TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);

  Register Op1L =

      TII->buildExtractSubReg(MI, MRI, Op, SrcRC, AMDGPU::sub0, SrcSubRC);

  Register Op1H =

      TII->buildExtractSubReg(MI, MRI, Op, SrcRC, AMDGPU::sub1, SrcSubRC);

  return {Op1L, Op1H};

}


static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,

                                          MachineBasicBlock &BB,

                                          const GCNSubtarget &ST,

                                          unsigned Opc) {

  MachineRegisterInfo &MRI = BB.getParent()->getRegInfo();

  const SIRegisterInfo *TRI = ST.getRegisterInfo();

  const DebugLoc &DL = MI.getDebugLoc();

  const SIInstrInfo *TII = ST.getInstrInfo();


  // Reduction operations depend on whether the input operand is SGPR or VGPR.

  Register SrcReg = MI.getOperand(1).getReg();

  bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));

  Register DstReg = MI.getOperand(0).getReg();

  unsigned Stratergy = static_cast<unsigned>(MI.getOperand(2).getImm());

  enum WAVE_REDUCE_STRATEGY : unsigned { DEFAULT = 0, ITERATIVE = 1, DPP = 2 };

  MachineBasicBlock *RetBB = nullptr;

  unsigned MIOpc = MI.getOpcode();

  auto BuildRegSequence = [&](MachineBasicBlock &BB,

                              MachineBasicBlock::iterator MI, Register Dst,

                              Register Src0, Register Src1) {

    auto RegSequence =

        BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dst)

            .addReg(Src0)

            .addImm(AMDGPU::sub0)

            .addReg(Src1)

            .addImm(AMDGPU::sub1);

    return RegSequence;

  };

  if (isSGPR) {

    switch (Opc) {

    case AMDGPU::S_MIN_U32:

    case AMDGPU::S_MIN_I32:

    case AMDGPU::V_MIN_F32_e64:

    case AMDGPU::S_MAX_U32:

    case AMDGPU::S_MAX_I32:

    case AMDGPU::V_MAX_F32_e64:

    case AMDGPU::S_AND_B32:

    case AMDGPU::S_OR_B32: {

      // Idempotent operations.

      BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);

      RetBB = &BB;

      break;

    }

    case AMDGPU::V_CMP_LT_U64_e64: // umin

    case AMDGPU::V_CMP_LT_I64_e64: // min

    case AMDGPU::V_CMP_GT_U64_e64: // umax

    case AMDGPU::V_CMP_GT_I64_e64: // max

    case AMDGPU::V_MIN_F64_e64:

    case AMDGPU::V_MIN_NUM_F64_e64:

    case AMDGPU::V_MAX_F64_e64:

    case AMDGPU::V_MAX_NUM_F64_e64:

    case AMDGPU::S_AND_B64:

    case AMDGPU::S_OR_B64: {

      // Idempotent operations.

      BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);

      RetBB = &BB;

      break;

    }

    case AMDGPU::S_XOR_B32:

    case AMDGPU::S_XOR_B64:

    case AMDGPU::S_ADD_I32:

    case AMDGPU::S_ADD_U64_PSEUDO:

    case AMDGPU::V_ADD_F32_e64:

    case AMDGPU::V_ADD_F64_e64:

    case AMDGPU::V_ADD_F64_pseudo_e64:

    case AMDGPU::S_SUB_I32:

    case AMDGPU::S_SUB_U64_PSEUDO:

    case AMDGPU::V_SUB_F32_e64: {

      const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();

      const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);

      Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);

      Register NumActiveLanes =

          MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);


      bool IsWave32 = ST.isWave32();

      unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;

      MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;

      unsigned BitCountOpc =

          IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;


      BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);


      auto NewAccumulator =

          BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)

              .addReg(ExecMask);


      switch (Opc) {

      case AMDGPU::S_XOR_B32:

      case AMDGPU::S_XOR_B64: {

        // Performing an XOR operation on a uniform value

        // depends on the parity of the number of active lanes.

        // For even parity, the result will be 0, for odd

        // parity the result will be the same as the input value.

        Register ParityRegister =

            MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);


        BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)

            .addReg(NewAccumulator->getOperand(0).getReg())

            .addImm(1)

            .setOperandDead(3); // Dead scc

        if (Opc == AMDGPU::S_XOR_B32) {

          BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)

              .addReg(SrcReg)

              .addReg(ParityRegister);

        } else {

          Register DestSub0 =

              MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);

          Register DestSub1 =

              MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);

          auto [Op1L, Op1H] = ExtractSubRegs(MI, MI.getOperand(1),

                                             MRI.getRegClass(SrcReg), ST, MRI);

          BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)

              .addReg(Op1L)

              .addReg(ParityRegister);

          BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)

              .addReg(Op1H)

              .addReg(ParityRegister);

          BuildRegSequence(BB, MI, DstReg, DestSub0, DestSub1);

        }

        break;

      }

      case AMDGPU::S_SUB_I32: {

        Register NegatedVal = MRI.createVirtualRegister(DstRegClass);


        // Take the negation of the source operand.

        BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)

            .addImm(0)

            .addReg(SrcReg);

        BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)

            .addReg(NegatedVal)

            .addReg(NewAccumulator->getOperand(0).getReg());

        break;

      }

      case AMDGPU::S_ADD_I32: {

        BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)

            .addReg(SrcReg)

            .addReg(NewAccumulator->getOperand(0).getReg());

        break;

      }

      case AMDGPU::S_ADD_U64_PSEUDO:

      case AMDGPU::S_SUB_U64_PSEUDO: {

        Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);

        Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);

        Register Op1H_Op0L_Reg =

            MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);

        Register Op1L_Op0H_Reg =

            MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);

        Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);

        Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);

        Register NegatedValLo =

            MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);

        Register NegatedValHi =

            MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);

        auto [Op1L, Op1H] = ExtractSubRegs(MI, MI.getOperand(1),

                                           MRI.getRegClass(SrcReg), ST, MRI);

        if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {

          BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)

              .addImm(0)

              .addReg(NewAccumulator->getOperand(0).getReg())

              .setOperandDead(3); // Dead scc

          BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)

              .addReg(NegatedValLo)

              .addImm(31)

              .setOperandDead(3); // Dead scc

          BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)

              .addReg(Op1L)

              .addReg(NegatedValHi);

        }

        Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO

                                 ? NegatedValLo

                                 : NewAccumulator->getOperand(0).getReg();

        BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)

            .addReg(Op1L)

            .addReg(LowOpcode);

        BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)

            .addReg(Op1L)

            .addReg(LowOpcode);

        BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)

            .addReg(Op1H)

            .addReg(LowOpcode);


        Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;

        BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)

            .addReg(CarryReg)

            .addReg(Op1H_Op0L_Reg)

            .setOperandDead(3); // Dead scc


        if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {

          BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)

              .addReg(HiVal)

              .addReg(Op1L_Op0H_Reg)

              .setOperandDead(3); // Dead scc

        }

        BuildRegSequence(BB, MI, DstReg, DestSub0, DestSub1);

        break;

      }

      case AMDGPU::V_ADD_F32_e64:

      case AMDGPU::V_ADD_F64_e64:

      case AMDGPU::V_ADD_F64_pseudo_e64:

      case AMDGPU::V_SUB_F32_e64: {

        bool is32BitOpc = is32bitWaveReduceOperation(Opc);

        const TargetRegisterClass *VregRC = TII->getRegClass(TII->get(Opc), 0);

        Register ActiveLanesVreg = MRI.createVirtualRegister(VregRC);

        Register DstVreg = MRI.createVirtualRegister(VregRC);

        // Get number of active lanes as a float val.

        BuildMI(BB, MI, DL,

                TII->get(is32BitOpc ? AMDGPU::V_CVT_F32_I32_e64

                                    : AMDGPU::V_CVT_F64_I32_e64),

                ActiveLanesVreg)

            .addReg(NewAccumulator->getOperand(0).getReg())

            .addImm(0)  // clamp

            .addImm(0); // output-modifier


        // Take negation of input for SUB reduction

        unsigned srcMod = (MIOpc == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32 ||

                           MIOpc == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64)

                              ? SISrcMods::NEG

                              : SISrcMods::NONE;

        unsigned MulOpc = is32BitOpc ? AMDGPU::V_MUL_F32_e64

                          : ST.getGeneration() >= AMDGPUSubtarget::GFX12

                              ? AMDGPU::V_MUL_F64_pseudo_e64

                              : AMDGPU::V_MUL_F64_e64;

        auto DestVregInst = BuildMI(BB, MI, DL, TII->get(MulOpc),

                                    DstVreg)

                                .addImm(srcMod) // src0 modifier

                                .addReg(SrcReg)

                                .addImm(SISrcMods::NONE) // src1 modifier

                                .addReg(ActiveLanesVreg)

                                .addImm(SISrcMods::NONE)  // clamp

                                .addImm(SISrcMods::NONE); // output-mod

        if (is32BitOpc) {

          BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)

              .addReg(DstVreg);

        } else {

          Register LaneValueLoReg =

              MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);

          Register LaneValueHiReg =

              MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);

          auto [Op1L, Op1H] =

              ExtractSubRegs(MI, DestVregInst->getOperand(0), VregRC, ST, MRI);

          // lane value input should be in an sgpr

          BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),

                  LaneValueLoReg)

              .addReg(Op1L);

          BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),

                  LaneValueHiReg)

              .addReg(Op1H);

          NewAccumulator =

              BuildRegSequence(BB, MI, DstReg, LaneValueLoReg, LaneValueHiReg);

        }

      }

      }

      RetBB = &BB;

    }

    }

  } else {

    MachineBasicBlock::iterator I = BB.end();

    Register SrcReg = MI.getOperand(1).getReg();

    bool is32BitOpc = is32bitWaveReduceOperation(Opc);

    bool isFPOp = isFloatingPointWaveReduceOperation(Opc);

    bool NeedsMovDPP = !is32BitOpc;

    // Create virtual registers required for lowering.

    const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();

    const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);

    const TargetRegisterClass *SrcRegClass = MRI.getRegClass(SrcReg);

    bool IsWave32 = ST.isWave32();

    unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;

    unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;

    if (Stratergy == WAVE_REDUCE_STRATEGY::ITERATIVE ||

        !ST.hasDPP()) { // If target doesn't support DPP operations, default to

                        // iterative stratergy


      // To reduce the VGPR using iterative approach, we need to iterate

      // over all the active lanes. Lowering consists of ComputeLoop,

      // which iterate over only active lanes. We use copy of EXEC register

      // as induction variable and every active lane modifies it using bitset0

      // so that we will get the next active lane for next iteration.


      // Create Control flow for loop

      // Split MI's Machine Basic block into For loop

      auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);


      Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);

      Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);

      Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);

      Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);

      Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);

      Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);

      Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);


      // Create initial values of induction variable from Exec, Accumulator and

      // insert branch instr to newly created ComputeBlock

      BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);

      uint64_t IdentityValue =

          MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64

              ? 0x0 // +0.0 for double sub reduction

              : getIdentityValueForWaveReduction(Opc);

      BuildMI(BB, I, DL,

              TII->get(is32BitOpc ? AMDGPU::S_MOV_B32

                                  : AMDGPU::S_MOV_B64_IMM_PSEUDO),

              IdentityValReg)

          .addImm(IdentityValue);

      // clang-format off

      BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))

          .addMBB(ComputeLoop);

      // clang-format on


      // Start constructing ComputeLoop

      I = ComputeLoop->begin();

      auto Accumulator =

          BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)

              .addReg(IdentityValReg)

              .addMBB(&BB);

      auto ActiveBits =

          BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)

              .addReg(LoopIterator)

              .addMBB(&BB);


      I = ComputeLoop->end();

      MachineInstr *NewAccumulator;

      // Perform the computations

      unsigned SFFOpc =

          IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;

      BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)

          .addReg(ActiveBitsReg);

      if (is32BitOpc) {

        BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),

                LaneValueReg)

            .addReg(SrcReg)

            .addReg(FF1Reg);

        if (isFPOp) {

          Register LaneValVreg =

              MRI.createVirtualRegister(MRI.getRegClass(SrcReg));

          Register DstVreg = MRI.createVirtualRegister(MRI.getRegClass(SrcReg));

          // Get the Lane Value in VGPR to avoid the Constant Bus Restriction

          BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_MOV_B32_e32),

                  LaneValVreg)

              .addReg(LaneValueReg);

          BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)

              .addImm(0) // src0 modifier

              .addReg(Accumulator->getOperand(0).getReg())

              .addImm(0) // src1 modifier

              .addReg(LaneValVreg)

              .addImm(0)  // clamp

              .addImm(0); // omod

          NewAccumulator =

              BuildMI(*ComputeLoop, I, DL,

                      TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)

                  .addReg(DstVreg);

        } else {

          NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)

                               .addReg(Accumulator->getOperand(0).getReg())

                               .addReg(LaneValueReg);

        }

      } else {

        Register LaneValueLoReg =

            MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);

        Register LaneValueHiReg =

            MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);

        Register LaneValReg =

            MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);

        auto [Op1L, Op1H] = ExtractSubRegs(MI, MI.getOperand(1),

                                           MRI.getRegClass(SrcReg), ST, MRI);

        // lane value input should be in an sgpr

        BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),

                LaneValueLoReg)

            .addReg(Op1L)

            .addReg(FF1Reg);

        BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),

                LaneValueHiReg)

            .addReg(Op1H)

            .addReg(FF1Reg);

        auto LaneValue = BuildRegSequence(*ComputeLoop, I, LaneValReg,

                                          LaneValueLoReg, LaneValueHiReg);

        switch (Opc) {

        case AMDGPU::S_OR_B64:

        case AMDGPU::S_AND_B64:

        case AMDGPU::S_XOR_B64: {

          NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)

                               .addReg(Accumulator->getOperand(0).getReg())

                               .addReg(LaneValue->getOperand(0).getReg())

                               .setOperandDead(3); // Dead scc

          break;

        }

        case AMDGPU::V_CMP_GT_I64_e64:

        case AMDGPU::V_CMP_GT_U64_e64:

        case AMDGPU::V_CMP_LT_I64_e64:

        case AMDGPU::V_CMP_LT_U64_e64: {

          Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);

          Register ComparisonResultReg =

              MRI.createVirtualRegister(WaveMaskRegClass);

          int SrcIdx =

              AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src);

          const TargetRegisterClass *VregClass =

              TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx));

          Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);

          auto [SrcReg0Sub0, SrcReg0Sub1] = ExtractSubRegs(

              MI, Accumulator->getOperand(0), VregClass, ST, MRI);

          BuildRegSequence(*ComputeLoop, I, AccumulatorVReg, SrcReg0Sub0,

                           SrcReg0Sub1);

          BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)

              .addReg(LaneValue->getOperand(0).getReg())

              .addReg(AccumulatorVReg);


          unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;

          BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)

              .addReg(LaneMaskReg)

              .addReg(ActiveBitsReg);


          NewAccumulator = BuildMI(*ComputeLoop, I, DL,

                                   TII->get(AMDGPU::S_CSELECT_B64), DstReg)

                               .addReg(LaneValue->getOperand(0).getReg())

                               .addReg(Accumulator->getOperand(0).getReg());

          break;

        }

        case AMDGPU::V_MIN_F64_e64:

        case AMDGPU::V_MIN_NUM_F64_e64:

        case AMDGPU::V_MAX_F64_e64:

        case AMDGPU::V_MAX_NUM_F64_e64:

        case AMDGPU::V_ADD_F64_e64:

        case AMDGPU::V_ADD_F64_pseudo_e64: {

          int SrcIdx =

              AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src);

          const TargetRegisterClass *VregRC =

              TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx));

          Register AccumulatorVReg = MRI.createVirtualRegister(VregRC);

          Register DstVreg = MRI.createVirtualRegister(VregRC);

          Register LaneValLo =

              MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);

          Register LaneValHi =

              MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);

          BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::COPY), AccumulatorVReg)

              .addReg(Accumulator->getOperand(0).getReg());

          unsigned Modifier =

              MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64

                  ? SISrcMods::NEG

                  : SISrcMods::NONE;

          auto DstVregInst =

              BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)

                  .addImm(Modifier) // src0 modifiers

                  .addReg(LaneValue->getOperand(0).getReg())

                  .addImm(SISrcMods::NONE) // src1 modifiers

                  .addReg(AccumulatorVReg)

                  .addImm(SISrcMods::NONE)  // clamp

                  .addImm(SISrcMods::NONE); // omod

          auto ReadLaneLo =

              BuildMI(*ComputeLoop, I, DL,

                      TII->get(AMDGPU::V_READFIRSTLANE_B32), LaneValLo);

          auto ReadLaneHi =

              BuildMI(*ComputeLoop, I, DL,

                      TII->get(AMDGPU::V_READFIRSTLANE_B32), LaneValHi);

          MachineBasicBlock::iterator Iters = *ReadLaneLo;

          auto [Op1L, Op1H] = ExtractSubRegs(*Iters, DstVregInst->getOperand(0),

                                             VregRC, ST, MRI);

          ReadLaneLo.addReg(Op1L);

          ReadLaneHi.addReg(Op1H);

          NewAccumulator =

              BuildRegSequence(*ComputeLoop, I, DstReg, LaneValLo, LaneValHi);

          break;

        }

        case AMDGPU::S_ADD_U64_PSEUDO:

        case AMDGPU::S_SUB_U64_PSEUDO: {

          NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)

                               .addReg(Accumulator->getOperand(0).getReg())

                               .addReg(LaneValue->getOperand(0).getReg());

          ComputeLoop =

              expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);

          break;

        }

        }

      }

      // Manipulate the iterator to get the next active lane

      unsigned BITSETOpc =

          IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;

      BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)

          .addReg(FF1Reg)

          .addReg(ActiveBitsReg);


      // Add phi nodes

      Accumulator.addReg(DstReg).addMBB(ComputeLoop);

      ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);


      // Creating branching

      unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;

      BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))

          .addReg(NewActiveBitsReg)

          .addImm(0);

      BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))

          .addMBB(ComputeLoop);


      RetBB = ComputeEnd;

    } else {

      assert(ST.hasDPP() && "Sub Target does not support DPP Operations");

      MachineBasicBlock *CurrBB = &BB;

      Register SrcWithIdentity = MRI.createVirtualRegister(SrcRegClass);

      Register IdentityVGPR = MRI.createVirtualRegister(SrcRegClass);

      Register IdentitySGPR = MRI.createVirtualRegister(DstRegClass);

      Register DPPRowShr1 = MRI.createVirtualRegister(SrcRegClass);

      Register DPPRowShr2 = MRI.createVirtualRegister(SrcRegClass);

      Register DPPRowShr4 = MRI.createVirtualRegister(SrcRegClass);

      Register DPPRowShr8 = MRI.createVirtualRegister(SrcRegClass);

      Register RowBcast15 = MRI.createVirtualRegister(SrcRegClass);

      Register ReducedValSGPR = MRI.createVirtualRegister(DstRegClass);

      Register NegatedReducedVal = MRI.createVirtualRegister(DstRegClass);

      Register RowBcast31 = MRI.createVirtualRegister(SrcRegClass);

      Register UndefExec = MRI.createVirtualRegister(WaveMaskRegClass);

      Register FinalDPPResult;

      MachineInstr *SrcWithIdentityInstr;

      MachineInstr *LastBcastInstr;

      BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), UndefExec);


      uint64_t IdentityValue = getIdentityValueForWaveReduction(Opc);

      BuildMI(*CurrBB, MI, DL,

              TII->get(is32BitOpc ? AMDGPU::S_MOV_B32

                                  : AMDGPU::S_MOV_B64_IMM_PSEUDO),

              IdentitySGPR)

          .addImm(IdentityValue);

      auto IdentityCopyInstr =

          BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::COPY), IdentityVGPR)

              .addReg(IdentitySGPR);

      auto DPPClampOpcPair = getDPPOpcForWaveReduction(Opc, ST);

      unsigned DPPOpc = std::get<0>(DPPClampOpcPair);

      unsigned ClampOpc = std::get<1>(DPPClampOpcPair);

      auto BuildSetInactiveInstr = [&](Register Dst, Register Src0,

                                       Register Src1) {

        return BuildMI(BB, MI, DL, TII->get(AMDGPU::V_SET_INACTIVE_B32),

                       Dst)

            .addImm(0)          // src0 modifiers

            .addReg(Src0)       // src0

            .addImm(0)          // src1 modifiers

            .addReg(Src1)       // identity value for inactive lanes

            .addReg(UndefExec); // bool i1

      };

      auto BuildDPPMachineInstr = [&](Register Dst, Register Src,

                                      unsigned DPPCtrl) {

        auto DPPInstr =

            BuildMI(*CurrBB, MI, DL, TII->get(DPPOpc), Dst).addReg(Src); // old

        if (isFPOp && !NeedsMovDPP)

          DPPInstr.addImm(SISrcMods::NONE); // src0 modifier

        DPPInstr.addReg(Src);               // src0

        if (isFPOp && !NeedsMovDPP)

          DPPInstr.addImm(SISrcMods::NONE); // src1 modifier

        if (!NeedsMovDPP)

          DPPInstr.addReg(Src); // src1

        if (AMDGPU::getNamedOperandIdx(DPPOpc, AMDGPU::OpName::clamp) >= 0)

          DPPInstr.addImm(0); // clamp

        DPPInstr

            .addImm(DPPCtrl) // dpp-ctrl

            .addImm(0xf)     // row-mask

            .addImm(0xf)     // bank-mask

            .addImm(0);      // bound-control

      };

      auto BuildClampInstr = [&](Register Dst, Register Src0, Register Src1,

                                 bool isAddSub = false,

                                 bool needsCarryIn = false,

                                 Register CarryIn = Register()) {

        unsigned InstrOpc = ClampOpc;

        Register CarryOutReg = MRI.createVirtualRegister(WaveMaskRegClass);

        if (needsCarryIn)

          InstrOpc = AMDGPU::V_ADDC_U32_e64;

        auto ClampInstr = BuildMI(*CurrBB, MI, DL, TII->get(InstrOpc), Dst);

        if (isFPOp)

          ClampInstr.addImm(SISrcMods::NONE); // src0 mod

        if (isAddSub) {

          if (needsCarryIn)

            ClampInstr.addReg(CarryOutReg,

                              RegState::Define |

                                  RegState::Dead); // killed carry-out reg

          else

            ClampInstr.addReg(CarryOutReg, RegState::Define); // carry-out reg

        }

        ClampInstr.addReg(Src0);              // src0

        if (isFPOp)

          ClampInstr.addImm(SISrcMods::NONE); // src1 mod

        ClampInstr.addReg(Src1);              // src1

        if (needsCarryIn)

          ClampInstr.addReg(CarryIn, RegState::Kill); // carry-in reg

        if (AMDGPU::getNamedOperandIdx(InstrOpc, AMDGPU::OpName::clamp) >= 0)

          ClampInstr.addImm(0); // clamp

        if (isFPOp)

          ClampInstr.addImm(0); // omod

        LastBcastInstr = ClampInstr;

        return CarryOutReg;

      };

      auto BuildPostDPPInstr = [&](Register Src0, Register Src1) {

        bool isAddSubOpc =

            Opc == AMDGPU::S_ADD_U64_PSEUDO || Opc == AMDGPU::S_SUB_U64_PSEUDO;

        bool isBitWiseOpc = Opc == AMDGPU::S_AND_B64 ||

                            Opc == AMDGPU::S_OR_B64 || Opc == AMDGPU::S_XOR_B64;

        Register ReturnReg = MRI.createVirtualRegister(SrcRegClass);

        if (isAddSubOpc || isBitWiseOpc) {

          Register ResLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

          Register ResHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

          MachineOperand Src0Operand =

              MachineOperand::CreateReg(Src0, /*isDef=*/false);

          MachineOperand Src1Operand =

              MachineOperand::CreateReg(Src1, /*isDef=*/false);

          auto [Src0Lo, Src0Hi] =

              ExtractSubRegs(MI, Src0Operand, SrcRegClass, ST, MRI);

          auto [Src1Lo, Src1Hi] =

              ExtractSubRegs(MI, Src1Operand, SrcRegClass, ST, MRI);

          Register CarryReg = BuildClampInstr(

              ResLo, Src0Lo, Src1Lo, isAddSubOpc, /*needsCarryIn*/ false);

          BuildClampInstr(ResHi, Src0Hi, Src1Hi, isAddSubOpc,

                          /*needsCarryIn*/ isAddSubOpc, CarryReg);

          BuildRegSequence(*CurrBB, MI, ReturnReg, ResLo, ResHi);

        } else {

          if (isFPOp) {

            BuildMI(*CurrBB, MI, DL, TII->get(Opc), ReturnReg)

                .addImm(SISrcMods::NONE) // src0 modifiers

                .addReg(Src0)

                .addImm(SISrcMods::NONE) // src1 modifiers

                .addReg(Src1)

                .addImm(SISrcMods::NONE)  // clamp

                .addImm(SISrcMods::NONE); // omod

          } else {

            Register CmpMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);

            BuildMI(*CurrBB, MI, DL, TII->get(Opc), CmpMaskReg)

                .addReg(Src0)  // src0

                .addReg(Src1); // src1

            LastBcastInstr =

                BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B64_PSEUDO),

                        ReturnReg)

                    .addReg(Src1)        // src0

                    .addReg(Src0)        // src1

                    .addReg(CmpMaskReg); // src2

            expand64BitV_CNDMASK(*LastBcastInstr, CurrBB);

          }

        }

        return ReturnReg;

      };


      // Set inactive lanes to the identity value.

      if (is32BitOpc) {

        SrcWithIdentityInstr =

            BuildSetInactiveInstr(SrcWithIdentity, SrcReg, IdentityVGPR);

      } else {

        Register SrcWithIdentitylo =

            MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

        Register SrcWithIdentityhi =

            MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

        auto [Reg0Sub0, Reg0Sub1] = ExtractSubRegs(

            MI, IdentityCopyInstr->getOperand(0), SrcRegClass, ST, MRI);

        auto [SrcReg0Sub0, SrcReg0Sub1] =

            ExtractSubRegs(MI, MI.getOperand(1), SrcRegClass, ST, MRI);

        MachineInstr *SetInactiveLoInstr =

            BuildSetInactiveInstr(SrcWithIdentitylo, SrcReg0Sub0, Reg0Sub0);

        MachineInstr *SetInactiveHiInstr =

            BuildSetInactiveInstr(SrcWithIdentityhi, SrcReg0Sub1, Reg0Sub1);

        SrcWithIdentityInstr =

            BuildRegSequence(*CurrBB, MI, SrcWithIdentity,

                             SetInactiveLoInstr->getOperand(0).getReg(),

                             SetInactiveHiInstr->getOperand(0).getReg());

      }

      // DPP reduction

      Register SrcWithIdentityReg =

          SrcWithIdentityInstr->getOperand(0).getReg();

      BuildDPPMachineInstr(DPPRowShr1, SrcWithIdentityReg,

                           AMDGPU::DPP::ROW_SHR_FIRST);

      if (NeedsMovDPP)

        DPPRowShr1 = BuildPostDPPInstr(SrcWithIdentityReg, DPPRowShr1);


      BuildDPPMachineInstr(DPPRowShr2, DPPRowShr1,

                           (AMDGPU::DPP::ROW_SHR_FIRST + 1));

      if (NeedsMovDPP)

        DPPRowShr2 = BuildPostDPPInstr(DPPRowShr1, DPPRowShr2);


      BuildDPPMachineInstr(DPPRowShr4, DPPRowShr2,

                           (AMDGPU::DPP::ROW_SHR_FIRST + 3));

      if (NeedsMovDPP)

        DPPRowShr4 = BuildPostDPPInstr(DPPRowShr2, DPPRowShr4);


      BuildDPPMachineInstr(DPPRowShr8, DPPRowShr4,

                           (AMDGPU::DPP::ROW_SHR_FIRST + 7));

      if (NeedsMovDPP)

        DPPRowShr8 = BuildPostDPPInstr(DPPRowShr4, DPPRowShr8);


      if (ST.hasDPPBroadcasts()) {

        BuildDPPMachineInstr(RowBcast15, DPPRowShr8, AMDGPU::DPP::BCAST15);

        if (NeedsMovDPP)

          RowBcast15 = BuildPostDPPInstr(DPPRowShr8, RowBcast15);

      } else {

        // magic constant: 0x1E0

        // To Set BIT_MODE : bit 15 = 0

        // XOR mask : bit [14:10] = 0

        // OR mask : bit [9:5] = 15

        // AND mask : bit [4:0] = 0

        if (is32BitOpc) {

          Register SwizzledValue =

              MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

          BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_SWIZZLE_B32),

                  SwizzledValue)

              .addReg(DPPRowShr8) // addr

              .addImm(0x1E0)      // swizzle offset (i16)

              .addImm(0x0);       // gds (i1)

          BuildClampInstr(RowBcast15, DPPRowShr8, SwizzledValue);

        } else {

          Register SwizzledValuelo =

              MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

          Register SwizzledValuehi =

              MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

          Register SwizzledValue64 = MRI.createVirtualRegister(SrcRegClass);

          MachineOperand DPPRowShr8Op =

              MachineOperand::CreateReg(DPPRowShr8, /*isDef=*/false);

          auto [Op1L, Op1H] =

              ExtractSubRegs(MI, DPPRowShr8Op, SrcRegClass, ST, MRI);

          BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_SWIZZLE_B32),

                  SwizzledValuelo)

              .addReg(Op1L)  // addr

              .addImm(0x1E0) // swizzle offset (i16)

              .addImm(0x0);  // gds (i1)

          BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_SWIZZLE_B32),

                  SwizzledValuehi)

              .addReg(Op1H)  // addr

              .addImm(0x1E0) // swizzle offset (i16)

              .addImm(0x0);  // gds (i1)

          BuildRegSequence(*CurrBB, MI, SwizzledValue64, SwizzledValuelo,

                           SwizzledValuehi);

          if (NeedsMovDPP)

            RowBcast15 = BuildPostDPPInstr(DPPRowShr8, SwizzledValue64);

          else

            BuildClampInstr(RowBcast15, DPPRowShr8, SwizzledValue64);

        }

      }

      FinalDPPResult = RowBcast15;

      if (!IsWave32) {

        if (ST.hasDPPBroadcasts()) {

          BuildDPPMachineInstr(RowBcast31, RowBcast15, AMDGPU::DPP::BCAST31);

          if (NeedsMovDPP)

            RowBcast31 = BuildPostDPPInstr(RowBcast15, RowBcast31);

        } else {

          Register ShiftedThreadID =

              MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

          Register PermuteByteOffset =

              MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

          Register PermutedValue = MRI.createVirtualRegister(SrcRegClass);

          Register Lane32Offset =

              MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);

          Register WordSizeConst =

              MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);

          Register ThreadIDRegLo =

              MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

          Register ThreadIDReg =

              MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

          // Get the thread ID.

          BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_MBCNT_LO_U32_B32_e64),

                  ThreadIDRegLo)

              .addImm(-1)

              .addImm(0);

          BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_MBCNT_HI_U32_B32_e64),

                  ThreadIDReg)

              .addImm(-1)

              .addReg(ThreadIDRegLo);

          // shift each lane over by 32 positions, so value in 31st lane is

          // present in 63rd lane.

          BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), Lane32Offset)

              .addImm(0x20);

          BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64),

                  ShiftedThreadID)

              .addReg(ThreadIDReg)

              .addReg(Lane32Offset)

              .addImm(0); // clamp

          // multiply by reg size.

          BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), WordSizeConst)

              .addImm(0x4);

          BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_MUL_LO_U32_e64),

                  PermuteByteOffset)

              .addReg(WordSizeConst)

              .addReg(ShiftedThreadID);

          // Permute the lanes

          if (is32BitOpc) {

            BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_PERMUTE_B32),

                    PermutedValue)

                .addReg(PermuteByteOffset) // addr

                .addReg(RowBcast15)        // data

                .addImm(0);                // offset

          } else {

            Register PermutedValuelo =

                MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

            Register PermutedValuehi =

                MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

            MachineOperand RowBcast15Op =

                MachineOperand::CreateReg(RowBcast15, /*isDef=*/false);

            auto [RowBcast15Lo, RowBcast15Hi] =

                ExtractSubRegs(MI, RowBcast15Op, SrcRegClass, ST, MRI);

            BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_PERMUTE_B32),

                    PermutedValuelo)

                .addReg(PermuteByteOffset) // addr

                .addReg(RowBcast15Lo)      // swizzle offset (i16)

                .addImm(0x0);              // gds (i1)

            BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_PERMUTE_B32),

                    PermutedValuehi)

                .addReg(PermuteByteOffset) // addr

                .addReg(RowBcast15Hi)      // swizzle offset (i16)

                .addImm(0x0);              // gds (i1)

            BuildRegSequence(*CurrBB, MI, PermutedValue, PermutedValuelo,

                             PermutedValuehi);

          }

          if (NeedsMovDPP)

            RowBcast31 = BuildPostDPPInstr(RowBcast15, PermutedValue);

          else

            BuildClampInstr(RowBcast31, RowBcast15, PermutedValue);

        }

        FinalDPPResult = RowBcast31;

      }

      if (MIOpc == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32 ||

          MIOpc == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64) {

        Register NegatedValVGPR = MRI.createVirtualRegister(SrcRegClass);

        // Opc for f32 reduction is V_SUB_F32.

        // For f64, there is no equivalent V_SUB_F64 opcode, so use

        // V_ADD_F64/V_ADD_F64_pseudo, and negate the second operand.

        BuildMI(*CurrBB, MI, DL, TII->get(Opc),

                NegatedValVGPR)

            .addImm(SISrcMods::NONE)                               // src0 mods

            .addReg(IdentityVGPR)                                  // src0

            .addImm(is32BitOpc ? SISrcMods::NONE : SISrcMods::NEG) // src1 mods

            .addReg(IsWave32 ? RowBcast15 : RowBcast31)            // src1

            .addImm(SISrcMods::NONE)                               // clamp

            .addImm(SISrcMods::NONE);                              // omod

        FinalDPPResult = NegatedValVGPR;

      }

      // The final reduced value is in the last lane.

      if (is32BitOpc) {

        BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),

                ReducedValSGPR)

            .addReg(FinalDPPResult)

            .addImm(ST.getWavefrontSize() - 1);

      } else {

        Register LaneValueLoReg =

            MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);

        Register LaneValueHiReg =

            MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);

        const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);

        MachineOperand FinalDPPResultOperand =

            MachineOperand::CreateReg(FinalDPPResult, /*isDef=*/false);

        auto [Op1L, Op1H] =

            ExtractSubRegs(MI, FinalDPPResultOperand, SrcRC, ST, MRI);

        // lane value input should be in an sgpr

        BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),

                LaneValueLoReg)

            .addReg(Op1L)

            .addImm(ST.getWavefrontSize() - 1);

        BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),

                LaneValueHiReg)

            .addReg(Op1H)

            .addImm(ST.getWavefrontSize() - 1);

        BuildRegSequence(*CurrBB, MI, ReducedValSGPR, LaneValueLoReg,

                         LaneValueHiReg);

      }

      if (Opc == AMDGPU::S_SUB_I32) {

        BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedReducedVal)

            .addImm(0)

            .addReg(ReducedValSGPR);

      } else if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {

        auto NegatedValInstr =

            BuildMI(*CurrBB, MI, DL, TII->get(Opc), NegatedReducedVal)

                .addImm(0)

                .addReg(ReducedValSGPR);

        CurrBB = expand64BitScalarArithmetic(*NegatedValInstr, CurrBB);

      }

      // Mark the final result as a whole-wave-mode calculation.

      BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::STRICT_WWM), DstReg)

          .addReg(Opc == AMDGPU::S_SUB_I32 || Opc == AMDGPU::S_SUB_U64_PSEUDO

                      ? NegatedReducedVal

                      : ReducedValSGPR);

      RetBB = CurrBB;

    }

  }

  MI.eraseFromParent();

  return RetBB;

}


MachineBasicBlock *


SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,

                                              MachineBasicBlock *BB) const {

  MachineFunction *MF = BB->getParent();

  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();

  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();

  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();

  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();

  MachineRegisterInfo &MRI = MF->getRegInfo();

  const DebugLoc &DL = MI.getDebugLoc();


  switch (MI.getOpcode()) {

  case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:

    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);

  case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:

    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);

  case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:

    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);

  case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:

    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);

  case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:

    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MIN_F32_e64);

  case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F64:

    return lowerWaveReduce(MI, *BB, *getSubtarget(),

                           ST.getGeneration() >= AMDGPUSubtarget::GFX12

                               ? AMDGPU::V_MIN_NUM_F64_e64

                               : AMDGPU::V_MIN_F64_e64);

  case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:

    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);

  case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:

    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);

  case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:

    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);

  case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:

    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);

  case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:

    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MAX_F32_e64);

  case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F64:

    return lowerWaveReduce(MI, *BB, *getSubtarget(),

                           ST.getGeneration() >= AMDGPUSubtarget::GFX12

                               ? AMDGPU::V_MAX_NUM_F64_e64

                               : AMDGPU::V_MAX_F64_e64);

  case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:

    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);

  case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:

    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);

  case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:

    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F32_e64);

  case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F64:

    return lowerWaveReduce(MI, *BB, *getSubtarget(),

                           ST.getGeneration() >= AMDGPUSubtarget::GFX12

                               ? AMDGPU::V_ADD_F64_pseudo_e64

                               : AMDGPU::V_ADD_F64_e64);

  case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:

    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);

  case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:

    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);

  case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:

    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64);

  case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64:

    // There is no S/V_SUB_F64 opcode. Double type subtraction is expanded as

    // fadd + neg, by setting the NEG bit in the instruction.

    return lowerWaveReduce(MI, *BB, *getSubtarget(),

                           ST.getGeneration() >= AMDGPUSubtarget::GFX12

                               ? AMDGPU::V_ADD_F64_pseudo_e64

                               : AMDGPU::V_ADD_F64_e64);

  case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:

    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);

  case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:

    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);

  case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:

    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);

  case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:

    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);

  case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:

    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);

  case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:

    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);

  case AMDGPU::S_UADDO_PSEUDO:

  case AMDGPU::S_USUBO_PSEUDO: {

    MachineOperand &Dest0 = MI.getOperand(0);

    MachineOperand &Dest1 = MI.getOperand(1);

    MachineOperand &Src0 = MI.getOperand(2);

    MachineOperand &Src1 = MI.getOperand(3);


    unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)

                       ? AMDGPU::S_ADD_U32

                       : AMDGPU::S_SUB_U32;

    // clang-format off

    BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())

        .add(Src0)

        .add(Src1);

    // clang-format on


    unsigned SelOpc =

        Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;

    BuildMI(*BB, MI, DL, TII->get(SelOpc), Dest1.getReg()).addImm(-1).addImm(0);


    MI.eraseFromParent();

    return BB;

  }

  case AMDGPU::S_ADD_U64_PSEUDO:

  case AMDGPU::S_SUB_U64_PSEUDO: {

    return expand64BitScalarArithmetic(MI, BB);

  }

  case AMDGPU::V_ADD_U64_PSEUDO:

  case AMDGPU::V_SUB_U64_PSEUDO: {

    bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);


    MachineOperand &Dest = MI.getOperand(0);

    MachineOperand &Src0 = MI.getOperand(1);

    MachineOperand &Src1 = MI.getOperand(2);


    if (ST.hasAddSubU64Insts()) {

      auto I = BuildMI(*BB, MI, DL,

                       TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64

                                      : AMDGPU::V_SUB_U64_e64),

                       Dest.getReg())

                   .add(Src0)

                   .add(Src1)

                   .addImm(0); // clamp

      TII->legalizeOperands(*I);

      MI.eraseFromParent();

      return BB;

    }


    if (IsAdd && ST.hasLshlAddU64Inst()) {

      auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),

                         Dest.getReg())

                     .add(Src0)

                     .addImm(0)

                     .add(Src1);

      TII->legalizeOperands(*Add);

      MI.eraseFromParent();

      return BB;

    }


    const auto *CarryRC = TRI->getWaveMaskRegClass();


    Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

    Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);


    Register CarryReg = MRI.createVirtualRegister(CarryRC);

    Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);


    const TargetRegisterClass *Src0RC = Src0.isReg()

                                            ? MRI.getRegClass(Src0.getReg())

                                            : &AMDGPU::VReg_64RegClass;

    const TargetRegisterClass *Src1RC = Src1.isReg()

                                            ? MRI.getRegClass(Src1.getReg())

                                            : &AMDGPU::VReg_64RegClass;


    const TargetRegisterClass *Src0SubRC =

        TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);

    const TargetRegisterClass *Src1SubRC =

        TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);


    MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(

        MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);

    MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(

        MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);


    MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(

        MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);

    MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(

        MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);


    unsigned LoOpc =

        IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;

    MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)

                               .addReg(CarryReg, RegState::Define)

                               .add(SrcReg0Sub0)

                               .add(SrcReg1Sub0)

                               .addImm(0); // clamp bit


    unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;

    MachineInstr *HiHalf =

        BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)

            .addReg(DeadCarryReg, RegState::Define | RegState::Dead)

            .add(SrcReg0Sub1)

            .add(SrcReg1Sub1)

            .addReg(CarryReg, RegState::Kill)

            .addImm(0); // clamp bit


    BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())

        .addReg(DestSub0)

        .addImm(AMDGPU::sub0)

        .addReg(DestSub1)

        .addImm(AMDGPU::sub1);

    TII->legalizeOperands(*LoHalf);

    TII->legalizeOperands(*HiHalf);

    MI.eraseFromParent();

    return BB;

  }

  case AMDGPU::S_ADD_CO_PSEUDO:

  case AMDGPU::S_SUB_CO_PSEUDO: {

    // This pseudo has a chance to be selected

    // only from uniform add/subcarry node. All the VGPR operands

    // therefore assumed to be splat vectors.

    MachineBasicBlock::iterator MII = MI;

    MachineOperand &Dest = MI.getOperand(0);

    MachineOperand &CarryDest = MI.getOperand(1);

    MachineOperand &Src0 = MI.getOperand(2);

    MachineOperand &Src1 = MI.getOperand(3);

    MachineOperand &Src2 = MI.getOperand(4);

    if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {

      Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);

      BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)

          .addReg(Src0.getReg());

      Src0.setReg(RegOp0);

    }

    if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {

      Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);

      BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)

          .addReg(Src1.getReg());

      Src1.setReg(RegOp1);

    }

    Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);

    if (TRI->isVectorRegister(MRI, Src2.getReg())) {

      BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)

          .addReg(Src2.getReg());

      Src2.setReg(RegOp2);

    }


    if (ST.isWave64()) {

      if (ST.hasScalarCompareEq64()) {

        BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))

            .addReg(Src2.getReg())

            .addImm(0);

      } else {

        const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());

        const TargetRegisterClass *SubRC =

            TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);

        MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(

            MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);

        MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(

            MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);

        Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);


        BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)

            .add(Src2Sub0)

            .add(Src2Sub1);


        BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))

            .addReg(Src2_32, RegState::Kill)

            .addImm(0);

      }

    } else {

      BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))

          .addReg(Src2.getReg())

          .addImm(0);

    }


    unsigned Opc = MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO

                       ? AMDGPU::S_ADDC_U32

                       : AMDGPU::S_SUBB_U32;


    BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);


    unsigned SelOpc =

        ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;


    BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())

        .addImm(-1)

        .addImm(0);


    MI.eraseFromParent();

    return BB;

  }

  case AMDGPU::SI_INIT_M0: {

    MachineOperand &M0Init = MI.getOperand(0);

    BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),

            TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),

            AMDGPU::M0)

        .add(M0Init);

    MI.eraseFromParent();

    return BB;

  }

  case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {

    // Set SCC to true, in case the barrier instruction gets converted to a NOP.

    BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),

            TII->get(AMDGPU::S_CMP_EQ_U32))

        .addImm(0)

        .addImm(0);

    return BB;

  }

  case AMDGPU::GET_GROUPSTATICSIZE: {

    assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||

           getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);

    BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))

        .add(MI.getOperand(0))

        .addImm(MFI->getLDSSize());

    MI.eraseFromParent();

    return BB;

  }

  case AMDGPU::GET_SHADERCYCLESHILO: {

    assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters());

    // The algorithm is:

    //

    // hi1 = getreg(SHADER_CYCLES_HI)

    // lo1 = getreg(SHADER_CYCLES_LO)

    // hi2 = getreg(SHADER_CYCLES_HI)

    //

    // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.

    // Otherwise there was overflow and the result is hi2:0. In both cases the

    // result should represent the actual time at some point during the sequence

    // of three getregs.

    using namespace AMDGPU::Hwreg;

    Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);

    BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)

        .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));

    Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);

    BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)

        .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));

    Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);

    BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)

        .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));

    BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))

        .addReg(RegHi1)

        .addReg(RegHi2);

    Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);

    BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)

        .addReg(RegLo1)

        .addImm(0);

    BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))

        .add(MI.getOperand(0))

        .addReg(RegLo)

        .addImm(AMDGPU::sub0)

        .addReg(RegHi2)

        .addImm(AMDGPU::sub1);

    MI.eraseFromParent();

    return BB;

  }

  case AMDGPU::SI_INDIRECT_SRC_V1:

  case AMDGPU::SI_INDIRECT_SRC_V2:

  case AMDGPU::SI_INDIRECT_SRC_V3:

  case AMDGPU::SI_INDIRECT_SRC_V4:

  case AMDGPU::SI_INDIRECT_SRC_V5:

  case AMDGPU::SI_INDIRECT_SRC_V6:

  case AMDGPU::SI_INDIRECT_SRC_V7:

  case AMDGPU::SI_INDIRECT_SRC_V8:

  case AMDGPU::SI_INDIRECT_SRC_V9:

  case AMDGPU::SI_INDIRECT_SRC_V10:

  case AMDGPU::SI_INDIRECT_SRC_V11:

  case AMDGPU::SI_INDIRECT_SRC_V12:

  case AMDGPU::SI_INDIRECT_SRC_V16:

  case AMDGPU::SI_INDIRECT_SRC_V32:

    return emitIndirectSrc(MI, *BB, *getSubtarget());

  case AMDGPU::SI_INDIRECT_DST_V1:

  case AMDGPU::SI_INDIRECT_DST_V2:

  case AMDGPU::SI_INDIRECT_DST_V3:

  case AMDGPU::SI_INDIRECT_DST_V4:

  case AMDGPU::SI_INDIRECT_DST_V5:

  case AMDGPU::SI_INDIRECT_DST_V6:

  case AMDGPU::SI_INDIRECT_DST_V7:

  case AMDGPU::SI_INDIRECT_DST_V8:

  case AMDGPU::SI_INDIRECT_DST_V9:

  case AMDGPU::SI_INDIRECT_DST_V10:

  case AMDGPU::SI_INDIRECT_DST_V11:

  case AMDGPU::SI_INDIRECT_DST_V12:

  case AMDGPU::SI_INDIRECT_DST_V16:

  case AMDGPU::SI_INDIRECT_DST_V32:

    return emitIndirectDst(MI, *BB, *getSubtarget());

  case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:

  case AMDGPU::SI_KILL_I1_PSEUDO:

    return splitKillBlock(MI, BB);

  case AMDGPU::V_CNDMASK_B64_PSEUDO: {

    expand64BitV_CNDMASK(MI, BB);

    return BB;

  }

  case AMDGPU::SI_BR_UNDEF: {

    MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))

                           .add(MI.getOperand(0));

    Br->getOperand(1).setIsUndef(); // read undef SCC

    MI.eraseFromParent();

    return BB;

  }

  case AMDGPU::ADJCALLSTACKUP:

  case AMDGPU::ADJCALLSTACKDOWN: {

    const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();

    MachineInstrBuilder MIB(*MF, &MI);

    MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)

        .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);

    return BB;

  }

  case AMDGPU::SI_CALL_ISEL: {

    unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);


    MachineInstrBuilder MIB;

    MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);


    for (const MachineOperand &MO : MI.operands())

      MIB.add(MO);


    MIB.cloneMemRefs(MI);

    MI.eraseFromParent();

    return BB;

  }

  case AMDGPU::V_ADD_CO_U32_e32:

  case AMDGPU::V_SUB_CO_U32_e32:

  case AMDGPU::V_SUBREV_CO_U32_e32: {

    // TODO: Define distinct V_*_I32_Pseudo instructions instead.

    unsigned Opc = MI.getOpcode();


    bool NeedClampOperand = false;

    if (TII->pseudoToMCOpcode(Opc) == -1) {

      Opc = AMDGPU::getVOPe64(Opc);

      NeedClampOperand = true;

    }


    auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());

    if (TII->isVOP3(*I)) {

      I.addReg(TRI->getVCC(), RegState::Define);

    }

    I.add(MI.getOperand(1)).add(MI.getOperand(2));

    if (NeedClampOperand)

      I.addImm(0); // clamp bit for e64 encoding


    TII->legalizeOperands(*I);


    MI.eraseFromParent();

    return BB;

  }

  case AMDGPU::V_ADDC_U32_e32:

  case AMDGPU::V_SUBB_U32_e32:

  case AMDGPU::V_SUBBREV_U32_e32:

    // These instructions have an implicit use of vcc which counts towards the

    // constant bus limit.

    TII->legalizeOperands(MI);

    return BB;

  case AMDGPU::DS_GWS_INIT:

  case AMDGPU::DS_GWS_SEMA_BR:

  case AMDGPU::DS_GWS_BARRIER:

  case AMDGPU::DS_GWS_SEMA_V:

  case AMDGPU::DS_GWS_SEMA_P:

  case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:

    // A s_waitcnt 0 is required to be the instruction immediately following.

    if (getSubtarget()->hasGWSAutoReplay()) {

      bundleInstWithWaitcnt(MI);

      return BB;

    }


    return emitGWSMemViolTestLoop(MI, BB);

  case AMDGPU::S_SETREG_B32: {

    // Try to optimize cases that only set the denormal mode or rounding mode.

    //

    // If the s_setreg_b32 fully sets all of the bits in the rounding mode or

    // denormal mode to a constant, we can use s_round_mode or s_denorm_mode

    // instead.

    //

    // FIXME: This could be predicates on the immediate, but tablegen doesn't

    // allow you to have a no side effect instruction in the output of a

    // sideeffecting pattern.

    auto [ID, Offset, Width] =

        AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());

    if (ID != AMDGPU::Hwreg::ID_MODE)

      return BB;


    const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);

    const unsigned SetMask = WidthMask << Offset;


    if (getSubtarget()->hasDenormModeInst()) {

      unsigned SetDenormOp = 0;

      unsigned SetRoundOp = 0;


      // The dedicated instructions can only set the whole denorm or round mode

      // at once, not a subset of bits in either.

      if (SetMask ==

          (AMDGPU::Hwreg::FP_ROUND_MASK | AMDGPU::Hwreg::FP_DENORM_MASK)) {

        // If this fully sets both the round and denorm mode, emit the two

        // dedicated instructions for these.

        SetRoundOp = AMDGPU::S_ROUND_MODE;

        SetDenormOp = AMDGPU::S_DENORM_MODE;

      } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {

        SetRoundOp = AMDGPU::S_ROUND_MODE;

      } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {

        SetDenormOp = AMDGPU::S_DENORM_MODE;

      }


      if (SetRoundOp || SetDenormOp) {

        MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());

        if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {

          unsigned ImmVal = Def->getOperand(1).getImm();

          if (SetRoundOp) {

            BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))

                .addImm(ImmVal & 0xf);


            // If we also have the denorm mode, get just the denorm mode bits.

            ImmVal >>= 4;

          }


          if (SetDenormOp) {

            BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))

                .addImm(ImmVal & 0xf);

          }


          MI.eraseFromParent();

          return BB;

        }

      }

    }


    // If only FP bits are touched, used the no side effects pseudo.

    if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |

                    AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)

      MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));


    return BB;

  }

  case AMDGPU::S_INVERSE_BALLOT_U32:

  case AMDGPU::S_INVERSE_BALLOT_U64:

    // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if

    // necessary. After that they are equivalent to a COPY.

    MI.setDesc(TII->get(AMDGPU::COPY));

    return BB;

  case AMDGPU::ENDPGM_TRAP: {

    if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {

      MI.setDesc(TII->get(AMDGPU::S_ENDPGM));

      MI.addOperand(MachineOperand::CreateImm(0));

      return BB;

    }


    // We need a block split to make the real endpgm a terminator. We also don't

    // want to break phis in successor blocks, so we can't just delete to the

    // end of the block.


    MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);

    MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();

    MF->push_back(TrapBB);

    // clang-format off

    BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))

        .addImm(0);

    BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))

        .addMBB(TrapBB);

    // clang-format on


    BB->addSuccessor(TrapBB);

    MI.eraseFromParent();

    return SplitBB;

  }

  case AMDGPU::SIMULATED_TRAP: {

    assert(Subtarget->hasPrivEnabledTrap2NopBug());

    MachineBasicBlock *SplitBB =

        TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());

    MI.eraseFromParent();

    return SplitBB;

  }

  case AMDGPU::SI_TCRETURN_GFX_WholeWave:

  case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {

    assert(MFI->isWholeWaveFunction());


    // During ISel, it's difficult to propagate the original EXEC mask to use as

    // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.

    MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());

    assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");

    Register OriginalExec = Setup->getOperand(0).getReg();

    MF->getRegInfo().clearKillFlags(OriginalExec);

    MI.getOperand(0).setReg(OriginalExec);

    return BB;

  }

  default:

    if (TII->isImage(MI) || TII->isMUBUF(MI)) {

      if (!MI.mayStore())

        AddMemOpInit(MI);

      return BB;

    }

    return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);

  }

}


bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {

  // This currently forces unfolding various combinations of fsub into fma with

  // free fneg'd operands. As long as we have fast FMA (controlled by

  // isFMAFasterThanFMulAndFAdd), we should perform these.


  // When fma is quarter rate, for f64 where add / sub are at best half rate,

  // most of these combines appear to be cycle neutral but save on instruction

  // count / code size.

  return true;

}


bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const { return true; }


EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,

                                         EVT VT) const {

  if (!VT.isVector()) {

    return MVT::i1;

  }

  return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());

}


MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {

  // TODO: Should i16 be used always if legal? For now it would force VALU

  // shifts.

  return (VT == MVT::i16) ? MVT::i16 : MVT::i32;

}


LLT SITargetLowering::getPreferredShiftAmountTy(LLT Ty) const {

  return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())

             ? Ty.changeElementSize(16)

             : Ty.changeElementSize(32);

}


// Answering this is somewhat tricky and depends on the specific device which

// have different rates for fma or all f64 operations.

//

// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other

// regardless of which device (although the number of cycles differs between

// devices), so it is always profitable for f64.

//

// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable

// only on full rate devices. Normally, we should prefer selecting v_mad_f32

// which we can always do even without fused FP ops since it returns the same

// result as the separate operations and since it is always full

// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32

// however does not support denormals, so we do report fma as faster if we have

// a fast fma device and require denormals.

//


bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,

                                                  EVT VT) const {

  VT = VT.getScalarType();


  switch (VT.getSimpleVT().SimpleTy) {

  case MVT::f32: {

    // If mad is not available this depends only on if f32 fma is full rate.

    if (!Subtarget->hasMadMacF32Insts())

      return Subtarget->hasFastFMAF32();


    // Otherwise f32 mad is always full rate and returns the same result as

    // the separate operations so should be preferred over fma.

    // However does not support denormals.

    if (!denormalModeIsFlushAllF32(MF))

      return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();


    // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.

    return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();

  }

  case MVT::f64:

    return true;

  case MVT::f16:

  case MVT::bf16:

    return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);

  default:

    break;

  }


  return false;

}


bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,

                                                  LLT Ty) const {

  switch (Ty.getScalarSizeInBits()) {

  case 16:

    return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);

  case 32:

    return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);

  case 64:

    return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);

  default:

    break;

  }


  return false;

}


bool SITargetLowering::isFMADLegal(const MachineInstr &MI, LLT Ty) const {

  if (!Ty.isScalar())

    return false;


  if (Ty.getScalarSizeInBits() == 16)

    return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());

  if (Ty.getScalarSizeInBits() == 32)

    return Subtarget->hasMadMacF32Insts() &&

           denormalModeIsFlushAllF32(*MI.getMF());


  return false;

}


bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG,

                                   const SDNode *N) const {

  // TODO: Check future ftz flag

  // v_mad_f32/v_mac_f32 do not support denormals.

  EVT VT = N->getValueType(0);

  if (VT == MVT::f32)

    return Subtarget->hasMadMacF32Insts() &&

           denormalModeIsFlushAllF32(DAG.getMachineFunction());

  if (VT == MVT::f16) {

    return Subtarget->hasMadF16() &&

           denormalModeIsFlushAllF64F16(DAG.getMachineFunction());

  }


  return false;

}


//===----------------------------------------------------------------------===//

// Custom DAG Lowering Operations

//===----------------------------------------------------------------------===//


// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the

// wider vector type is legal.


SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,

                                             SelectionDAG &DAG) const {

  unsigned Opc = Op.getOpcode();

  EVT VT = Op.getValueType();

  assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||

         VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||

         VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||

         VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||

         VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||

         VT == MVT::v32bf16);


  auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);


  SDLoc SL(Op);

  SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());

  SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());


  return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);

}


// Enable lowering of ROTR for vxi32 types. This is a workaround for a

// regression whereby extra unnecessary instructions were added to codegen

// for rotr operations, casued by legalising v2i32 or. This resulted in extra

// instructions to extract the result from the vector.


SDValue SITargetLowering::lowerROTR(SDValue Op, SelectionDAG &DAG) const {

  [[maybe_unused]] EVT VT = Op.getValueType();


  assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||

          VT == MVT::v16i32) &&

         "Unexpected ValueType.");


  return DAG.UnrollVectorOp(Op.getNode());

}


// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the

// wider vector type is legal.


SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,

                                              SelectionDAG &DAG) const {

  unsigned Opc = Op.getOpcode();

  EVT VT = Op.getValueType();

  assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||

         VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||

         VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||

         VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||

         VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||

         VT == MVT::v32bf16);


  auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);

  auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);


  SDLoc SL(Op);


  SDValue OpLo =

      DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());

  SDValue OpHi =

      DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());


  return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);

}


SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,

                                               SelectionDAG &DAG) const {

  unsigned Opc = Op.getOpcode();

  EVT VT = Op.getValueType();

  assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||

         VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||

         VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||

         VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||

         VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||

         VT == MVT::v32bf16);


  SDValue Op0 = Op.getOperand(0);

  auto [Lo0, Hi0] = Op0.getValueType().isVector()

                        ? DAG.SplitVectorOperand(Op.getNode(), 0)

                        : std::pair(Op0, Op0);


  auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);

  auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);


  SDLoc SL(Op);

  auto ResVT = DAG.GetSplitDestVTs(VT);


  SDValue OpLo =

      DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());

  SDValue OpHi =

      DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());


  return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);

}


SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {

  switch (Op.getOpcode()) {

  default:

    return AMDGPUTargetLowering::LowerOperation(Op, DAG);

  case ISD::BRCOND:

    return LowerBRCOND(Op, DAG);

  case ISD::RETURNADDR:

    return LowerRETURNADDR(Op, DAG);

  case ISD::SPONENTRY:

    return LowerSPONENTRY(Op, DAG);

  case ISD::LOAD: {

    SDValue Result = LowerLOAD(Op, DAG);

    assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&

           "Load should return a value and a chain");

    return Result;

  }

  case ISD::FSQRT: {

    EVT VT = Op.getValueType();

    if (VT == MVT::f32)

      return lowerFSQRTF32(Op, DAG);

    if (VT == MVT::f64)

      return lowerFSQRTF64(Op, DAG);

    return SDValue();

  }

  case ISD::FSIN:

  case ISD::FCOS:

    return LowerTrig(Op, DAG);

  case ISD::SELECT:

    return LowerSELECT(Op, DAG);

  case ISD::FDIV:

    return LowerFDIV(Op, DAG);

  case ISD::FFREXP:

    return LowerFFREXP(Op, DAG);

  case ISD::ATOMIC_CMP_SWAP:

    return LowerATOMIC_CMP_SWAP(Op, DAG);

  case ISD::STORE:

    return LowerSTORE(Op, DAG);

  case ISD::GlobalAddress: {

    MachineFunction &MF = DAG.getMachineFunction();

    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

    return LowerGlobalAddress(MFI, Op, DAG);

  }

  case ISD::ExternalSymbol:

    return LowerExternalSymbol(Op, DAG);

  case ISD::INTRINSIC_WO_CHAIN:

    return LowerINTRINSIC_WO_CHAIN(Op, DAG);

  case ISD::INTRINSIC_W_CHAIN:

    return LowerINTRINSIC_W_CHAIN(Op, DAG);

  case ISD::INTRINSIC_VOID:

    return LowerINTRINSIC_VOID(Op, DAG);

  case ISD::ADDRSPACECAST:

    return lowerADDRSPACECAST(Op, DAG);

  case ISD::INSERT_SUBVECTOR:

    return lowerINSERT_SUBVECTOR(Op, DAG);

  case ISD::INSERT_VECTOR_ELT:

    return lowerINSERT_VECTOR_ELT(Op, DAG);

  case ISD::EXTRACT_VECTOR_ELT:

    return lowerEXTRACT_VECTOR_ELT(Op, DAG);

  case ISD::VECTOR_SHUFFLE:

    return lowerVECTOR_SHUFFLE(Op, DAG);

  case ISD::SCALAR_TO_VECTOR:

    return lowerSCALAR_TO_VECTOR(Op, DAG);

  case ISD::BUILD_VECTOR:

    return lowerBUILD_VECTOR(Op, DAG);

  case ISD::FP_ROUND:

  case ISD::STRICT_FP_ROUND:

    return lowerFP_ROUND(Op, DAG);

  case ISD::TRAP:

    return lowerTRAP(Op, DAG);

  case ISD::DEBUGTRAP:

    return lowerDEBUGTRAP(Op, DAG);

  case ISD::ABS:

  case ISD::FABS:

  case ISD::FNEG:

  case ISD::FCANONICALIZE:

  case ISD::BSWAP:

    return splitUnaryVectorOp(Op, DAG);

  case ISD::FMINNUM:

  case ISD::FMAXNUM:

    return lowerFMINNUM_FMAXNUM(Op, DAG);

  case ISD::FMINIMUMNUM:

  case ISD::FMAXIMUMNUM:

    return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);

  case ISD::FMINIMUM:

  case ISD::FMAXIMUM:

    return lowerFMINIMUM_FMAXIMUM(Op, DAG);

  case ISD::FLDEXP:

  case ISD::STRICT_FLDEXP:

    return lowerFLDEXP(Op, DAG);

  case ISD::FMA:

    return splitTernaryVectorOp(Op, DAG);

  case ISD::FP_TO_SINT:

  case ISD::FP_TO_UINT:

    if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11 &&

        Op.getValueType() == MVT::i16 &&

        Op.getOperand(0).getValueType() == MVT::f32) {

      // Make f32->i16 legal so we can select V_CVT_PK_[IU]16_F32.

      return Op;

    }

    return LowerFP_TO_INT(Op, DAG);

  case ISD::SHL:

  case ISD::SRA:

  case ISD::SRL:

  case ISD::ADD:

  case ISD::SUB:

  case ISD::SMIN:

  case ISD::SMAX:

  case ISD::UMIN:

  case ISD::UMAX:

  case ISD::FADD:

  case ISD::FMUL:

  case ISD::FMINNUM_IEEE:

  case ISD::FMAXNUM_IEEE:

  case ISD::UADDSAT:

  case ISD::USUBSAT:

  case ISD::SADDSAT:

  case ISD::SSUBSAT:

    return splitBinaryVectorOp(Op, DAG);

  case ISD::FCOPYSIGN:

    return lowerFCOPYSIGN(Op, DAG);

  case ISD::MUL:

    return lowerMUL(Op, DAG);

  case ISD::SMULO:

  case ISD::UMULO:

    return lowerXMULO(Op, DAG);

  case ISD::SMUL_LOHI:

  case ISD::UMUL_LOHI:

    return lowerXMUL_LOHI(Op, DAG);

  case ISD::DYNAMIC_STACKALLOC:

    return LowerDYNAMIC_STACKALLOC(Op, DAG);

  case ISD::STACKSAVE:

    return LowerSTACKSAVE(Op, DAG);

  case ISD::GET_ROUNDING:

    return lowerGET_ROUNDING(Op, DAG);

  case ISD::SET_ROUNDING:

    return lowerSET_ROUNDING(Op, DAG);

  case ISD::PREFETCH:

    return lowerPREFETCH(Op, DAG);

  case ISD::FP_EXTEND:

  case ISD::STRICT_FP_EXTEND:

    return lowerFP_EXTEND(Op, DAG);

  case ISD::GET_FPENV:

    return lowerGET_FPENV(Op, DAG);

  case ISD::SET_FPENV:

    return lowerSET_FPENV(Op, DAG);

  case ISD::ROTR:

    return lowerROTR(Op, DAG);

  case ISD::INLINEASM:

    return LowerINLINEASM(Op, DAG);

  }

  return SDValue();

}


// Used for D16: Casts the result of an instruction into the right vector,

// packs values if loads return unpacked values.


static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,

                                       const SDLoc &DL, SelectionDAG &DAG,

                                       bool Unpacked) {

  if (!LoadVT.isVector())

    return Result;


  // Cast back to the original packed type or to a larger type that is a

  // multiple of 32 bit for D16. Widening the return type is a required for

  // legalization.

  EVT FittingLoadVT = LoadVT;

  if ((LoadVT.getVectorNumElements() % 2) == 1) {

    FittingLoadVT =

        EVT::getVectorVT(*DAG.getContext(), LoadVT.getVectorElementType(),

                         LoadVT.getVectorNumElements() + 1);

  }


  if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.

    // Truncate to v2i16/v4i16.

    EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();


    // Workaround legalizer not scalarizing truncate after vector op

    // legalization but not creating intermediate vector trunc.

    SmallVector<SDValue, 4> Elts;

    DAG.ExtractVectorElements(Result, Elts);

    for (SDValue &Elt : Elts)

      Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);


    // Pad illegal v1i16/v3fi6 to v4i16

    if ((LoadVT.getVectorNumElements() % 2) == 1)

      Elts.push_back(DAG.getPOISON(MVT::i16));


    Result = DAG.getBuildVector(IntLoadVT, DL, Elts);


    // Bitcast to original type (v2f16/v4f16).

    return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);

  }


  // Cast back to the original packed type.

  return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);

}


SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,

                                              SelectionDAG &DAG,

                                              ArrayRef<SDValue> Ops,

                                              bool IsIntrinsic) const {

  SDLoc DL(M);


  bool Unpacked = Subtarget->hasUnpackedD16VMem();

  EVT LoadVT = M->getValueType(0);


  EVT EquivLoadVT = LoadVT;

  if (LoadVT.isVector()) {

    if (Unpacked) {

      EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,

                                     LoadVT.getVectorNumElements());

    } else if ((LoadVT.getVectorNumElements() % 2) == 1) {

      // Widen v3f16 to legal type

      EquivLoadVT =

          EVT::getVectorVT(*DAG.getContext(), LoadVT.getVectorElementType(),

                           LoadVT.getVectorNumElements() + 1);

    }

  }


  // Change from v4f16/v2f16 to EquivLoadVT.

  SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);


  SDValue Load = DAG.getMemIntrinsicNode(

      IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,

      M->getMemoryVT(), M->getMemOperand());


  SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);


  return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);

}


SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,

                                             SelectionDAG &DAG,

                                             ArrayRef<SDValue> Ops) const {

  SDLoc DL(M);

  EVT LoadVT = M->getValueType(0);

  EVT EltType = LoadVT.getScalarType();

  EVT IntVT = LoadVT.changeTypeToInteger();


  bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);


  assert(M->getNumValues() == 2 || M->getNumValues() == 3);

  bool IsTFE = M->getNumValues() == 3;


  unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE

                                   : AMDGPUISD::BUFFER_LOAD_FORMAT)

                 : IsTFE  ? AMDGPUISD::BUFFER_LOAD_TFE

                          : AMDGPUISD::BUFFER_LOAD;


  if (IsD16) {

    return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);

  }


  // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics

  if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)

    return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),

                                      IsTFE);


  if (isTypeLegal(LoadVT)) {

    return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,

                               M->getMemOperand(), DAG);

  }


  EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);

  SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);

  SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,

                                        M->getMemOperand(), DAG);

  return DAG.getMergeValues(

      {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},

      DL);

}


static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N,

                                  SelectionDAG &DAG) {

  EVT VT = N->getValueType(0);

  unsigned CondCode = N->getConstantOperandVal(3);

  if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))

    return DAG.getPOISON(VT);


  ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);


  SDValue LHS = N->getOperand(1);

  SDValue RHS = N->getOperand(2);


  SDLoc DL(N);


  EVT CmpVT = LHS.getValueType();

  if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {

    unsigned PromoteOp =

        ICmpInst::isSigned(IcInput) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

    LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);

    RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);

  }


  ISD::CondCode CCOpcode = getICmpCondCode(IcInput);


  unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();

  EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);


  SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,

                              DAG.getCondCode(CCOpcode));

  if (VT.bitsEq(CCVT))

    return SetCC;

  return DAG.getZExtOrTrunc(SetCC, DL, VT);

}


static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N,

                                  SelectionDAG &DAG) {

  EVT VT = N->getValueType(0);


  unsigned CondCode = N->getConstantOperandVal(3);

  if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))

    return DAG.getPOISON(VT);


  SDValue Src0 = N->getOperand(1);

  SDValue Src1 = N->getOperand(2);

  EVT CmpVT = Src0.getValueType();

  SDLoc SL(N);


  if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {

    Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);

    Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);

  }


  FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);

  ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);

  unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();

  EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);

  SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,

                              DAG.getCondCode(CCOpcode));

  if (VT.bitsEq(CCVT))

    return SetCC;

  return DAG.getZExtOrTrunc(SetCC, SL, VT);

}


static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,

                                    SelectionDAG &DAG) {

  EVT VT = N->getValueType(0);

  SDValue Src = N->getOperand(1);

  SDLoc SL(N);


  if (Src.getOpcode() == ISD::SETCC) {

    SDValue Op0 = Src.getOperand(0);

    SDValue Op1 = Src.getOperand(1);

    // Need to expand bfloat to float for comparison (setcc).

    if (Op0.getValueType() == MVT::bf16) {

      Op0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op0);

      Op1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op1);

    }

    // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)

    return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Op0, Op1, Src.getOperand(2));

  }

  if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {

    // (ballot 0) -> 0

    if (Arg->isZero())

      return DAG.getConstant(0, SL, VT);


    // (ballot 1) -> EXEC/EXEC_LO

    if (Arg->isOne()) {

      Register Exec;

      if (VT.getScalarSizeInBits() == 32)

        Exec = AMDGPU::EXEC_LO;

      else if (VT.getScalarSizeInBits() == 64)

        Exec = AMDGPU::EXEC;

      else

        return SDValue();


      return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);

    }

  }


  // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)

  // ISD::SETNE)

  return DAG.getNode(

      AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),

      DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));

}


static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,

                           SelectionDAG &DAG) {

  EVT VT = N->getValueType(0);

  unsigned ValSize = VT.getSizeInBits();

  unsigned IID = N->getConstantOperandVal(0);

  bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||

                      IID == Intrinsic::amdgcn_permlanex16;

  bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||

                       IID == Intrinsic::amdgcn_set_inactive_chain_arg;

  bool IsPermlaneShuffle = IID == Intrinsic::amdgcn_permlane_bcast ||

                           IID == Intrinsic::amdgcn_permlane_up ||

                           IID == Intrinsic::amdgcn_permlane_down ||

                           IID == Intrinsic::amdgcn_permlane_xor;

  SDLoc SL(N);

  MVT IntVT = MVT::getIntegerVT(ValSize);

  const GCNSubtarget *ST = TLI.getSubtarget();

  unsigned SplitSize = 32;

  if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&

      ST->hasDPALU_DPP() &&

      AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3)))

    SplitSize = 64;


  auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,

                                          SDValue Src2, MVT ValT) -> SDValue {

    SmallVector<SDValue, 8> Operands;

    switch (IID) {

    case Intrinsic::amdgcn_permlane16:

    case Intrinsic::amdgcn_permlanex16:

    case Intrinsic::amdgcn_update_dpp:

      Operands.push_back(N->getOperand(6));

      Operands.push_back(N->getOperand(5));

      Operands.push_back(N->getOperand(4));

      [[fallthrough]];

    case Intrinsic::amdgcn_writelane:

    case Intrinsic::amdgcn_permlane_bcast:

    case Intrinsic::amdgcn_permlane_up:

    case Intrinsic::amdgcn_permlane_down:

    case Intrinsic::amdgcn_permlane_xor:

      Operands.push_back(Src2);

      [[fallthrough]];

    case Intrinsic::amdgcn_readlane:

    case Intrinsic::amdgcn_set_inactive:

    case Intrinsic::amdgcn_set_inactive_chain_arg:

    case Intrinsic::amdgcn_mov_dpp8:

      Operands.push_back(Src1);

      [[fallthrough]];

    case Intrinsic::amdgcn_readfirstlane:

    case Intrinsic::amdgcn_permlane64:

      Operands.push_back(Src0);

      break;

    default:

      llvm_unreachable("unhandled lane op");

    }


    Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));

    std::reverse(Operands.begin(), Operands.end());


    if (SDNode *GL = N->getGluedNode()) {

      assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);

      GL = GL->getOperand(0).getNode();

      Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,

                                     SDValue(GL, 0)));

    }


    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);

  };


  SDValue Src0 = N->getOperand(1);

  SDValue Src1, Src2;

  if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||

      IID == Intrinsic::amdgcn_mov_dpp8 ||

      IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16 ||

      IsPermlaneShuffle) {

    Src1 = N->getOperand(2);

    if (IID == Intrinsic::amdgcn_writelane ||

        IID == Intrinsic::amdgcn_update_dpp || IsPermLane16 ||

        IsPermlaneShuffle)

      Src2 = N->getOperand(3);

  }


  if (ValSize == SplitSize) {

    // Already legal

    return SDValue();

  }


  if (ValSize < 32) {

    bool IsFloat = VT.isFloatingPoint();

    Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,

                                SL, MVT::i32);


    if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {

      Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,

                                  SL, MVT::i32);

    }


    if (IID == Intrinsic::amdgcn_writelane) {

      Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,

                                  SL, MVT::i32);

    }


    SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);

    SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);

    return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;

  }


  if (ValSize % SplitSize != 0)

    return SDValue();


  auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {

    EVT VT = N->getValueType(0);

    unsigned NE = VT.getVectorNumElements();

    EVT EltVT = VT.getVectorElementType();

    SmallVector<SDValue, 8> Scalars;

    unsigned NumOperands = N->getNumOperands();

    SmallVector<SDValue, 4> Operands(NumOperands);

    SDNode *GL = N->getGluedNode();


    // only handle convergencectrl_glue

    assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);


    for (unsigned i = 0; i != NE; ++i) {

      for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;

           ++j) {

        SDValue Operand = N->getOperand(j);

        EVT OperandVT = Operand.getValueType();

        if (OperandVT.isVector()) {

          // A vector operand; extract a single element.

          EVT OperandEltVT = OperandVT.getVectorElementType();

          Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,

                                    Operand, DAG.getVectorIdxConstant(i, SL));

        } else {

          // A scalar operand; just use it as is.

          Operands[j] = Operand;

        }

      }


      if (GL)

        Operands[NumOperands - 1] =

            DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,

                        SDValue(GL->getOperand(0).getNode(), 0));


      Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));

    }


    EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);

    return DAG.getBuildVector(VecVT, SL, Scalars);

  };


  if (VT.isVector()) {

    switch (MVT::SimpleValueType EltTy =

                VT.getVectorElementType().getSimpleVT().SimpleTy) {

    case MVT::i32:

    case MVT::f32:

      if (SplitSize == 32) {

        SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());

        return unrollLaneOp(LaneOp.getNode());

      }

      [[fallthrough]];

    case MVT::i16:

    case MVT::f16:

    case MVT::bf16: {

      unsigned SubVecNumElt =

          SplitSize / VT.getVectorElementType().getSizeInBits();

      MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);

      SmallVector<SDValue, 4> Pieces;

      SDValue Src0SubVec, Src1SubVec, Src2SubVec;

      for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {

        Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,

                                 DAG.getConstant(EltIdx, SL, MVT::i32));


        if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||

            IsPermLane16) {

          Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,

                                   DAG.getConstant(EltIdx, SL, MVT::i32));


          Pieces.push_back(

              createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT));

        } else if (IID == Intrinsic::amdgcn_writelane) {

          Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,

                                   DAG.getConstant(EltIdx, SL, MVT::i32));

          Pieces.push_back(

              createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));

        } else {

          Pieces.push_back(createLaneOp(Src0SubVec, Src1, Src2, SubVecVT));

        }


        EltIdx += SubVecNumElt;

      }

      return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);

    }

    default:

      // Handle all other cases by bitcasting to i32 vectors

      break;

    }

  }


  MVT VecVT =

      MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);

  Src0 = DAG.getBitcast(VecVT, Src0);


  if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)

    Src1 = DAG.getBitcast(VecVT, Src1);


  if (IID == Intrinsic::amdgcn_writelane)

    Src2 = DAG.getBitcast(VecVT, Src2);


  SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);

  SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());

  return DAG.getBitcast(VT, UnrolledLaneOp);

}


static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N,

                                SelectionDAG &DAG) {

  EVT VT = N->getValueType(0);


  if (VT.getSizeInBits() != 32)

    return SDValue();


  SDLoc SL(N);


  SDValue Value = N->getOperand(1);

  SDValue Index = N->getOperand(2);


  // ds_bpermute requires index to be multiplied by 4

  SDValue ShiftAmount = DAG.getShiftAmountConstant(2, MVT::i32, SL);

  SDValue ShiftedIndex =

      DAG.getNode(ISD::SHL, SL, Index.getValueType(), Index, ShiftAmount);


  // Intrinsics will require i32 to operate on

  SDValue ValueI32 = DAG.getBitcast(MVT::i32, Value);


  auto MakeIntrinsic = [&DAG, &SL](unsigned IID, MVT RetVT,

                                   SmallVector<SDValue> IntrinArgs) -> SDValue {

    SmallVector<SDValue> Operands(1);

    Operands[0] = DAG.getTargetConstant(IID, SL, MVT::i32);

    Operands.append(IntrinArgs);

    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, RetVT, Operands);

  };


  // If we can bpermute across the whole wave, then just do that

  if (TLI.getSubtarget()->supportsWaveWideBPermute()) {

    SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,

                                     {ShiftedIndex, ValueI32});

    return DAG.getBitcast(VT, BPermute);

  }


  assert(TLI.getSubtarget()->isWave64());


  // Otherwise, we need to make use of whole wave mode

  SDValue PoisonVal = DAG.getPOISON(ValueI32->getValueType(0));


  // Set inactive lanes to poison

  SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,

                                   {ValueI32, PoisonVal});

  SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,

                                   {ShiftedIndex, PoisonVal});


  SDValue Swapped =

      MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});


  // Get permutation of each half, then we'll select which one to use

  SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,

                                        {WWMIndex, WWMValue});

  SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,

                                         MVT::i32, {WWMIndex, Swapped});

  SDValue BPermOtherHalfWWM =

      MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});


  // Select which side to take the permute from

  SDValue ThreadIDMask = DAG.getAllOnesConstant(SL, MVT::i32);

  // We can get away with only using mbcnt_lo here since we're only

  // trying to detect which side of 32 each lane is on, and mbcnt_lo

  // returns 32 for lanes 32-63.

  SDValue ThreadID =

      MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,

                    {ThreadIDMask, DAG.getTargetConstant(0, SL, MVT::i32)});


  SDValue SameOrOtherHalf =

      DAG.getNode(ISD::AND, SL, MVT::i32,

                  DAG.getNode(ISD::XOR, SL, MVT::i32, ThreadID, Index),

                  DAG.getTargetConstant(32, SL, MVT::i32));

  SDValue UseSameHalf =

      DAG.getSetCC(SL, MVT::i1, SameOrOtherHalf,

                   DAG.getConstant(0, SL, MVT::i32), ISD::SETEQ);

  SDValue Result = DAG.getSelect(SL, MVT::i32, UseSameHalf, BPermSameHalf,

                                 BPermOtherHalfWWM);

  return DAG.getBitcast(VT, Result);

}


void SITargetLowering::ReplaceNodeResults(SDNode *N,

                                          SmallVectorImpl<SDValue> &Results,

                                          SelectionDAG &DAG) const {

  switch (N->getOpcode()) {

  case ISD::INSERT_VECTOR_ELT: {

    if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))

      Results.push_back(Res);

    return;

  }

  case ISD::EXTRACT_VECTOR_ELT: {

    if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))

      Results.push_back(Res);

    return;

  }

  case ISD::INTRINSIC_WO_CHAIN: {

    unsigned IID = N->getConstantOperandVal(0);

    switch (IID) {

    case Intrinsic::amdgcn_make_buffer_rsrc:

      Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));

      return;

    case Intrinsic::amdgcn_cvt_pkrtz: {

      SDValue Src0 = N->getOperand(1);

      SDValue Src1 = N->getOperand(2);

      SDLoc SL(N);

      SDValue Cvt =

          DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);

      Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));

      return;

    }

    case Intrinsic::amdgcn_cvt_pknorm_i16:

    case Intrinsic::amdgcn_cvt_pknorm_u16:

    case Intrinsic::amdgcn_cvt_pk_i16:

    case Intrinsic::amdgcn_cvt_pk_u16: {

      SDValue Src0 = N->getOperand(1);

      SDValue Src1 = N->getOperand(2);

      SDLoc SL(N);

      unsigned Opcode;


      if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)

        Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;

      else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)

        Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;

      else if (IID == Intrinsic::amdgcn_cvt_pk_i16)

        Opcode = AMDGPUISD::CVT_PK_I16_I32;

      else

        Opcode = AMDGPUISD::CVT_PK_U16_U32;


      EVT VT = N->getValueType(0);

      if (isTypeLegal(VT))

        Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));

      else {

        SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);

        Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));

      }

      return;

    }

    case Intrinsic::amdgcn_s_buffer_load: {

      // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate

      // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG

      // combiner tries to merge the s_buffer_load_u8 with a sext instruction

      // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with

      // s_buffer_load_i8.

      if (!Subtarget->hasScalarSubwordLoads())

        return;

      SDValue Op = SDValue(N, 0);

      SDValue Rsrc = Op.getOperand(1);

      SDValue Offset = Op.getOperand(2);

      SDValue CachePolicy = Op.getOperand(3);

      EVT VT = Op.getValueType();

      assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");

      SDLoc DL(Op);

      MachineFunction &MF = DAG.getMachineFunction();

      const DataLayout &DataLayout = DAG.getDataLayout();

      Align Alignment =

          DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));

      MachineMemOperand *MMO = MF.getMachineMemOperand(

          MachinePointerInfo(),

          MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |

              MachineMemOperand::MOInvariant,

          VT.getStoreSize(), Alignment);

      SDValue LoadVal;

      if (!Offset->isDivergent()) {

        SDValue Ops[] = {Rsrc, // source register

                         Offset, CachePolicy};

        SDValue BufferLoad =

            DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_UBYTE, DL,

                                    DAG.getVTList(MVT::i32), Ops, VT, MMO);

        LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);

      } else {

        SDValue Ops[] = {

            DAG.getEntryNode(),                    // Chain

            Rsrc,                                  // rsrc

            DAG.getConstant(0, DL, MVT::i32),      // vindex

            {},                                    // voffset

            {},                                    // soffset

            {},                                    // offset

            CachePolicy,                           // cachepolicy

            DAG.getTargetConstant(0, DL, MVT::i1), // idxen

        };

        setBufferOffsets(Offset, DAG, &Ops[3], Align(4));

        LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);

      }

      Results.push_back(LoadVal);

      return;

    }

    case Intrinsic::amdgcn_dead: {

      for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)

        Results.push_back(DAG.getPOISON(N->getValueType(I)));

      return;

    }

    }

    break;

  }

  case ISD::INTRINSIC_W_CHAIN: {

    if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {

      if (Res.getOpcode() == ISD::MERGE_VALUES) {

        // FIXME: Hacky

        for (unsigned I = 0; I < Res.getNumOperands(); I++) {

          Results.push_back(Res.getOperand(I));

        }

      } else {

        Results.push_back(Res);

        Results.push_back(Res.getValue(1));

      }

      return;

    }


    break;

  }

  case ISD::SELECT: {

    SDLoc SL(N);

    EVT VT = N->getValueType(0);

    EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);

    SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));

    SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));


    EVT SelectVT = NewVT;

    if (NewVT.bitsLT(MVT::i32)) {

      LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);

      RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);

      SelectVT = MVT::i32;

    }


    SDValue NewSelect =

        DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);


    if (NewVT != SelectVT)

      NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);

    Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));

    return;

  }

  case ISD::FNEG: {

    if (N->getValueType(0) != MVT::v2f16)

      break;


    SDLoc SL(N);

    SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));


    SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,

                             DAG.getConstant(0x80008000, SL, MVT::i32));

    Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));

    return;

  }

  case ISD::FABS: {

    if (N->getValueType(0) != MVT::v2f16)

      break;


    SDLoc SL(N);

    SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));


    SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,

                             DAG.getConstant(0x7fff7fff, SL, MVT::i32));

    Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));

    return;

  }

  case ISD::FSQRT: {

    if (N->getValueType(0) != MVT::f16)

      break;

    Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));

    break;

  }

  default:

    AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);

    break;

  }

}


/// Helper function for LowerBRCOND


static SDNode *findUser(SDValue Value, unsigned Opcode) {


  for (SDUse &U : Value->uses()) {

    if (U.get() != Value)

      continue;


    if (U.getUser()->getOpcode() == Opcode)

      return U.getUser();

  }

  return nullptr;

}


unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {

  if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {

    switch (Intr->getConstantOperandVal(1)) {

    case Intrinsic::amdgcn_if:

      return AMDGPUISD::IF;

    case Intrinsic::amdgcn_else:

      return AMDGPUISD::ELSE;

    case Intrinsic::amdgcn_loop:

      return AMDGPUISD::LOOP;

    case Intrinsic::amdgcn_end_cf:

      llvm_unreachable("should not occur");

    default:

      return 0;

    }

  }


  // break, if_break, else_break are all only used as inputs to loop, not

  // directly as branch conditions.

  return 0;

}


bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {

  const Triple &TT = getTargetMachine().getTargetTriple();

  return (GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||

          GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&

         AMDGPU::shouldEmitConstantsToTextSection(TT);

}


bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {

  if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())

    return false;


  // FIXME: Either avoid relying on address space here or change the default

  // address space for functions to avoid the explicit check.

  return (GV->getValueType()->isFunctionTy() ||

          !isNonGlobalAddrSpace(GV->getAddressSpace())) &&

         !shouldEmitFixup(GV) && !getTargetMachine().shouldAssumeDSOLocal(GV);

}


bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {

  return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);

}


bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue *GV) const {

  if (!GV->hasExternalLinkage())

    return true;


  // With object linking, external LDS declarations need relocations so the

  // linker can assign their offsets.

  if (AMDGPUTargetMachine::EnableObjectLinking) {

    if (const auto *GVar = dyn_cast<GlobalVariable>(GV)) {

      if (GVar->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {

        assert(GVar->isDeclaration() && "AS3 GVs should be declaration here "

                                        "when object linking is enabled");

        return false;

      }

    }

  }


  const auto OS = getTargetMachine().getTargetTriple().getOS();

  return OS == Triple::AMDHSA || OS == Triple::AMDPAL;

}


/// This transforms the control flow intrinsics to get the branch destination as

/// last parameter, also switches branch target with BR if the need arise

SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {

  SDLoc DL(BRCOND);


  SDNode *Intr = BRCOND.getOperand(1).getNode();

  SDValue Target = BRCOND.getOperand(2);

  SDNode *BR = nullptr;

  SDNode *SetCC = nullptr;


  switch (Intr->getOpcode()) {

  case ISD::SETCC: {

    // As long as we negate the condition everything is fine

    SetCC = Intr;

    Intr = SetCC->getOperand(0).getNode();

    break;

  }

  case ISD::XOR: {

    // Similar to SETCC, if we have (xor c, -1), we will be fine.

    SDValue LHS = Intr->getOperand(0);

    SDValue RHS = Intr->getOperand(1);

    if (auto *C = dyn_cast<ConstantSDNode>(RHS); C && C->getZExtValue()) {

      Intr = LHS.getNode();

      break;

    }

    [[fallthrough]];

  }

  default: {

    // Get the target from BR if we don't negate the condition

    BR = findUser(BRCOND, ISD::BR);

    assert(BR && "brcond missing unconditional branch user");

    Target = BR->getOperand(1);

  }

  }


  unsigned CFNode = isCFIntrinsic(Intr);

  if (CFNode == 0) {

    // This is a uniform branch so we don't need to legalize.

    return BRCOND;

  }


  bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||

                   Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;


  assert(!SetCC ||

         (SetCC->getConstantOperandVal(1) == 1 &&

          cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==

              ISD::SETNE));


  // operands of the new intrinsic call

  SmallVector<SDValue, 4> Ops;

  if (HaveChain)

    Ops.push_back(BRCOND.getOperand(0));


  Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());

  Ops.push_back(Target);


  ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());


  // build the new intrinsic call

  SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();


  if (!HaveChain) {

    SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};


    Result = DAG.getMergeValues(Ops, DL).getNode();

  }


  if (BR) {

    // Give the branch instruction our target

    SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};

    SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);

    DAG.ReplaceAllUsesWith(BR, NewBR.getNode());

  }


  SDValue Chain = SDValue(Result, Result->getNumValues() - 1);


  // Copy the intrinsic results to registers

  for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {

    SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);

    if (!CopyToReg)

      continue;


    Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),

                             SDValue(Result, i - 1), SDValue());


    DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));

  }


  // Remove the old intrinsic from the chain

  DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),

                                Intr->getOperand(0));


  return Chain;

}


SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {

  MVT VT = Op.getSimpleValueType();

  SDLoc DL(Op);

  // Checking the depth

  if (Op.getConstantOperandVal(0) != 0)

    return DAG.getConstant(0, DL, VT);


  MachineFunction &MF = DAG.getMachineFunction();

  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();

  // Check for kernel and shader functions

  if (Info->isEntryFunction())

    return DAG.getConstant(0, DL, VT);


  MachineFrameInfo &MFI = MF.getFrameInfo();

  // There is a call to @llvm.returnaddress in this function

  MFI.setReturnAddressIsTaken(true);


  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();

  // Get the return address reg and mark it as an implicit live-in

  Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),

                              getRegClassFor(VT, Op.getNode()->isDivergent()));


  return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);

}


SDValue SITargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {

  MachineFunction &MF = DAG.getMachineFunction();

  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();


  // For functions that set up their own stack, select the GET_STACK_BASE

  // pseudo.

  if (MFI->isBottomOfStack())

    return Op;


  // For everything else, create a dummy stack object.

  int FI = MF.getFrameInfo().CreateFixedObject(1, 0, /*IsImmutable=*/false);

  return DAG.getFrameIndex(FI, Op.getValueType());

}


SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,

                                            const SDLoc &DL, EVT VT) const {

  return Op.getValueType().bitsLE(VT)

             ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)

             : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,

                           DAG.getTargetConstant(0, DL, MVT::i32));

}


SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,

                                                SelectionDAG &DAG) const {

  EVT DstVT = Op.getValueType();

  unsigned NumElts = DstVT.getVectorNumElements();

  assert(NumElts > 2 && isPowerOf2_32(NumElts));


  auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);


  SDLoc DL(Op);

  unsigned Opc = Op.getOpcode();

  SDValue Flags = Op.getOperand(1);

  EVT HalfDstVT =

      EVT::getVectorVT(*DAG.getContext(), DstVT.getScalarType(), NumElts / 2);

  SDValue OpLo = DAG.getNode(Opc, DL, HalfDstVT, Lo, Flags);

  SDValue OpHi = DAG.getNode(Opc, DL, HalfDstVT, Hi, Flags);


  return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, OpLo, OpHi);

}


SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {

  SDValue Src = Op.getOperand(0);

  EVT SrcVT = Src.getValueType();

  EVT DstVT = Op.getValueType();


  if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {

    assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");

    if (SrcVT.getScalarType() != MVT::f32)

      return SDValue();

    return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);

  }


  if (SrcVT.getScalarType() != MVT::f64)

    return Op;


  SDLoc DL(Op);

  if (DstVT == MVT::f16) {

    // TODO: Handle strictfp

    if (Op.getOpcode() != ISD::FP_ROUND)

      return Op;


    if (!Subtarget->has16BitInsts()) {

      SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);

      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);

      return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);

    }

    if (Op->getFlags().hasApproximateFuncs()) {

      SDValue Flags = Op.getOperand(1);

      SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);

      return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);

    }

    SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);

    SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);

    return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);

  }


  assert(DstVT.getScalarType() == MVT::bf16 &&

         "custom lower FP_ROUND for f16 or bf16");

  assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");


  // Round-inexact-to-odd f64 to f32, then do the final rounding using the

  // hardware f32 -> bf16 instruction.

  EVT F32VT = SrcVT.changeElementType(*DAG.getContext(), MVT::f32);

  SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG);

  return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod,

                     DAG.getTargetConstant(0, DL, MVT::i32));

}


SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,

                                               SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();

  const MachineFunction &MF = DAG.getMachineFunction();

  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();

  bool IsIEEEMode = Info->getMode().IEEE;


  // FIXME: Assert during selection that this is only selected for

  // ieee_mode. Currently a combine can produce the ieee version for non-ieee

  // mode functions, but this happens to be OK since it's only done in cases

  // where there is known no sNaN.

  if (IsIEEEMode)

    return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);


  if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||

      VT == MVT::v16bf16)

    return splitBinaryVectorOp(Op, DAG);

  return Op;

}


SDValue

SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,

                                               SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();

  const MachineFunction &MF = DAG.getMachineFunction();

  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();

  bool IsIEEEMode = Info->getMode().IEEE;


  if (IsIEEEMode)

    return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);


  if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||

      VT == MVT::v16bf16)

    return splitBinaryVectorOp(Op, DAG);

  return Op;

}


SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,

                                                 SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();

  if (VT.isVector())

    return splitBinaryVectorOp(Op, DAG);


  assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&

         !Subtarget->hasMinimum3Maximum3F16() &&

         Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&

         "should not need to widen f16 minimum/maximum to v2f16");


  // Widen f16 operation to v2f16


  // fminimum f16:x, f16:y ->

  //   extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))

  //                                (v2f16 (scalar_to_vector y))), 0

  SDLoc SL(Op);

  SDValue WideSrc0 =

      DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));

  SDValue WideSrc1 =

      DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));


  SDValue Widened =

      DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);


  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,

                     DAG.getConstant(0, SL, MVT::i32));

}


SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {

  bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;

  EVT VT = Op.getValueType();

  assert(VT == MVT::f16);


  SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);

  EVT ExpVT = Exp.getValueType();

  if (ExpVT == MVT::i16)

    return Op;


  SDLoc DL(Op);


  // Correct the exponent type for f16 to i16.

  // Clamp the range of the exponent to the instruction's range.


  // TODO: This should be a generic narrowing legalization, and can easily be

  // for GlobalISel.


  SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);

  SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);


  SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);

  SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);


  SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);


  if (IsStrict) {

    return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},

                       {Op.getOperand(0), Op.getOperand(1), TruncExp});

  }


  return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);

}


static unsigned getExtOpcodeForPromotedOp(SDValue Op) {

  switch (Op->getOpcode()) {

  case ISD::SRA:

  case ISD::SMIN:

  case ISD::SMAX:

    return ISD::SIGN_EXTEND;

  case ISD::SRL:

  case ISD::UMIN:

  case ISD::UMAX:

    return ISD::ZERO_EXTEND;

  case ISD::ADD:

  case ISD::SUB:

  case ISD::AND:

  case ISD::OR:

  case ISD::XOR:

  case ISD::SHL:

  case ISD::SELECT:

  case ISD::MUL:

    // operation result won't be influenced by garbage high bits.

    // TODO: are all of those cases correct, and are there more?

    return ISD::ANY_EXTEND;

  case ISD::SETCC: {

    ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();

    return ISD::isSignedIntSetCC(CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

  }

  default:

    llvm_unreachable("unexpected opcode!");

  }

}


SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,

                                                DAGCombinerInfo &DCI) const {

  const unsigned Opc = Op.getOpcode();

  assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||

         Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||

         Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||

         Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||

         Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);


  EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()

                                 : Op->getOperand(0).getValueType();

  auto &DAG = DCI.DAG;

  auto ExtTy = OpTy.changeElementType(*DAG.getContext(), MVT::i32);


  if (DCI.isBeforeLegalizeOps() ||

      isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))

    return SDValue();


  SDLoc DL(Op);

  SDValue LHS;

  SDValue RHS;

  if (Opc == ISD::SELECT) {

    LHS = Op->getOperand(1);

    RHS = Op->getOperand(2);

  } else {

    LHS = Op->getOperand(0);

    RHS = Op->getOperand(1);

  }


  const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);

  LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});


  // Special case: for shifts, the RHS always needs a zext.

  if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)

    RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});

  else

    RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});


  // setcc always return i1/i1 vec so no need to truncate after.

  if (Opc == ISD::SETCC) {

    ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();

    return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);

  }


  // For other ops, we extend the operation's return type as well so we need to

  // truncate back to the original type.

  SDValue NewVal;

  if (Opc == ISD::SELECT)

    NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});

  else

    NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});


  return DAG.getZExtOrTrunc(NewVal, DL, OpTy);

}


SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {

  SDValue Mag = Op.getOperand(0);

  EVT MagVT = Mag.getValueType();


  if (MagVT.getVectorNumElements() > 2)

    return splitBinaryVectorOp(Op, DAG);


  SDValue Sign = Op.getOperand(1);

  EVT SignVT = Sign.getValueType();


  if (MagVT == SignVT)

    return Op;


  // fcopysign v2f16:mag, v2f32:sign ->

  //   fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)


  SDLoc SL(Op);

  SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);

  SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32);


  SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);


  return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);

}


// Custom lowering for vector multiplications and s_mul_u64.

SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();


  // Split vector operands.

  if (VT.isVector())

    return splitBinaryVectorOp(Op, DAG);


  assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");


  // There are four ways to lower s_mul_u64:

  //

  // 1. If all the operands are uniform, then we lower it as it is.

  //

  // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit

  //    multiplications because there is not a vector equivalent of s_mul_u64.

  //

  // 3. If the cost model decides that it is more efficient to use vector

  //    registers, then we have to split s_mul_u64 in 32-bit multiplications.

  //    This happens in splitScalarSMULU64() in SIInstrInfo.cpp .

  //

  // 4. If the cost model decides to use vector registers and both of the

  //    operands are zero-extended/sign-extended from 32-bits, then we split the

  //    s_mul_u64 in two 32-bit multiplications. The problem is that it is not

  //    possible to check if the operands are zero-extended or sign-extended in

  //    SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with

  //    s_mul_u64_u32_pseudo if both operands are zero-extended and we replace

  //    s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.

  //    If the cost model decides that we have to use vector registers, then

  //    splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/

  //    s_mul_i64_i32_pseudo in two vector multiplications. If the cost model

  //    decides that we should use scalar registers, then s_mul_u64_u32_pseudo/

  //    s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in

  //    SIInstrInfo.cpp .


  if (Op->isDivergent())

    return SDValue();


  SDValue Op0 = Op.getOperand(0);

  SDValue Op1 = Op.getOperand(1);

  // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64

  // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to

  // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.

  KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);

  unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();

  KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);

  unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();

  SDLoc SL(Op);

  if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)

    return SDValue(

        DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);

  unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);

  unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);

  if (Op0SignBits >= 33 && Op1SignBits >= 33)

    return SDValue(

        DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);

  // If all the operands are uniform, then we lower s_mul_u64 as it is.

  return Op;

}


SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();

  SDLoc SL(Op);

  SDValue LHS = Op.getOperand(0);

  SDValue RHS = Op.getOperand(1);

  bool isSigned = Op.getOpcode() == ISD::SMULO;


  if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {

    const APInt &C = RHSC->getAPIntValue();

    // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }

    if (C.isPowerOf2()) {

      // smulo(x, signed_min) is same as umulo(x, signed_min).

      bool UseArithShift = isSigned && !C.isMinSignedValue();

      SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);

      SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);

      SDValue Overflow =

          DAG.getSetCC(SL, MVT::i1,

                       DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,

                                   Result, ShiftAmt),

                       LHS, ISD::SETNE);

      return DAG.getMergeValues({Result, Overflow}, SL);

    }

  }


  SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);

  SDValue Top =

      DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);


  SDValue Sign = isSigned

                     ? DAG.getNode(ISD::SRA, SL, VT, Result,

                                   DAG.getConstant(VT.getScalarSizeInBits() - 1,

                                                   SL, MVT::i32))

                     : DAG.getConstant(0, SL, VT);

  SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);


  return DAG.getMergeValues({Result, Overflow}, SL);

}


SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {

  if (Op->isDivergent()) {

    // Select to V_MAD_[IU]64_[IU]32.

    return Op;

  }

  if (Subtarget->hasSMulHi()) {

    // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.

    return SDValue();

  }

  // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to

  // calculate the high part, so we might as well do the whole thing with

  // V_MAD_[IU]64_[IU]32.

  return Op;

}


SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {

  if (!Subtarget->hasTrapHandler() ||

      Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)

    return lowerTrapEndpgm(Op, DAG);


  return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)

                                            : lowerTrapHsaQueuePtr(Op, DAG);

}


SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {

  SDLoc SL(Op);

  SDValue Chain = Op.getOperand(0);

  return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);

}


SDValue

SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,

                                             const SDLoc &DL, Align Alignment,

                                             ImplicitParameter Param) const {

  MachineFunction &MF = DAG.getMachineFunction();

  uint64_t Offset = getImplicitParameterOffset(MF, Param);

  SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);

  MachinePointerInfo PtrInfo =

      getKernargSegmentPtrInfo(DAG.getMachineFunction());

  return DAG.getLoad(

      VT, DL, DAG.getEntryNode(), Ptr, PtrInfo.getWithOffset(Offset), Alignment,

      MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant);

}


SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,

                                               SelectionDAG &DAG) const {

  SDLoc SL(Op);

  SDValue Chain = Op.getOperand(0);


  SDValue QueuePtr;

  // For code object version 5, QueuePtr is passed through implicit kernarg.

  const Module *M = DAG.getMachineFunction().getFunction().getParent();

  if (AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) {

    QueuePtr =

        loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);

  } else {

    MachineFunction &MF = DAG.getMachineFunction();

    SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();

    Register UserSGPR = Info->getQueuePtrUserSGPR();


    if (UserSGPR == AMDGPU::NoRegister) {

      // We probably are in a function incorrectly marked with

      // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the

      // trap, so just use a null pointer.

      QueuePtr = DAG.getConstant(0, SL, MVT::i64);

    } else {

      QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,

                                      MVT::i64);

    }

  }


  SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);

  SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());


  uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);

  SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,

                   ToReg.getValue(1)};

  return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);

}


SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {

  SDLoc SL(Op);

  SDValue Chain = Op.getOperand(0);


  // We need to simulate the 's_trap 2' instruction on targets that run in

  // PRIV=1 (where it is treated as a nop).

  if (Subtarget->hasPrivEnabledTrap2NopBug())

    return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);


  uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);

  SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};

  return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);

}


SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {

  SDLoc SL(Op);

  SDValue Chain = Op.getOperand(0);

  MachineFunction &MF = DAG.getMachineFunction();


  if (!Subtarget->hasTrapHandler() ||

      Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {

    LLVMContext &Ctx = MF.getFunction().getContext();

    Ctx.diagnose(DiagnosticInfoUnsupported(MF.getFunction(),

                                           "debugtrap handler not supported",

                                           Op.getDebugLoc(), DS_Warning));

    return Chain;

  }


  uint64_t TrapID =

      static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);

  SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};

  return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);

}


/// When a divergent value (in VGPR) is passed to an inline asm with an SGPR

/// constraint ('s'), we need to insert v_readfirstlane to move the value from

/// VGPR to SGPR. This is done by modifying the CopyToReg nodes in the glue

/// chain that feed into the INLINEASM node.

SDValue SITargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {

  unsigned NumOps = Op.getNumOperands();


  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();

  SmallSet<Register, 8> SGPRInputRegs;


  unsigned NumVals = 0;

  for (unsigned I = InlineAsm::Op_FirstOperand; I < NumOps - 1;

       I += 1 + NumVals) {

    const InlineAsm::Flag Flags(Op.getConstantOperandVal(I));

    NumVals = Flags.getNumOperandRegisters();


    unsigned RCID;

    bool IsSGPRInput = Flags.getKind() == InlineAsm::Kind::RegUse &&

                       NumVals > 0 && Flags.hasRegClassConstraint(RCID) &&

                       TRI->isSGPRClass(TRI->getRegClass(RCID));


    for (unsigned J = 0; J < NumVals; ++J) {

      SDValue Val = Op.getOperand(I + 1 + J);

      if (const RegisterSDNode *RegNode =

              dyn_cast<RegisterSDNode>(Val.getNode())) {

        Register Reg = RegNode->getReg();

        if (IsSGPRInput || (Reg.isPhysical() && TRI->isSGPRPhysReg(Reg)))

          SGPRInputRegs.insert(Reg);

      }

    }

  }


  if (SGPRInputRegs.empty())

    return Op;


  // Walk the glue chain and insert readfirstlane for divergent SGPR inputs.

  SDLoc DL(Op);

  SDNode *N = Op.getOperand(NumOps - 1).getNode();


  while (N && N->getOpcode() == ISD::CopyToReg) {

    Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();

    SDValue SrcVal = N->getOperand(2);


    // Insert readfirstlane if copying a divergent value to an SGPR input.

    if (SrcVal->isDivergent() && SGPRInputRegs.count(Reg)) {

      SDValue ReadFirstLaneID =

          DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);

      SDValue ReadFirstLane =

          DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, SrcVal.getValueType(),

                      ReadFirstLaneID, SrcVal);


      SmallVector<SDValue, 4> Ops = {N->getOperand(0), N->getOperand(1),

                                     ReadFirstLane};

      if (N->getNumOperands() > 3)

        Ops.push_back(N->getOperand(3)); // Glue input


      DAG.UpdateNodeOperands(N, Ops);

    }


    // Follow glue chain to next CopyToReg.

    SDNode *Next = nullptr;

    for (unsigned I = 0, E = N->getNumOperands(); I != E; ++I) {

      if (N->getOperand(I).getValueType() == MVT::Glue) {

        Next = N->getOperand(I).getNode();

        break;

      }

    }

    N = Next;

  }


  return Op;

}


SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,

                                             SelectionDAG &DAG) const {

  if (Subtarget->hasApertureRegs()) {

    const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)

                                       ? AMDGPU::SRC_SHARED_BASE

                                       : AMDGPU::SRC_PRIVATE_BASE;

    assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||

            !Subtarget->hasGloballyAddressableScratch()) &&

           "Cannot use src_private_base with globally addressable scratch!");

    // Note: this feature (register) is broken. When used as a 32-bit operand,

    // it returns a wrong value (all zeroes?). The real value is in the upper 32

    // bits.

    //

    // To work around the issue, emit a 64 bit copy from this register

    // then extract the high bits. Note that this shouldn't even result in a

    // shift being emitted and simply become a pair of registers (e.g.):

    //    s_mov_b64 s[6:7], src_shared_base

    //    v_mov_b32_e32 v1, s7

    SDValue Copy =

        DAG.getCopyFromReg(DAG.getEntryNode(), DL, ApertureRegNo, MVT::v2i32);

    return DAG.getExtractVectorElt(DL, MVT::i32, Copy, 1);

  }


  // For code object version 5, private_base and shared_base are passed through

  // implicit kernargs.

  const Module *M = DAG.getMachineFunction().getFunction().getParent();

  if (AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) {

    ImplicitParameter Param =

        (AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE;

    return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);

  }


  MachineFunction &MF = DAG.getMachineFunction();

  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();

  Register UserSGPR = Info->getQueuePtrUserSGPR();

  if (UserSGPR == AMDGPU::NoRegister) {

    // We probably are in a function incorrectly marked with

    // amdgpu-no-queue-ptr. This is undefined.

    return DAG.getPOISON(MVT::i32);

  }


  SDValue QueuePtr =

      CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);


  // Offset into amd_queue_t for group_segment_aperture_base_hi /

  // private_segment_aperture_base_hi.

  uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;


  SDValue Ptr =

      DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));


  // TODO: Use custom target PseudoSourceValue.

  // TODO: We should use the value from the IR intrinsic call, but it might not

  // be available and how do we get it?

  MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);

  return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,

                     commonAlignment(Align(64), StructOffset),

                     MachineMemOperand::MODereferenceable |

                         MachineMemOperand::MOInvariant);

}


/// Return true if the value is a known valid address, such that a null check is

/// not necessary.


static bool isKnownNonNull(SDValue Val, SelectionDAG &DAG,

                           const AMDGPUTargetMachine &TM, unsigned AddrSpace) {

  if (isa<FrameIndexSDNode, GlobalAddressSDNode, BasicBlockSDNode>(Val))

    return true;


  if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))

    return ConstVal->getSExtValue() != AMDGPU::getNullPointerValue(AddrSpace);


  // TODO: Search through arithmetic, handle arguments and loads

  // marked nonnull.

  return false;

}


SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,

                                             SelectionDAG &DAG) const {

  SDLoc SL(Op);


  const AMDGPUTargetMachine &TM =

      static_cast<const AMDGPUTargetMachine &>(getTargetMachine());


  unsigned DestAS, SrcAS;

  SDValue Src;

  bool IsNonNull = false;

  if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {

    SrcAS = ASC->getSrcAddressSpace();

    Src = ASC->getOperand(0);

    DestAS = ASC->getDestAddressSpace();

  } else {

    assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&

           Op.getConstantOperandVal(0) ==

               Intrinsic::amdgcn_addrspacecast_nonnull);

    Src = Op->getOperand(1);

    SrcAS = Op->getConstantOperandVal(2);

    DestAS = Op->getConstantOperandVal(3);

    IsNonNull = true;

  }


  SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);


  // flat -> local/private

  if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {

    if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||

        DestAS == AMDGPUAS::PRIVATE_ADDRESS) {

      SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);


      if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&

          Subtarget->hasGloballyAddressableScratch()) {

        // flat -> private with globally addressable scratch: subtract

        // src_flat_scratch_base_lo.

        SDValue FlatScratchBaseLo(

            DAG.getMachineNode(

                AMDGPU::S_MOV_B32, SL, MVT::i32,

                DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),

            0);

        Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);

      }


      if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))

        return Ptr;


      unsigned NullVal = AMDGPU::getNullPointerValue(DestAS);

      SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);

      SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);


      return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,

                         SegmentNullPtr);

    }

  }


  // local/private -> flat

  if (DestAS == AMDGPUAS::FLAT_ADDRESS) {

    if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||

        SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {

      SDValue CvtPtr;

      if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&

          Subtarget->hasGloballyAddressableScratch()) {

        // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr

        // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr

        SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);

        SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);

        ThreadID = DAG.getNode(

            ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,

            DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),

            AllOnes, ThreadID);

        if (Subtarget->isWave64())

          ThreadID = DAG.getNode(

              ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,

              DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),

              AllOnes, ThreadID);

        SDValue ShAmt = DAG.getShiftAmountConstant(

            57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);

        SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);

        CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);

        CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);

        // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full

        // 64-bit hi:lo value.

        SDValue FlatScratchBase = {

            DAG.getMachineNode(

                AMDGPU::S_MOV_B64, SL, MVT::i64,

                DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),

            0};

        CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);

      } else {

        SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);

        CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);

        CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);

      }


      if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))

        return CvtPtr;


      unsigned NullVal = AMDGPU::getNullPointerValue(SrcAS);

      SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);


      SDValue NonNull =

          DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);


      return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,

                         FlatNullPtr);

    }

  }


  if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&

      Op.getValueType() == MVT::i64) {

    const SIMachineFunctionInfo *Info =

        DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();

    if (Info->get32BitAddressHighBits() == 0)

      return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, Src);


    SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);

    SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);

    return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);

  }


  if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&

      Src.getValueType() == MVT::i64)

    return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);


  // global <-> flat are no-ops and never emitted.


  // Invalid casts are poison.

  return DAG.getPOISON(Op->getValueType(0));

}


// This lowers an INSERT_SUBVECTOR by extracting the individual elements from

// the small vector and inserting them into the big vector. That is better than

// the default expansion of doing it via a stack slot. Even though the use of

// the stack slot would be optimized away afterwards, the stack slot itself

// remains.

SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,

                                                SelectionDAG &DAG) const {

  SDValue Vec = Op.getOperand(0);

  SDValue Ins = Op.getOperand(1);

  SDValue Idx = Op.getOperand(2);

  EVT VecVT = Vec.getValueType();

  EVT InsVT = Ins.getValueType();

  EVT EltVT = VecVT.getVectorElementType();

  unsigned InsNumElts = InsVT.getVectorNumElements();

  unsigned IdxVal = Idx->getAsZExtVal();

  SDLoc SL(Op);


  if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {

    // Insert 32-bit registers at a time.

    assert(InsNumElts % 2 == 0 && "expect legal vector types");


    unsigned VecNumElts = VecVT.getVectorNumElements();

    EVT NewVecVT =

        EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);

    EVT NewInsVT = InsNumElts == 2 ? MVT::i32

                                   : EVT::getVectorVT(*DAG.getContext(),

                                                      MVT::i32, InsNumElts / 2);


    Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);

    Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);


    for (unsigned I = 0; I != InsNumElts / 2; ++I) {

      SDValue Elt;

      if (InsNumElts == 2) {

        Elt = Ins;

      } else {

        Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,

                          DAG.getConstant(I, SL, MVT::i32));

      }

      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,

                        DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));

    }


    return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);

  }


  for (unsigned I = 0; I != InsNumElts; ++I) {

    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,

                              DAG.getConstant(I, SL, MVT::i32));

    Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,

                      DAG.getConstant(IdxVal + I, SL, MVT::i32));

  }

  return Vec;

}


SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,

                                                 SelectionDAG &DAG) const {

  SDValue Vec = Op.getOperand(0);

  SDValue InsVal = Op.getOperand(1);

  SDValue Idx = Op.getOperand(2);

  EVT VecVT = Vec.getValueType();

  EVT EltVT = VecVT.getVectorElementType();

  unsigned VecSize = VecVT.getSizeInBits();

  unsigned EltSize = EltVT.getSizeInBits();

  SDLoc SL(Op);


  // Specially handle the case of v4i16 with static indexing.

  unsigned NumElts = VecVT.getVectorNumElements();

  auto *KIdx = dyn_cast<ConstantSDNode>(Idx);

  if (NumElts == 4 && EltSize == 16 && KIdx) {

    SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);


    SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,

                                 DAG.getConstant(0, SL, MVT::i32));

    SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,

                                 DAG.getConstant(1, SL, MVT::i32));


    SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);

    SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);


    unsigned Idx = KIdx->getZExtValue();

    bool InsertLo = Idx < 2;

    SDValue InsHalf = DAG.getNode(

        ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,

        DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),

        DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));


    InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);


    SDValue Concat =

        InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})

                 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});


    return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);

  }


  // Static indexing does not lower to stack access, and hence there is no need

  // for special custom lowering to avoid stack access.

  if (isa<ConstantSDNode>(Idx))

    return SDValue();


  // Avoid stack access for dynamic indexing by custom lowering to

  // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec


  assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");


  MVT IntVT = MVT::getIntegerVT(VecSize);


  // Convert vector index to bit-index and get the required bit mask.

  assert(isPowerOf2_32(EltSize));

  const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);

  SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);

  SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);

  SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,

                            DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);


  // 1. Create a congruent vector with the target value in each element.

  SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,

                               DAG.getSplatBuildVector(VecVT, SL, InsVal));


  // 2. Mask off all other indices except the required index within (1).

  SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);


  // 3. Mask off the required index within the target vector.

  SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);

  SDValue RHS =

      DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);


  // 4. Get (2) and (3) ORed into the target vector.

  SDValue BFI =

      DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);


  return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);

}


SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,

                                                  SelectionDAG &DAG) const {

  SDLoc SL(Op);


  EVT ResultVT = Op.getValueType();

  SDValue Vec = Op.getOperand(0);

  SDValue Idx = Op.getOperand(1);

  EVT VecVT = Vec.getValueType();

  unsigned VecSize = VecVT.getSizeInBits();

  EVT EltVT = VecVT.getVectorElementType();


  DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);


  // Make sure we do any optimizations that will make it easier to fold

  // source modifiers before obscuring it with bit operations.


  // XXX - Why doesn't this get called when vector_shuffle is expanded?

  if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))

    return Combined;


  if (VecSize == 128 || VecSize == 256 || VecSize == 512) {

    SDValue Lo, Hi;

    auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);


    if (VecSize == 128) {

      SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);

      Lo = DAG.getBitcast(LoVT,

                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,

                                      DAG.getConstant(0, SL, MVT::i32)));

      Hi = DAG.getBitcast(HiVT,

                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,

                                      DAG.getConstant(1, SL, MVT::i32)));

    } else if (VecSize == 256) {

      SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);

      SDValue Parts[4];

      for (unsigned P = 0; P < 4; ++P) {

        Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,

                               DAG.getConstant(P, SL, MVT::i32));

      }


      Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,

                                            Parts[0], Parts[1]));

      Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,

                                            Parts[2], Parts[3]));

    } else {

      assert(VecSize == 512);


      SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);

      SDValue Parts[8];

      for (unsigned P = 0; P < 8; ++P) {

        Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,

                               DAG.getConstant(P, SL, MVT::i32));

      }


      Lo = DAG.getBitcast(LoVT,

                          DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,

                                      Parts[0], Parts[1], Parts[2], Parts[3]));

      Hi = DAG.getBitcast(HiVT,

                          DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,

                                      Parts[4], Parts[5], Parts[6], Parts[7]));

    }


    EVT IdxVT = Idx.getValueType();

    unsigned NElem = VecVT.getVectorNumElements();

    assert(isPowerOf2_32(NElem));

    SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);

    SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);

    SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);

    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);

  }


  assert(VecSize <= 64);


  MVT IntVT = MVT::getIntegerVT(VecSize);


  // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.

  SDValue VecBC = peekThroughBitcasts(Vec);

  if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {

    SDValue Src = VecBC.getOperand(0);

    Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);

    Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);

  }


  unsigned EltSize = EltVT.getSizeInBits();

  assert(isPowerOf2_32(EltSize));


  SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);


  // Convert vector index to bit-index (* EltSize)

  SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);


  SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);

  SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);


  if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {

    SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);

    return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);

  }


  return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);

}


static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {

  assert(Elt % 2 == 0);

  return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);

}


static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {

  assert(Elt % 2 == 0);

  return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&

         !(Mask[Elt + 1] & 1);

}


SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,

                                              SelectionDAG &DAG) const {

  SDLoc SL(Op);

  EVT ResultVT = Op.getValueType();

  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);

  MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();

  const int NewSrcNumElts = 2;

  MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);

  int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();


  // Break up the shuffle into registers sized pieces.

  //

  // We're trying to form sub-shuffles that the register allocation pipeline

  // won't be able to figure out, like how to use v_pk_mov_b32 to do a register

  // blend or 16-bit op_sel. It should be able to figure out how to reassemble a

  // pair of copies into a consecutive register copy, so use the ordinary

  // extract_vector_elt lowering unless we can use the shuffle.

  //

  // TODO: This is a bit of hack, and we should probably always use

  // extract_subvector for the largest possible subvector we can (or at least

  // use it for PackVT aligned pieces). However we have worse support for

  // combines on them don't directly treat extract_subvector / insert_subvector

  // as legal. The DAG scheduler also ends up doing a worse job with the

  // extract_subvectors.

  const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;


  // vector_shuffle <0,1,6,7> lhs, rhs

  // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)

  //

  // vector_shuffle <6,7,2,3> lhs, rhs

  // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)

  //

  // vector_shuffle <6,7,0,1> lhs, rhs

  // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)


  // Avoid scalarizing when both halves are reading from consecutive elements.


  // If we're treating 2 element shuffles as legal, also create odd-to-even

  // shuffles of neighboring pairs.

  //

  // vector_shuffle <3,2,7,6> lhs, rhs

  //  -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)

  //                    vector_shuffle <1, 0> (extract_subvector rhs, 2)


  SmallVector<SDValue, 16> Pieces;

  for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {

    if (ShouldUseConsecutiveExtract &&

        elementPairIsContiguous(SVN->getMask(), I)) {

      const int Idx = SVN->getMaskElt(I);

      int VecIdx = Idx < SrcNumElts ? 0 : 1;

      int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;

      SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,

                                   SVN->getOperand(VecIdx),

                                   DAG.getConstant(EltIdx, SL, MVT::i32));

      Pieces.push_back(SubVec);

    } else if (elementPairIsOddToEven(SVN->getMask(), I) &&

               isOperationLegal(ISD::VECTOR_SHUFFLE, PackVT)) {

      int Idx0 = SVN->getMaskElt(I);

      int Idx1 = SVN->getMaskElt(I + 1);


      SDValue SrcOp0 = SVN->getOperand(0);

      SDValue SrcOp1 = SrcOp0;

      if (Idx0 >= SrcNumElts) {

        SrcOp0 = SVN->getOperand(1);

        Idx0 -= SrcNumElts;

      }


      if (Idx1 >= SrcNumElts) {

        SrcOp1 = SVN->getOperand(1);

        Idx1 -= SrcNumElts;

      }


      int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);

      int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);


      // Extract nearest even aligned piece.

      SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,

                                    DAG.getConstant(AlignedIdx0, SL, MVT::i32));

      SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,

                                    DAG.getConstant(AlignedIdx1, SL, MVT::i32));


      int NewMaskIdx0 = Idx0 - AlignedIdx0;

      int NewMaskIdx1 = Idx1 - AlignedIdx1;


      SDValue Result0 = SubVec0;

      SDValue Result1 = SubVec0;


      if (SubVec0 != SubVec1) {

        NewMaskIdx1 += NewSrcNumElts;

        Result1 = SubVec1;

      } else {

        Result1 = DAG.getPOISON(PackVT);

      }


      SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,

                                          {NewMaskIdx0, NewMaskIdx1});

      Pieces.push_back(Shuf);

    } else {

      const int Idx0 = SVN->getMaskElt(I);

      const int Idx1 = SVN->getMaskElt(I + 1);

      int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;

      int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;

      int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;

      int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;


      SDValue Vec0 = SVN->getOperand(VecIdx0);

      SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,

                                 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));


      SDValue Vec1 = SVN->getOperand(VecIdx1);

      SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,

                                 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));

      Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));

    }

  }


  return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);

}


SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,

                                                SelectionDAG &DAG) const {

  SDValue SVal = Op.getOperand(0);

  EVT ResultVT = Op.getValueType();

  EVT SValVT = SVal.getValueType();

  SDValue UndefVal = DAG.getPOISON(SValVT);

  SDLoc SL(Op);


  SmallVector<SDValue, 8> VElts;

  VElts.push_back(SVal);

  for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)

    VElts.push_back(UndefVal);


  return DAG.getBuildVector(ResultVT, SL, VElts);

}


SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,

                                            SelectionDAG &DAG) const {

  SDLoc SL(Op);

  EVT VT = Op.getValueType();


  if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {

    assert(!Subtarget->hasVOP3PInsts() && "this should be legal");


    SDValue Lo = Op.getOperand(0);

    SDValue Hi = Op.getOperand(1);


    // Avoid adding defined bits with the zero_extend.

    if (Hi.isUndef()) {

      Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);

      SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);

      return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);

    }


    Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);

    Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);


    SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,

                                DAG.getConstant(16, SL, MVT::i32));

    if (Lo.isUndef())

      return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);


    Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);

    Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);


    SDValue Or =

        DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);

    return DAG.getNode(ISD::BITCAST, SL, VT, Or);

  }


  // Split into 2-element chunks.

  const unsigned NumParts = VT.getVectorNumElements() / 2;

  EVT PartVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);

  MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());


  SmallVector<SDValue> Casts;

  for (unsigned P = 0; P < NumParts; ++P) {

    SDValue Vec = DAG.getBuildVector(

        PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});

    Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));

  }


  SDValue Blend =

      DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);

  return DAG.getNode(ISD::BITCAST, SL, VT, Blend);

}


bool SITargetLowering::isOffsetFoldingLegal(

    const GlobalAddressSDNode *GA) const {

  // OSes that use ELF REL relocations (instead of RELA) can only store a

  // 32-bit addend in the instruction, so it is not safe to allow offset folding

  // which can create arbitrary 64-bit addends. (This is only a problem for

  // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by

  // the high 32 bits of the addend.)

  //

  // This should be kept in sync with how HasRelocationAddend is initialized in

  // the constructor of ELFAMDGPUAsmBackend.

  if (!Subtarget->isAmdHsaOS())

    return false;


  // We can fold offsets for anything that doesn't require a GOT relocation.

  return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||

          GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||

          GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&

         !shouldEmitGOTReloc(GA->getGlobal());

}


static SDValue


buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,

                        const SDLoc &DL, int64_t Offset, EVT PtrVT,

                        unsigned GAFlags = SIInstrInfo::MO_NONE) {

  assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");

  // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is

  // lowered to the following code sequence:

  //

  // For constant address space:

  //   s_getpc_b64 s[0:1]

  //   s_add_u32 s0, s0, $symbol

  //   s_addc_u32 s1, s1, 0

  //

  //   s_getpc_b64 returns the address of the s_add_u32 instruction and then

  //   a fixup or relocation is emitted to replace $symbol with a literal

  //   constant, which is a pc-relative offset from the encoding of the $symbol

  //   operand to the global variable.

  //

  // For global address space:

  //   s_getpc_b64 s[0:1]

  //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo

  //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi

  //

  //   s_getpc_b64 returns the address of the s_add_u32 instruction and then

  //   fixups or relocations are emitted to replace $symbol@*@lo and

  //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,

  //   which is a 64-bit pc-relative offset from the encoding of the $symbol

  //   operand to the global variable.

  if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {

    assert(GAFlags != SIInstrInfo::MO_NONE);


    SDValue Ptr =

        DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2);

    return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr);

  }


  SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);

  SDValue PtrHi;

  if (GAFlags == SIInstrInfo::MO_NONE)

    PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);

  else

    PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);

  return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);

}


SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI,

                                             SDValue Op,

                                             SelectionDAG &DAG) const {

  GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);

  SDLoc DL(GSD);

  EVT PtrVT = Op.getValueType();


  const GlobalValue *GV = GSD->getGlobal();

  if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&

       shouldUseLDSConstAddress(GV)) ||

      GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||

      GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {

    if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&

        GV->hasExternalLinkage()) {

      const GlobalVariable &GVar = *cast<GlobalVariable>(GV);

      // HIP uses an unsized array `extern __shared__ T s[]` or similar

      // zero-sized type in other languages to declare the dynamic shared

      // memory which size is not known at the compile time. They will be

      // allocated by the runtime and placed directly after the static

      // allocated ones. They all share the same offset.

      if (GVar.getGlobalSize(GVar.getDataLayout()) == 0) {

        assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");

        // Adjust alignment for that dynamic shared memory array.

        Function &F = DAG.getMachineFunction().getFunction();

        MFI->setDynLDSAlign(F, GVar);

        MFI->setUsesDynamicLDS(true);

        return SDValue(

            DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);

      }

    }

    return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);

  }


  if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {

    SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),

                                            SIInstrInfo::MO_ABS32_LO);

    return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);

  }


  if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {

    if (Subtarget->has64BitLiterals()) {

      SDValue Addr = DAG.getTargetGlobalAddress(

          GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64);

      return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr),

                     0);

    }


    SDValue AddrLo = DAG.getTargetGlobalAddress(

        GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);

    AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};


    SDValue AddrHi = DAG.getTargetGlobalAddress(

        GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);

    AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};


    return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);

  }


  if (shouldEmitFixup(GV))

    return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);


  if (shouldEmitPCReloc(GV))

    return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,

                                   SIInstrInfo::MO_REL32);


  SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,

                                            SIInstrInfo::MO_GOTPCREL32);

  PointerType *PtrTy =

      PointerType::get(*DAG.getContext(), AMDGPUAS::CONSTANT_ADDRESS);

  const DataLayout &DataLayout = DAG.getDataLayout();

  Align Alignment = DataLayout.getABITypeAlign(PtrTy);

  MachinePointerInfo PtrInfo =

      MachinePointerInfo::getGOT(DAG.getMachineFunction());


  return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,

                     MachineMemOperand::MODereferenceable |

                         MachineMemOperand::MOInvariant);

}


SDValue SITargetLowering::LowerExternalSymbol(SDValue Op,

                                              SelectionDAG &DAG) const {

  // TODO: Handle this. It should be mostly the same as LowerGlobalAddress.

  const Function &Fn = DAG.getMachineFunction().getFunction();

  DAG.getContext()->diagnose(DiagnosticInfoUnsupported(

      Fn, "unsupported external symbol", Op.getDebugLoc()));

  return DAG.getPOISON(Op.getValueType());

}


SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,

                                   const SDLoc &DL, SDValue V) const {

  // We can't use S_MOV_B32 directly, because there is no way to specify m0 as

  // the destination register.

  //

  // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,

  // so we will end up with redundant moves to m0.

  //

  // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.


  // A Null SDValue creates a glue result.

  SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,

                                  V, Chain);

  return SDValue(M0, 0);

}


SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,

                                                 MVT VT,

                                                 unsigned Offset) const {

  SDLoc SL(Op);

  SDValue Param = lowerKernargMemParameter(

      DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);

  // The local size values will have the hi 16-bits as zero.

  return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,

                     DAG.getValueType(VT));

}


static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,

                                        EVT VT) {

  DAG.getContext()->diagnose(DiagnosticInfoUnsupported(

      DAG.getMachineFunction().getFunction(),

      "non-hsa intrinsic with hsa target", DL.getDebugLoc()));

  return DAG.getPOISON(VT);

}


static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,

                                         EVT VT) {

  DAG.getContext()->diagnose(DiagnosticInfoUnsupported(

      DAG.getMachineFunction().getFunction(),

      "intrinsic not supported on subtarget", DL.getDebugLoc()));

  return DAG.getPOISON(VT);

}


static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,

                                    ArrayRef<SDValue> Elts) {

  assert(!Elts.empty());

  MVT Type;

  unsigned NumElts = Elts.size();


  if (NumElts <= 12) {

    Type = MVT::getVectorVT(MVT::f32, NumElts);

  } else {

    assert(Elts.size() <= 16);

    Type = MVT::v16f32;

    NumElts = 16;

  }


  SmallVector<SDValue, 16> VecElts(NumElts);

  for (unsigned i = 0; i < Elts.size(); ++i) {

    SDValue Elt = Elts[i];

    if (Elt.getValueType() != MVT::f32)

      Elt = DAG.getBitcast(MVT::f32, Elt);

    VecElts[i] = Elt;

  }

  for (unsigned i = Elts.size(); i < NumElts; ++i)

    VecElts[i] = DAG.getPOISON(MVT::f32);


  if (NumElts == 1)

    return VecElts[0];

  return DAG.getBuildVector(Type, DL, VecElts);

}


static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,

                              SDValue Src, int ExtraElts) {

  EVT SrcVT = Src.getValueType();


  SmallVector<SDValue, 8> Elts;


  if (SrcVT.isVector())

    DAG.ExtractVectorElements(Src, Elts);

  else

    Elts.push_back(Src);


  SDValue Undef = DAG.getPOISON(SrcVT.getScalarType());

  while (ExtraElts--)

    Elts.push_back(Undef);


  return DAG.getBuildVector(CastVT, DL, Elts);

}


// Re-construct the required return value for a image load intrinsic.

// This is more complicated due to the optional use TexFailCtrl which means the

// required return type is an aggregate


static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result,

                                 ArrayRef<EVT> ResultTypes, bool IsTexFail,

                                 bool Unpacked, bool IsD16, int DMaskPop,

                                 int NumVDataDwords, bool IsAtomicPacked16Bit,

                                 const SDLoc &DL) {

  // Determine the required return type. This is the same regardless of

  // IsTexFail flag

  EVT ReqRetVT = ResultTypes[0];

  int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;

  int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)

                          ? (ReqRetNumElts + 1) / 2

                          : ReqRetNumElts;


  int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;


  MVT DataDwordVT =

      NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);


  MVT MaskPopVT =

      MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);


  SDValue Data(Result, 0);

  SDValue TexFail;


  if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {

    SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);

    if (MaskPopVT.isVector()) {

      Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,

                         SDValue(Result, 0), ZeroIdx);

    } else {

      Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,

                         SDValue(Result, 0), ZeroIdx);

    }

  }


  if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)

    Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,

                          NumDataDwords - MaskPopDwords);


  if (IsD16)

    Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);


  EVT LegalReqRetVT = ReqRetVT;

  if (!ReqRetVT.isVector()) {

    if (!Data.getValueType().isInteger())

      Data = DAG.getNode(ISD::BITCAST, DL,

                         Data.getValueType().changeTypeToInteger(), Data);

    Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);

  } else {

    // We need to widen the return vector to a legal type

    if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&

        ReqRetVT.getVectorElementType().getSizeInBits() == 16) {

      LegalReqRetVT =

          EVT::getVectorVT(*DAG.getContext(), ReqRetVT.getVectorElementType(),

                           ReqRetVT.getVectorNumElements() + 1);

    }

  }

  Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);


  if (IsTexFail) {

    TexFail =

        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),

                    DAG.getConstant(MaskPopDwords, DL, MVT::i32));


    return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);

  }


  if (Result->getNumValues() == 1)

    return Data;


  return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);

}


static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,

                         SDValue *LWE, bool &IsTexFail) {

  auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());


  uint64_t Value = TexFailCtrlConst->getZExtValue();

  if (Value) {

    IsTexFail = true;

  }


  SDLoc DL(TexFailCtrlConst);

  *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);

  Value &= ~(uint64_t)0x1;

  *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);

  Value &= ~(uint64_t)0x2;


  return Value == 0;

}


static void packImage16bitOpsToDwords(SelectionDAG &DAG, SDValue Op,

                                      MVT PackVectorVT,

                                      SmallVectorImpl<SDValue> &PackedAddrs,

                                      unsigned DimIdx, unsigned EndIdx,

                                      unsigned NumGradients) {

  SDLoc DL(Op);

  for (unsigned I = DimIdx; I < EndIdx; I++) {

    SDValue Addr = Op.getOperand(I);


    // Gradients are packed with undef for each coordinate.

    // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:

    // 1D: undef,dx/dh; undef,dx/dv

    // 2D: dy/dh,dx/dh; dy/dv,dx/dv

    // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv

    if (((I + 1) >= EndIdx) ||

        ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||

                                         I == DimIdx + NumGradients - 1))) {

      if (Addr.getValueType() != MVT::i16)

        Addr = DAG.getBitcast(MVT::i16, Addr);

      Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);

    } else {

      Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});

      I++;

    }

    Addr = DAG.getBitcast(MVT::f32, Addr);

    PackedAddrs.push_back(Addr);

  }

}


SDValue SITargetLowering::lowerImage(SDValue Op,

                                     const AMDGPU::ImageDimIntrinsicInfo *Intr,

                                     SelectionDAG &DAG, bool WithChain) const {

  SDLoc DL(Op);

  MachineFunction &MF = DAG.getMachineFunction();

  const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();

  unsigned IntrOpcode = Intr->BaseOpcode;

  // For image atomic: use no-return opcode if result is unused.

  if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode &&

      !Op.getNode()->hasAnyUseOfValue(0))

    IntrOpcode = Intr->AtomicNoRetBaseOpcode;

  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =

      AMDGPU::getMIMGBaseOpcodeInfo(IntrOpcode);

  const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);

  bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);

  bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);

  bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);

  bool IsGFX13 = AMDGPU::isGFX13(*Subtarget);


  SmallVector<EVT, 3> ResultTypes(Op->values());

  SmallVector<EVT, 3> OrigResultTypes(Op->values());

  if (BaseOpcode->NoReturn && BaseOpcode->Atomic)

    ResultTypes.erase(&ResultTypes[0]);


  bool IsD16 = false;

  bool IsG16 = false;

  bool IsA16 = false;

  SDValue VData;

  int NumVDataDwords = 0;

  bool AdjustRetType = false;

  bool IsAtomicPacked16Bit = false;


  // Offset of intrinsic arguments

  const unsigned ArgOffset = WithChain ? 2 : 1;


  unsigned DMask;

  unsigned DMaskLanes = 0;


  if (BaseOpcode->Atomic) {

    VData = Op.getOperand(2);


    IsAtomicPacked16Bit =

        (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||

         IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||

         IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||

         IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);


    bool Is64Bit = VData.getValueSizeInBits() == 64;

    if (BaseOpcode->AtomicX2) {

      SDValue VData2 = Op.getOperand(3);

      VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,

                                 {VData, VData2});

      if (Is64Bit)

        VData = DAG.getBitcast(MVT::v4i32, VData);


      if (!BaseOpcode->NoReturn)

        ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;


      DMask = Is64Bit ? 0xf : 0x3;

      NumVDataDwords = Is64Bit ? 4 : 2;

    } else {

      DMask = Is64Bit ? 0x3 : 0x1;

      NumVDataDwords = Is64Bit ? 2 : 1;

    }

  } else {

    DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);

    DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);


    if (BaseOpcode->Store) {

      VData = Op.getOperand(2);


      MVT StoreVT = VData.getSimpleValueType();

      if (StoreVT.getScalarType() == MVT::f16) {

        if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)

          return Op; // D16 is unsupported for this instruction


        IsD16 = true;

        VData = handleD16VData(VData, DAG, true);

      }


      NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;

    } else if (!BaseOpcode->NoReturn) {

      // Work out the num dwords based on the dmask popcount and underlying type

      // and whether packing is supported.

      MVT LoadVT = ResultTypes[0].getSimpleVT();

      if (LoadVT.getScalarType() == MVT::f16) {

        if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)

          return Op; // D16 is unsupported for this instruction


        IsD16 = true;

      }


      // Confirm that the return type is large enough for the dmask specified

      if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||

          (!LoadVT.isVector() && DMaskLanes > 1))

        return Op;


      // The sq block of gfx8 and gfx9 do not estimate register use correctly

      // for d16 image_gather4, image_gather4_l, and image_gather4_lz

      // instructions.

      if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&

          !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))

        NumVDataDwords = (DMaskLanes + 1) / 2;

      else

        NumVDataDwords = DMaskLanes;


      AdjustRetType = true;

    }

  }


  unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;

  SmallVector<SDValue, 4> VAddrs;


  // Check for 16 bit addresses or derivatives and pack if true.

  MVT VAddrVT =

      Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();

  MVT VAddrScalarVT = VAddrVT.getScalarType();

  MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;

  IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;


  VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();

  VAddrScalarVT = VAddrVT.getScalarType();

  MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;

  IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;


  // Push back extra arguments.

  for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {

    if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {

      assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");

      // Special handling of bias when A16 is on. Bias is of type half but

      // occupies full 32-bit.

      SDValue Bias = DAG.getBuildVector(

          MVT::v2f16, DL,

          {Op.getOperand(ArgOffset + I), DAG.getPOISON(MVT::f16)});

      VAddrs.push_back(Bias);

    } else {

      assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&

             "Bias needs to be converted to 16 bit in A16 mode");

      VAddrs.push_back(Op.getOperand(ArgOffset + I));

    }

  }


  if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {

    // 16 bit gradients are supported, but are tied to the A16 control

    // so both gradients and addresses must be 16 bit

    LLVM_DEBUG(

        dbgs() << "Failed to lower image intrinsic: 16 bit addresses "

                  "require 16 bit args for both gradients and addresses");

    return Op;

  }


  if (IsA16) {

    if (!ST->hasA16()) {

      LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "

                           "support 16 bit addresses\n");

      return Op;

    }

  }


  // We've dealt with incorrect input so we know that if IsA16, IsG16

  // are set then we have to compress/pack operands (either address,

  // gradient or both)

  // In the case where a16 and gradients are tied (no G16 support) then we

  // have already verified that both IsA16 and IsG16 are true

  if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {

    // Activate g16

    const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =

        AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);

    IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16

  }


  // Add gradients (packed or unpacked)

  if (IsG16) {

    // Pack the gradients

    // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);

    packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,

                              ArgOffset + Intr->GradientStart,

                              ArgOffset + Intr->CoordStart, Intr->NumGradients);

  } else {

    for (unsigned I = ArgOffset + Intr->GradientStart;

         I < ArgOffset + Intr->CoordStart; I++)

      VAddrs.push_back(Op.getOperand(I));

  }


  // Add addresses (packed or unpacked)

  if (IsA16) {

    packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,

                              ArgOffset + Intr->CoordStart, VAddrEnd,

                              0 /* No gradients */);

  } else {

    // Add uncompressed address

    for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)

      VAddrs.push_back(Op.getOperand(I));

  }


  // If the register allocator cannot place the address registers contiguously

  // without introducing moves, then using the non-sequential address encoding

  // is always preferable, since it saves VALU instructions and is usually a

  // wash in terms of code size or even better.

  //

  // However, we currently have no way of hinting to the register allocator that

  // MIMG addresses should be placed contiguously when it is possible to do so,

  // so force non-NSA for the common 2-address case as a heuristic.

  //

  // SIShrinkInstructions will convert NSA encodings to non-NSA after register

  // allocation when possible.

  //

  // Partial NSA is allowed on GFX11+ where the final register is a contiguous

  // set of the remaining addresses.

  const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);

  const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();

  const bool UseNSA = ST->hasNSAEncoding() &&

                      VAddrs.size() >= ST->getNSAThreshold(MF) &&

                      (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);

  const bool UsePartialNSA =

      UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;


  SDValue VAddr;

  if (UsePartialNSA) {

    VAddr = getBuildDwordsVector(DAG, DL,

                                 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));

  } else if (!UseNSA) {

    VAddr = getBuildDwordsVector(DAG, DL, VAddrs);

  }


  SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);

  SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);

  SDValue Unorm;

  if (!BaseOpcode->Sampler) {

    Unorm = True;

  } else {

    uint64_t UnormConst =

        Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);


    Unorm = UnormConst ? True : False;

  }


  SDValue TFE;

  SDValue LWE;

  SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);

  bool IsTexFail = false;

  if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))

    return Op;


  if (IsTexFail) {

    if (!DMaskLanes) {

      // Expecting to get an error flag since TFC is on - and dmask is 0

      // Force dmask to be at least 1 otherwise the instruction will fail

      DMask = 0x1;

      DMaskLanes = 1;

      NumVDataDwords = 1;

    }

    NumVDataDwords += 1;

    AdjustRetType = true;

  }


  // Has something earlier tagged that the return type needs adjusting

  // This happens if the instruction is a load or has set TexFailCtrl flags

  if (AdjustRetType) {

    // NumVDataDwords reflects the true number of dwords required in the return

    // type

    if (DMaskLanes == 0 && !BaseOpcode->Store) {

      // This is a no-op load. This can be eliminated

      SDValue Undef = DAG.getPOISON(Op.getValueType());

      if (isa<MemSDNode>(Op))

        return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);

      return Undef;

    }


    EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),

                                                      MVT::i32, NumVDataDwords)

                                   : MVT::i32;


    ResultTypes[0] = NewVT;

    if (ResultTypes.size() == 3) {

      // Original result was aggregate type used for TexFailCtrl results

      // The actual instruction returns as a vector type which has now been

      // created. Remove the aggregate result.

      ResultTypes.erase(&ResultTypes[1]);

    }

  }


  unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);

  // Keep GLC only when the atomic's result is actually used.

  if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)

    CPol |= AMDGPU::CPol::GLC;

  if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |

               AMDGPU::CPol::VOLATILE))

    return Op;


  SmallVector<SDValue, 26> Ops;

  if (BaseOpcode->Store || BaseOpcode->Atomic)

    Ops.push_back(VData); // vdata

  if (UsePartialNSA) {

    append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));

    Ops.push_back(VAddr);

  } else if (UseNSA)

    append_range(Ops, VAddrs);

  else

    Ops.push_back(VAddr);

  SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);

  EVT RsrcVT = Rsrc.getValueType();

  if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)

    return Op;

  Ops.push_back(Rsrc);

  if (BaseOpcode->Sampler) {

    SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);

    if (Samp.getValueType() != MVT::v4i32)

      return Op;

    Ops.push_back(Samp);

  }

  Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));

  if (IsGFX10Plus)

    Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));

  if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)

    Ops.push_back(Unorm);

  Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));

  Ops.push_back(IsA16 && // r128, a16 for gfx9

                        ST->hasFeature(AMDGPU::FeatureR128A16)

                    ? True

                    : False);

  if (IsGFX10Plus)

    Ops.push_back(IsA16 ? True : False);


  if (!Subtarget->hasGFX90AInsts())

    Ops.push_back(TFE); // tfe

  else if (TFE->getAsZExtVal()) {

    DAG.getContext()->diagnose(DiagnosticInfoUnsupported(

        DAG.getMachineFunction().getFunction(),

        "TFE is not supported on this GPU", DL.getDebugLoc()));

  }


  if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)

    Ops.push_back(LWE); // lwe

  if (!IsGFX10Plus)

    Ops.push_back(DimInfo->DA ? True : False);

  if (BaseOpcode->HasD16)

    Ops.push_back(IsD16 ? True : False);

  if (isa<MemSDNode>(Op))

    Ops.push_back(Op.getOperand(0)); // chain


  int NumVAddrDwords =

      UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;

  int Opcode = -1;


  if (IsGFX13) {

    Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx13,

                                   NumVDataDwords, NumVAddrDwords);

  } else if (IsGFX12Plus) {

    Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,

                                   NumVDataDwords, NumVAddrDwords);

  } else if (IsGFX11Plus) {

    Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,

                                   UseNSA ? AMDGPU::MIMGEncGfx11NSA

                                          : AMDGPU::MIMGEncGfx11Default,

                                   NumVDataDwords, NumVAddrDwords);

  } else if (IsGFX10Plus) {

    Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,

                                   UseNSA ? AMDGPU::MIMGEncGfx10NSA

                                          : AMDGPU::MIMGEncGfx10Default,

                                   NumVDataDwords, NumVAddrDwords);

  } else {

    if (Subtarget->hasGFX90AInsts()) {

      Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,

                                     NumVDataDwords, NumVAddrDwords);

      if (Opcode == -1) {

        DAG.getContext()->diagnose(DiagnosticInfoUnsupported(

            DAG.getMachineFunction().getFunction(),

            "requested image instruction is not supported on this GPU",

            DL.getDebugLoc()));


        unsigned Idx = 0;

        SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());

        for (EVT VT : OrigResultTypes) {

          if (VT == MVT::Other)

            RetValues[Idx++] = Op.getOperand(0); // Chain

          else

            RetValues[Idx++] = DAG.getPOISON(VT);

        }


        return DAG.getMergeValues(RetValues, DL);

      }

    }

    if (Opcode == -1 &&

        Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)

      Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,

                                     NumVDataDwords, NumVAddrDwords);

    if (Opcode == -1)

      Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,

                                     NumVDataDwords, NumVAddrDwords);

  }

  if (Opcode == -1)

    return Op;


  MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);

  if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {

    MachineMemOperand *MemRef = MemOp->getMemOperand();

    DAG.setNodeMemRefs(NewNode, {MemRef});

  }


  if (BaseOpcode->NoReturn) {

    if (BaseOpcode->Atomic)

      return DAG.getMergeValues(

          {DAG.getPOISON(OrigResultTypes[0]), SDValue(NewNode, 0)}, DL);


    return SDValue(NewNode, 0);

  }


  if (BaseOpcode->AtomicX2) {

    SmallVector<SDValue, 1> Elt;

    DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);

    return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);

  }


  return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,

                           Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,

                           NumVDataDwords, IsAtomicPacked16Bit, DL);

}


SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,

                                       SDValue Offset, SDValue CachePolicy,

                                       SelectionDAG &DAG) const {

  MachineFunction &MF = DAG.getMachineFunction();


  const DataLayout &DataLayout = DAG.getDataLayout();

  Align Alignment =

      DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));


  MachineMemOperand *MMO = MF.getMachineMemOperand(

      MachinePointerInfo(),

      MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |

          MachineMemOperand::MOInvariant,

      VT.getStoreSize(), Alignment);


  if (!Offset->isDivergent()) {

    SDValue Ops[] = {Rsrc, Offset, CachePolicy};


    // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the

    // s_buffer_load_u16 instruction is emitted for both signed and unsigned

    // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext

    // and generates s_buffer_load_i16 (performSignExtendInRegCombine).

    if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {

      SDValue BufferLoad =

          DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_USHORT, DL,

                                  DAG.getVTList(MVT::i32), Ops, VT, MMO);

      return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);

    }


    // Widen vec3 load to vec4.

    if (VT.isVector() && VT.getVectorNumElements() == 3 &&

        !Subtarget->hasScalarDwordx3Loads()) {

      EVT WidenedVT =

          EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);

      auto WidenedOp = DAG.getMemIntrinsicNode(

          AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,

          MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));

      auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,

                                   DAG.getVectorIdxConstant(0, DL));

      return Subvector;

    }


    return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,

                                   DAG.getVTList(VT), Ops, VT, MMO);

  }


  // We have a divergent offset. Emit a MUBUF buffer load instead. We can

  // assume that the buffer is unswizzled.

  SDValue Ops[] = {

      DAG.getEntryNode(),                    // Chain

      Rsrc,                                  // rsrc

      DAG.getConstant(0, DL, MVT::i32),      // vindex

      {},                                    // voffset

      {},                                    // soffset

      {},                                    // offset

      CachePolicy,                           // cachepolicy

      DAG.getTargetConstant(0, DL, MVT::i1), // idxen

  };

  if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {

    setBufferOffsets(Offset, DAG, &Ops[3], Align(4));

    return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);

  }


  SmallVector<SDValue, 4> Loads;

  unsigned NumLoads = 1;

  MVT LoadVT = VT.getSimpleVT();

  unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;

  assert((LoadVT.getScalarType() == MVT::i32 ||

          LoadVT.getScalarType() == MVT::f32));


  if (NumElts == 8 || NumElts == 16) {

    NumLoads = NumElts / 4;

    LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);

  }


  SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other});


  // Use the alignment to ensure that the required offsets will fit into the

  // immediate offsets.

  setBufferOffsets(Offset, DAG, &Ops[3],

                   NumLoads > 1 ? Align(16 * NumLoads) : Align(4));


  uint64_t InstOffset = Ops[5]->getAsZExtVal();

  unsigned LoadSize = LoadVT.getStoreSize();

  for (unsigned i = 0; i < NumLoads; ++i) {

    Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);

    MachineMemOperand *LoadMMO = MF.getMachineMemOperand(MMO, 16 * i, LoadSize);

    Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,

                                        LoadVT, LoadMMO, DAG));

  }


  if (NumElts == 8 || NumElts == 16)

    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);


  return Loads[0];

}


SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {

  // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].

  if (!Subtarget->hasArchitectedSGPRs())

    return {};

  SDLoc SL(Op);

  MVT VT = MVT::i32;

  SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);

  return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,

                     DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));

}


SDValue SITargetLowering::lowerConstHwRegRead(SelectionDAG &DAG, SDValue Op,

                                              AMDGPU::Hwreg::Id HwReg,

                                              unsigned LowBit,

                                              unsigned Width) const {

  SDLoc SL(Op);

  using namespace AMDGPU::Hwreg;

  return {DAG.getMachineNode(

              AMDGPU::S_GETREG_B32_const, SL, MVT::i32,

              DAG.getTargetConstant(HwregEncoding::encode(HwReg, LowBit, Width),

                                    SL, MVT::i32)),

          0};

}


SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,

                                          unsigned Dim,

                                          const ArgDescriptor &Arg) const {

  SDLoc SL(Op);

  MachineFunction &MF = DAG.getMachineFunction();

  unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);

  if (MaxID == 0)

    return DAG.getConstant(0, SL, MVT::i32);


  // It's undefined behavior if a function marked with the amdgpu-no-*

  // attributes uses the corresponding intrinsic.

  if (!Arg)

    return DAG.getPOISON(Op->getValueType(0));


  SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,

                               SDLoc(DAG.getEntryNode()), Arg);


  // Don't bother inserting AssertZext for packed IDs since we're emitting the

  // masking operations anyway.

  //

  // TODO: We could assert the top bit is 0 for the source copy.

  if (Arg.isMasked())

    return Val;


  // Preserve the known bits after expansion to a copy.

  EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID));

  return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,

                     DAG.getValueType(SmallVT));

}


SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,

                                                  SelectionDAG &DAG) const {

  MachineFunction &MF = DAG.getMachineFunction();

  auto *MFI = MF.getInfo<SIMachineFunctionInfo>();


  EVT VT = Op.getValueType();

  SDLoc DL(Op);

  unsigned IntrinsicID = Op.getConstantOperandVal(0);


  // TODO: Should this propagate fast-math-flags?


  switch (IntrinsicID) {

  case Intrinsic::amdgcn_implicit_buffer_ptr: {

    if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))

      return emitNonHSAIntrinsicError(DAG, DL, VT);

    return getPreloadedValue(DAG, *MFI, VT,

                             AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);

  }

  case Intrinsic::amdgcn_dispatch_ptr:

  case Intrinsic::amdgcn_queue_ptr: {

    if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {

      DAG.getContext()->diagnose(DiagnosticInfoUnsupported(

          MF.getFunction(), "unsupported hsa intrinsic without hsa target",

          DL.getDebugLoc()));

      return DAG.getPOISON(VT);

    }


    auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr

                     ? AMDGPUFunctionArgInfo::DISPATCH_PTR

                     : AMDGPUFunctionArgInfo::QUEUE_PTR;

    return getPreloadedValue(DAG, *MFI, VT, RegID);

  }

  case Intrinsic::amdgcn_implicitarg_ptr: {

    if (MFI->isEntryFunction())

      return getImplicitArgPtr(DAG, DL);

    return getPreloadedValue(DAG, *MFI, VT,

                             AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);

  }

  case Intrinsic::amdgcn_kernarg_segment_ptr: {

    if (!AMDGPU::isKernel(MF.getFunction())) {

      // This only makes sense to call in a kernel, so just lower to null.

      return DAG.getConstant(0, DL, VT);

    }


    return getPreloadedValue(DAG, *MFI, VT,

                             AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);

  }

  case Intrinsic::amdgcn_dispatch_id: {

    return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);

  }

  case Intrinsic::amdgcn_rcp:

    return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));

  case Intrinsic::amdgcn_rsq:

    return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));

  case Intrinsic::amdgcn_rsq_legacy:

    if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)

      return emitRemovedIntrinsicError(DAG, DL, VT);

    return SDValue();

  case Intrinsic::amdgcn_rcp_legacy:

    if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)

      return emitRemovedIntrinsicError(DAG, DL, VT);

    return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));

  case Intrinsic::amdgcn_rsq_clamp: {

    if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)

      return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));


    Type *Type = VT.getTypeForEVT(*DAG.getContext());

    APFloat Max = APFloat::getLargest(Type->getFltSemantics());

    APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);


    SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));

    SDValue Tmp =

        DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));

    return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,

                       DAG.getConstantFP(Min, DL, VT));

  }

  case Intrinsic::r600_read_ngroups_x:

    if (Subtarget->isAmdHsaOS())

      return emitNonHSAIntrinsicError(DAG, DL, VT);


    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),

                                    SI::KernelInputOffsets::NGROUPS_X, Align(4),

                                    false);

  case Intrinsic::r600_read_ngroups_y:

    if (Subtarget->isAmdHsaOS())

      return emitNonHSAIntrinsicError(DAG, DL, VT);


    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),

                                    SI::KernelInputOffsets::NGROUPS_Y, Align(4),

                                    false);

  case Intrinsic::r600_read_ngroups_z:

    if (Subtarget->isAmdHsaOS())

      return emitNonHSAIntrinsicError(DAG, DL, VT);


    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),

                                    SI::KernelInputOffsets::NGROUPS_Z, Align(4),

                                    false);

  case Intrinsic::r600_read_local_size_x:

    if (Subtarget->isAmdHsaOS())

      return emitNonHSAIntrinsicError(DAG, DL, VT);


    return lowerImplicitZextParam(DAG, Op, MVT::i16,

                                  SI::KernelInputOffsets::LOCAL_SIZE_X);

  case Intrinsic::r600_read_local_size_y:

    if (Subtarget->isAmdHsaOS())

      return emitNonHSAIntrinsicError(DAG, DL, VT);


    return lowerImplicitZextParam(DAG, Op, MVT::i16,

                                  SI::KernelInputOffsets::LOCAL_SIZE_Y);

  case Intrinsic::r600_read_local_size_z:

    if (Subtarget->isAmdHsaOS())

      return emitNonHSAIntrinsicError(DAG, DL, VT);


    return lowerImplicitZextParam(DAG, Op, MVT::i16,

                                  SI::KernelInputOffsets::LOCAL_SIZE_Z);

  case Intrinsic::amdgcn_workgroup_id_x:

    return lowerWorkGroupId(DAG, *MFI, VT,

                            AMDGPUFunctionArgInfo::WORKGROUP_ID_X,

                            AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X,

                            AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X);

  case Intrinsic::amdgcn_workgroup_id_y:

    return lowerWorkGroupId(DAG, *MFI, VT,

                            AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,

                            AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y,

                            AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y);

  case Intrinsic::amdgcn_workgroup_id_z:

    return lowerWorkGroupId(DAG, *MFI, VT,

                            AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,

                            AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z,

                            AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z);

  case Intrinsic::amdgcn_cluster_id_x:

    return Subtarget->hasClusters()

               ? getPreloadedValue(DAG, *MFI, VT,

                                   AMDGPUFunctionArgInfo::WORKGROUP_ID_X)

               : DAG.getPOISON(VT);

  case Intrinsic::amdgcn_cluster_id_y:

    return Subtarget->hasClusters()

               ? getPreloadedValue(DAG, *MFI, VT,

                                   AMDGPUFunctionArgInfo::WORKGROUP_ID_Y)

               : DAG.getPOISON(VT);

  case Intrinsic::amdgcn_cluster_id_z:

    return Subtarget->hasClusters()

               ? getPreloadedValue(DAG, *MFI, VT,

                                   AMDGPUFunctionArgInfo::WORKGROUP_ID_Z)

               : DAG.getPOISON(VT);

  case Intrinsic::amdgcn_cluster_workgroup_id_x:

    return Subtarget->hasClusters()

               ? getPreloadedValue(

                     DAG, *MFI, VT,

                     AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X)

               : DAG.getPOISON(VT);

  case Intrinsic::amdgcn_cluster_workgroup_id_y:

    return Subtarget->hasClusters()

               ? getPreloadedValue(

                     DAG, *MFI, VT,

                     AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y)

               : DAG.getPOISON(VT);

  case Intrinsic::amdgcn_cluster_workgroup_id_z:

    return Subtarget->hasClusters()

               ? getPreloadedValue(

                     DAG, *MFI, VT,

                     AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z)

               : DAG.getPOISON(VT);

  case Intrinsic::amdgcn_cluster_workgroup_flat_id:

    return Subtarget->hasClusters()

               ? lowerConstHwRegRead(DAG, Op, AMDGPU::Hwreg::ID_IB_STS2, 21, 4)

               : SDValue();

  case Intrinsic::amdgcn_cluster_workgroup_max_id_x:

    return Subtarget->hasClusters()

               ? getPreloadedValue(

                     DAG, *MFI, VT,

                     AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X)

               : DAG.getPOISON(VT);

  case Intrinsic::amdgcn_cluster_workgroup_max_id_y:

    return Subtarget->hasClusters()

               ? getPreloadedValue(

                     DAG, *MFI, VT,

                     AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y)

               : DAG.getPOISON(VT);

  case Intrinsic::amdgcn_cluster_workgroup_max_id_z:

    return Subtarget->hasClusters()

               ? getPreloadedValue(

                     DAG, *MFI, VT,

                     AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z)

               : DAG.getPOISON(VT);

  case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:

    return Subtarget->hasClusters()

               ? getPreloadedValue(

                     DAG, *MFI, VT,

                     AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID)

               : DAG.getPOISON(VT);

  case Intrinsic::amdgcn_wave_id:

    return lowerWaveID(DAG, Op);

  case Intrinsic::amdgcn_lds_kernel_id: {

    if (MFI->isEntryFunction())

      return getLDSKernelId(DAG, DL);

    return getPreloadedValue(DAG, *MFI, VT,

                             AMDGPUFunctionArgInfo::LDS_KERNEL_ID);

  }

  case Intrinsic::amdgcn_workitem_id_x:

    return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);

  case Intrinsic::amdgcn_workitem_id_y:

    return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);

  case Intrinsic::amdgcn_workitem_id_z:

    return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);

  case Intrinsic::amdgcn_wavefrontsize:

    return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),

                           SDLoc(Op), MVT::i32);

  case Intrinsic::amdgcn_s_buffer_load: {

    unsigned CPol = Op.getConstantOperandVal(3);

    // s_buffer_load, because of how it's optimized, can't be volatile

    // so reject ones with the volatile bit set.

    if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)

                     ? AMDGPU::CPol::ALL

                     : AMDGPU::CPol::ALL_pregfx12))

      return Op;

    return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),

                        Op.getOperand(3), DAG);

  }

  case Intrinsic::amdgcn_fdiv_fast:

    return lowerFDIV_FAST(Op, DAG);

  case Intrinsic::amdgcn_sin:

    return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));


  case Intrinsic::amdgcn_cos:

    return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));


  case Intrinsic::amdgcn_mul_u24:

    return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),

                       Op.getOperand(2));

  case Intrinsic::amdgcn_mul_i24:

    return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),

                       Op.getOperand(2));


  case Intrinsic::amdgcn_log_clamp: {

    if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)

      return SDValue();


    return emitRemovedIntrinsicError(DAG, DL, VT);

  }

  case Intrinsic::amdgcn_fract:

    return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));


  case Intrinsic::amdgcn_class:

    return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),

                       Op.getOperand(2));

  case Intrinsic::amdgcn_div_fmas:

    return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),

                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));


  case Intrinsic::amdgcn_div_fixup:

    return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),

                       Op.getOperand(2), Op.getOperand(3));


  case Intrinsic::amdgcn_div_scale: {

    const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));


    // Translate to the operands expected by the machine instruction. The

    // first parameter must be the same as the first instruction.

    SDValue Numerator = Op.getOperand(1);

    SDValue Denominator = Op.getOperand(2);


    // Note this order is opposite of the machine instruction's operations,

    // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The

    // intrinsic has the numerator as the first operand to match a normal

    // division operation.


    SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;


    return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,

                       Denominator, Numerator);

  }

  case Intrinsic::amdgcn_icmp: {

    // There is a Pat that handles this variant, so return it as-is.

    if (Op.getOperand(1).getValueType() == MVT::i1 &&

        Op.getConstantOperandVal(2) == 0 &&

        Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)

      return Op;

    return lowerICMPIntrinsic(*this, Op.getNode(), DAG);

  }

  case Intrinsic::amdgcn_fcmp: {

    return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);

  }

  case Intrinsic::amdgcn_ballot:

    return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);

  case Intrinsic::amdgcn_fmed3:

    return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),

                       Op.getOperand(2), Op.getOperand(3));

  case Intrinsic::amdgcn_fdot2:

    return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),

                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));

  case Intrinsic::amdgcn_fmul_legacy:

    return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),

                       Op.getOperand(2));

  case Intrinsic::amdgcn_sbfe:

    return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),

                       Op.getOperand(2), Op.getOperand(3));

  case Intrinsic::amdgcn_ubfe:

    return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),

                       Op.getOperand(2), Op.getOperand(3));

  case Intrinsic::amdgcn_cvt_pkrtz:

  case Intrinsic::amdgcn_cvt_pknorm_i16:

  case Intrinsic::amdgcn_cvt_pknorm_u16:

  case Intrinsic::amdgcn_cvt_pk_i16:

  case Intrinsic::amdgcn_cvt_pk_u16: {

    // FIXME: Stop adding cast if v2f16/v2i16 are legal.

    EVT VT = Op.getValueType();

    unsigned Opcode;


    if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)

      Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;

    else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)

      Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;

    else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)

      Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;

    else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)

      Opcode = AMDGPUISD::CVT_PK_I16_I32;

    else

      Opcode = AMDGPUISD::CVT_PK_U16_U32;


    if (isTypeLegal(VT))

      return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));


    SDValue Node =

        DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));

    return DAG.getNode(ISD::BITCAST, DL, VT, Node);

  }

  case Intrinsic::amdgcn_fmad_ftz:

    return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),

                       Op.getOperand(2), Op.getOperand(3));


  case Intrinsic::amdgcn_if_break:

    return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,

                                      Op->getOperand(1), Op->getOperand(2)),

                   0);


  case Intrinsic::amdgcn_groupstaticsize: {

    Triple::OSType OS = getTargetMachine().getTargetTriple().getOS();

    if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)

      return Op;


    const Module *M = MF.getFunction().getParent();

    const GlobalValue *GV =

        Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);

    SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,

                                            SIInstrInfo::MO_ABS32_LO);

    return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};

  }

  case Intrinsic::amdgcn_is_shared:

  case Intrinsic::amdgcn_is_private: {

    SDLoc SL(Op);

    SDValue SrcVec =

        DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));

    SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,

                                DAG.getConstant(1, SL, MVT::i32));


    unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)

                      ? AMDGPUAS::LOCAL_ADDRESS

                      : AMDGPUAS::PRIVATE_ADDRESS;

    if (AS == AMDGPUAS::PRIVATE_ADDRESS &&

        Subtarget->hasGloballyAddressableScratch()) {

      SDValue FlatScratchBaseHi(

          DAG.getMachineNode(

              AMDGPU::S_MOV_B32, DL, MVT::i32,

              DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),

          0);

      // Test bits 63..58 against the aperture address.

      return DAG.getSetCC(

          SL, MVT::i1,

          DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),

          DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);

    }


    SDValue Aperture = getSegmentAperture(AS, SL, DAG);

    return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);

  }

  case Intrinsic::amdgcn_perm:

    return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),

                       Op.getOperand(2), Op.getOperand(3));

  case Intrinsic::amdgcn_reloc_constant: {

    Module *M = MF.getFunction().getParent();

    const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();

    auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();

    auto *RelocSymbol = cast<GlobalVariable>(

        M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));

    SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,

                                            SIInstrInfo::MO_ABS32_LO);

    return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};

  }

  case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:

  case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:

  case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:

  case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:

  case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:

  case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:

  case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:

  case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {

    if (Op.getOperand(4).getValueType() == MVT::i32)

      return SDValue();


    SDLoc SL(Op);

    auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);

    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),

                       Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),

                       Op.getOperand(3), IndexKeyi32);

  }

  case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:

  case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:

  case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:

  case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:

  case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:

  case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:

  case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:

  case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {

    if (Op.getOperand(4).getValueType() == MVT::i64)

      return SDValue();


    SDLoc SL(Op);

    auto IndexKeyi64 =

        Op.getOperand(4).getValueType() == MVT::v2i32

            ? DAG.getBitcast(MVT::i64, Op.getOperand(4))

            : DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64);

    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),

                       {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),

                        Op.getOperand(3), IndexKeyi64, Op.getOperand(5),

                        Op.getOperand(6)});

  }

  case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:

  case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:

  case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:

  case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:

  case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:

  case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {

    EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8

                         ? MVT::i64

                         : MVT::i32;

    if (Op.getOperand(6).getValueType() == IndexKeyTy)

      return SDValue();


    SDLoc SL(Op);

    auto IndexKey =

        Op.getOperand(6).getValueType().isVector()

            ? DAG.getBitcast(IndexKeyTy, Op.getOperand(6))

            : DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);

    SmallVector<SDValue> Args{

        Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),

        Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),

        IndexKey,         Op.getOperand(7), Op.getOperand(8)};

    if (IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8)

      Args.push_back(Op.getOperand(9));

    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), Args);

  }

  case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:

  case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:

  case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {

    if (Op.getOperand(6).getValueType() == MVT::i32)

      return SDValue();


    SDLoc SL(Op);

    auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);

    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),

                       {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),

                        Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),

                        IndexKeyi32, Op.getOperand(7)});

  }

  case Intrinsic::amdgcn_addrspacecast_nonnull:

    return lowerADDRSPACECAST(Op, DAG);

  case Intrinsic::amdgcn_readlane:

  case Intrinsic::amdgcn_readfirstlane:

  case Intrinsic::amdgcn_writelane:

  case Intrinsic::amdgcn_permlane16:

  case Intrinsic::amdgcn_permlanex16:

  case Intrinsic::amdgcn_permlane64:

  case Intrinsic::amdgcn_set_inactive:

  case Intrinsic::amdgcn_set_inactive_chain_arg:

  case Intrinsic::amdgcn_mov_dpp8:

  case Intrinsic::amdgcn_update_dpp:

  case Intrinsic::amdgcn_permlane_bcast:

  case Intrinsic::amdgcn_permlane_up:

  case Intrinsic::amdgcn_permlane_down:

  case Intrinsic::amdgcn_permlane_xor:

    return lowerLaneOp(*this, Op.getNode(), DAG);

  case Intrinsic::amdgcn_dead: {

    SmallVector<SDValue, 8> Poisons;

    for (const EVT ValTy : Op.getNode()->values())

      Poisons.push_back(DAG.getPOISON(ValTy));

    return DAG.getMergeValues(Poisons, SDLoc(Op));

  }

  case Intrinsic::amdgcn_wave_shuffle:

    return lowerWaveShuffle(*this, Op.getNode(), DAG);

  default:

    if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =

            AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))

      return lowerImage(Op, ImageDimIntr, DAG, false);


    return Op;

  }

}


// On targets not supporting constant in soffset field, turn zero to

// SGPR_NULL to avoid generating an extra s_mov with zero.


static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG,

                             const GCNSubtarget *Subtarget) {

  if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))

    return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);

  return SOffset;

}


SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,

                                                     SelectionDAG &DAG,

                                                     unsigned NewOpcode) const {

  SDLoc DL(Op);


  SDValue VData = Op.getOperand(2);

  SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);

  auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);

  auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);

  SDValue Ops[] = {

      Op.getOperand(0),                      // Chain

      VData,                                 // vdata

      Rsrc,                                  // rsrc

      DAG.getConstant(0, DL, MVT::i32),      // vindex

      VOffset,                               // voffset

      SOffset,                               // soffset

      Offset,                                // offset

      Op.getOperand(6),                      // cachepolicy

      DAG.getTargetConstant(0, DL, MVT::i1), // idxen

  };


  auto *M = cast<MemSDNode>(Op);


  EVT MemVT = VData.getValueType();

  return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,

                                 M->getMemOperand());

}


SDValue

SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,

                                                unsigned NewOpcode) const {

  SDLoc DL(Op);


  SDValue VData = Op.getOperand(2);

  SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);

  auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);

  auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);

  SDValue Ops[] = {

      Op.getOperand(0),                      // Chain

      VData,                                 // vdata

      Rsrc,                                  // rsrc

      Op.getOperand(4),                      // vindex

      VOffset,                               // voffset

      SOffset,                               // soffset

      Offset,                                // offset

      Op.getOperand(7),                      // cachepolicy

      DAG.getTargetConstant(1, DL, MVT::i1), // idxen

  };


  auto *M = cast<MemSDNode>(Op);


  EVT MemVT = VData.getValueType();

  return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,

                                 M->getMemOperand());

}


SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,

                                                 SelectionDAG &DAG) const {

  unsigned IntrID = Op.getConstantOperandVal(1);

  SDLoc DL(Op);


  switch (IntrID) {

  case Intrinsic::amdgcn_ds_ordered_add:

  case Intrinsic::amdgcn_ds_ordered_swap: {

    MemSDNode *M = cast<MemSDNode>(Op);

    SDValue Chain = M->getOperand(0);

    SDValue M0 = M->getOperand(2);

    SDValue Value = M->getOperand(3);

    unsigned IndexOperand = M->getConstantOperandVal(7);

    unsigned WaveRelease = M->getConstantOperandVal(8);

    unsigned WaveDone = M->getConstantOperandVal(9);


    unsigned OrderedCountIndex = IndexOperand & 0x3f;

    IndexOperand &= ~0x3f;

    unsigned CountDw = 0;


    if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {

      CountDw = (IndexOperand >> 24) & 0xf;

      IndexOperand &= ~(0xf << 24);


      if (CountDw < 1 || CountDw > 4) {

        const Function &Fn = DAG.getMachineFunction().getFunction();

        DAG.getContext()->diagnose(DiagnosticInfoUnsupported(

            Fn, "ds_ordered_count: dword count must be between 1 and 4",

            DL.getDebugLoc()));

        CountDw = 1;

      }

    }


    if (IndexOperand) {

      const Function &Fn = DAG.getMachineFunction().getFunction();

      DAG.getContext()->diagnose(DiagnosticInfoUnsupported(

          Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));

    }


    if (WaveDone && !WaveRelease) {

      // TODO: Move this to IR verifier

      const Function &Fn = DAG.getMachineFunction().getFunction();

      DAG.getContext()->diagnose(DiagnosticInfoUnsupported(

          Fn, "ds_ordered_count: wave_done requires wave_release",

          DL.getDebugLoc()));

    }


    unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;

    unsigned ShaderType =

        SIInstrInfo::getDSShaderTypeValue(DAG.getMachineFunction());

    unsigned Offset0 = OrderedCountIndex << 2;

    unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);


    if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)

      Offset1 |= (CountDw - 1) << 6;


    if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)

      Offset1 |= ShaderType << 2;


    unsigned Offset = Offset0 | (Offset1 << 8);


    SDValue Ops[] = {

        Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),

        copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue

    };

    return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL,

                                   M->getVTList(), Ops, M->getMemoryVT(),

                                   M->getMemOperand());

  }

  case Intrinsic::amdgcn_raw_buffer_load:

  case Intrinsic::amdgcn_raw_ptr_buffer_load:

  case Intrinsic::amdgcn_raw_atomic_buffer_load:

  case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:

  case Intrinsic::amdgcn_raw_buffer_load_format:

  case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {

    const bool IsFormat =

        IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||

        IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;


    SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);

    auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);

    auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);

    SDValue Ops[] = {

        Op.getOperand(0),                      // Chain

        Rsrc,                                  // rsrc

        DAG.getConstant(0, DL, MVT::i32),      // vindex

        VOffset,                               // voffset

        SOffset,                               // soffset

        Offset,                                // offset

        Op.getOperand(5),                      // cachepolicy, swizzled buffer

        DAG.getTargetConstant(0, DL, MVT::i1), // idxen

    };


    auto *M = cast<MemSDNode>(Op);

    return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);

  }

  case Intrinsic::amdgcn_struct_buffer_load:

  case Intrinsic::amdgcn_struct_ptr_buffer_load:

  case Intrinsic::amdgcn_struct_buffer_load_format:

  case Intrinsic::amdgcn_struct_ptr_buffer_load_format:

  case Intrinsic::amdgcn_struct_atomic_buffer_load:

  case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {

    const bool IsFormat =

        IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||

        IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;


    SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);

    auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);

    auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);

    SDValue Ops[] = {

        Op.getOperand(0),                      // Chain

        Rsrc,                                  // rsrc

        Op.getOperand(3),                      // vindex

        VOffset,                               // voffset

        SOffset,                               // soffset

        Offset,                                // offset

        Op.getOperand(6),                      // cachepolicy, swizzled buffer

        DAG.getTargetConstant(1, DL, MVT::i1), // idxen

    };


    return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);

  }

  case Intrinsic::amdgcn_raw_tbuffer_load:

  case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {

    MemSDNode *M = cast<MemSDNode>(Op);

    EVT LoadVT = Op.getValueType();

    SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);

    auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);

    auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);


    SDValue Ops[] = {

        Op.getOperand(0),                      // Chain

        Rsrc,                                  // rsrc

        DAG.getConstant(0, DL, MVT::i32),      // vindex

        VOffset,                               // voffset

        SOffset,                               // soffset

        Offset,                                // offset

        Op.getOperand(5),                      // format

        Op.getOperand(6),                      // cachepolicy, swizzled buffer

        DAG.getTargetConstant(0, DL, MVT::i1), // idxen

    };


    if (LoadVT.getScalarType() == MVT::f16)

      return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,

                                 Ops);

    return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,

                               Op->getVTList(), Ops, LoadVT, M->getMemOperand(),

                               DAG);

  }

  case Intrinsic::amdgcn_struct_tbuffer_load:

  case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {

    MemSDNode *M = cast<MemSDNode>(Op);

    EVT LoadVT = Op.getValueType();

    SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);

    auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);

    auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);


    SDValue Ops[] = {

        Op.getOperand(0),                      // Chain

        Rsrc,                                  // rsrc

        Op.getOperand(3),                      // vindex

        VOffset,                               // voffset

        SOffset,                               // soffset

        Offset,                                // offset

        Op.getOperand(6),                      // format

        Op.getOperand(7),                      // cachepolicy, swizzled buffer

        DAG.getTargetConstant(1, DL, MVT::i1), // idxen

    };


    if (LoadVT.getScalarType() == MVT::f16)

      return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,

                                 Ops);

    return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,

                               Op->getVTList(), Ops, LoadVT, M->getMemOperand(),

                               DAG);

  }

  case Intrinsic::amdgcn_raw_buffer_atomic_fadd:

  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:

    return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);

  case Intrinsic::amdgcn_struct_buffer_atomic_fadd:

  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:

    return lowerStructBufferAtomicIntrin(Op, DAG,

                                         AMDGPUISD::BUFFER_ATOMIC_FADD);

  case Intrinsic::amdgcn_raw_buffer_atomic_fmin:

  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:

    return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);

  case Intrinsic::amdgcn_struct_buffer_atomic_fmin:

  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:

    return lowerStructBufferAtomicIntrin(Op, DAG,

                                         AMDGPUISD::BUFFER_ATOMIC_FMIN);

  case Intrinsic::amdgcn_raw_buffer_atomic_fmax:

  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:

    return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);

  case Intrinsic::amdgcn_struct_buffer_atomic_fmax:

  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:

    return lowerStructBufferAtomicIntrin(Op, DAG,

                                         AMDGPUISD::BUFFER_ATOMIC_FMAX);

  case Intrinsic::amdgcn_raw_buffer_atomic_swap:

  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:

    return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);

  case Intrinsic::amdgcn_raw_buffer_atomic_add:

  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:

    return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);

  case Intrinsic::amdgcn_raw_buffer_atomic_sub:

  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:

    return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);

  case Intrinsic::amdgcn_raw_buffer_atomic_smin:

  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:

    return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);

  case Intrinsic::amdgcn_raw_buffer_atomic_umin:

  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:

    return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);

  case Intrinsic::amdgcn_raw_buffer_atomic_smax:

  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:

    return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);

  case Intrinsic::amdgcn_raw_buffer_atomic_umax:

  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:

    return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);

  case Intrinsic::amdgcn_raw_buffer_atomic_and:

  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:

    return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);

  case Intrinsic::amdgcn_raw_buffer_atomic_or:

  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:

    return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);

  case Intrinsic::amdgcn_raw_buffer_atomic_xor:

  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:

    return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);

  case Intrinsic::amdgcn_raw_buffer_atomic_inc:

  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:

    return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);

  case Intrinsic::amdgcn_raw_buffer_atomic_dec:

  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:

    return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);

  case Intrinsic::amdgcn_struct_buffer_atomic_swap:

  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:

    return lowerStructBufferAtomicIntrin(Op, DAG,

                                         AMDGPUISD::BUFFER_ATOMIC_SWAP);

  case Intrinsic::amdgcn_struct_buffer_atomic_add:

  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:

    return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);

  case Intrinsic::amdgcn_struct_buffer_atomic_sub:

  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:

    return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);

  case Intrinsic::amdgcn_struct_buffer_atomic_smin:

  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:

    return lowerStructBufferAtomicIntrin(Op, DAG,

                                         AMDGPUISD::BUFFER_ATOMIC_SMIN);

  case Intrinsic::amdgcn_struct_buffer_atomic_umin:

  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:

    return lowerStructBufferAtomicIntrin(Op, DAG,

                                         AMDGPUISD::BUFFER_ATOMIC_UMIN);

  case Intrinsic::amdgcn_struct_buffer_atomic_smax:

  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:

    return lowerStructBufferAtomicIntrin(Op, DAG,

                                         AMDGPUISD::BUFFER_ATOMIC_SMAX);

  case Intrinsic::amdgcn_struct_buffer_atomic_umax:

  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:

    return lowerStructBufferAtomicIntrin(Op, DAG,

                                         AMDGPUISD::BUFFER_ATOMIC_UMAX);

  case Intrinsic::amdgcn_struct_buffer_atomic_and:

  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:

    return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);

  case Intrinsic::amdgcn_struct_buffer_atomic_or:

  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:

    return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);

  case Intrinsic::amdgcn_struct_buffer_atomic_xor:

  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:

    return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);

  case Intrinsic::amdgcn_struct_buffer_atomic_inc:

  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:

    return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);

  case Intrinsic::amdgcn_struct_buffer_atomic_dec:

  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:

    return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);

  case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:

  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:

    return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_CSUB);

  case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:

  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:

    return lowerStructBufferAtomicIntrin(Op, DAG,

                                         AMDGPUISD::BUFFER_ATOMIC_CSUB);

  case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:

  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:

    return lowerRawBufferAtomicIntrin(Op, DAG,

                                      AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);

  case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:

  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:

    return lowerStructBufferAtomicIntrin(Op, DAG,

                                         AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);

  case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:

  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {

    SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);

    auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);

    auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);

    SDValue Ops[] = {

        Op.getOperand(0),                      // Chain

        Op.getOperand(2),                      // src

        Op.getOperand(3),                      // cmp

        Rsrc,                                  // rsrc

        DAG.getConstant(0, DL, MVT::i32),      // vindex

        VOffset,                               // voffset

        SOffset,                               // soffset

        Offset,                                // offset

        Op.getOperand(7),                      // cachepolicy

        DAG.getTargetConstant(0, DL, MVT::i1), // idxen

    };

    EVT VT = Op.getValueType();

    auto *M = cast<MemSDNode>(Op);


    return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,

                                   Op->getVTList(), Ops, VT,

                                   M->getMemOperand());

  }

  case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:

  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {

    SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);

    auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);

    auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);

    SDValue Ops[] = {

        Op.getOperand(0),                      // Chain

        Op.getOperand(2),                      // src

        Op.getOperand(3),                      // cmp

        Rsrc,                                  // rsrc

        Op.getOperand(5),                      // vindex

        VOffset,                               // voffset

        SOffset,                               // soffset

        Offset,                                // offset

        Op.getOperand(8),                      // cachepolicy

        DAG.getTargetConstant(1, DL, MVT::i1), // idxen

    };

    EVT VT = Op.getValueType();

    auto *M = cast<MemSDNode>(Op);


    return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,

                                   Op->getVTList(), Ops, VT,

                                   M->getMemOperand());

  }

  case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:

  case Intrinsic::amdgcn_image_bvh8_intersect_ray: {

    MemSDNode *M = cast<MemSDNode>(Op);

    SDValue NodePtr = M->getOperand(2);

    SDValue RayExtent = M->getOperand(3);

    SDValue InstanceMask = M->getOperand(4);

    SDValue RayOrigin = M->getOperand(5);

    SDValue RayDir = M->getOperand(6);

    SDValue Offsets = M->getOperand(7);

    SDValue TDescr = M->getOperand(8);


    assert(NodePtr.getValueType() == MVT::i64);

    assert(RayDir.getValueType() == MVT::v3f32);


    if (!Subtarget->hasBVHDualAndBVH8Insts()) {

      emitRemovedIntrinsicError(DAG, DL, Op.getValueType());

      return SDValue();

    }


    bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;

    const unsigned NumVDataDwords = 10;

    const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;

    int Opcode = AMDGPU::getMIMGOpcode(

        IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY

               : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,

        AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);

    assert(Opcode != -1);


    SmallVector<SDValue, 7> Ops;

    Ops.push_back(NodePtr);

    Ops.push_back(DAG.getBuildVector(

        MVT::v2i32, DL,

        {DAG.getBitcast(MVT::i32, RayExtent),

         DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));

    Ops.push_back(RayOrigin);

    Ops.push_back(RayDir);

    Ops.push_back(Offsets);

    Ops.push_back(TDescr);

    Ops.push_back(M->getChain());


    auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);

    MachineMemOperand *MemRef = M->getMemOperand();

    DAG.setNodeMemRefs(NewNode, {MemRef});

    return SDValue(NewNode, 0);

  }

  case Intrinsic::amdgcn_image_bvh_intersect_ray: {

    MemSDNode *M = cast<MemSDNode>(Op);

    SDValue NodePtr = M->getOperand(2);

    SDValue RayExtent = M->getOperand(3);

    SDValue RayOrigin = M->getOperand(4);

    SDValue RayDir = M->getOperand(5);

    SDValue RayInvDir = M->getOperand(6);

    SDValue TDescr = M->getOperand(7);


    assert(NodePtr.getValueType() == MVT::i32 ||

           NodePtr.getValueType() == MVT::i64);

    assert(RayDir.getValueType() == MVT::v3f16 ||

           RayDir.getValueType() == MVT::v3f32);


    if (!Subtarget->hasGFX10_AEncoding()) {

      emitRemovedIntrinsicError(DAG, DL, Op.getValueType());

      return SDValue();

    }


    const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);

    const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);

    const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);

    const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;

    const bool Is64 = NodePtr.getValueType() == MVT::i64;

    const unsigned NumVDataDwords = 4;

    const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);

    const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;

    const bool UseNSA = (Subtarget->hasNSAEncoding() &&

                         NumVAddrs <= Subtarget->getNSAMaxSize()) ||

                        IsGFX12Plus;

    const unsigned BaseOpcodes[2][2] = {

        {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},

        {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,

         AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};

    int Opcode;

    if (UseNSA) {

      Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],

                                     IsGFX12Plus ? AMDGPU::MIMGEncGfx12

                                     : IsGFX11   ? AMDGPU::MIMGEncGfx11NSA

                                                 : AMDGPU::MIMGEncGfx10NSA,

                                     NumVDataDwords, NumVAddrDwords);

    } else {

      assert(!IsGFX12Plus);

      Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],

                                     IsGFX11 ? AMDGPU::MIMGEncGfx11Default

                                             : AMDGPU::MIMGEncGfx10Default,

                                     NumVDataDwords, NumVAddrDwords);

    }

    assert(Opcode != -1);


    SmallVector<SDValue, 16> Ops;


    auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {

      SmallVector<SDValue, 3> Lanes;

      DAG.ExtractVectorElements(Op, Lanes, 0, 3);

      if (Lanes[0].getValueSizeInBits() == 32) {

        for (unsigned I = 0; I < 3; ++I)

          Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));

      } else {

        if (IsAligned) {

          Ops.push_back(DAG.getBitcast(

              MVT::i32,

              DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));

          Ops.push_back(Lanes[2]);

        } else {

          SDValue Elt0 = Ops.pop_back_val();

          Ops.push_back(DAG.getBitcast(

              MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));

          Ops.push_back(DAG.getBitcast(

              MVT::i32,

              DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));

        }

      }

    };


    if (UseNSA && IsGFX11Plus) {

      Ops.push_back(NodePtr);

      Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));

      Ops.push_back(RayOrigin);

      if (IsA16) {

        SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;

        DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);

        DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);

        for (unsigned I = 0; I < 3; ++I) {

          MergedLanes.push_back(DAG.getBitcast(

              MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,

                                           {DirLanes[I], InvDirLanes[I]})));

        }

        Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));

      } else {

        Ops.push_back(RayDir);

        Ops.push_back(RayInvDir);

      }

    } else {

      if (Is64)

        DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,

                                  2);

      else

        Ops.push_back(NodePtr);


      Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));

      packLanes(RayOrigin, true);

      packLanes(RayDir, true);

      packLanes(RayInvDir, false);

    }


    if (!UseNSA) {

      // Build a single vector containing all the operands so far prepared.

      if (NumVAddrDwords > 12) {

        SDValue Undef = DAG.getPOISON(MVT::i32);

        Ops.append(16 - Ops.size(), Undef);

      }

      assert(Ops.size() >= 8 && Ops.size() <= 12);

      SDValue MergedOps =

          DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);

      Ops.clear();

      Ops.push_back(MergedOps);

    }


    Ops.push_back(TDescr);

    Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));

    Ops.push_back(M->getChain());


    auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);

    MachineMemOperand *MemRef = M->getMemOperand();

    DAG.setNodeMemRefs(NewNode, {MemRef});

    return SDValue(NewNode, 0);

  }

  case Intrinsic::amdgcn_global_atomic_fmin_num:

  case Intrinsic::amdgcn_global_atomic_fmax_num:

  case Intrinsic::amdgcn_flat_atomic_fmin_num:

  case Intrinsic::amdgcn_flat_atomic_fmax_num: {

    MemSDNode *M = cast<MemSDNode>(Op);

    SDValue Ops[] = {

        M->getOperand(0), // Chain

        M->getOperand(2), // Ptr

        M->getOperand(3)  // Value

    };

    unsigned Opcode = 0;

    switch (IntrID) {

    case Intrinsic::amdgcn_global_atomic_fmin_num:

    case Intrinsic::amdgcn_flat_atomic_fmin_num: {

      Opcode = ISD::ATOMIC_LOAD_FMIN;

      break;

    }

    case Intrinsic::amdgcn_global_atomic_fmax_num:

    case Intrinsic::amdgcn_flat_atomic_fmax_num: {

      Opcode = ISD::ATOMIC_LOAD_FMAX;

      break;

    }

    default:

      llvm_unreachable("unhandled atomic opcode");

    }

    return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),

                         Ops, M->getMemOperand());

  }

  case Intrinsic::amdgcn_s_alloc_vgpr: {

    SDValue NumVGPRs = Op.getOperand(2);

    if (!NumVGPRs->isDivergent())

      return Op;


    SDValue ReadFirstLaneID =

        DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);

    NumVGPRs = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,

                           ReadFirstLaneID, NumVGPRs);


    return DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, Op->getVTList(),

                       Op.getOperand(0), Op.getOperand(1), NumVGPRs);

  }

  case Intrinsic::amdgcn_s_get_barrier_state:

  case Intrinsic::amdgcn_s_get_named_barrier_state: {

    SDValue Chain = Op->getOperand(0);

    SmallVector<SDValue, 2> Ops;

    unsigned Opc;


    if (isa<ConstantSDNode>(Op->getOperand(2))) {

      uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();

      if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)

        BarID = (BarID >> 4) & 0x3F;

      Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;

      SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);

      Ops.push_back(K);

      Ops.push_back(Chain);

    } else {

      Opc = AMDGPU::S_GET_BARRIER_STATE_M0;

      if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {

        SDValue M0Val;

        M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),

                            DAG.getShiftAmountConstant(4, MVT::i32, DL));

        M0Val = SDValue(

            DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,

                               DAG.getTargetConstant(0x3F, DL, MVT::i32)),

            0);

        Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));

      } else

        Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));

    }


    auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);

    return SDValue(NewMI, 0);

  }

  case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:

  case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:

  case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {

    MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);

    SDValue Chain = Op->getOperand(0);

    SDValue Ptr = Op->getOperand(2);

    EVT VT = Op->getValueType(0);

    return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,

                             Chain, Ptr, MII->getMemOperand());

  }

  case Intrinsic::amdgcn_flat_load_monitor_b32:

  case Intrinsic::amdgcn_flat_load_monitor_b64:

  case Intrinsic::amdgcn_flat_load_monitor_b128: {

    MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);

    SDValue Chain = Op->getOperand(0);

    SDValue Ptr = Op->getOperand(2);

    return DAG.getMemIntrinsicNode(AMDGPUISD::FLAT_LOAD_MONITOR, DL,

                                   Op->getVTList(), {Chain, Ptr},

                                   MII->getMemoryVT(), MII->getMemOperand());

  }

  case Intrinsic::amdgcn_global_load_monitor_b32:

  case Intrinsic::amdgcn_global_load_monitor_b64:

  case Intrinsic::amdgcn_global_load_monitor_b128: {

    MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);

    SDValue Chain = Op->getOperand(0);

    SDValue Ptr = Op->getOperand(2);

    return DAG.getMemIntrinsicNode(AMDGPUISD::GLOBAL_LOAD_MONITOR, DL,

                                   Op->getVTList(), {Chain, Ptr},

                                   MII->getMemoryVT(), MII->getMemOperand());

  }

  default:


    if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =

            AMDGPU::getImageDimIntrinsicInfo(IntrID))

      return lowerImage(Op, ImageDimIntr, DAG, true);


    return SDValue();

  }

}


// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to

// dwordx4 if on SI and handle TFE loads.

SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,

                                              SDVTList VTList,

                                              ArrayRef<SDValue> Ops, EVT MemVT,

                                              MachineMemOperand *MMO,

                                              SelectionDAG &DAG) const {

  LLVMContext &C = *DAG.getContext();

  MachineFunction &MF = DAG.getMachineFunction();

  EVT VT = VTList.VTs[0];


  assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);

  bool IsTFE = VTList.NumVTs == 3;

  if (IsTFE) {

    unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);

    unsigned NumOpDWords = NumValueDWords + 1;

    EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);

    SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);

    MachineMemOperand *OpDWordsMMO =

        MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);

    SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,

                                     OpDWordsVT, OpDWordsMMO, DAG);

    SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,

                                 DAG.getVectorIdxConstant(NumValueDWords, DL));

    SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);

    SDValue ValueDWords =

        NumValueDWords == 1

            ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)

            : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL,

                          EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,

                          ZeroIdx);

    SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);

    return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);

  }


  if (!Subtarget->hasDwordx3LoadStores() &&

      (VT == MVT::v3i32 || VT == MVT::v3f32)) {

    EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);

    EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);

    MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);

    SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);

    SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,

                                         WidenedMemVT, WidenedMMO);

    SDValue Value = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Op,

                                DAG.getVectorIdxConstant(0, DL));

    return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);

  }


  return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);

}


SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,

                                         bool ImageStore) const {

  EVT StoreVT = VData.getValueType();


  // No change for f16 and legal vector D16 types.

  if (!StoreVT.isVector())

    return VData;


  SDLoc DL(VData);

  unsigned NumElements = StoreVT.getVectorNumElements();


  if (Subtarget->hasUnpackedD16VMem()) {

    // We need to unpack the packed data to store.

    EVT IntStoreVT = StoreVT.changeTypeToInteger();

    SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);


    EVT EquivStoreVT =

        EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);

    SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);

    return DAG.UnrollVectorOp(ZExt.getNode());

  }


  // The sq block of gfx8.1 does not estimate register use correctly for d16

  // image store instructions. The data operand is computed as if it were not a

  // d16 image instruction.

  if (ImageStore && Subtarget->hasImageStoreD16Bug()) {

    // Bitcast to i16

    EVT IntStoreVT = StoreVT.changeTypeToInteger();

    SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);


    // Decompose into scalars

    SmallVector<SDValue, 4> Elts;

    DAG.ExtractVectorElements(IntVData, Elts);


    // Group pairs of i16 into v2i16 and bitcast to i32

    SmallVector<SDValue, 4> PackedElts;

    for (unsigned I = 0; I < Elts.size() / 2; I += 1) {

      SDValue Pair =

          DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});

      SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);

      PackedElts.push_back(IntPair);

    }

    if ((NumElements % 2) == 1) {

      // Handle v3i16

      unsigned I = Elts.size() / 2;

      SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,

                                        {Elts[I * 2], DAG.getPOISON(MVT::i16)});

      SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);

      PackedElts.push_back(IntPair);

    }


    // Pad using UNDEF

    PackedElts.resize(Elts.size(), DAG.getPOISON(MVT::i32));


    // Build final vector

    EVT VecVT =

        EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());

    return DAG.getBuildVector(VecVT, DL, PackedElts);

  }


  if (NumElements == 3) {

    EVT IntStoreVT =

        EVT::getIntegerVT(*DAG.getContext(), StoreVT.getStoreSizeInBits());

    SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);


    EVT WidenedStoreVT = EVT::getVectorVT(

        *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);

    EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),

                                         WidenedStoreVT.getStoreSizeInBits());

    SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);

    return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);

  }


  assert(isTypeLegal(StoreVT));

  return VData;

}


static bool isAsyncLDSDMA(Intrinsic::ID Intr) {

  switch (Intr) {

  case Intrinsic::amdgcn_raw_buffer_load_async_lds:

  case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:

  case Intrinsic::amdgcn_struct_buffer_load_async_lds:

  case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:

  case Intrinsic::amdgcn_load_async_to_lds:

  case Intrinsic::amdgcn_global_load_async_lds:

    return true;

  }

  return false;

}


SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,

                                              SelectionDAG &DAG) const {

  SDLoc DL(Op);

  SDValue Chain = Op.getOperand(0);

  unsigned IntrinsicID = Op.getConstantOperandVal(1);


  switch (IntrinsicID) {

  case Intrinsic::amdgcn_exp_compr: {

    if (!Subtarget->hasCompressedExport()) {

      DAG.getContext()->diagnose(DiagnosticInfoUnsupported(

          DAG.getMachineFunction().getFunction(),

          "intrinsic not supported on subtarget", DL.getDebugLoc()));

    }

    SDValue Src0 = Op.getOperand(4);

    SDValue Src1 = Op.getOperand(5);

    // Hack around illegal type on SI by directly selecting it.

    if (isTypeLegal(Src0.getValueType()))

      return SDValue();


    const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));

    SDValue Undef = DAG.getPOISON(MVT::f32);

    const SDValue Ops[] = {

        Op.getOperand(2),                              // tgt

        DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0

        DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1

        Undef,                                         // src2

        Undef,                                         // src3

        Op.getOperand(7),                              // vm

        DAG.getTargetConstant(1, DL, MVT::i1),         // compr

        Op.getOperand(3),                              // en

        Op.getOperand(0)                               // Chain

    };


    unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;

    return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);

  }


  case Intrinsic::amdgcn_struct_tbuffer_store:

  case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {

    SDValue VData = Op.getOperand(2);

    bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);

    if (IsD16)

      VData = handleD16VData(VData, DAG);

    SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);

    auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);

    auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);

    SDValue Ops[] = {

        Chain,

        VData,                                 // vdata

        Rsrc,                                  // rsrc

        Op.getOperand(4),                      // vindex

        VOffset,                               // voffset

        SOffset,                               // soffset

        Offset,                                // offset

        Op.getOperand(7),                      // format

        Op.getOperand(8),                      // cachepolicy, swizzled buffer

        DAG.getTargetConstant(1, DL, MVT::i1), // idxen

    };

    unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16

                         : AMDGPUISD::TBUFFER_STORE_FORMAT;

    MemSDNode *M = cast<MemSDNode>(Op);

    return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,

                                   M->getMemoryVT(), M->getMemOperand());

  }


  case Intrinsic::amdgcn_raw_tbuffer_store:

  case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {

    SDValue VData = Op.getOperand(2);

    bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);

    if (IsD16)

      VData = handleD16VData(VData, DAG);

    SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);

    auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);

    auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);

    SDValue Ops[] = {

        Chain,

        VData,                                 // vdata

        Rsrc,                                  // rsrc

        DAG.getConstant(0, DL, MVT::i32),      // vindex

        VOffset,                               // voffset

        SOffset,                               // soffset

        Offset,                                // offset

        Op.getOperand(6),                      // format

        Op.getOperand(7),                      // cachepolicy, swizzled buffer

        DAG.getTargetConstant(0, DL, MVT::i1), // idxen

    };

    unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16

                         : AMDGPUISD::TBUFFER_STORE_FORMAT;

    MemSDNode *M = cast<MemSDNode>(Op);

    return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,

                                   M->getMemoryVT(), M->getMemOperand());

  }


  case Intrinsic::amdgcn_raw_buffer_store:

  case Intrinsic::amdgcn_raw_ptr_buffer_store:

  case Intrinsic::amdgcn_raw_buffer_store_format:

  case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {

    const bool IsFormat =

        IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||

        IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;


    SDValue VData = Op.getOperand(2);

    EVT VDataVT = VData.getValueType();

    EVT EltType = VDataVT.getScalarType();

    bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);

    if (IsD16) {

      VData = handleD16VData(VData, DAG);

      VDataVT = VData.getValueType();

    }


    if (!isTypeLegal(VDataVT)) {

      VData =

          DAG.getNode(ISD::BITCAST, DL,

                      getEquivalentMemType(*DAG.getContext(), VDataVT), VData);

    }


    SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);

    auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);

    auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);

    SDValue Ops[] = {

        Chain,

        VData,

        Rsrc,

        DAG.getConstant(0, DL, MVT::i32),      // vindex

        VOffset,                               // voffset

        SOffset,                               // soffset

        Offset,                                // offset

        Op.getOperand(6),                      // cachepolicy, swizzled buffer

        DAG.getTargetConstant(0, DL, MVT::i1), // idxen

    };

    unsigned Opc =

        IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;

    Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;

    MemSDNode *M = cast<MemSDNode>(Op);


    // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics

    if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)

      return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);


    return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,

                                   M->getMemoryVT(), M->getMemOperand());

  }


  case Intrinsic::amdgcn_struct_buffer_store:

  case Intrinsic::amdgcn_struct_ptr_buffer_store:

  case Intrinsic::amdgcn_struct_buffer_store_format:

  case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {

    const bool IsFormat =

        IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||

        IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;


    SDValue VData = Op.getOperand(2);

    EVT VDataVT = VData.getValueType();

    EVT EltType = VDataVT.getScalarType();

    bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);


    if (IsD16) {

      VData = handleD16VData(VData, DAG);

      VDataVT = VData.getValueType();

    }


    if (!isTypeLegal(VDataVT)) {

      VData =

          DAG.getNode(ISD::BITCAST, DL,

                      getEquivalentMemType(*DAG.getContext(), VDataVT), VData);

    }


    auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);

    auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);

    auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);

    SDValue Ops[] = {

        Chain,

        VData,

        Rsrc,

        Op.getOperand(4),                      // vindex

        VOffset,                               // voffset

        SOffset,                               // soffset

        Offset,                                // offset

        Op.getOperand(7),                      // cachepolicy, swizzled buffer

        DAG.getTargetConstant(1, DL, MVT::i1), // idxen

    };

    unsigned Opc =

        !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;

    Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;

    MemSDNode *M = cast<MemSDNode>(Op);


    // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics

    EVT VDataType = VData.getValueType().getScalarType();

    if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)

      return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);


    return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,

                                   M->getMemoryVT(), M->getMemOperand());

  }

  case Intrinsic::amdgcn_raw_buffer_load_lds:

  case Intrinsic::amdgcn_raw_buffer_load_async_lds:

  case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:

  case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:

  case Intrinsic::amdgcn_struct_buffer_load_lds:

  case Intrinsic::amdgcn_struct_buffer_load_async_lds:

  case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:

  case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {

    if (!Subtarget->hasVMemToLDSLoad())

      return SDValue();

    unsigned Opc;

    bool HasVIndex =

        IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||

        IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_async_lds ||

        IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds ||

        IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds;

    unsigned OpOffset = HasVIndex ? 1 : 0;

    SDValue VOffset = Op.getOperand(5 + OpOffset);

    bool HasVOffset = !isNullConstant(VOffset);

    unsigned Size = Op->getConstantOperandVal(4);


    switch (Size) {

    default:

      return SDValue();

    case 1:

      Opc = HasVIndex    ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN

                                      : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN

            : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN

                         : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;

      break;

    case 2:

      Opc = HasVIndex    ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN

                                      : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN

            : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN

                         : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;

      break;

    case 4:

      Opc = HasVIndex    ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN

                                      : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN

            : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN

                         : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;

      break;

    case 12:

      if (!Subtarget->hasLDSLoadB96_B128())

        return SDValue();

      Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN

                                   : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN

                      : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN

                                   : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;

      break;

    case 16:

      if (!Subtarget->hasLDSLoadB96_B128())

        return SDValue();

      Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN

                                   : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN

                      : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN

                                   : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;

      break;

    }


    SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));


    SmallVector<SDValue, 8> Ops;


    if (HasVIndex && HasVOffset)

      Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,

                                       {Op.getOperand(5), // VIndex

                                        VOffset}));

    else if (HasVIndex)

      Ops.push_back(Op.getOperand(5));

    else if (HasVOffset)

      Ops.push_back(VOffset);


    SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);

    Ops.push_back(Rsrc);

    Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset

    Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset

    bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);

    unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);

    Ops.push_back(DAG.getTargetConstant(

        Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),

        DL, MVT::i8)); // cpol

    Ops.push_back(DAG.getTargetConstant(

        Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)

            ? 1

            : 0,

        DL, MVT::i8));                                           // swz

    Ops.push_back(

        DAG.getTargetConstant(isAsyncLDSDMA(IntrinsicID), DL, MVT::i8));

    Ops.push_back(M0Val.getValue(0));                            // Chain

    Ops.push_back(M0Val.getValue(1));                            // Glue


    auto *M = cast<MemSDNode>(Op);

    auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);

    DAG.setNodeMemRefs(Load, M->memoperands());


    return SDValue(Load, 0);

  }

  // Buffers are handled by LowerBufferFatPointers, and we're going to go

  // for "trust me" that the remaining cases are global pointers until

  // such time as we can put two mem operands on an intrinsic.

  case Intrinsic::amdgcn_load_to_lds:

  case Intrinsic::amdgcn_load_async_to_lds:

  case Intrinsic::amdgcn_global_load_lds:

  case Intrinsic::amdgcn_global_load_async_lds: {

    if (!Subtarget->hasVMemToLDSLoad())

      return SDValue();


    unsigned Opc;

    unsigned Size = Op->getConstantOperandVal(4);

    switch (Size) {

    default:

      return SDValue();

    case 1:

      Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;

      break;

    case 2:

      Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;

      break;

    case 4:

      Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;

      break;

    case 12:

      if (!Subtarget->hasLDSLoadB96_B128())

        return SDValue();

      Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;

      break;

    case 16:

      if (!Subtarget->hasLDSLoadB96_B128())

        return SDValue();

      Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;

      break;

    }


    SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));


    SmallVector<SDValue, 6> Ops;


    SDValue Addr = Op.getOperand(2); // Global ptr

    SDValue VOffset;

    // Try to split SAddr and VOffset. Global and LDS pointers share the same

    // immediate offset, so we cannot use a regular SelectGlobalSAddr().

    if (Addr->isDivergent() && Addr->isAnyAdd()) {

      SDValue LHS = Addr.getOperand(0);

      SDValue RHS = Addr.getOperand(1);


      if (LHS->isDivergent())

        std::swap(LHS, RHS);


      if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&

          RHS.getOperand(0).getValueType() == MVT::i32) {

        // add (i64 sgpr), (zero_extend (i32 vgpr))

        Addr = LHS;

        VOffset = RHS.getOperand(0);

      }

    }


    Ops.push_back(Addr);

    if (!Addr->isDivergent()) {

      Opc = AMDGPU::getGlobalSaddrOp(Opc);

      if (!VOffset)

        VOffset =

            SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,

                                       DAG.getTargetConstant(0, DL, MVT::i32)),

                    0);

      Ops.push_back(VOffset);

    }


    Ops.push_back(Op.getOperand(5));  // Offset


    unsigned Aux = Op.getConstantOperandVal(6);

    Ops.push_back(DAG.getTargetConstant(Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL,

                                        MVT::i32)); // CPol

    Ops.push_back(

        DAG.getTargetConstant(isAsyncLDSDMA(IntrinsicID), DL, MVT::i8));


    Ops.push_back(M0Val.getValue(0)); // Chain

    Ops.push_back(M0Val.getValue(1)); // Glue


    auto *M = cast<MemSDNode>(Op);

    auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);

    DAG.setNodeMemRefs(Load, M->memoperands());


    return SDValue(Load, 0);

  }

  case Intrinsic::amdgcn_end_cf:

    return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,

                                      Op->getOperand(2), Chain),

                   0);

  case Intrinsic::amdgcn_s_barrier_signal_var: {

    // Member count of 0 means to re-use a previous member count,

    // which, if the named barrier is statically chosen, means we can use

    // the immarg form. Otherwisee, fall through to constructiong M0 as for

    // s_barrier_init.

    SDValue CntOp = Op->getOperand(3);

    auto *CntC = dyn_cast<ConstantSDNode>(CntOp);

    if (CntC && CntC->isZero()) {

      SDValue Chain = Op->getOperand(0);

      SDValue BarOp = Op->getOperand(2);

      SmallVector<SDValue, 2> Ops;


      std::optional<uint64_t> BarVal;

      if (auto *C = dyn_cast<ConstantSDNode>(BarOp))

        BarVal = C->getZExtValue();

      else if (auto *GA = dyn_cast<GlobalAddressSDNode>(BarOp))

        if (auto Addr = AMDGPUMachineFunctionInfo::getLDSAbsoluteAddress(

                *GA->getGlobal()))

          BarVal = *Addr + GA->getOffset();


      if (BarVal) {

        unsigned BarID = (*BarVal >> 4) & 0x3F;

        Ops.push_back(DAG.getTargetConstant(BarID, DL, MVT::i32));

        Ops.push_back(Chain);

        auto *NewMI = DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,

                                         Op->getVTList(), Ops);

        return SDValue(NewMI, 0);

      }

    }

    [[fallthrough]];

  }

  case Intrinsic::amdgcn_s_barrier_init: {

    // these two intrinsics have two operands: barrier pointer and member count

    SDValue Chain = Op->getOperand(0);

    SmallVector<SDValue, 2> Ops;

    SDValue BarOp = Op->getOperand(2);

    SDValue CntOp = Op->getOperand(3);

    SDValue M0Val;

    unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init

                       ? AMDGPU::S_BARRIER_INIT_M0

                       : AMDGPU::S_BARRIER_SIGNAL_M0;

    // extract the BarrierID from bits 4-9 of BarOp

    SDValue BarID;

    BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,

                        DAG.getShiftAmountConstant(4, MVT::i32, DL));

    BarID =

        SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,

                                   DAG.getTargetConstant(0x3F, DL, MVT::i32)),

                0);

    // Member count should be put into M0[ShAmt:+6]

    // Barrier ID should be put into M0[5:0]

    M0Val =

        SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,

                                   DAG.getTargetConstant(0x3F, DL, MVT::i32)),

                0);

    constexpr unsigned ShAmt = 16;

    M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,

                        DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));


    M0Val = SDValue(

        DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);


    Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));


    auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);

    return SDValue(NewMI, 0);

  }

  case Intrinsic::amdgcn_s_wakeup_barrier: {

    if (!Subtarget->hasSWakeupBarrier())

      return SDValue();

    [[fallthrough]];

  }

  case Intrinsic::amdgcn_s_barrier_join: {

    // these three intrinsics have one operand: barrier pointer

    SDValue Chain = Op->getOperand(0);

    SmallVector<SDValue, 2> Ops;

    SDValue BarOp = Op->getOperand(2);

    unsigned Opc;


    if (isa<ConstantSDNode>(BarOp)) {

      uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();

      switch (IntrinsicID) {

      default:

        return SDValue();

      case Intrinsic::amdgcn_s_barrier_join:

        Opc = AMDGPU::S_BARRIER_JOIN_IMM;

        break;

      case Intrinsic::amdgcn_s_wakeup_barrier:

        Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;

        break;

      }

      // extract the BarrierID from bits 4-9 of the immediate

      unsigned BarID = (BarVal >> 4) & 0x3F;

      SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);

      Ops.push_back(K);

      Ops.push_back(Chain);

    } else {

      switch (IntrinsicID) {

      default:

        return SDValue();

      case Intrinsic::amdgcn_s_barrier_join:

        Opc = AMDGPU::S_BARRIER_JOIN_M0;

        break;

      case Intrinsic::amdgcn_s_wakeup_barrier:

        Opc = AMDGPU::S_WAKEUP_BARRIER_M0;

        break;

      }

      // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]

      SDValue M0Val;

      M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,

                          DAG.getShiftAmountConstant(4, MVT::i32, DL));

      M0Val =

          SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,

                                     DAG.getTargetConstant(0x3F, DL, MVT::i32)),

                  0);

      Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));

    }


    auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);

    return SDValue(NewMI, 0);

  }

  case Intrinsic::amdgcn_s_prefetch_data: {

    // For non-global address space preserve the chain and remove the call.

    if (!AMDGPU::isFlatGlobalAddrSpace(cast<MemSDNode>(Op)->getAddressSpace()))

      return Op.getOperand(0);

    return Op;

  }

  case Intrinsic::amdgcn_s_buffer_prefetch_data: {

    SDValue Ops[] = {

        Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),

        Op.getOperand(3), // offset

        Op.getOperand(4), // length

    };


    MemSDNode *M = cast<MemSDNode>(Op);

    return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_PREFETCH_DATA, DL,

                                   Op->getVTList(), Ops, M->getMemoryVT(),

                                   M->getMemOperand());

  }

  case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:

  case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:

  case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {

    MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);

    SDValue Chain = Op->getOperand(0);

    SDValue Ptr = Op->getOperand(2);

    SDValue Val = Op->getOperand(3);

    return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MII->getMemoryVT(), Chain, Val,

                         Ptr, MII->getMemOperand());

  }

  default: {

    if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =

            AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))

      return lowerImage(Op, ImageDimIntr, DAG, true);


    return Op;

  }

  }

}


// Return whether the operation has NoUnsignedWrap property.


static bool isNoUnsignedWrap(SDValue Addr) {

  return (Addr.getOpcode() == ISD::ADD &&

          Addr->getFlags().hasNoUnsignedWrap()) ||

         Addr->getOpcode() == ISD::OR;

}


bool SITargetLowering::shouldPreservePtrArith(const Function &F,

                                              EVT PtrVT) const {

  return PtrVT == MVT::i64;

}


bool SITargetLowering::canTransformPtrArithOutOfBounds(const Function &F,

                                                       EVT PtrVT) const {

  return true;

}


// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:

// offset (the offset that is included in bounds checking and swizzling, to be

// split between the instruction's voffset and immoffset fields) and soffset

// (the offset that is excluded from bounds checking and swizzling, to go in

// the instruction's soffset field).  This function takes the first kind of

// offset and figures out how to split it between voffset and immoffset.

std::pair<SDValue, SDValue>

SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {

  SDLoc DL(Offset);

  const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);

  SDValue N0 = Offset;

  ConstantSDNode *C1 = nullptr;


  if ((C1 = dyn_cast<ConstantSDNode>(N0)))

    N0 = SDValue();

  else if (DAG.isBaseWithConstantOffset(N0)) {

    // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before

    // being added, so we can only safely match a 32-bit addition with no

    // unsigned overflow.

    bool CheckNUW = Subtarget->hasGFX1250Insts();

    if (!CheckNUW || isNoUnsignedWrap(N0)) {

      C1 = cast<ConstantSDNode>(N0.getOperand(1));

      N0 = N0.getOperand(0);

    }

  }


  if (C1) {

    unsigned ImmOffset = C1->getZExtValue();

    // If the immediate value is too big for the immoffset field, put only bits

    // that would normally fit in the immoffset field. The remaining value that

    // is copied/added for the voffset field is a large power of 2, and it

    // stands more chance of being CSEd with the copy/add for another similar

    // load/store.

    // However, do not do that rounding down if that is a negative

    // number, as it appears to be illegal to have a negative offset in the

    // vgpr, even if adding the immediate offset makes it positive.

    unsigned Overflow = ImmOffset & ~MaxImm;

    ImmOffset -= Overflow;

    if ((int32_t)Overflow < 0) {

      Overflow += ImmOffset;

      ImmOffset = 0;

    }

    C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));

    if (Overflow) {

      auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);

      if (!N0)

        N0 = OverflowVal;

      else {

        SDValue Ops[] = {N0, OverflowVal};

        N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);

      }

    }

  }

  if (!N0)

    N0 = DAG.getConstant(0, DL, MVT::i32);

  if (!C1)

    C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));

  return {N0, SDValue(C1, 0)};

}


// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store

// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array

// pointed to by Offsets.

void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,

                                        SelectionDAG &DAG, SDValue *Offsets,

                                        Align Alignment) const {

  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();

  SDLoc DL(CombinedOffset);

  if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {

    uint32_t Imm = C->getZExtValue();

    uint32_t SOffset, ImmOffset;

    if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {

      Offsets[0] = DAG.getConstant(0, DL, MVT::i32);

      Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);

      Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);

      return;

    }

  }

  if (DAG.isBaseWithConstantOffset(CombinedOffset)) {

    // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before

    // being added, so we can only safely match a 32-bit addition with no

    // unsigned overflow.

    bool CheckNUW = Subtarget->hasGFX1250Insts();

    SDValue N0 = CombinedOffset.getOperand(0);

    SDValue N1 = CombinedOffset.getOperand(1);

    uint32_t SOffset, ImmOffset;

    int Offset = cast<ConstantSDNode>(N1)->getSExtValue();

    if (Offset >= 0 && (!CheckNUW || isNoUnsignedWrap(CombinedOffset)) &&

        TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {

      Offsets[0] = N0;

      Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);

      Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);

      return;

    }

  }


  SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()

                            ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)

                            : DAG.getConstant(0, DL, MVT::i32);


  Offsets[0] = CombinedOffset;

  Offsets[1] = SOffsetZero;

  Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);

}


SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,

                                                SelectionDAG &DAG) const {

  if (!MaybePointer.getValueType().isScalarInteger())

    return MaybePointer;


  SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);

  return Rsrc;

}


// Wrap a global or flat pointer into a buffer intrinsic using the flags

// specified in the intrinsic.

SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,

                                                   SelectionDAG &DAG) const {

  SDLoc Loc(Op);


  SDValue Pointer = Op->getOperand(1);

  SDValue Stride = Op->getOperand(2);

  SDValue NumRecords = Op->getOperand(3);

  SDValue Flags = Op->getOperand(4);


  SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);

  SDValue Rsrc;


  if (Subtarget->has45BitNumRecordsBufferResource()) {

    SDValue Zero = DAG.getConstant(0, Loc, MVT::i32);

    // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit

    // num_records.

    SDValue ExtPointer = DAG.getAnyExtOrTrunc(Pointer, Loc, MVT::i64);

    SDValue NumRecordsLHS =

        DAG.getNode(ISD::SHL, Loc, MVT::i64, NumRecords,

                    DAG.getShiftAmountConstant(57, MVT::i32, Loc));

    SDValue LowHalf =

        DAG.getNode(ISD::OR, Loc, MVT::i64, ExtPointer, NumRecordsLHS);


    // Build the higher 64-bit value, which has the higher 38-bit num_records,

    // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.

    SDValue NumRecordsRHS =

        DAG.getNode(ISD::SRL, Loc, MVT::i64, NumRecords,

                    DAG.getShiftAmountConstant(7, MVT::i32, Loc));

    SDValue ShiftedStride =

        DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,

                    DAG.getShiftAmountConstant(12, MVT::i32, Loc));

    SDValue ExtShiftedStrideVec =

        DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedStride);

    SDValue ExtShiftedStride =

        DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedStrideVec);

    SDValue ShiftedFlags =

        DAG.getNode(ISD::SHL, Loc, MVT::i32, Flags,

                    DAG.getShiftAmountConstant(28, MVT::i32, Loc));

    SDValue ExtShiftedFlagsVec =

        DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedFlags);

    SDValue ExtShiftedFlags =

        DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedFlagsVec);

    SDValue CombinedFields =

        DAG.getNode(ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);

    SDValue HighHalf =

        DAG.getNode(ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);


    Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i64, LowHalf, HighHalf);

  } else {

    NumRecords = DAG.getAnyExtOrTrunc(NumRecords, Loc, MVT::i32);

    auto [LowHalf, HighHalf] =

        DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);

    SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);

    SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);

    SDValue ShiftedStride =

        DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,

                    DAG.getShiftAmountConstant(16, MVT::i32, Loc));

    SDValue NewHighHalf =

        DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);


    Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf, NewHighHalf,

                       NumRecords, Flags);

  }


  SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);

  return RsrcPtr;

}


// Handle 8 bit and 16 bit buffer loads

SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,

                                                     EVT LoadVT, SDLoc DL,

                                                     ArrayRef<SDValue> Ops,

                                                     MachineMemOperand *MMO,

                                                     bool IsTFE) const {

  EVT IntVT = LoadVT.changeTypeToInteger();


  if (IsTFE) {

    unsigned Opc = (LoadVT.getScalarType() == MVT::i8)

                       ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE

                       : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;

    MachineFunction &MF = DAG.getMachineFunction();

    MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);

    SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);

    SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);

    SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,

                                 DAG.getConstant(1, DL, MVT::i32));

    SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,

                               DAG.getConstant(0, DL, MVT::i32));

    SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);

    SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);

    return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);

  }


  unsigned Opc = LoadVT.getScalarType() == MVT::i8

                     ? AMDGPUISD::BUFFER_LOAD_UBYTE

                     : AMDGPUISD::BUFFER_LOAD_USHORT;


  SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);

  SDValue BufferLoad =

      DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);

  SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);

  LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);


  return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);

}


// Handle 8 bit and 16 bit buffer stores

SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,

                                                      EVT VDataType, SDLoc DL,

                                                      SDValue Ops[],

                                                      MemSDNode *M) const {

  if (VDataType == MVT::f16 || VDataType == MVT::bf16)

    Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);


  SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);

  Ops[1] = BufferStoreExt;

  unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE

                                        : AMDGPUISD::BUFFER_STORE_SHORT;

  ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);

  return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,

                                 M->getMemOperand());

}


static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType,

                                 SDValue Op, const SDLoc &SL, EVT VT) {

  if (VT.bitsLT(Op.getValueType()))

    return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);


  switch (ExtType) {

  case ISD::SEXTLOAD:

    return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);

  case ISD::ZEXTLOAD:

    return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);

  case ISD::EXTLOAD:

    return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);

  case ISD::NON_EXTLOAD:

    return Op;

  }


  llvm_unreachable("invalid ext type");

}


// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.

// TODO: Skip this on GFX12 which does have scalar sub-dword loads.

SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,

                                    DAGCombinerInfo &DCI) const {

  SelectionDAG &DAG = DCI.DAG;

  if (Ld->getAlign() < Align(4) || Ld->isDivergent())

    return SDValue();


  // FIXME: Constant loads should all be marked invariant.

  unsigned AS = Ld->getAddressSpace();

  if (AS != AMDGPUAS::CONSTANT_ADDRESS &&

      AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT &&

      (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))

    return SDValue();


  // Don't do this early, since it may interfere with adjacent load merging for

  // illegal types. We can avoid losing alignment information for exotic types

  // pre-legalize.

  EVT MemVT = Ld->getMemoryVT();

  if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||

      MemVT.getSizeInBits() >= 32)

    return SDValue();


  SDLoc SL(Ld);


  assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&

         "unexpected vector extload");


  // TODO: Drop only high part of range.

  SDValue Ptr = Ld->getBasePtr();

  SDValue NewLoad = DAG.getLoad(

      ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,

      Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),

      Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),

      nullptr); // Drop ranges


  EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());

  if (MemVT.isFloatingPoint()) {

    assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&

           "unexpected fp extload");

    TruncVT = MemVT.changeTypeToInteger();

  }


  SDValue Cvt = NewLoad;

  if (Ld->getExtensionType() == ISD::SEXTLOAD) {

    Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,

                      DAG.getValueType(TruncVT));

  } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||

             Ld->getExtensionType() == ISD::NON_EXTLOAD) {

    Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);

  } else {

    assert(Ld->getExtensionType() == ISD::EXTLOAD);

  }


  EVT VT = Ld->getValueType(0);

  EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());


  DCI.AddToWorklist(Cvt.getNode());


  // We may need to handle exotic cases, such as i16->i64 extloads, so insert

  // the appropriate extension from the 32-bit load.

  Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);

  DCI.AddToWorklist(Cvt.getNode());


  // Handle conversion back to floating point if necessary.

  Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);


  return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);

}


static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO,

                                          const SIMachineFunctionInfo &Info) {

  // TODO: Should check if the address can definitely not access stack.

  if (Info.isEntryFunction())

    return Info.getUserSGPRInfo().hasFlatScratchInit();

  return true;

}


SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {

  SDLoc DL(Op);

  LoadSDNode *Load = cast<LoadSDNode>(Op);

  ISD::LoadExtType ExtType = Load->getExtensionType();

  EVT MemVT = Load->getMemoryVT();

  MachineMemOperand *MMO = Load->getMemOperand();


  if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {

    if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))

      return SDValue();


    // FIXME: Copied from PPC

    // First, load into 32 bits, then truncate to 1 bit.


    SDValue Chain = Load->getChain();

    SDValue BasePtr = Load->getBasePtr();


    EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;


    SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,

                                   RealMemVT, MMO);


    if (!MemVT.isVector()) {

      SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),

                       NewLD.getValue(1)};


      return DAG.getMergeValues(Ops, DL);

    }


    SmallVector<SDValue, 3> Elts;

    for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {

      SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,

                                DAG.getConstant(I, DL, MVT::i32));


      Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));

    }


    SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};


    return DAG.getMergeValues(Ops, DL);

  }


  if (!MemVT.isVector())

    return SDValue();


  assert(Op.getValueType().getVectorElementType() == MVT::i32 &&

         "Custom lowering for non-i32 vectors hasn't been implemented.");


  Align Alignment = Load->getAlign();

  unsigned AS = Load->getAddressSpace();

  if (Subtarget->hasLDSMisalignedBugInWGPMode() &&

      AS == AMDGPUAS::FLAT_ADDRESS &&

      Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {

    return SplitVectorLoad(Op, DAG);

  }


  MachineFunction &MF = DAG.getMachineFunction();

  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

  // If there is a possibility that flat instruction access scratch memory

  // then we need to use the same legalization rules we use for private.

  if (AS == AMDGPUAS::FLAT_ADDRESS &&

      !Subtarget->hasMultiDwordFlatScratchAddressing())

    AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)

             ? AMDGPUAS::PRIVATE_ADDRESS

             : AMDGPUAS::GLOBAL_ADDRESS;


  unsigned NumElements = MemVT.getVectorNumElements();


  if (AS == AMDGPUAS::CONSTANT_ADDRESS ||

      AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||

      (AS == AMDGPUAS::GLOBAL_ADDRESS &&

       Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&

       (Load->isInvariant() || isMemOpHasNoClobberedMemOperand(Load)))) {

    if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&

        Alignment >= Align(4) && NumElements < 32) {

      if (MemVT.isPow2VectorType() ||

          (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))

        return SDValue();

      return WidenOrSplitVectorLoad(Op, DAG);

    }

    // Non-uniform loads will be selected to MUBUF instructions, so they

    // have the same legalization requirements as global and private

    // loads.

    //

  }

  if (AS == AMDGPUAS::CONSTANT_ADDRESS ||

      AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||

      AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {

    if (NumElements > 4)

      return SplitVectorLoad(Op, DAG);

    // v3 loads not supported on SI.

    if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())

      return WidenOrSplitVectorLoad(Op, DAG);


    // v3 and v4 loads are supported for private and global memory.

    return SDValue();

  }

  if (AS == AMDGPUAS::PRIVATE_ADDRESS) {

    // Depending on the setting of the private_element_size field in the

    // resource descriptor, we can only make private accesses up to a certain

    // size.

    switch (Subtarget->getMaxPrivateElementSize()) {

    case 4: {

      auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);

      return DAG.getMergeValues({Op0, Op1}, DL);

    }

    case 8:

      if (NumElements > 2)

        return SplitVectorLoad(Op, DAG);

      return SDValue();

    case 16:

      // Same as global/flat

      if (NumElements > 4)

        return SplitVectorLoad(Op, DAG);

      // v3 loads not supported on SI.

      if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())

        return WidenOrSplitVectorLoad(Op, DAG);


      return SDValue();

    default:

      llvm_unreachable("unsupported private_element_size");

    }

  } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {

    unsigned Fast = 0;

    auto Flags = Load->getMemOperand()->getFlags();

    if (allowsMisalignedMemoryAccessesImpl(MemVT.getSizeInBits(), AS,

                                           Load->getAlign(), Flags, &Fast) &&

        Fast > 1)

      return SDValue();


    if (MemVT.isVector())

      return SplitVectorLoad(Op, DAG);

  }


  if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),

                                      MemVT, *Load->getMemOperand())) {

    auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);

    return DAG.getMergeValues({Op0, Op1}, DL);

  }


  return SDValue();

}


SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();

  if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||

      VT.getSizeInBits() == 512)

    return splitTernaryVectorOp(Op, DAG);


  assert(VT.getSizeInBits() == 64);


  SDLoc DL(Op);

  SDValue Cond = DAG.getFreeze(Op.getOperand(0));


  SDValue Zero = DAG.getConstant(0, DL, MVT::i32);

  SDValue One = DAG.getConstant(1, DL, MVT::i32);


  SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));

  SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));


  SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);

  SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);


  SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);


  SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);

  SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);


  SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);


  SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});

  return DAG.getNode(ISD::BITCAST, DL, VT, Res);

}


// Catch division cases where we can use shortcuts with rcp and rsq

// instructions.

SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,

                                              SelectionDAG &DAG) const {

  SDLoc SL(Op);

  SDValue LHS = Op.getOperand(0);

  SDValue RHS = Op.getOperand(1);

  EVT VT = Op.getValueType();

  const SDNodeFlags Flags = Op->getFlags();


  bool AllowInaccurateRcp = Flags.hasApproximateFuncs();


  if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {

    // Without !fpmath accuracy information, we can't do more because we don't

    // know exactly whether rcp is accurate enough to meet !fpmath requirement.

    // f16 is always accurate enough

    if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)

      return SDValue();


    if (CLHS->isExactlyValue(1.0)) {

      // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to

      // the CI documentation has a worst case error of 1 ulp.

      // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to

      // use it as long as we aren't trying to use denormals.

      //

      // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.


      // 1.0 / sqrt(x) -> rsq(x)


      // XXX - Is afn sufficient to do this for f64? The maximum ULP

      // error seems really high at 2^29 ULP.

      // 1.0 / x -> rcp(x)

      return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);

    }


    // Same as for 1.0, but expand the sign out of the constant.

    if (CLHS->isExactlyValue(-1.0)) {

      // -1.0 / x -> rcp (fneg x)

      SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);

      return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);

    }

  }


  // For f16 and bf16 require afn or arcp.

  // For f32 require afn.

  if (!AllowInaccurateRcp &&

      ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))

    return SDValue();


  // Turn into multiply by the reciprocal.

  // x / y -> x * (1.0 / y)

  SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);

  return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);

}


SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,

                                                SelectionDAG &DAG) const {

  SDLoc SL(Op);

  SDValue X = Op.getOperand(0);

  SDValue Y = Op.getOperand(1);

  EVT VT = Op.getValueType();

  const SDNodeFlags Flags = Op->getFlags();


  bool AllowInaccurateDiv = Flags.hasApproximateFuncs();

  if (!AllowInaccurateDiv)

    return SDValue();


  const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(X);

  bool IsNegRcp = CLHS && CLHS->isExactlyValue(-1.0);


  // Pull out the negation so it folds for free into the source modifiers.

  if (IsNegRcp)

    X = DAG.getConstantFP(1.0, SL, VT);


  SDValue NegY = IsNegRcp ? Y : DAG.getNode(ISD::FNEG, SL, VT, Y);

  SDValue One = DAG.getConstantFP(1.0, SL, VT);


  SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);

  if (IsNegRcp)

    R = DAG.getNode(ISD::FNEG, SL, VT, R);


  SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);


  R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);

  SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);

  R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);


  // Skip the last 2 correction terms for reciprocal.

  if (IsNegRcp || (CLHS && CLHS->isExactlyValue(1.0)))

    return R;


  SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);

  SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);

  return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);

}


static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,

                          EVT VT, SDValue A, SDValue B, SDValue GlueChain,

                          SDNodeFlags Flags) {

  if (GlueChain->getNumValues() <= 1) {

    return DAG.getNode(Opcode, SL, VT, A, B, Flags);

  }


  assert(GlueChain->getNumValues() == 3);


  SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);

  switch (Opcode) {

  default:

    llvm_unreachable("no chain equivalent for opcode");

  case ISD::FMUL:

    Opcode = AMDGPUISD::FMUL_W_CHAIN;

    break;

  }


  return DAG.getNode(Opcode, SL, VTList,

                     {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},

                     Flags);

}


static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,

                           EVT VT, SDValue A, SDValue B, SDValue C,

                           SDValue GlueChain, SDNodeFlags Flags) {

  if (GlueChain->getNumValues() <= 1) {

    return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);

  }


  assert(GlueChain->getNumValues() == 3);


  SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);

  switch (Opcode) {

  default:

    llvm_unreachable("no chain equivalent for opcode");

  case ISD::FMA:

    Opcode = AMDGPUISD::FMA_W_CHAIN;

    break;

  }


  return DAG.getNode(Opcode, SL, VTList,

                     {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},

                     Flags);

}


SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {

  if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))

    return FastLowered;


  SDLoc SL(Op);

  EVT VT = Op.getValueType();

  SDValue LHS = Op.getOperand(0);

  SDValue RHS = Op.getOperand(1);


  SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);

  SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);


  if (VT == MVT::bf16) {

    SDValue ExtDiv =

        DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags());

    return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv,

                       DAG.getTargetConstant(0, SL, MVT::i32));

  }


  assert(VT == MVT::f16);


  // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32

  // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32

  // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d

  // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp

  // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n

  // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp

  // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n

  // tmp.u = opx(V_MUL_F32, e32.u, r32.u);

  // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)

  // q32.u = opx(V_ADD_F32, tmp.u, q32.u);

  // q16.u = opx(V_CVT_F16_F32, q32.u);

  // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)


  // We will use ISD::FMA on targets that don't support ISD::FMAD.

  unsigned FMADOpCode =

      isOperationLegal(ISD::FMAD, MVT::f32) ? ISD::FMAD : ISD::FMA;

  SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);

  SDValue Rcp =

      DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());

  SDValue Quot =

      DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());

  SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,

                            Op->getFlags());

  Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());

  Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,

                    Op->getFlags());

  SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());

  SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);

  TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,

                        DAG.getConstant(0xff800000, SL, MVT::i32));

  Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);

  Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());

  SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,

                             DAG.getTargetConstant(0, SL, MVT::i32));

  return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,

                     Op->getFlags());

}


// Faster 2.5 ULP division that does not support denormals.

SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {

  SDNodeFlags Flags = Op->getFlags();

  SDLoc SL(Op);

  SDValue LHS = Op.getOperand(1);

  SDValue RHS = Op.getOperand(2);


  // TODO: The combiner should probably handle elimination of redundant fabs.

  SDValue r1 = DAG.SignBitIsZeroFP(RHS)

                   ? RHS

                   : DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);


  const APFloat K0Val(0x1p+96f);

  const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);


  const APFloat K1Val(0x1p-32f);

  const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);


  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);


  EVT SetCCVT =

      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);


  SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);


  SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);


  r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);


  // rcp does not support denormals.

  SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);


  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);


  return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);

}


// Returns immediate value for setting the F32 denorm mode when using the

// S_DENORM_MODE instruction.


static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG,

                                    const SIMachineFunctionInfo *Info,

                                    const GCNSubtarget *ST) {

  assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");

  uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();

  uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);

  return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);

}


SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {

  if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))

    return FastLowered;


  // The selection matcher assumes anything with a chain selecting to a

  // mayRaiseFPException machine instruction. Since we're introducing a chain

  // here, we need to explicitly report nofpexcept for the regular fdiv

  // lowering.

  SDNodeFlags Flags = Op->getFlags();

  Flags.setNoFPExcept(true);


  SDLoc SL(Op);

  SDValue LHS = Op.getOperand(0);

  SDValue RHS = Op.getOperand(1);


  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);


  SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);


  SDValue DenominatorScaled =

      DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);

  SDValue NumeratorScaled =

      DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);


  // Denominator is scaled to not be denormal, so using rcp is ok.

  SDValue ApproxRcp =

      DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);

  SDValue NegDivScale0 =

      DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);


  using namespace AMDGPU::Hwreg;

  const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);

  const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);


  const MachineFunction &MF = DAG.getMachineFunction();

  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();

  const DenormalMode DenormMode = Info->getMode().FP32Denormals;


  const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();

  const bool HasDynamicDenormals =

      (DenormMode.Input == DenormalMode::Dynamic) ||

      (DenormMode.Output == DenormalMode::Dynamic);


  SDValue SavedDenormMode;


  if (!PreservesDenormals) {

    // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV

    // lowering. The chain dependence is insufficient, and we need glue. We do

    // not need the glue variants in a strictfp function.


    SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);


    SDValue Glue = DAG.getEntryNode();

    if (HasDynamicDenormals) {

      SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,

                                          DAG.getVTList(MVT::i32, MVT::Glue),

                                          {BitField, Glue});

      SavedDenormMode = SDValue(GetReg, 0);


      Glue = DAG.getMergeValues(

          {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);

    }


    SDNode *EnableDenorm;

    if (Subtarget->hasDenormModeInst()) {

      const SDValue EnableDenormValue =

          getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);


      EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,

                                 EnableDenormValue)

                         .getNode();

    } else {

      const SDValue EnableDenormValue =

          DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);

      EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,

                                        {EnableDenormValue, BitField, Glue});

    }


    SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),

                      SDValue(EnableDenorm, 1)};


    NegDivScale0 = DAG.getMergeValues(Ops, SL);

  }


  SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,

                             ApproxRcp, One, NegDivScale0, Flags);


  SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,

                             ApproxRcp, Fma0, Flags);


  SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,

                           Fma1, Flags);


  SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,

                             NumeratorScaled, Mul, Flags);


  SDValue Fma3 =

      getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);


  SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,

                             NumeratorScaled, Fma3, Flags);


  if (!PreservesDenormals) {

    SDNode *DisableDenorm;

    if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {

      const SDValue DisableDenormValue = getSPDenormModeValue(

          FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);


      SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);

      DisableDenorm =

          DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,

                      Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2))

              .getNode();

    } else {

      assert(HasDynamicDenormals == (bool)SavedDenormMode);

      const SDValue DisableDenormValue =

          HasDynamicDenormals

              ? SavedDenormMode

              : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);


      DisableDenorm = DAG.getMachineNode(

          AMDGPU::S_SETREG_B32, SL, MVT::Other,

          {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});

    }


    SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,

                                      SDValue(DisableDenorm, 0), DAG.getRoot());

    DAG.setRoot(OutputChain);

  }


  SDValue Scale = NumeratorScaled.getValue(1);

  SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,

                             {Fma4, Fma1, Fma3, Scale}, Flags);


  return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);

}


SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {

  if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))

    return FastLowered;


  SDLoc SL(Op);

  SDValue X = Op.getOperand(0);

  SDValue Y = Op.getOperand(1);


  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);


  SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);


  SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);


  SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);


  SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);


  SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);


  SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);


  SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);


  SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);


  SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);

  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);


  SDValue Fma4 =

      DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);


  SDValue Scale;


  if (!Subtarget->hasUsableDivScaleConditionOutput()) {

    // Workaround a hardware bug on SI where the condition output from div_scale

    // is not usable.


    const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);


    // Figure out if the scale to use for div_fmas.

    SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);

    SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);

    SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);

    SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);


    SDValue NumHi =

        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);

    SDValue DenHi =

        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);


    SDValue Scale0Hi =

        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);

    SDValue Scale1Hi =

        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);


    SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);

    SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);

    Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);

  } else {

    Scale = DivScale1.getValue(1);

  }


  SDValue Fmas =

      DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);


  return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);

}


SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {

  EVT VT = Op.getValueType();


  if (VT == MVT::f32)

    return LowerFDIV32(Op, DAG);


  if (VT == MVT::f64)

    return LowerFDIV64(Op, DAG);


  if (VT == MVT::f16 || VT == MVT::bf16)

    return LowerFDIV16(Op, DAG);


  llvm_unreachable("Unexpected type for fdiv");

}


SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {

  SDLoc dl(Op);

  SDValue Val = Op.getOperand(0);

  EVT VT = Val.getValueType();

  EVT ResultExpVT = Op->getValueType(1);

  EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;


  SDValue Mant = DAG.getNode(

      ISD::INTRINSIC_WO_CHAIN, dl, VT,

      DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);


  SDValue Exp = DAG.getNode(

      ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,

      DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);


  if (Subtarget->hasFractBug()) {

    SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);

    SDValue Inf =

        DAG.getConstantFP(APFloat::getInf(VT.getFltSemantics()), dl, VT);


    SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);

    SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);

    Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);

    Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);

  }


  SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);

  return DAG.getMergeValues({Mant, CastExp}, dl);

}


SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {

  SDLoc DL(Op);

  StoreSDNode *Store = cast<StoreSDNode>(Op);

  EVT VT = Store->getMemoryVT();


  if (VT == MVT::i1) {

    return DAG.getTruncStore(

        Store->getChain(), DL,

        DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),

        Store->getBasePtr(), MVT::i1, Store->getMemOperand());

  }


  assert(VT.isVector() &&

         Store->getValue().getValueType().getScalarType() == MVT::i32);


  unsigned AS = Store->getAddressSpace();

  if (Subtarget->hasLDSMisalignedBugInWGPMode() &&

      AS == AMDGPUAS::FLAT_ADDRESS &&

      Store->getAlign().value() < VT.getStoreSize() &&

      VT.getSizeInBits() > 32) {

    return SplitVectorStore(Op, DAG);

  }


  MachineFunction &MF = DAG.getMachineFunction();

  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

  // If there is a possibility that flat instruction access scratch memory

  // then we need to use the same legalization rules we use for private.

  if (AS == AMDGPUAS::FLAT_ADDRESS &&

      !Subtarget->hasMultiDwordFlatScratchAddressing())

    AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)

             ? AMDGPUAS::PRIVATE_ADDRESS

             : AMDGPUAS::GLOBAL_ADDRESS;


  unsigned NumElements = VT.getVectorNumElements();

  if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {

    if (NumElements > 4)

      return SplitVectorStore(Op, DAG);

    // v3 stores not supported on SI.

    if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())

      return SplitVectorStore(Op, DAG);


    if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),

                                        VT, *Store->getMemOperand()))

      return expandUnalignedStore(Store, DAG);


    return SDValue();

  }

  if (AS == AMDGPUAS::PRIVATE_ADDRESS) {

    switch (Subtarget->getMaxPrivateElementSize()) {

    case 4:

      return scalarizeVectorStore(Store, DAG);

    case 8:

      if (NumElements > 2)

        return SplitVectorStore(Op, DAG);

      return SDValue();

    case 16:

      if (NumElements > 4 ||

          (NumElements == 3 && !Subtarget->hasFlatScratchEnabled()))

        return SplitVectorStore(Op, DAG);

      return SDValue();

    default:

      llvm_unreachable("unsupported private_element_size");

    }

  } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {

    unsigned Fast = 0;

    auto Flags = Store->getMemOperand()->getFlags();

    if (allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AS,

                                           Store->getAlign(), Flags, &Fast) &&

        Fast > 1)

      return SDValue();


    if (VT.isVector())

      return SplitVectorStore(Op, DAG);


    return expandUnalignedStore(Store, DAG);

  }


  // Probably an invalid store. If so we'll end up emitting a selection error.

  return SDValue();

}


// Avoid the full correct expansion for f32 sqrt when promoting from f16.

SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {

  SDLoc SL(Op);

  assert(!Subtarget->has16BitInsts());

  SDNodeFlags Flags = Op->getFlags();

  SDValue Ext =

      DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);


  SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);

  SDValue Sqrt =

      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);


  return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,

                     DAG.getTargetConstant(0, SL, MVT::i32), Flags);

}


SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {

  SDLoc DL(Op);

  SDNodeFlags Flags = Op->getFlags();

  MVT VT = Op.getValueType().getSimpleVT();

  const SDValue X = Op.getOperand(0);


  if (allowApproxFunc(DAG, Flags)) {

    // Instruction is 1ulp but ignores denormals.

    return DAG.getNode(

        ISD::INTRINSIC_WO_CHAIN, DL, VT,

        DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);

  }


  SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);

  SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);


  SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);


  SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);


  SDValue SqrtX =

      DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);


  SDValue SqrtS;

  if (needsDenormHandlingF32(DAG, X, Flags)) {

    SDValue SqrtID =

        DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);

    SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);


    SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);

    SDValue SqrtSNextDownInt =

        DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,

                    DAG.getAllOnesConstant(DL, MVT::i32));

    SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);


    SDValue NegSqrtSNextDown =

        DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);


    SDValue SqrtVP =

        DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);


    SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,

                                         DAG.getConstant(1, DL, MVT::i32));

    SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);


    SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);

    SDValue SqrtVS =

        DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);


    SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);

    SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);


    SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,

                        Flags);


    SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);

    SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,

                        Flags);

  } else {

    SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);


    SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);


    SDValue Half = DAG.getConstantFP(0.5f, DL, VT);

    SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);

    SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);


    SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);

    SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);

    SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);


    SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);

    SDValue SqrtD =

        DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);

    SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);

  }


  SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);


  SDValue ScaledDown =

      DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);


  SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);

  SDValue IsZeroOrInf =

      DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,

                  DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));


  return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);

}


SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {

  // For double type, the SQRT and RSQ instructions don't have required

  // precision, we apply Goldschmidt's algorithm to improve the result:

  //

  //   y0 = rsq(x)

  //   g0 = x * y0

  //   h0 = 0.5 * y0

  //

  //   r0 = 0.5 - h0 * g0

  //   g1 = g0 * r0 + g0

  //   h1 = h0 * r0 + h0

  //

  //   r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1

  //   g2 = g1 * r1 + g1     g2 = d0 * h1 + g1

  //   h2 = h1 * r1 + h1

  //

  //   r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2

  //   g3 = g2 * r2 + g2     g3 = d1 * h1 + g2

  //

  //   sqrt(x) = g3


  SDNodeFlags Flags = Op->getFlags();


  SDLoc DL(Op);


  SDValue X = Op.getOperand(0);

  SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);


  SDValue SqrtX = X;

  SDValue Scaling;

  if (!Flags.hasApproximateFuncs()) {

    SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);

    Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);


    // Scale up input if it is too small.

    SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);

    SDValue ScaleUp =

        DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);

    SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);

  }


  SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);


  SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);


  SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);

  SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);


  SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);

  SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);


  SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);


  SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);


  SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);

  SDValue SqrtD0 =

      DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);


  SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);


  SDValue SqrtRet = SqrtS2;

  if (!Flags.hasApproximateFuncs()) {

    SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);

    SDValue SqrtD1 =

        DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);


    SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);


    SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);

    SDValue ScaleDown = DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling,

                                    ScaleDownFactor, ZeroInt);

    SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);

  }


  // TODO: Check for DAZ and expand to subnormals


  SDValue IsZeroOrInf;

  if (Flags.hasNoInfs()) {

    SDValue Zero = DAG.getConstantFP(0.0, DL, MVT::f64);

    IsZeroOrInf = DAG.getSetCC(DL, MVT::i1, SqrtX, Zero, ISD::SETOEQ);

  } else {

    IsZeroOrInf =

        DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,

                    DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));

  }


  // If x is +INF, +0, or -0, use its original value

  return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,

                     Flags);

}


SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {

  SDLoc DL(Op);

  EVT VT = Op.getValueType();

  SDValue Arg = Op.getOperand(0);

  SDValue TrigVal;


  // Propagate fast-math flags so that the multiply we introduce can be folded

  // if Arg is already the result of a multiply by constant.

  auto Flags = Op->getFlags();


  // AMDGPUISD nodes of vector type must be unrolled here since

  // they will not be expanded elsewhere.

  auto UnrollIfVec = [&DAG](SDValue V) -> SDValue {

    if (!V.getValueType().isVector())

      return V;


    return DAG.UnrollVectorOp(cast<SDNode>(V));

  };


  SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);


  if (Subtarget->hasTrigReducedRange()) {

    SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);

    TrigVal = UnrollIfVec(DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags));

  } else {

    TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);

  }


  switch (Op.getOpcode()) {

  case ISD::FCOS:

    TrigVal = DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);

    break;

  case ISD::FSIN:

    TrigVal = DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);

    break;

  default:

    llvm_unreachable("Wrong trig opcode");

  }


  return UnrollIfVec(TrigVal);

}


SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,

                                               SelectionDAG &DAG) const {

  AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);

  assert(AtomicNode->isCompareAndSwap());

  unsigned AS = AtomicNode->getAddressSpace();


  // No custom lowering required for local address space

  if (!AMDGPU::isFlatGlobalAddrSpace(AS))

    return Op;


  // Non-local address space requires custom lowering for atomic compare

  // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2

  SDLoc DL(Op);

  SDValue ChainIn = Op.getOperand(0);

  SDValue Addr = Op.getOperand(1);

  SDValue Old = Op.getOperand(2);

  SDValue New = Op.getOperand(3);

  EVT VT = Op.getValueType();

  MVT SimpleVT = VT.getSimpleVT();

  MVT VecType = MVT::getVectorVT(SimpleVT, 2);


  SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});

  SDValue Ops[] = {ChainIn, Addr, NewOld};


  return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL,

                                 Op->getVTList(), Ops, VT,

                                 AtomicNode->getMemOperand());

}


//===----------------------------------------------------------------------===//

// Custom DAG optimizations

//===----------------------------------------------------------------------===//


SDValue

SITargetLowering::performUCharToFloatCombine(SDNode *N,

                                             DAGCombinerInfo &DCI) const {

  EVT VT = N->getValueType(0);

  EVT ScalarVT = VT.getScalarType();

  if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)

    return SDValue();


  SelectionDAG &DAG = DCI.DAG;

  SDLoc DL(N);


  SDValue Src = N->getOperand(0);

  EVT SrcVT = Src.getValueType();


  // TODO: We could try to match extracting the higher bytes, which would be

  // easier if i8 vectors weren't promoted to i32 vectors, particularly after

  // types are legalized. v4i8 -> v4f32 is probably the only case to worry

  // about in practice.

  if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {

    if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {

      SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);

      DCI.AddToWorklist(Cvt.getNode());


      // For the f16 case, fold to a cast to f32 and then cast back to f16.

      if (ScalarVT != MVT::f32) {

        Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,

                          DAG.getTargetConstant(0, DL, MVT::i32));

      }

      return Cvt;

    }

  }


  return SDValue();

}


SDValue SITargetLowering::performFCopySignCombine(SDNode *N,

                                                  DAGCombinerInfo &DCI) const {

  SDValue MagnitudeOp = N->getOperand(0);

  SDValue SignOp = N->getOperand(1);


  // The generic combine for fcopysign + fp cast is too conservative with

  // vectors, and also gets confused by the splitting we will perform here, so

  // peek through FP casts.

  if (SignOp.getOpcode() == ISD::FP_EXTEND ||

      SignOp.getOpcode() == ISD::FP_ROUND)

    SignOp = SignOp.getOperand(0);


  SelectionDAG &DAG = DCI.DAG;

  SDLoc DL(N);

  EVT SignVT = SignOp.getValueType();


  // f64 fcopysign is really an f32 copysign on the high bits, so replace the

  // lower half with a copy.

  // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)

  EVT MagVT = MagnitudeOp.getValueType();


  unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;


  if (MagVT.getScalarType() == MVT::f64) {

    EVT F32VT = MagVT.isVector()

                    ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)

                    : MVT::v2f32;


    SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);


    SmallVector<SDValue, 8> NewElts;

    for (unsigned I = 0; I != NumElts; ++I) {

      SDValue MagLo =

          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,

                      DAG.getConstant(2 * I, DL, MVT::i32));

      SDValue MagHi =

          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,

                      DAG.getConstant(2 * I + 1, DL, MVT::i32));


      SDValue SignOpElt =

          MagVT.isVector()

              ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SignVT.getScalarType(),

                            SignOp, DAG.getConstant(I, DL, MVT::i32))

              : SignOp;


      SDValue HiOp =

          DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);


      SDValue Vector =

          DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);


      SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);

      NewElts.push_back(NewElt);

    }


    if (NewElts.size() == 1)

      return NewElts[0];


    return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);

  }


  if (SignVT.getScalarType() != MVT::f64)

    return SDValue();


  // Reduce width of sign operand, we only need the highest bit.

  //

  // fcopysign f64:x, f64:y ->

  //   fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)

  // TODO: In some cases it might make sense to go all the way to f16.


  EVT F32VT = MagVT.isVector()

                  ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)

                  : MVT::v2f32;


  SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);


  SmallVector<SDValue, 8> F32Signs;

  for (unsigned I = 0; I != NumElts; ++I) {

    // Take sign from odd elements of cast vector

    SDValue SignAsF32 =

        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,

                    DAG.getConstant(2 * I + 1, DL, MVT::i32));

    F32Signs.push_back(SignAsF32);

  }


  SDValue NewSign =

      NumElts == 1

          ? F32Signs.back()

          : DAG.getNode(ISD::BUILD_VECTOR, DL,

                        EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),

                        F32Signs);


  return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),

                     NewSign);

}


// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)

// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no

// bits


// This is a variant of

// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),

//

// The normal DAG combiner will do this, but only if the add has one use since

// that would increase the number of instructions.

//

// This prevents us from seeing a constant offset that can be folded into a

// memory instruction's addressing mode. If we know the resulting add offset of

// a pointer can be folded into an addressing offset, we can replace the pointer

// operand with the add of new constant offset. This eliminates one of the uses,

// and may allow the remaining use to also be simplified.

//

SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,

                                               EVT MemVT,

                                               DAGCombinerInfo &DCI) const {

  SDValue N0 = N->getOperand(0);

  SDValue N1 = N->getOperand(1);


  // We only do this to handle cases where it's profitable when there are

  // multiple uses of the add, so defer to the standard combine.

  if ((!N0->isAnyAdd() && N0.getOpcode() != ISD::OR) || N0->hasOneUse())

    return SDValue();


  const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);

  if (!CN1)

    return SDValue();


  const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));

  if (!CAdd)

    return SDValue();


  SelectionDAG &DAG = DCI.DAG;


  if (N0->getOpcode() == ISD::OR &&

      !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))

    return SDValue();


  // If the resulting offset is too large, we can't fold it into the

  // addressing mode offset.

  APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();

  Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());


  AddrMode AM;

  AM.HasBaseReg = true;

  AM.BaseOffs = Offset.getSExtValue();

  if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))

    return SDValue();


  SDLoc SL(N);

  EVT VT = N->getValueType(0);


  SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);

  SDValue COffset = DAG.getConstant(Offset, SL, VT);


  SDNodeFlags Flags;

  Flags.setNoUnsignedWrap(

      N->getFlags().hasNoUnsignedWrap() &&

      (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));


  // Use ISD::ADD even if the original operation was ISD::PTRADD, since we can't

  // be sure that the new left operand is a proper base pointer.

  return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);

}


/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset

/// by the chain and intrinsic ID. Theoretically we would also need to check the

/// specific intrinsic, but they all place the pointer operand first.


static unsigned getBasePtrIndex(const MemSDNode *N) {

  switch (N->getOpcode()) {

  case ISD::STORE:

  case ISD::INTRINSIC_W_CHAIN:

  case ISD::INTRINSIC_VOID:

    return 2;

  default:

    return 1;

  }

}


SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,

                                                  DAGCombinerInfo &DCI) const {

  SelectionDAG &DAG = DCI.DAG;


  unsigned PtrIdx = getBasePtrIndex(N);

  SDValue Ptr = N->getOperand(PtrIdx);


  // TODO: We could also do this for multiplies.

  if (Ptr.getOpcode() == ISD::SHL) {

    SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),

                                          N->getMemoryVT(), DCI);

    if (NewPtr) {

      SmallVector<SDValue, 8> NewOps(N->ops());


      NewOps[PtrIdx] = NewPtr;

      return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);

    }

  }


  return SDValue();

}


static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {

  return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||

         (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||

         (Opc == ISD::XOR && Val == 0);

}


// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This

// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit

// integer combine opportunities since most 64-bit operations are decomposed

// this way.  TODO: We won't want this for SALU especially if it is an inline

// immediate.

SDValue SITargetLowering::splitBinaryBitConstantOp(

    DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,

    const ConstantSDNode *CRHS) const {

  uint64_t Val = CRHS->getZExtValue();

  uint32_t ValLo = Lo_32(Val);

  uint32_t ValHi = Hi_32(Val);

  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();


  if ((bitOpWithConstantIsReducible(Opc, ValLo) ||

       bitOpWithConstantIsReducible(Opc, ValHi)) ||

      (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {

    // We have 64-bit scalar and/or/xor, but do not have vector forms.

    if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&

        !CRHS->user_begin()->isDivergent())

      return SDValue();


    // If we need to materialize a 64-bit immediate, it will be split up later

    // anyway. Avoid creating the harder to understand 64-bit immediate

    // materialization.

    return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);

  }


  return SDValue();

}


bool llvm::isBoolSGPR(SDValue V) {

  if (V.getValueType() != MVT::i1)

    return false;

  switch (V.getOpcode()) {

  default:

    break;

  case ISD::SETCC:

  case ISD::IS_FPCLASS:

  case AMDGPUISD::FP_CLASS:

    return true;

  case ISD::AND:

  case ISD::OR:

  case ISD::XOR:

    return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));

  case ISD::SADDO:

  case ISD::UADDO:

  case ISD::SSUBO:

  case ISD::USUBO:

  case ISD::SMULO:

  case ISD::UMULO:

    return V.getResNo() == 1;

  case ISD::INTRINSIC_WO_CHAIN: {

    unsigned IntrinsicID = V.getConstantOperandVal(0);

    switch (IntrinsicID) {

    case Intrinsic::amdgcn_is_shared:

    case Intrinsic::amdgcn_is_private:

      return true;

    default:

      return false;

    }


    return false;

  }

  }

  return false;

}


// If a constant has all zeroes or all ones within each byte return it.

// Otherwise return 0.


static uint32_t getConstantPermuteMask(uint32_t C) {

  // 0xff for any zero byte in the mask

  uint32_t ZeroByteMask = 0;

  if (!(C & 0x000000ff))

    ZeroByteMask |= 0x000000ff;

  if (!(C & 0x0000ff00))

    ZeroByteMask |= 0x0000ff00;

  if (!(C & 0x00ff0000))

    ZeroByteMask |= 0x00ff0000;

  if (!(C & 0xff000000))

    ZeroByteMask |= 0xff000000;

  uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte

  if ((NonZeroByteMask & C) != NonZeroByteMask)

    return 0; // Partial bytes selected.

  return C;

}


// Check if a node selects whole bytes from its operand 0 starting at a byte

// boundary while masking the rest. Returns select mask as in the v_perm_b32

// or -1 if not succeeded.

// Note byte select encoding:

// value 0-3 selects corresponding source byte;

// value 0xc selects zero;

// value 0xff selects 0xff.


static uint32_t getPermuteMask(SDValue V) {

  assert(V.getValueSizeInBits() == 32);


  if (V.getNumOperands() != 2)

    return ~0;


  ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));

  if (!N1)

    return ~0;


  uint32_t C = N1->getZExtValue();


  switch (V.getOpcode()) {

  default:

    break;

  case ISD::AND:

    if (uint32_t ConstMask = getConstantPermuteMask(C))

      return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);

    break;


  case ISD::OR:

    if (uint32_t ConstMask = getConstantPermuteMask(C))

      return (0x03020100 & ~ConstMask) | ConstMask;

    break;


  case ISD::SHL:

    if (C % 8)

      return ~0;


    return uint32_t((0x030201000c0c0c0cull << C) >> 32);


  case ISD::SRL:

    if (C % 8)

      return ~0;


    return uint32_t(0x0c0c0c0c03020100ull >> C);

  }


  return ~0;

}


SDValue SITargetLowering::performAndCombine(SDNode *N,

                                            DAGCombinerInfo &DCI) const {

  if (DCI.isBeforeLegalize())

    return SDValue();


  SelectionDAG &DAG = DCI.DAG;

  EVT VT = N->getValueType(0);

  SDValue LHS = N->getOperand(0);

  SDValue RHS = N->getOperand(1);


  const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);

  if (VT == MVT::i64 && CRHS) {

    if (SDValue Split =

            splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))

      return Split;

  }


  if (CRHS && VT == MVT::i32) {

    // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb

    // nb = number of trailing zeroes in mask

    // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,

    // given that we are selecting 8 or 16 bit fields starting at byte boundary.

    uint64_t Mask = CRHS->getZExtValue();

    unsigned Bits = llvm::popcount(Mask);

    if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&

        (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {

      if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {

        unsigned Shift = CShift->getZExtValue();

        unsigned NB = CRHS->getAPIntValue().countr_zero();

        unsigned Offset = NB + Shift;

        if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.

          SDLoc SL(N);

          SDValue BFE =

              DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),

                          DAG.getConstant(Offset, SL, MVT::i32),

                          DAG.getConstant(Bits, SL, MVT::i32));

          EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);

          SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,

                                    DAG.getValueType(NarrowVT));

          SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,

                                    DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));

          return Shl;

        }

      }

    }


    // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)

    if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&

        isa<ConstantSDNode>(LHS.getOperand(2))) {

      uint32_t Sel = getConstantPermuteMask(Mask);

      if (!Sel)

        return SDValue();


      // Select 0xc for all zero bytes

      Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);

      SDLoc DL(N);

      return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),

                         LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));

    }

  }


  // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->

  // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)

  if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {

    ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();

    ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();


    SDValue X = LHS.getOperand(0);

    SDValue Y = RHS.getOperand(0);

    if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||

        !isTypeLegal(X.getValueType()))

      return SDValue();


    if (LCC == ISD::SETO) {

      if (X != LHS.getOperand(1))

        return SDValue();


      if (RCC == ISD::SETUNE) {

        const ConstantFPSDNode *C1 =

            dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));

        if (!C1 || !C1->isInfinity() || C1->isNegative())

          return SDValue();


        const uint32_t Mask = SIInstrFlags::N_NORMAL |

                              SIInstrFlags::N_SUBNORMAL | SIInstrFlags::N_ZERO |

                              SIInstrFlags::P_ZERO | SIInstrFlags::P_SUBNORMAL |

                              SIInstrFlags::P_NORMAL;


        static_assert(

            ((~(SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN |

                SIInstrFlags::N_INFINITY | SIInstrFlags::P_INFINITY)) &

             0x3ff) == Mask,

            "mask not equal");


        SDLoc DL(N);

        return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,

                           DAG.getConstant(Mask, DL, MVT::i32));

      }

    }

  }


  if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)

    std::swap(LHS, RHS);


  if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&

      RHS.hasOneUse()) {

    ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();

    // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |

    // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan

    // | n_nan)

    const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));

    if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&

        (RHS.getOperand(0) == LHS.getOperand(0) &&

         LHS.getOperand(0) == LHS.getOperand(1))) {

      const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;

      unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask

                                          : Mask->getZExtValue() & OrdMask;


      SDLoc DL(N);

      return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),

                         DAG.getConstant(NewMask, DL, MVT::i32));

    }

  }


  if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||

                         LHS.getOpcode() == ISD::SIGN_EXTEND)) {

    // and x, (sext cc from i1) => select cc, x, 0

    if (RHS.getOpcode() != ISD::SIGN_EXTEND)

      std::swap(LHS, RHS);

    if (isBoolSGPR(RHS.getOperand(0)))

      return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,

                           DAG.getConstant(0, SDLoc(N), MVT::i32));

  }


  // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)

  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();

  if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&

      N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {

    uint32_t LHSMask = getPermuteMask(LHS);

    uint32_t RHSMask = getPermuteMask(RHS);

    if (LHSMask != ~0u && RHSMask != ~0u) {

      // Canonicalize the expression in an attempt to have fewer unique masks

      // and therefore fewer registers used to hold the masks.

      if (LHSMask > RHSMask) {

        std::swap(LHSMask, RHSMask);

        std::swap(LHS, RHS);

      }


      // Select 0xc for each lane used from source operand. Zero has 0xc mask

      // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.

      uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;

      uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;


      // Check of we need to combine values from two sources within a byte.

      if (!(LHSUsedLanes & RHSUsedLanes) &&

          // If we select high and lower word keep it for SDWA.

          // TODO: teach SDWA to work with v_perm_b32 and remove the check.

          !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {

        // Each byte in each mask is either selector mask 0-3, or has higher

        // bits set in either of masks, which can be 0xff for 0xff or 0x0c for

        // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise

        // mask which is not 0xff wins. By anding both masks we have a correct

        // result except that 0x0c shall be corrected to give 0x0c only.

        uint32_t Mask = LHSMask & RHSMask;

        for (unsigned I = 0; I < 32; I += 8) {

          uint32_t ByteSel = 0xff << I;

          if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)

            Mask &= (0x0c << I) & 0xffffffff;

        }


        // Add 4 to each active LHS lane. It will not affect any existing 0xff

        // or 0x0c.

        uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);

        SDLoc DL(N);


        return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),

                           RHS.getOperand(0),

                           DAG.getConstant(Sel, DL, MVT::i32));

      }

    }

  }


  return SDValue();

}


// A key component of v_perm is a mapping between byte position of the src

// operands, and the byte position of the dest. To provide such, we need: 1. the

// node that provides x byte of the dest of the OR, and 2. the byte of the node

// used to provide that x byte. calculateByteProvider finds which node provides

// a certain byte of the dest of the OR, and calculateSrcByte takes that node,

// and finds an ultimate src and byte position For example: The supported

// LoadCombine pattern for vector loads is as follows

//                                t1

//                                or

//                      /                  \

//                      t2                 t3

//                     zext                shl

//                      |                   |     \

//                     t4                  t5     16

//                     or                 anyext

//                 /        \               |

//                t6        t7             t8

//               srl        shl             or

//            /    |      /     \         /     \

//           t9   t10    t11   t12      t13    t14

//         trunc*  8    trunc*  8      and     and

//           |            |          /    |     |    \

//          t15          t16        t17  t18   t19   t20

//                                trunc*  255   srl   -256

//                                   |         /   \

//                                  t15       t15  16

//

// *In this example, the truncs are from i32->i16

//

// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3

// respectively. calculateSrcByte would find (given node) -> ultimate src &

// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.

// After finding the mapping, we can combine the tree into vperm t15, t16,

// 0x05000407


// Find the source and byte position from a node.

// \p DestByte is the byte position of the dest of the or that the src

// ultimately provides. \p SrcIndex is the byte of the src that maps to this

// dest of the or byte. \p Depth tracks how many recursive iterations we have

// performed.

static const std::optional<ByteProvider<SDValue>>


calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,

                 unsigned Depth = 0) {

  // We may need to recursively traverse a series of SRLs

  if (Depth >= 6)

    return std::nullopt;


  if (Op.getValueSizeInBits() < 8)

    return std::nullopt;


  if (Op.getValueType().isVector())

    return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);


  switch (Op->getOpcode()) {

  case ISD::TRUNCATE: {

    return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);

  }


  case ISD::ANY_EXTEND:

  case ISD::SIGN_EXTEND:

  case ISD::ZERO_EXTEND:

  case ISD::SIGN_EXTEND_INREG: {

    SDValue NarrowOp = Op->getOperand(0);

    auto NarrowVT = NarrowOp.getValueType();

    if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {

      auto *VTSign = cast<VTSDNode>(Op->getOperand(1));

      NarrowVT = VTSign->getVT();

    }

    if (!NarrowVT.isByteSized())

      return std::nullopt;

    uint64_t NarrowByteWidth = NarrowVT.getStoreSize();


    if (SrcIndex >= NarrowByteWidth)

      return std::nullopt;

    return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);

  }


  case ISD::SRA:

  case ISD::SRL: {

    auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));

    if (!ShiftOp)

      return std::nullopt;


    uint64_t BitShift = ShiftOp->getZExtValue();


    if (BitShift % 8 != 0)

      return std::nullopt;


    SrcIndex += BitShift / 8;


    return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);

  }


  default: {

    return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);

  }

  }

  llvm_unreachable("fully handled switch");

}


// For a byte position in the result of an Or, traverse the tree and find the

// node (and the byte of the node) which ultimately provides this {Or,

// BytePosition}. \p Op is the operand we are currently examining. \p Index is

// the byte position of the Op that corresponds with the originally requested

// byte of the Or \p Depth tracks how many recursive iterations we have

// performed. \p StartingIndex is the originally requested byte of the Or

static const std::optional<ByteProvider<SDValue>>


calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,

                      unsigned StartingIndex = 0) {

  // Finding Src tree of RHS of or typically requires at least 1 additional

  // depth

  if (Depth > 6)

    return std::nullopt;


  unsigned BitWidth = Op.getScalarValueSizeInBits();

  if (BitWidth % 8 != 0)

    return std::nullopt;

  if (Index > BitWidth / 8 - 1)

    return std::nullopt;


  bool IsVec = Op.getValueType().isVector();

  switch (Op.getOpcode()) {

  case ISD::OR: {

    if (IsVec)

      return std::nullopt;


    auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,

                                     StartingIndex);

    if (!RHS)

      return std::nullopt;

    auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,

                                     StartingIndex);

    if (!LHS)

      return std::nullopt;

    // A well formed Or will have two ByteProviders for each byte, one of which

    // is constant zero

    if (!LHS->isConstantZero() && !RHS->isConstantZero())

      return std::nullopt;

    if (!LHS || LHS->isConstantZero())

      return RHS;

    if (!RHS || RHS->isConstantZero())

      return LHS;

    return std::nullopt;

  }


  case ISD::AND: {

    if (IsVec)

      return std::nullopt;


    auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));

    if (!BitMaskOp)

      return std::nullopt;


    uint32_t BitMask = BitMaskOp->getZExtValue();

    // Bits we expect for our StartingIndex

    uint32_t IndexMask = 0xFF << (Index * 8);


    if ((IndexMask & BitMask) != IndexMask) {

      // If the result of the and partially provides the byte, then it

      // is not well formatted

      if (IndexMask & BitMask)

        return std::nullopt;

      return ByteProvider<SDValue>::getConstantZero();

    }


    return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);

  }


  case ISD::FSHR: {

    if (IsVec)

      return std::nullopt;


    // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))

    auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));

    if (!ShiftOp || Op.getValueType().isVector())

      return std::nullopt;


    uint64_t BitsProvided = Op.getValueSizeInBits();

    if (BitsProvided % 8 != 0)

      return std::nullopt;


    uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);

    if (BitShift % 8)

      return std::nullopt;


    uint64_t ConcatSizeInBytes = BitsProvided / 4;

    uint64_t ByteShift = BitShift / 8;


    uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;

    uint64_t BytesProvided = BitsProvided / 8;

    SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);

    NewIndex %= BytesProvided;

    return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);

  }


  case ISD::SRA:

  case ISD::SRL: {

    if (IsVec)

      return std::nullopt;


    auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));

    if (!ShiftOp)

      return std::nullopt;


    uint64_t BitShift = ShiftOp->getZExtValue();

    if (BitShift % 8)

      return std::nullopt;


    auto BitsProvided = Op.getScalarValueSizeInBits();

    if (BitsProvided % 8 != 0)

      return std::nullopt;


    uint64_t BytesProvided = BitsProvided / 8;

    uint64_t ByteShift = BitShift / 8;

    // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.

    // If the byte we are trying to provide (as tracked by index) falls in this

    // range, then the SRL provides the byte. The byte of interest of the src of

    // the SRL is Index + ByteShift

    return BytesProvided - ByteShift > Index

               ? calculateSrcByte(Op->getOperand(0), StartingIndex,

                                  Index + ByteShift)

               : ByteProvider<SDValue>::getConstantZero();

  }


  case ISD::SHL: {

    if (IsVec)

      return std::nullopt;


    auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));

    if (!ShiftOp)

      return std::nullopt;


    uint64_t BitShift = ShiftOp->getZExtValue();

    if (BitShift % 8 != 0)

      return std::nullopt;

    uint64_t ByteShift = BitShift / 8;


    // If we are shifting by an amount greater than (or equal to)

    // the index we are trying to provide, then it provides 0s. If not,

    // then this bytes are not definitively 0s, and the corresponding byte

    // of interest is Index - ByteShift of the src

    return Index < ByteShift

               ? ByteProvider<SDValue>::getConstantZero()

               : calculateByteProvider(Op.getOperand(0), Index - ByteShift,

                                       Depth + 1, StartingIndex);

  }

  case ISD::ANY_EXTEND:

  case ISD::SIGN_EXTEND:

  case ISD::ZERO_EXTEND:

  case ISD::SIGN_EXTEND_INREG:

  case ISD::AssertZext:

  case ISD::AssertSext: {

    if (IsVec)

      return std::nullopt;


    SDValue NarrowOp = Op->getOperand(0);

    unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();

    if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||

        Op->getOpcode() == ISD::AssertZext ||

        Op->getOpcode() == ISD::AssertSext) {

      auto *VTSign = cast<VTSDNode>(Op->getOperand(1));

      NarrowBitWidth = VTSign->getVT().getSizeInBits();

    }

    if (NarrowBitWidth % 8 != 0)

      return std::nullopt;

    uint64_t NarrowByteWidth = NarrowBitWidth / 8;


    if (Index >= NarrowByteWidth)

      return Op.getOpcode() == ISD::ZERO_EXTEND

                 ? std::optional<ByteProvider<SDValue>>(

                       ByteProvider<SDValue>::getConstantZero())

                 : std::nullopt;

    return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);

  }


  case ISD::TRUNCATE: {

    if (IsVec)

      return std::nullopt;


    uint64_t NarrowByteWidth = BitWidth / 8;


    if (NarrowByteWidth >= Index) {

      return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,

                                   StartingIndex);

    }


    return std::nullopt;

  }


  case ISD::CopyFromReg: {

    if (BitWidth / 8 > Index)

      return calculateSrcByte(Op, StartingIndex, Index);


    return std::nullopt;

  }


  case ISD::LOAD: {

    auto *L = cast<LoadSDNode>(Op.getNode());


    unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();

    if (NarrowBitWidth % 8 != 0)

      return std::nullopt;

    uint64_t NarrowByteWidth = NarrowBitWidth / 8;


    // If the width of the load does not reach byte we are trying to provide for

    // and it is not a ZEXTLOAD, then the load does not provide for the byte in

    // question

    if (Index >= NarrowByteWidth) {

      return L->getExtensionType() == ISD::ZEXTLOAD

                 ? std::optional<ByteProvider<SDValue>>(

                       ByteProvider<SDValue>::getConstantZero())

                 : std::nullopt;

    }


    if (NarrowByteWidth > Index) {

      return calculateSrcByte(Op, StartingIndex, Index);

    }


    return std::nullopt;

  }


  case ISD::BSWAP: {

    if (IsVec)

      return std::nullopt;


    return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,

                                 Depth + 1, StartingIndex);

  }


  case ISD::EXTRACT_VECTOR_ELT: {

    auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));

    if (!IdxOp)

      return std::nullopt;

    auto VecIdx = IdxOp->getZExtValue();

    auto ScalarSize = Op.getScalarValueSizeInBits();

    if (ScalarSize < 32)

      Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;

    return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),

                            StartingIndex, Index);

  }


  case AMDGPUISD::PERM: {

    if (IsVec)

      return std::nullopt;


    auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));

    if (!PermMask)

      return std::nullopt;


    auto IdxMask =

        (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);

    if (IdxMask > 0x07 && IdxMask != 0x0c)

      return std::nullopt;


    auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);

    auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;


    return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)

                           : ByteProvider<SDValue>(

                                 ByteProvider<SDValue>::getConstantZero());

  }


  default: {

    return std::nullopt;

  }

  }


  llvm_unreachable("fully handled switch");

}


// Returns true if the Operand is a scalar and is 16 bits


static bool isExtendedFrom16Bits(SDValue &Operand) {


  switch (Operand.getOpcode()) {

  case ISD::ANY_EXTEND:

  case ISD::SIGN_EXTEND:

  case ISD::ZERO_EXTEND: {

    auto OpVT = Operand.getOperand(0).getValueType();

    return !OpVT.isVector() && OpVT.getSizeInBits() == 16;

  }

  case ISD::LOAD: {

    LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());

    auto ExtType = cast<LoadSDNode>(L)->getExtensionType();

    if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||

        ExtType == ISD::EXTLOAD) {

      auto MemVT = L->getMemoryVT();

      return !MemVT.isVector() && MemVT.getSizeInBits() == 16;

    }

    return L->getMemoryVT().getSizeInBits() == 16;

  }

  default:

    return false;

  }

}


// Returns true if the mask matches consecutive bytes, and the first byte

// begins at a power of 2 byte offset from 0th byte


static bool addresses16Bits(int Mask) {

  int Low8 = Mask & 0xff;

  int Hi8 = (Mask & 0xff00) >> 8;


  assert(Low8 < 8 && Hi8 < 8);

  // Are the bytes contiguous in the order of increasing addresses.

  bool IsConsecutive = (Hi8 - Low8 == 1);

  // Is the first byte at location that is aligned for 16 bit instructions.

  // A counter example is taking 2 consecutive bytes starting at the 8th bit.

  // In this case, we still need code to extract the 16 bit operand, so it

  // is better to use i8 v_perm

  bool Is16Aligned = !(Low8 % 2);


  return IsConsecutive && Is16Aligned;

}


// Do not lower into v_perm if the operands are actually 16 bit

// and the selected bits (based on PermMask) correspond with two

// easily addressable 16 bit operands.


static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,

                                SDValue &OtherOp) {

  int Low16 = PermMask & 0xffff;

  int Hi16 = (PermMask & 0xffff0000) >> 16;


  auto TempOp = peekThroughBitcasts(Op);

  auto TempOtherOp = peekThroughBitcasts(OtherOp);


  auto OpIs16Bit =

      TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);

  if (!OpIs16Bit)

    return true;


  auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||

                        isExtendedFrom16Bits(TempOtherOp);

  if (!OtherOpIs16Bit)

    return true;


  // Do we cleanly address both

  return !addresses16Bits(Low16) || !addresses16Bits(Hi16);

}


static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src,

                                  unsigned DWordOffset) {

  SDValue Ret;


  auto TypeSize = Src.getValueSizeInBits().getFixedValue();

  // ByteProvider must be at least 8 bits

  assert(Src.getValueSizeInBits().isKnownMultipleOf(8));


  if (TypeSize <= 32)

    return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);


  if (Src.getValueType().isVector()) {

    auto ScalarTySize = Src.getScalarValueSizeInBits();

    auto ScalarTy = Src.getValueType().getScalarType();

    if (ScalarTySize == 32) {

      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,

                         DAG.getConstant(DWordOffset, SL, MVT::i32));

    }

    if (ScalarTySize > 32) {

      Ret = DAG.getNode(

          ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,

          DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));

      auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));

      if (ShiftVal)

        Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,

                          DAG.getConstant(ShiftVal, SL, MVT::i32));

      return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);

    }


    assert(ScalarTySize < 32);

    auto NumElements = TypeSize / ScalarTySize;

    auto Trunc32Elements = (ScalarTySize * NumElements) / 32;

    auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;

    auto NumElementsIn32 = 32 / ScalarTySize;

    auto NumAvailElements = DWordOffset < Trunc32Elements

                                ? NumElementsIn32

                                : NumElements - NormalizedTrunc;


    SmallVector<SDValue, 4> VecSrcs;

    DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,

                              NumAvailElements);


    Ret = DAG.getBuildVector(

        MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,

        VecSrcs);

    return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);

  }


  /// Scalar Type

  auto ShiftVal = 32 * DWordOffset;

  Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,

                    DAG.getConstant(ShiftVal, SL, MVT::i32));

  return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);

}


static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {

  SelectionDAG &DAG = DCI.DAG;

  [[maybe_unused]] EVT VT = N->getValueType(0);

  SmallVector<ByteProvider<SDValue>, 8> PermNodes;


  // VT is known to be MVT::i32, so we need to provide 4 bytes.

  assert(VT == MVT::i32);

  for (int i = 0; i < 4; i++) {

    // Find the ByteProvider that provides the ith byte of the result of OR

    std::optional<ByteProvider<SDValue>> P =

        calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);

    // TODO support constantZero

    if (!P || P->isConstantZero())

      return SDValue();


    PermNodes.push_back(*P);

  }

  if (PermNodes.size() != 4)

    return SDValue();


  std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);

  std::optional<std::pair<unsigned, unsigned>> SecondSrc;

  uint64_t PermMask = 0x00000000;

  for (size_t i = 0; i < PermNodes.size(); i++) {

    auto PermOp = PermNodes[i];

    // Since the mask is applied to Src1:Src2, Src1 bytes must be offset

    // by sizeof(Src2) = 4

    int SrcByteAdjust = 4;


    // If the Src uses a byte from a different DWORD, then it corresponds

    // with a difference source

    if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||

        ((PermOp.SrcOffset / 4) != FirstSrc.second)) {

      if (SecondSrc)

        if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||

            ((PermOp.SrcOffset / 4) != SecondSrc->second))

          return SDValue();


      // Set the index of the second distinct Src node

      SecondSrc = {i, PermNodes[i].SrcOffset / 4};

      assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));

      SrcByteAdjust = 0;

    }

    assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);

    assert(!DAG.getDataLayout().isBigEndian());

    PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);

  }

  SDLoc DL(N);

  SDValue Op = *PermNodes[FirstSrc.first].Src;

  Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);

  assert(Op.getValueSizeInBits() == 32);


  // Check that we are not just extracting the bytes in order from an op

  if (!SecondSrc) {

    int Low16 = PermMask & 0xffff;

    int Hi16 = (PermMask & 0xffff0000) >> 16;


    bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);

    bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);


    // The perm op would really just produce Op. So combine into Op

    if (WellFormedLow && WellFormedHi)

      return DAG.getBitcast(MVT::getIntegerVT(32), Op);

  }


  SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;


  if (SecondSrc) {

    OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);

    assert(OtherOp.getValueSizeInBits() == 32);

  }


  // Check that we haven't just recreated the same FSHR node.

  if (N->getOpcode() == ISD::FSHR &&

      (N->getOperand(0) == Op || N->getOperand(0) == OtherOp) &&

      (N->getOperand(1) == Op || N->getOperand(1) == OtherOp))

    return SDValue();


  if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {


    assert(Op.getValueType().isByteSized() &&

           OtherOp.getValueType().isByteSized());


    // If the ultimate src is less than 32 bits, then we will only be

    // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.

    // CalculateByteProvider would not have returned Op as source if we

    // used a byte that is outside its ValueType. Thus, we are free to

    // ANY_EXTEND as the extended bits are dont-cares.

    Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);

    OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);


    return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,

                       DAG.getConstant(PermMask, DL, MVT::i32));

  }

  return SDValue();

}


SDValue SITargetLowering::performOrCombine(SDNode *N,

                                           DAGCombinerInfo &DCI) const {

  SelectionDAG &DAG = DCI.DAG;

  SDValue LHS = N->getOperand(0);

  SDValue RHS = N->getOperand(1);


  EVT VT = N->getValueType(0);

  if (VT == MVT::i1) {

    // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)

    if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&

        RHS.getOpcode() == AMDGPUISD::FP_CLASS) {

      SDValue Src = LHS.getOperand(0);

      if (Src != RHS.getOperand(0))

        return SDValue();


      const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));

      const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));

      if (!CLHS || !CRHS)

        return SDValue();


      // Only 10 bits are used.

      static const uint32_t MaxMask = 0x3ff;


      uint32_t NewMask =

          (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;

      SDLoc DL(N);

      return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,

                         DAG.getConstant(NewMask, DL, MVT::i32));

    }


    return SDValue();

  }


  // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)

  if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&

      LHS.getOpcode() == AMDGPUISD::PERM &&

      isa<ConstantSDNode>(LHS.getOperand(2))) {

    uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));

    if (!Sel)

      return SDValue();


    Sel |= LHS.getConstantOperandVal(2);

    SDLoc DL(N);

    return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),

                       LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));

  }


  // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)

  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();

  if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&

      N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {


    // If all the uses of an or need to extract the individual elements, do not

    // attempt to lower into v_perm

    auto usesCombinedOperand = [](SDNode *OrUse) {

      // If we have any non-vectorized use, then it is a candidate for v_perm

      if (OrUse->getOpcode() != ISD::BITCAST ||

          !OrUse->getValueType(0).isVector())

        return true;


      // If we have any non-vectorized use, then it is a candidate for v_perm

      for (auto *VUser : OrUse->users()) {

        if (!VUser->getValueType(0).isVector())

          return true;


        // If the use of a vector is a store, then combining via a v_perm

        // is beneficial.

        // TODO -- whitelist more uses

        for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})

          if (VUser->getOpcode() == VectorwiseOp)

            return true;

      }

      return false;

    };


    if (!any_of(N->users(), usesCombinedOperand))

      return SDValue();


    uint32_t LHSMask = getPermuteMask(LHS);

    uint32_t RHSMask = getPermuteMask(RHS);


    if (LHSMask != ~0u && RHSMask != ~0u) {

      // Canonicalize the expression in an attempt to have fewer unique masks

      // and therefore fewer registers used to hold the masks.

      if (LHSMask > RHSMask) {

        std::swap(LHSMask, RHSMask);

        std::swap(LHS, RHS);

      }


      // Select 0xc for each lane used from source operand. Zero has 0xc mask

      // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.

      uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;

      uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;


      // Check of we need to combine values from two sources within a byte.

      if (!(LHSUsedLanes & RHSUsedLanes) &&

          // If we select high and lower word keep it for SDWA.

          // TODO: teach SDWA to work with v_perm_b32 and remove the check.

          !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {

        // Kill zero bytes selected by other mask. Zero value is 0xc.

        LHSMask &= ~RHSUsedLanes;

        RHSMask &= ~LHSUsedLanes;

        // Add 4 to each active LHS lane

        LHSMask |= LHSUsedLanes & 0x04040404;

        // Combine masks

        uint32_t Sel = LHSMask | RHSMask;

        SDLoc DL(N);


        return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),

                           RHS.getOperand(0),

                           DAG.getConstant(Sel, DL, MVT::i32));

      }

    }

    if (LHSMask == ~0u || RHSMask == ~0u) {

      if (SDValue Perm = matchPERM(N, DCI))

        return Perm;

    }

  }


  // Detect identity v2i32 OR and replace with identity source node.

  // Specifically an Or that has operands constructed from the same source node

  // via extract_vector_elt and build_vector. I.E.

  // v2i32 or(

  //   v2i32 build_vector(

  //     i32 extract_elt(%IdentitySrc, 0),

  //     i32 0

  //   ),

  //   v2i32 build_vector(

  //     i32 0,

  //     i32 extract_elt(%IdentitySrc, 1)

  //   ) )

  // =>

  // v2i32 %IdentitySrc


  if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&

      RHS->getOpcode() == ISD::BUILD_VECTOR) {


    ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));

    ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));


    // Test for and normalise build vectors.

    if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {


      // Get the extract_vector_element operands.

      SDValue LEVE = LHS->getOperand(0);

      SDValue REVE = RHS->getOperand(1);


      if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

          REVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {

        // Check that different elements from the same vector are

        // extracted.

        if (LEVE->getOperand(0) == REVE->getOperand(0) &&

            LEVE->getOperand(1) != REVE->getOperand(1)) {

          SDValue IdentitySrc = LEVE.getOperand(0);

          return IdentitySrc;

        }

      }

    }

  }


  if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())

    return SDValue();


  // TODO: This could be a generic combine with a predicate for extracting the

  // high half of an integer being free.


  // (or i64:x, (zero_extend i32:y)) ->

  //   i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))

  if (LHS.getOpcode() == ISD::ZERO_EXTEND &&

      RHS.getOpcode() != ISD::ZERO_EXTEND)

    std::swap(LHS, RHS);


  if (RHS.getOpcode() == ISD::ZERO_EXTEND) {

    SDValue ExtSrc = RHS.getOperand(0);

    EVT SrcVT = ExtSrc.getValueType();

    if (SrcVT == MVT::i32) {

      SDLoc SL(N);

      auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);

      SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);


      DCI.AddToWorklist(LowOr.getNode());

      DCI.AddToWorklist(HiBits.getNode());


      SDValue Vec =

          DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);

      return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);

    }

  }


  const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));

  if (CRHS) {

    if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,

                                                 N->getOperand(0), CRHS))

      return Split;

  }


  return SDValue();

}


SDValue SITargetLowering::performXorCombine(SDNode *N,

                                            DAGCombinerInfo &DCI) const {

  if (SDValue RV = reassociateScalarOps(N, DCI.DAG))

    return RV;


  SDValue LHS = N->getOperand(0);

  SDValue RHS = N->getOperand(1);


  const ConstantSDNode *CRHS = isConstOrConstSplat(RHS);

  SelectionDAG &DAG = DCI.DAG;


  EVT VT = N->getValueType(0);

  if (CRHS && VT == MVT::i64) {

    if (SDValue Split =

            splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))

      return Split;

  }


  // v2i32 (xor (vselect cc, x, y), K) ->

  // (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be

  // replaced with source modifiers when the select is lowered to CNDMASK.

  unsigned Opc = LHS.getOpcode();

  if (((Opc == ISD::VSELECT && VT == MVT::v2i32) ||

       (Opc == ISD::SELECT && VT == MVT::i64)) &&

      CRHS && CRHS->getAPIntValue().isSignMask()) {

    SDValue CC = LHS->getOperand(0);

    SDValue TRUE = LHS->getOperand(1);

    SDValue FALSE = LHS->getOperand(2);

    SDValue XTrue = DAG.getNode(ISD::XOR, SDLoc(N), VT, TRUE, RHS);

    SDValue XFalse = DAG.getNode(ISD::XOR, SDLoc(N), VT, FALSE, RHS);

    SDValue XSelect =

        DAG.getNode(ISD::VSELECT, SDLoc(N), VT, CC, XTrue, XFalse);

    return XSelect;

  }


  // Make sure to apply the 64-bit constant splitting fold before trying to fold

  // fneg-like xors into 64-bit select.

  if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {

    // This looks like an fneg, try to fold as a source modifier.

    if (CRHS && CRHS->getAPIntValue().isSignMask() &&

        shouldFoldFNegIntoSrc(N, LHS)) {

      // xor (select c, a, b), 0x80000000 ->

      //   bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))

      SDLoc DL(N);

      SDValue CastLHS =

          DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));

      SDValue CastRHS =

          DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));

      SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);

      SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);

      SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,

                                      LHS->getOperand(0), FNegLHS, FNegRHS);

      return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);

    }

  }


  return SDValue();

}


SDValue

SITargetLowering::performZeroOrAnyExtendCombine(SDNode *N,

                                                DAGCombinerInfo &DCI) const {

  if (!Subtarget->has16BitInsts() ||

      DCI.getDAGCombineLevel() < AfterLegalizeTypes)

    return SDValue();


  EVT VT = N->getValueType(0);

  if (VT != MVT::i32)

    return SDValue();


  SDValue Src = N->getOperand(0);

  if (Src.getValueType() != MVT::i16)

    return SDValue();


  if (!Src->hasOneUse())

    return SDValue();


  // TODO: We bail out below if SrcOffset is not in the first dword (>= 4). It's

  // possible we're missing out on some combine opportunities, but we'd need to

  // weigh the cost of extracting the byte from the upper dwords.


  std::optional<ByteProvider<SDValue>> BP0 =

      calculateByteProvider(SDValue(N, 0), 0, 0, 0);

  if (!BP0 || BP0->SrcOffset >= 4 || !BP0->Src)

    return SDValue();

  SDValue V0 = *BP0->Src;


  std::optional<ByteProvider<SDValue>> BP1 =

      calculateByteProvider(SDValue(N, 0), 1, 0, 1);

  if (!BP1 || BP1->SrcOffset >= 4 || !BP1->Src)

    return SDValue();


  SDValue V1 = *BP1->Src;


  if (V0 == V1)

    return SDValue();


  SelectionDAG &DAG = DCI.DAG;

  SDLoc DL(N);

  uint32_t PermMask = 0x0c0c0c0c;

  if (V0) {

    V0 = DAG.getBitcastedAnyExtOrTrunc(V0, DL, MVT::i32);

    PermMask = (PermMask & ~0xFF) | (BP0->SrcOffset + 4);

  }


  if (V1) {

    V1 = DAG.getBitcastedAnyExtOrTrunc(V1, DL, MVT::i32);

    PermMask = (PermMask & ~(0xFF << 8)) | (BP1->SrcOffset << 8);

  }


  return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, V0, V1,

                     DAG.getConstant(PermMask, DL, MVT::i32));

}


SDValue

SITargetLowering::performSignExtendInRegCombine(SDNode *N,

                                                DAGCombinerInfo &DCI) const {

  SDValue Src = N->getOperand(0);

  auto *VTSign = cast<VTSDNode>(N->getOperand(1));


  // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them

  // with s_buffer_load_i8 and s_buffer_load_i16 respectively.

  if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&

        VTSign->getVT() == MVT::i8) ||

       (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&

        VTSign->getVT() == MVT::i16))) {

    assert(Subtarget->hasScalarSubwordLoads() &&

           "s_buffer_load_{u8, i8} are supported "

           "in GFX12 (or newer) architectures.");

    EVT VT = Src.getValueType();

    unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)

                       ? AMDGPUISD::SBUFFER_LOAD_BYTE

                       : AMDGPUISD::SBUFFER_LOAD_SHORT;

    SDLoc DL(N);

    SDVTList ResList = DCI.DAG.getVTList(MVT::i32);

    SDValue Ops[] = {

        Src.getOperand(0), // source register

        Src.getOperand(1), // offset

        Src.getOperand(2)  // cachePolicy

    };

    auto *M = cast<MemSDNode>(Src);

    SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(

        Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());

    SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);

    return LoadVal;

  }

  if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&

        VTSign->getVT() == MVT::i8) ||

       (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&

        VTSign->getVT() == MVT::i16)) &&

      Src.hasOneUse()) {

    auto *M = cast<MemSDNode>(Src);

    SDValue Ops[] = {Src.getOperand(0), // Chain

                     Src.getOperand(1), // rsrc

                     Src.getOperand(2), // vindex

                     Src.getOperand(3), // voffset

                     Src.getOperand(4), // soffset

                     Src.getOperand(5), // offset

                     Src.getOperand(6), Src.getOperand(7)};

    // replace with BUFFER_LOAD_BYTE/SHORT

    SDVTList ResList =

        DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());

    unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)

                       ? AMDGPUISD::BUFFER_LOAD_BYTE

                       : AMDGPUISD::BUFFER_LOAD_SHORT;

    SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(

        Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());

    return DCI.DAG.getMergeValues(

        {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));

  }

  return SDValue();

}


SDValue SITargetLowering::performClassCombine(SDNode *N,

                                              DAGCombinerInfo &DCI) const {

  SelectionDAG &DAG = DCI.DAG;

  SDValue Mask = N->getOperand(1);


  // fp_class x, 0 -> false

  if (isNullConstant(Mask))

    return DAG.getConstant(0, SDLoc(N), MVT::i1);


  if (N->getOperand(0).isUndef())

    return DAG.getUNDEF(MVT::i1);


  return SDValue();

}


SDValue SITargetLowering::performRcpCombine(SDNode *N,

                                            DAGCombinerInfo &DCI) const {

  EVT VT = N->getValueType(0);

  SDValue N0 = N->getOperand(0);


  if (N0.isUndef()) {

    return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),

                                 SDLoc(N), VT);

  }


  // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.

  if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&

      N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {

    return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),

                           N->getFlags());

  }


  return AMDGPUTargetLowering::performRcpCombine(N, DCI);

}


bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,

                                       SDNodeFlags UserFlags,

                                       unsigned MaxDepth) const {

  unsigned Opcode = Op.getOpcode();

  if (Opcode == ISD::FCANONICALIZE)

    return true;


  if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {

    const auto &F = CFP->getValueAPF();

    if (F.isNaN() && F.isSignaling())

      return false;

    if (!F.isDenormal())

      return true;


    DenormalMode Mode =

        DAG.getMachineFunction().getDenormalMode(F.getSemantics());

    return Mode == DenormalMode::getIEEE();

  }


  // If source is a result of another standard FP operation it is already in

  // canonical form.

  if (MaxDepth == 0)

    return false;


  switch (Opcode) {

  // These will flush denorms if required.

  case ISD::FADD:

  case ISD::FSUB:

  case ISD::FMUL:

  case ISD::FCEIL:

  case ISD::FFLOOR:

  case ISD::FMA:

  case ISD::FMAD:

  case ISD::FSQRT:

  case ISD::FDIV:

  case ISD::FREM:

  case ISD::FP_ROUND:

  case ISD::FP_EXTEND:

  case ISD::FP16_TO_FP:

  case ISD::FP_TO_FP16:

  case ISD::BF16_TO_FP:

  case ISD::FP_TO_BF16:

  case ISD::FLDEXP:

  case AMDGPUISD::FMUL_LEGACY:

  case AMDGPUISD::FMAD_FTZ:

  case AMDGPUISD::RCP:

  case AMDGPUISD::RSQ:

  case AMDGPUISD::RSQ_CLAMP:

  case AMDGPUISD::RCP_LEGACY:

  case AMDGPUISD::RCP_IFLAG:

  case AMDGPUISD::LOG:

  case AMDGPUISD::EXP:

  case AMDGPUISD::DIV_SCALE:

  case AMDGPUISD::DIV_FMAS:

  case AMDGPUISD::DIV_FIXUP:

  case AMDGPUISD::FRACT:

  case AMDGPUISD::CVT_PKRTZ_F16_F32:

  case AMDGPUISD::CVT_F32_UBYTE0:

  case AMDGPUISD::CVT_F32_UBYTE1:

  case AMDGPUISD::CVT_F32_UBYTE2:

  case AMDGPUISD::CVT_F32_UBYTE3:

  case AMDGPUISD::FP_TO_FP16:

  case AMDGPUISD::SIN_HW:

  case AMDGPUISD::COS_HW:

    return true;


  // It can/will be lowered or combined as a bit operation.

  // Need to check their input recursively to handle.

  case ISD::FNEG:

  case ISD::FABS:

  case ISD::FCOPYSIGN:

    return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);


  case ISD::AND:

    if (Op.getValueType() == MVT::i32) {

      // Be careful as we only know it is a bitcast floating point type. It

      // could be f32, v2f16, we have no way of knowing. Luckily the constant

      // value that we optimize for, which comes up in fp32 to bf16 conversions,

      // is valid to optimize for all types.

      if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {

        if (RHS->getZExtValue() == 0xffff0000) {

          return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);

        }

      }

    }

    break;


  case ISD::FSIN:

  case ISD::FCOS:

  case ISD::FSINCOS:

    return Op.getValueType().getScalarType() != MVT::f16;


  case ISD::FMINNUM:

  case ISD::FMAXNUM:

  case ISD::FMINNUM_IEEE:

  case ISD::FMAXNUM_IEEE:

  case ISD::FMINIMUM:

  case ISD::FMAXIMUM:

  case ISD::FMINIMUMNUM:

  case ISD::FMAXIMUMNUM:

  case AMDGPUISD::CLAMP:

  case AMDGPUISD::FMED3:

  case AMDGPUISD::FMAX3:

  case AMDGPUISD::FMIN3:

  case AMDGPUISD::FMAXIMUM3:

  case AMDGPUISD::FMINIMUM3: {

    // FIXME: Shouldn't treat the generic operations different based these.

    // However, we aren't really required to flush the result from

    // minnum/maxnum..


    // snans will be quieted, so we only need to worry about denormals.

    if (Subtarget->supportsMinMaxDenormModes() ||

        // FIXME: denormalsEnabledForType is broken for dynamic

        denormalsEnabledForType(DAG, Op.getValueType()))

      return true;


    // Flushing may be required.

    // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such

    // targets need to check their input recursively.


    // FIXME: Does this apply with clamp? It's implemented with max.

    for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {

      if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))

        return false;

    }


    return true;

  }

  case ISD::SELECT: {

    return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&

           isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);

  }

  case ISD::BUILD_VECTOR: {

    for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {

      SDValue SrcOp = Op.getOperand(i);

      if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))

        return false;

    }


    return true;

  }

  case ISD::EXTRACT_VECTOR_ELT:

  case ISD::EXTRACT_SUBVECTOR: {

    return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);

  }

  case ISD::INSERT_VECTOR_ELT: {

    return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&

           isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);

  }

  case ISD::UNDEF:

    // Could be anything.

    return false;


  case ISD::BITCAST:

    // TODO: This is incorrect as it loses track of the operand's type. We may

    // end up effectively bitcasting from f32 to v2f16 or vice versa, and the

    // same bits that are canonicalized in one type need not be in the other.

    return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);

  case ISD::TRUNCATE: {

    // Hack round the mess we make when legalizing extract_vector_elt

    if (Op.getValueType() == MVT::i16) {

      SDValue TruncSrc = Op.getOperand(0);

      if (TruncSrc.getValueType() == MVT::i32 &&

          TruncSrc.getOpcode() == ISD::BITCAST &&

          TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {

        return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);

      }

    }

    return false;

  }

  case ISD::INTRINSIC_WO_CHAIN: {

    unsigned IntrinsicID = Op.getConstantOperandVal(0);

    // TODO: Handle more intrinsics

    switch (IntrinsicID) {

    case Intrinsic::amdgcn_cvt_pkrtz:

    case Intrinsic::amdgcn_cubeid:

    case Intrinsic::amdgcn_frexp_mant:

    case Intrinsic::amdgcn_fdot2:

    case Intrinsic::amdgcn_rcp:

    case Intrinsic::amdgcn_rsq:

    case Intrinsic::amdgcn_rsq_clamp:

    case Intrinsic::amdgcn_rcp_legacy:

    case Intrinsic::amdgcn_rsq_legacy:

    case Intrinsic::amdgcn_trig_preop:

    case Intrinsic::amdgcn_tanh:

    case Intrinsic::amdgcn_log:

    case Intrinsic::amdgcn_exp2:

    case Intrinsic::amdgcn_sqrt:

      return true;

    default:

      break;

    }


    break;

  }

  default:

    break;

  }


  // FIXME: denormalsEnabledForType is broken for dynamic

  return denormalsEnabledForType(DAG, Op.getValueType()) &&

         (UserFlags.hasNoNaNs() || DAG.isKnownNeverSNaN(Op));

}


bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF,

                                       unsigned MaxDepth) const {

  const MachineRegisterInfo &MRI = MF.getRegInfo();

  MachineInstr *MI = MRI.getVRegDef(Reg);

  unsigned Opcode = MI->getOpcode();


  if (Opcode == AMDGPU::G_FCANONICALIZE)

    return true;


  std::optional<FPValueAndVReg> FCR;

  // Constant splat (can be padded with undef) or scalar constant.

  if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {

    if (FCR->Value.isSignaling())

      return false;

    if (!FCR->Value.isDenormal())

      return true;


    DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());

    return Mode == DenormalMode::getIEEE();

  }


  if (MaxDepth == 0)

    return false;


  switch (Opcode) {

  case AMDGPU::G_FADD:

  case AMDGPU::G_FSUB:

  case AMDGPU::G_FMUL:

  case AMDGPU::G_FCEIL:

  case AMDGPU::G_FFLOOR:

  case AMDGPU::G_FRINT:

  case AMDGPU::G_FNEARBYINT:

  case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:

  case AMDGPU::G_INTRINSIC_TRUNC:

  case AMDGPU::G_INTRINSIC_ROUNDEVEN:

  case AMDGPU::G_FMA:

  case AMDGPU::G_FMAD:

  case AMDGPU::G_FSQRT:

  case AMDGPU::G_FDIV:

  case AMDGPU::G_FREM:

  case AMDGPU::G_FPOW:

  case AMDGPU::G_FPEXT:

  case AMDGPU::G_FLOG:

  case AMDGPU::G_FLOG2:

  case AMDGPU::G_FLOG10:

  case AMDGPU::G_FPTRUNC:

  case AMDGPU::G_AMDGPU_RCP_IFLAG:

  case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:

  case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:

  case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:

  case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:

    return true;

  case AMDGPU::G_FNEG:

  case AMDGPU::G_FABS:

  case AMDGPU::G_FCOPYSIGN:

    return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);

  case AMDGPU::G_FMINNUM:

  case AMDGPU::G_FMAXNUM:

  case AMDGPU::G_FMINNUM_IEEE:

  case AMDGPU::G_FMAXNUM_IEEE:

  case AMDGPU::G_FMINIMUM:

  case AMDGPU::G_FMAXIMUM:

  case AMDGPU::G_FMINIMUMNUM:

  case AMDGPU::G_FMAXIMUMNUM: {

    if (Subtarget->supportsMinMaxDenormModes() ||

        // FIXME: denormalsEnabledForType is broken for dynamic

        denormalsEnabledForType(MRI.getType(Reg), MF))

      return true;


    [[fallthrough]];

  }

  case AMDGPU::G_BUILD_VECTOR:

    for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))

      if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))

        return false;

    return true;

  case AMDGPU::G_INTRINSIC:

  case AMDGPU::G_INTRINSIC_CONVERGENT:

    switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {

    case Intrinsic::amdgcn_fmul_legacy:

    case Intrinsic::amdgcn_fmad_ftz:

    case Intrinsic::amdgcn_sqrt:

    case Intrinsic::amdgcn_fmed3:

    case Intrinsic::amdgcn_sin:

    case Intrinsic::amdgcn_cos:

    case Intrinsic::amdgcn_log:

    case Intrinsic::amdgcn_exp2:

    case Intrinsic::amdgcn_log_clamp:

    case Intrinsic::amdgcn_rcp:

    case Intrinsic::amdgcn_rcp_legacy:

    case Intrinsic::amdgcn_rsq:

    case Intrinsic::amdgcn_rsq_clamp:

    case Intrinsic::amdgcn_rsq_legacy:

    case Intrinsic::amdgcn_div_scale:

    case Intrinsic::amdgcn_div_fmas:

    case Intrinsic::amdgcn_div_fixup:

    case Intrinsic::amdgcn_fract:

    case Intrinsic::amdgcn_cvt_pkrtz:

    case Intrinsic::amdgcn_cubeid:

    case Intrinsic::amdgcn_cubema:

    case Intrinsic::amdgcn_cubesc:

    case Intrinsic::amdgcn_cubetc:

    case Intrinsic::amdgcn_frexp_mant:

    case Intrinsic::amdgcn_fdot2:

    case Intrinsic::amdgcn_trig_preop:

    case Intrinsic::amdgcn_tanh:

      return true;

    default:

      break;

    }


    [[fallthrough]];

  default:

    return false;

  }


  llvm_unreachable("invalid operation");

}


// Constant fold canonicalize.

SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,

                                                 const SDLoc &SL, EVT VT,

                                                 const APFloat &C) const {

  // Flush denormals to 0 if not enabled.

  if (C.isDenormal()) {

    DenormalMode Mode =

        DAG.getMachineFunction().getDenormalMode(C.getSemantics());

    if (Mode == DenormalMode::getPreserveSign()) {

      return DAG.getConstantFP(

          APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);

    }


    if (Mode != DenormalMode::getIEEE())

      return SDValue();

  }


  if (C.isNaN()) {

    APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());

    if (C.isSignaling()) {

      // Quiet a signaling NaN.

      // FIXME: Is this supposed to preserve payload bits?

      return DAG.getConstantFP(CanonicalQNaN, SL, VT);

    }


    // Make sure it is the canonical NaN bitpattern.

    //

    // TODO: Can we use -1 as the canonical NaN value since it's an inline

    // immediate?

    if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())

      return DAG.getConstantFP(CanonicalQNaN, SL, VT);

  }


  // Already canonical.

  return DAG.getConstantFP(C, SL, VT);

}


static bool vectorEltWillFoldAway(SDValue Op) {

  return Op.isUndef() || isa<ConstantFPSDNode>(Op);

}


SDValue

SITargetLowering::performFCanonicalizeCombine(SDNode *N,

                                              DAGCombinerInfo &DCI) const {

  SelectionDAG &DAG = DCI.DAG;

  SDValue N0 = N->getOperand(0);

  EVT VT = N->getValueType(0);


  // fcanonicalize undef -> qnan

  if (N0.isUndef()) {

    APFloat QNaN = APFloat::getQNaN(VT.getFltSemantics());

    return DAG.getConstantFP(QNaN, SDLoc(N), VT);

  }


  if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {

    EVT VT = N->getValueType(0);

    return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());

  }


  // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),

  //                                                   (fcanonicalize k)

  //

  // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0


  // TODO: This could be better with wider vectors that will be split to v2f16,

  // and to consider uses since there aren't that many packed operations.

  if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&

      isTypeLegal(MVT::v2f16)) {

    SDLoc SL(N);

    SDValue NewElts[2];

    SDValue Lo = N0.getOperand(0);

    SDValue Hi = N0.getOperand(1);

    EVT EltVT = Lo.getValueType();


    if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) {

      for (unsigned I = 0; I != 2; ++I) {

        SDValue Op = N0.getOperand(I);

        if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {

          NewElts[I] =

              getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());

        } else if (Op.isUndef()) {

          // Handled below based on what the other operand is.

          NewElts[I] = Op;

        } else {

          NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);

        }

      }


      // If one half is undef, and one is constant, prefer a splat vector rather

      // than the normal qNaN. If it's a register, prefer 0.0 since that's

      // cheaper to use and may be free with a packed operation.

      if (NewElts[0].isUndef()) {

        if (isa<ConstantFPSDNode>(NewElts[1]))

          NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])

                           ? NewElts[1]

                           : DAG.getConstantFP(0.0f, SL, EltVT);

      }


      if (NewElts[1].isUndef()) {

        NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])

                         ? NewElts[0]

                         : DAG.getConstantFP(0.0f, SL, EltVT);

      }


      return DAG.getBuildVector(VT, SL, NewElts);

    }

  }


  return SDValue();

}


static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {

  switch (Opc) {

  case ISD::FMAXNUM:

  case ISD::FMAXNUM_IEEE:

  case ISD::FMAXIMUMNUM:

    return AMDGPUISD::FMAX3;

  case ISD::FMAXIMUM:

    return AMDGPUISD::FMAXIMUM3;

  case ISD::SMAX:

    return AMDGPUISD::SMAX3;

  case ISD::UMAX:

    return AMDGPUISD::UMAX3;

  case ISD::FMINNUM:

  case ISD::FMINNUM_IEEE:

  case ISD::FMINIMUMNUM:

    return AMDGPUISD::FMIN3;

  case ISD::FMINIMUM:

    return AMDGPUISD::FMINIMUM3;

  case ISD::SMIN:

    return AMDGPUISD::SMIN3;

  case ISD::UMIN:

    return AMDGPUISD::UMIN3;

  default:

    llvm_unreachable("Not a min/max opcode");

  }

}


SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,

                                                   const SDLoc &SL, SDValue Src,

                                                   SDValue MinVal,

                                                   SDValue MaxVal,

                                                   bool Signed) const {


  // med3 comes from

  //    min(max(x, K0), K1), K0 < K1

  //    max(min(x, K0), K1), K1 < K0

  //

  // "MinVal" and "MaxVal" respectively refer to the rhs of the

  // min/max op.

  ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);

  ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);


  if (!MinK || !MaxK)

    return SDValue();


  if (Signed) {

    if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))

      return SDValue();

  } else {

    if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))

      return SDValue();

  }


  EVT VT = MinK->getValueType(0);

  unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;

  if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))

    return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);


  // Note: we could also extend to i32 and use i32 med3 if i16 med3 is

  // not available, but this is unlikely to be profitable as constants

  // will often need to be materialized & extended, especially on

  // pre-GFX10 where VOP3 instructions couldn't take literal operands.

  return SDValue();

}


static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {

  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))

    return C;


  if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {

    if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())

      return C;

  }


  return nullptr;

}


SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,

                                                  const SDLoc &SL, SDValue Op0,

                                                  SDValue Op1,

                                                  bool IsKnownNoNaNs) const {

  ConstantFPSDNode *K1 = getSplatConstantFP(Op1);

  if (!K1)

    return SDValue();


  ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));

  if (!K0)

    return SDValue();


  // Ordered >= (although NaN inputs should have folded away by now).

  if (K0->getValueAPF() > K1->getValueAPF())

    return SDValue();


  // med3 with a nan input acts like

  // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)

  //

  // So the result depends on whether the IEEE mode bit is enabled or not with a

  // signaling nan input.

  // ieee=1

  // s0 snan: yields s2

  // s1 snan: yields s2

  // s2 snan: qnan


  // s0 qnan: min(s1, s2)

  // s1 qnan: min(s0, s2)

  // s2 qnan: min(s0, s1)


  // ieee=0

  // s0 snan: min(s1, s2)

  // s1 snan: min(s0, s2)

  // s2 snan: qnan


  // s0 qnan: min(s1, s2)

  // s1 qnan: min(s0, s2)

  // s2 qnan: min(s0, s1)

  const MachineFunction &MF = DAG.getMachineFunction();

  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();


  // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of

  // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We

  // can only form if op0 is fmaxnum_ieee if IEEE=1.

  EVT VT = Op0.getValueType();

  if (Info->getMode().DX10Clamp) {

    // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the

    // hardware fmed3 behavior converting to a min.

    // FIXME: Should this be allowing -0.0?

    if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))

      return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));

  }


  // med3 for f16 is only available on gfx9+, and not available for v2f16.

  if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {

    // This isn't safe with signaling NaNs because in IEEE mode, min/max on a

    // signaling NaN gives a quiet NaN. The quiet NaN input to the min would

    // then give the other result, which is different from med3 with a NaN

    // input.

    SDValue Var = Op0.getOperand(0);

    if (!IsKnownNoNaNs && !DAG.isKnownNeverSNaN(Var))

      return SDValue();


    const SIInstrInfo *TII = getSubtarget()->getInstrInfo();


    if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&

        (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {

      return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,

                         SDValue(K0, 0), SDValue(K1, 0));

    }

  }


  return SDValue();

}


/// \return true if the subtarget supports minimum3 and maximum3 with the given

/// base min/max opcode \p Opc for type \p VT.


static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,

                             EVT VT) {

  switch (Opc) {

  case ISD::FMINNUM:

  case ISD::FMAXNUM:

  case ISD::FMINNUM_IEEE:

  case ISD::FMAXNUM_IEEE:

  case ISD::FMINIMUMNUM:

  case ISD::FMAXIMUMNUM:

  case AMDGPUISD::FMIN_LEGACY:

  case AMDGPUISD::FMAX_LEGACY:

    return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||

           (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());

  case ISD::FMINIMUM:

  case ISD::FMAXIMUM:

    return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||

           (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||

           (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());

  case ISD::SMAX:

  case ISD::SMIN:

  case ISD::UMAX:

  case ISD::UMIN:

    return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());

  default:

    return false;

  }


  llvm_unreachable("not a min/max opcode");

}


SDValue SITargetLowering::performMinMaxCombine(SDNode *N,

                                               DAGCombinerInfo &DCI) const {

  SelectionDAG &DAG = DCI.DAG;


  EVT VT = N->getValueType(0);

  unsigned Opc = N->getOpcode();

  SDValue Op0 = N->getOperand(0);

  SDValue Op1 = N->getOperand(1);


  // Only do this if the inner op has one use since this will just increases

  // register pressure for no benefit.


  if (supportsMin3Max3(*Subtarget, Opc, VT)) {

    auto IsTreeWithCombinableChildren = [Opc](SDValue Op) {

      return (Op.getOperand(0).getOpcode() == Opc &&

              Op.getOperand(0).hasOneUse()) ||

             (Op.getOperand(1).getOpcode() == Opc &&

              Op.getOperand(1).hasOneUse());

    };


    bool CanTreeCombineApply = Op0.getOpcode() == Opc && Op0.hasOneUse() &&

                               Op1.getOpcode() == Opc && Op1.hasOneUse();

    bool HasCombinableTreeChild =

        CanTreeCombineApply && (IsTreeWithCombinableChildren(Op0) ||

                                IsTreeWithCombinableChildren(Op1));


    // Tree reduction: when both operands are the same min/max op, restructure

    // to keep a 2-op node on top so higher tree levels can still combine.

    //

    // max(max(a, b), max(c, d)) -> max(max3(a, b, c), d)

    // min(min(a, b), min(c, d)) -> min(min3(a, b, c), d)

    //

    // Defer when either inner op is a tree node with combinable children.

    if (CanTreeCombineApply && !HasCombinableTreeChild) {

      SDLoc DL(N);

      SDValue Inner =

          DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, VT, Op0.getOperand(0),

                      Op0.getOperand(1), Op1.getOperand(0));

      return DAG.getNode(Opc, DL, VT, Inner, Op1.getOperand(1));

    }


    // max(max(a, b), c) -> max3(a, b, c)

    // min(min(a, b), c) -> min3(a, b, c)

    // Deferred when Op0 is a tree node with combinable children.

    if (Op0.getOpcode() == Opc && Op0.hasOneUse() && !HasCombinableTreeChild) {

      SDLoc DL(N);

      return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),

                         Op0.getOperand(0), Op0.getOperand(1), Op1);

    }


    // Try commuted.

    // max(a, max(b, c)) -> max3(a, b, c)

    // min(a, min(b, c)) -> min3(a, b, c)

    // Deferred when Op1 is a tree node with combinable children.

    if (Op1.getOpcode() == Opc && Op1.hasOneUse() && !HasCombinableTreeChild) {

      SDLoc DL(N);

      return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),

                         Op0, Op1.getOperand(0), Op1.getOperand(1));

    }

  }


  // umin(sffbh(x), bitwidth) -> sffbh(x) if x is known to be not 0 or -1.

  SDValue FfbhSrc;

  uint64_t Clamp = 0;

  if (Opc == ISD::UMIN &&

      sd_match(Op0,

               m_IntrinsicWOChain<Intrinsic::amdgcn_sffbh>(m_Value(FfbhSrc))) &&

      sd_match(Op1, m_ConstInt(Clamp))) {

    unsigned BitWidth = FfbhSrc.getValueType().getScalarSizeInBits();

    if (Clamp >= BitWidth) {

      KnownBits Known = DAG.computeKnownBits(FfbhSrc);

      if (Known.isNonZero() && !Known.isAllOnes())

        return Op0;

    }

  }


  // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)

  // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)

  if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {

    if (SDValue Med3 = performIntMed3ImmCombine(

            DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))

      return Med3;

  }

  if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {

    if (SDValue Med3 = performIntMed3ImmCombine(

            DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))

      return Med3;

  }


  if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {

    if (SDValue Med3 = performIntMed3ImmCombine(

            DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))

      return Med3;

  }

  if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {

    if (SDValue Med3 = performIntMed3ImmCombine(

            DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))

      return Med3;

  }


  // if !is_snan(x):

  //   fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)

  //   fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)

  //   fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)

  //   fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)

  if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||

       (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||

       (Opc == ISD::FMINIMUMNUM && Op0.getOpcode() == ISD::FMAXIMUMNUM) ||

       (Opc == AMDGPUISD::FMIN_LEGACY &&

        Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&

      (VT == MVT::f32 || VT == MVT::f64 ||

       (VT == MVT::f16 && Subtarget->has16BitInsts()) ||

       (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||

       (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||

       (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&

      Op0.hasOneUse()) {

    if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1,

                                              N->getFlags().hasNoNaNs()))

      return Res;

  }


  // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal

  // for some types, but at a higher cost since it's implemented with a 3

  // operand form.

  const SDNodeFlags Flags = N->getFlags();

  if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) && Flags.hasNoNaNs() &&

      !Subtarget->hasIEEEMinimumMaximumInsts() &&

      isOperationLegal(ISD::FMINNUM_IEEE, VT.getScalarType())) {

    unsigned NewOpc =

        Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;

    return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);

  }


  return SDValue();

}


static bool isClampZeroToOne(SDValue A, SDValue B) {

  if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {

    if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {

      // FIXME: Should this be allowing -0.0?

      return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||

             (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));

    }

  }


  return false;

}


// FIXME: Should only worry about snans for version with chain.

SDValue SITargetLowering::performFMed3Combine(SDNode *N,

                                              DAGCombinerInfo &DCI) const {

  EVT VT = N->getValueType(0);

  // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and

  // NaNs. With a NaN input, the order of the operands may change the result.


  SelectionDAG &DAG = DCI.DAG;

  SDLoc SL(N);


  SDValue Src0 = N->getOperand(0);

  SDValue Src1 = N->getOperand(1);

  SDValue Src2 = N->getOperand(2);


  if (isClampZeroToOne(Src0, Src1)) {

    // const_a, const_b, x -> clamp is safe in all cases including signaling

    // nans.

    // FIXME: Should this be allowing -0.0?

    return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);

  }


  const MachineFunction &MF = DAG.getMachineFunction();

  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();


  // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother

  // handling no dx10-clamp?

  if (Info->getMode().DX10Clamp) {

    // If NaNs is clamped to 0, we are free to reorder the inputs.


    if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))

      std::swap(Src0, Src1);


    if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))

      std::swap(Src1, Src2);


    if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))

      std::swap(Src0, Src1);


    if (isClampZeroToOne(Src1, Src2))

      return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);

  }


  return SDValue();

}


SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,

                                                 DAGCombinerInfo &DCI) const {

  SDValue Src0 = N->getOperand(0);

  SDValue Src1 = N->getOperand(1);

  if (Src0.isUndef() && Src1.isUndef())

    return DCI.DAG.getUNDEF(N->getValueType(0));

  return SDValue();

}


// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be

// expanded into a set of cmp/select instructions.


bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize,

                                                unsigned NumElem,

                                                bool IsDivergentIdx,

                                                const GCNSubtarget *Subtarget) {

  if (UseDivergentRegisterIndexing)

    return false;


  unsigned VecSize = EltSize * NumElem;


  // Sub-dword vectors of size 2 dword or less have better implementation.

  if (VecSize <= 64 && EltSize < 32)

    return false;


  // Always expand the rest of sub-dword instructions, otherwise it will be

  // lowered via memory.

  if (EltSize < 32)

    return true;


  // Always do this if var-idx is divergent, otherwise it will become a loop.

  if (IsDivergentIdx)

    return true;


  // Large vectors would yield too many compares and v_cndmask_b32 instructions.

  unsigned NumInsts = NumElem /* Number of compares */ +

                      ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;


  // On some architectures (GFX9) movrel is not available and it's better

  // to expand.

  if (Subtarget->useVGPRIndexMode())

    return NumInsts <= 16;


  // If movrel is available, use it instead of expanding for vector of 8

  // elements.

  if (Subtarget->hasMovrel())

    return NumInsts <= 15;


  return true;

}


bool SITargetLowering::shouldExpandVectorDynExt(SDNode *N) const {

  SDValue Idx = N->getOperand(N->getNumOperands() - 1);

  if (isa<ConstantSDNode>(Idx))

    return false;


  SDValue Vec = N->getOperand(0);

  EVT VecVT = Vec.getValueType();

  EVT EltVT = VecVT.getVectorElementType();

  unsigned EltSize = EltVT.getSizeInBits();

  unsigned NumElem = VecVT.getVectorNumElements();


  return SITargetLowering::shouldExpandVectorDynExt(

      EltSize, NumElem, Idx->isDivergent(), getSubtarget());

}


SDValue

SITargetLowering::performExtractVectorEltCombine(SDNode *N,

                                                 DAGCombinerInfo &DCI) const {

  SDValue Vec = N->getOperand(0);

  SelectionDAG &DAG = DCI.DAG;


  EVT VecVT = Vec.getValueType();

  EVT VecEltVT = VecVT.getVectorElementType();

  EVT ResVT = N->getValueType(0);


  unsigned VecSize = VecVT.getSizeInBits();

  unsigned VecEltSize = VecEltVT.getSizeInBits();


  if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&

      allUsesHaveSourceMods(N)) {

    SDLoc SL(N);

    SDValue Idx = N->getOperand(1);

    SDValue Elt =

        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);

    return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);

  }


  // (extract_vector_element (and {y0, y1}, (build_vector 0x1f, 0x1f)), index)

  // -> (and (extract_vector_element {y0, y1}, index), 0x1f)

  // There are optimisations to transform 64-bit shifts into 32-bit shifts

  // depending on the shift operand. See e.g. performSraCombine().

  // This combine ensures that the optimisation is compatible with v2i32

  // legalised AND.

  if (VecVT == MVT::v2i32 && Vec->getOpcode() == ISD::AND &&

      Vec->getOperand(1)->getOpcode() == ISD::BUILD_VECTOR) {


    const ConstantSDNode *C = isConstOrConstSplat(Vec.getOperand(1));

    if (!C || C->getZExtValue() != 0x1f)

      return SDValue();


    SDLoc SL(N);

    SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);

    SDValue EVE = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,

                              Vec->getOperand(0), N->getOperand(1));

    SDValue A = DAG.getNode(ISD::AND, SL, MVT::i32, EVE, AndMask);

    DAG.ReplaceAllUsesWith(N, A.getNode());

  }


  // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)

  //    =>

  // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)

  // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)

  // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt

  if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {

    SDLoc SL(N);

    SDValue Idx = N->getOperand(1);

    unsigned Opc = Vec.getOpcode();


    switch (Opc) {

    default:

      break;

      // TODO: Support other binary operations.

    case ISD::FADD:

    case ISD::FSUB:

    case ISD::FMUL:

    case ISD::ADD:

    case ISD::UMIN:

    case ISD::UMAX:

    case ISD::SMIN:

    case ISD::SMAX:

    case ISD::FMAXNUM:

    case ISD::FMINNUM:

    case ISD::FMAXNUM_IEEE:

    case ISD::FMINNUM_IEEE:

    case ISD::FMAXIMUM:

    case ISD::FMINIMUM: {

      SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,

                                 Vec.getOperand(0), Idx);

      SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,

                                 Vec.getOperand(1), Idx);


      DCI.AddToWorklist(Elt0.getNode());

      DCI.AddToWorklist(Elt1.getNode());

      return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());

    }

    }

  }


  // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)

  if (shouldExpandVectorDynExt(N)) {

    SDLoc SL(N);

    SDValue Idx = N->getOperand(1);

    SDValue V;

    for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {

      SDValue IC = DAG.getVectorIdxConstant(I, SL);

      SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);

      if (I == 0)

        V = Elt;

      else

        V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);

    }

    return V;

  }


  // EXTRACT_VECTOR_ELT (v2i32 bitcast (i64/f64:k), Idx)

  //   =>

  // i32:Lo(k) if Idx == 0, or

  // i32:Hi(k) if Idx == 1

  auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));

  if (Vec.getOpcode() == ISD::BITCAST && VecVT == MVT::v2i32 && Idx) {

    SDLoc SL(N);

    SDValue PeekThrough = Vec.getOperand(0);

    auto *KImm = dyn_cast<ConstantSDNode>(PeekThrough);

    if (KImm && KImm->getValueType(0).getSizeInBits() == 64) {

      uint64_t KImmValue = KImm->getZExtValue();

      return DAG.getConstant(

          (KImmValue >> (32 * Idx->getZExtValue())) & 0xffffffff, SL, MVT::i32);

    }

    auto *KFPImm = dyn_cast<ConstantFPSDNode>(PeekThrough);

    if (KFPImm && KFPImm->getValueType(0).getSizeInBits() == 64) {

      uint64_t KFPImmValue =

          KFPImm->getValueAPF().bitcastToAPInt().getZExtValue();

      return DAG.getConstant((KFPImmValue >> (32 * Idx->getZExtValue())) &

                                 0xffffffff,

                             SL, MVT::i32);

    }

  }


  if (!DCI.isBeforeLegalize())

    return SDValue();


  // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit

  // elements. This exposes more load reduction opportunities by replacing

  // multiple small extract_vector_elements with a single 32-bit extract.

  if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&

      VecSize > 32 && VecSize % 32 == 0 && Idx) {

    EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);


    unsigned BitIndex = Idx->getZExtValue() * VecEltSize;

    unsigned EltIdx = BitIndex / 32;

    unsigned LeftoverBitIdx = BitIndex % 32;

    SDLoc SL(N);


    SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);

    DCI.AddToWorklist(Cast.getNode());


    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,

                              DAG.getConstant(EltIdx, SL, MVT::i32));

    DCI.AddToWorklist(Elt.getNode());

    SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,

                              DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));

    DCI.AddToWorklist(Srl.getNode());


    EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();

    SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);

    DCI.AddToWorklist(Trunc.getNode());


    if (VecEltVT == ResVT) {

      return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);

    }


    assert(ResVT.isScalarInteger());

    return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);

  }


  return SDValue();

}


SDValue

SITargetLowering::performInsertVectorEltCombine(SDNode *N,

                                                DAGCombinerInfo &DCI) const {

  SDValue Vec = N->getOperand(0);

  SDValue Idx = N->getOperand(2);

  EVT VecVT = Vec.getValueType();

  EVT EltVT = VecVT.getVectorElementType();


  // INSERT_VECTOR_ELT (<n x e>, var-idx)

  // => BUILD_VECTOR n x select (e, const-idx)

  if (!shouldExpandVectorDynExt(N))

    return SDValue();


  SelectionDAG &DAG = DCI.DAG;

  SDLoc SL(N);

  SDValue Ins = N->getOperand(1);

  EVT IdxVT = Idx.getValueType();


  SmallVector<SDValue, 16> Ops;

  for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {

    SDValue IC = DAG.getConstant(I, SL, IdxVT);

    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);

    SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);

    Ops.push_back(V);

  }


  return DAG.getBuildVector(VecVT, SL, Ops);

}


/// Return the source of an fp_extend from f16 to f32, or a converted FP

/// constant.


static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src) {

  if (Src.getOpcode() == ISD::FP_EXTEND &&

      Src.getOperand(0).getValueType() == MVT::f16) {

    return Src.getOperand(0);

  }


  if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {

    APFloat Val = CFP->getValueAPF();

    bool LosesInfo = true;

    Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo);

    if (!LosesInfo)

      return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);

  }


  return SDValue();

}


SDValue SITargetLowering::performFPRoundCombine(SDNode *N,

                                                DAGCombinerInfo &DCI) const {

  assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&

         "combine only useful on gfx8");


  SDValue TruncSrc = N->getOperand(0);

  EVT VT = N->getValueType(0);

  if (VT != MVT::f16)

    return SDValue();


  if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||

      TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())

    return SDValue();


  SelectionDAG &DAG = DCI.DAG;

  SDLoc SL(N);


  // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,

  // and expanding it with min/max saves 1 instruction vs. casting to f32 and

  // casting back.


  // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>

  // fmin(fmax(a, b), fmax(fmin(a, b), c))

  SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));

  if (!A)

    return SDValue();


  SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));

  if (!B)

    return SDValue();


  SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));

  if (!C)

    return SDValue();


  // This changes signaling nan behavior. If an input is a signaling nan, it

  // would have been quieted by the fpext originally. We don't care because

  // these are unconstrained ops. If we needed to insert quieting canonicalizes

  // we would be worse off than just doing the promotion.

  SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);

  SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);

  SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);

  return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);

}


unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,

                                          const SDNode *N0,

                                          const SDNode *N1) const {

  EVT VT = N0->getValueType(0);


  // Only do this if we are not trying to support denormals. v_mad_f32 does not

  // support denormals ever.

  if (((VT == MVT::f32 &&

        denormalModeIsFlushAllF32(DAG.getMachineFunction())) ||

       (VT == MVT::f16 && Subtarget->hasMadF16() &&

        denormalModeIsFlushAllF64F16(DAG.getMachineFunction()))) &&

      isOperationLegal(ISD::FMAD, VT))

    return ISD::FMAD;


  const TargetOptions &Options = DAG.getTarget().Options;

  if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||

       (N0->getFlags().hasAllowContract() &&

        N1->getFlags().hasAllowContract())) &&

      isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT)) {

    return ISD::FMA;

  }


  return 0;

}


// For a reassociatable opcode perform:

// op x, (op y, z) -> op (op x, z), y, if x and z are uniform

SDValue SITargetLowering::reassociateScalarOps(SDNode *N,

                                               SelectionDAG &DAG) const {

  EVT VT = N->getValueType(0);

  if (VT != MVT::i32 && VT != MVT::i64)

    return SDValue();


  if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))

    return SDValue();


  unsigned Opc = N->getOpcode();

  SDValue Op0 = N->getOperand(0);

  SDValue Op1 = N->getOperand(1);


  if (!(Op0->isDivergent() ^ Op1->isDivergent()))

    return SDValue();


  if (Op0->isDivergent())

    std::swap(Op0, Op1);


  if (Op1.getOpcode() != Opc || !Op1.hasOneUse())

    return SDValue();


  SDValue Op2 = Op1.getOperand(1);

  Op1 = Op1.getOperand(0);

  if (!(Op1->isDivergent() ^ Op2->isDivergent()))

    return SDValue();


  if (Op1->isDivergent())

    std::swap(Op1, Op2);


  SDLoc SL(N);

  SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);

  return DAG.getNode(Opc, SL, VT, Add1, Op2);

}


static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,

                           SDValue N0, SDValue N1, SDValue N2, bool Signed) {

  unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;

  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);

  SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);

  return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);

}


// Fold

//     y = lshr i64 x, 32

//     res = add (mul i64 y, Const), x   where "Const" is a 64-bit constant

//     with Const.hi == -1

// To

//     res = mad_u64_u32 y.lo ,Const.lo, x.lo


static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL,

                                 SDValue MulLHS, SDValue MulRHS,

                                 SDValue AddRHS) {

  if (MulRHS.getOpcode() == ISD::SRL)

    std::swap(MulLHS, MulRHS);


  if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)

    return SDValue();


  ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));

  if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||

      MulLHS.getOperand(0) != AddRHS)

    return SDValue();


  ConstantSDNode *Const = dyn_cast<ConstantSDNode>(MulRHS.getNode());

  if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))

    return SDValue();


  SDValue ConstMul =

      DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);

  return getMad64_32(DAG, SL, MVT::i64,

                     DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,

                     DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);

}


// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high

// multiplies, if any.

//

// Full 64-bit multiplies that feed into an addition are lowered here instead

// of using the generic expansion. The generic expansion ends up with

// a tree of ADD nodes that prevents us from using the "add" part of the

// MAD instruction. The expansion produced here results in a chain of ADDs

// instead of a tree.

SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,

                                            DAGCombinerInfo &DCI) const {

  assert(N->isAnyAdd());


  SelectionDAG &DAG = DCI.DAG;

  EVT VT = N->getValueType(0);

  SDLoc SL(N);

  SDValue LHS = N->getOperand(0);

  SDValue RHS = N->getOperand(1);


  if (VT.isVector())

    return SDValue();


  // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall

  // result in scalar registers for uniform values.

  if (!N->isDivergent() && Subtarget->hasSMulHi())

    return SDValue();


  unsigned NumBits = VT.getScalarSizeInBits();

  if (NumBits <= 32 || NumBits > 64)

    return SDValue();


  if (LHS.getOpcode() != ISD::MUL) {

    assert(RHS.getOpcode() == ISD::MUL);

    std::swap(LHS, RHS);

  }


  // Avoid the fold if it would unduly increase the number of multiplies due to

  // multiple uses, except on hardware with full-rate multiply-add (which is

  // part of full-rate 64-bit ops).

  if (!Subtarget->hasFullRate64Ops()) {

    unsigned NumUsers = 0;

    for (SDNode *User : LHS->users()) {

      // There is a use that does not feed into addition, so the multiply can't

      // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.

      if (!User->isAnyAdd())

        return SDValue();


      // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer

      // MUL + 3xADD + 3xADDC over 3xMAD.

      ++NumUsers;

      if (NumUsers >= 3)

        return SDValue();

    }

  }


  SDValue MulLHS = LHS.getOperand(0);

  SDValue MulRHS = LHS.getOperand(1);

  SDValue AddRHS = RHS;


  if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))

    return FoldedMAD;


  // Always check whether operands are small unsigned values, since that

  // knowledge is useful in more cases. Check for small signed values only if

  // doing so can unlock a shorter code sequence.

  bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;

  bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;


  bool MulSignedLo = false;

  if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {

    MulSignedLo =

        numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;

  }


  // The operands and final result all have the same number of bits. If

  // operands need to be extended, they can be extended with garbage. The

  // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is

  // truncated away in the end.

  if (VT != MVT::i64) {

    MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);

    MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);

    AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);

  }


  // The basic code generated is conceptually straightforward. Pseudo code:

  //

  //   accum = mad_64_32 lhs.lo, rhs.lo, accum

  //   accum.hi = add (mul lhs.hi, rhs.lo), accum.hi

  //   accum.hi = add (mul lhs.lo, rhs.hi), accum.hi

  //

  // The second and third lines are optional, depending on whether the factors

  // are {sign,zero}-extended or not.

  //

  // The actual DAG is noisier than the pseudo code, but only due to

  // instructions that disassemble values into low and high parts, and

  // assemble the final result.

  SDValue One = DAG.getConstant(1, SL, MVT::i32);


  auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);

  auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);

  SDValue Accum =

      getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);


  if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {

    auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);


    if (!MulLHSUnsigned32) {

      auto MulLHSHi =

          DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);

      SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);

      AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);

    }


    if (!MulRHSUnsigned32) {

      auto MulRHSHi =

          DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);

      SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);

      AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);

    }


    Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});

    Accum = DAG.getBitcast(MVT::i64, Accum);

  }


  if (VT != MVT::i64)

    Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);

  return Accum;

}


SDValue

SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,

                                                  DAGCombinerInfo &DCI) const {

  SDValue RHS = N->getOperand(1);

  auto *CRHS = dyn_cast<ConstantSDNode>(RHS);

  if (!CRHS)

    return SDValue();


  // TODO: Worth using computeKnownBits? Maybe expensive since it's so

  // common.

  uint64_t Val = CRHS->getZExtValue();

  if (countr_zero(Val) >= 32) {

    SelectionDAG &DAG = DCI.DAG;

    SDLoc SL(N);

    SDValue LHS = N->getOperand(0);


    // Avoid carry machinery if we know the low half of the add does not

    // contribute to the final result.

    //

    // add i64:x, K if computeTrailingZeros(K) >= 32

    //  => build_pair (add x.hi, K.hi), x.lo


    // Breaking the 64-bit add here with this strange constant is unlikely

    // to interfere with addressing mode patterns.


    SDValue Hi = getHiHalf64(LHS, DAG);

    SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);

    unsigned Opcode = N->getOpcode();

    if (Opcode == ISD::PTRADD)

      Opcode = ISD::ADD;

    SDValue AddHi =

        DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());


    SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);

    return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);

  }


  return SDValue();

}


// Collect the ultimate src of each of the mul node's operands, and confirm

// each operand is 8 bytes.

static std::optional<ByteProvider<SDValue>>


handleMulOperand(const SDValue &MulOperand) {

  auto Byte0 = calculateByteProvider(MulOperand, 0, 0);

  if (!Byte0 || Byte0->isConstantZero()) {

    return std::nullopt;

  }

  auto Byte1 = calculateByteProvider(MulOperand, 1, 0);

  if (Byte1 && !Byte1->isConstantZero()) {

    return std::nullopt;

  }

  return Byte0;

}


static unsigned addPermMasks(unsigned First, unsigned Second) {

  unsigned FirstCs = First & 0x0c0c0c0c;

  unsigned SecondCs = Second & 0x0c0c0c0c;

  unsigned FirstNoCs = First & ~0x0c0c0c0c;

  unsigned SecondNoCs = Second & ~0x0c0c0c0c;


  assert((FirstCs & 0xFF) | (SecondCs & 0xFF));

  assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));

  assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));

  assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));


  return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);

}


struct DotSrc {

  SDValue SrcOp;

  int64_t PermMask;

  int64_t DWordOffset;

};


static void placeSources(ByteProvider<SDValue> &Src0,

                         ByteProvider<SDValue> &Src1,

                         SmallVectorImpl<DotSrc> &Src0s,

                         SmallVectorImpl<DotSrc> &Src1s, int Step) {


  assert(Src0.Src.has_value() && Src1.Src.has_value());

  // Src0s and Src1s are empty, just place arbitrarily.

  if (Step == 0) {

    Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,

                     Src0.SrcOffset / 4});

    Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,

                     Src1.SrcOffset / 4});

    return;

  }


  for (int BPI = 0; BPI < 2; BPI++) {

    std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};

    if (BPI == 1) {

      BPP = {Src1, Src0};

    }

    unsigned ZeroMask = 0x0c0c0c0c;

    unsigned FMask = 0xFF << (8 * (3 - Step));


    unsigned FirstMask =

        (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);

    unsigned SecondMask =

        (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);

    // Attempt to find Src vector which contains our SDValue, if so, add our

    // perm mask to the existing one. If we are unable to find a match for the

    // first SDValue, attempt to find match for the second.

    int FirstGroup = -1;

    for (int I = 0; I < 2; I++) {

      SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;

      auto MatchesFirst = [&BPP](DotSrc &IterElt) {

        return IterElt.SrcOp == *BPP.first.Src &&

               (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));

      };


      auto *Match = llvm::find_if(Srcs, MatchesFirst);

      if (Match != Srcs.end()) {

        Match->PermMask = addPermMasks(FirstMask, Match->PermMask);

        FirstGroup = I;

        break;

      }

    }

    if (FirstGroup != -1) {

      SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;

      auto MatchesSecond = [&BPP](DotSrc &IterElt) {

        return IterElt.SrcOp == *BPP.second.Src &&

               (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));

      };

      auto *Match = llvm::find_if(Srcs, MatchesSecond);

      if (Match != Srcs.end()) {

        Match->PermMask = addPermMasks(SecondMask, Match->PermMask);

      } else

        Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});

      return;

    }

  }


  // If we have made it here, then we could not find a match in Src0s or Src1s

  // for either Src0 or Src1, so just place them arbitrarily.


  unsigned ZeroMask = 0x0c0c0c0c;

  unsigned FMask = 0xFF << (8 * (3 - Step));


  Src0s.push_back(

      {*Src0.Src,

       ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),

       Src0.SrcOffset / 4});

  Src1s.push_back(

      {*Src1.Src,

       ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),

       Src1.SrcOffset / 4});

}


static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL,

                              SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,

                              bool IsAny) {


  // If we just have one source, just permute it accordingly.

  if (Srcs.size() == 1) {

    auto *Elt = Srcs.begin();

    auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);


    // v_perm will produce the original value

    if (Elt->PermMask == 0x3020100)

      return EltOp;


    return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,

                       DAG.getConstant(Elt->PermMask, SL, MVT::i32));

  }


  auto *FirstElt = Srcs.begin();

  auto *SecondElt = std::next(FirstElt);


  SmallVector<SDValue, 2> Perms;


  // If we have multiple sources in the chain, combine them via perms (using

  // calculated perm mask) and Ors.

  while (true) {

    auto FirstMask = FirstElt->PermMask;

    auto SecondMask = SecondElt->PermMask;


    unsigned FirstCs = FirstMask & 0x0c0c0c0c;

    unsigned FirstPlusFour = FirstMask | 0x04040404;

    // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any

    // original 0x0C.

    FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;


    auto PermMask = addPermMasks(FirstMask, SecondMask);

    auto FirstVal =

        getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);

    auto SecondVal =

        getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);


    Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,

                                SecondVal,

                                DAG.getConstant(PermMask, SL, MVT::i32)));


    FirstElt = std::next(SecondElt);

    if (FirstElt == Srcs.end())

      break;


    SecondElt = std::next(FirstElt);

    // If we only have a FirstElt, then just combine that into the cumulative

    // source node.

    if (SecondElt == Srcs.end()) {

      auto EltOp =

          getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);


      Perms.push_back(

          DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,

                      DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));

      break;

    }

  }


  assert(Perms.size() == 1 || Perms.size() == 2);

  return Perms.size() == 2

             ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])

             : Perms[0];

}


static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {

  for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {

    EntryMask = EntryMask >> ((4 - ChainLength) * 8);

    auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;

    EntryMask += ZeroMask;

  }

}


static bool isMul(const SDValue Op) {

  auto Opcode = Op.getOpcode();


  return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||

          Opcode == AMDGPUISD::MUL_I24);

}


static std::optional<bool>


checkDot4MulSignedness(const SDValue &N, ByteProvider<SDValue> &Src0,

                       ByteProvider<SDValue> &Src1, const SDValue &S0Op,

                       const SDValue &S1Op, const SelectionDAG &DAG) {

  // If we both ops are i8s (pre legalize-dag), then the signedness semantics

  // of the dot4 is irrelevant.

  if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)

    return false;


  auto Known0 = DAG.computeKnownBits(S0Op, 0);

  bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;

  bool S0IsSigned = Known0.countMinLeadingOnes() > 0;

  auto Known1 = DAG.computeKnownBits(S1Op, 0);

  bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;

  bool S1IsSigned = Known1.countMinLeadingOnes() > 0;


  assert(!(S0IsUnsigned && S0IsSigned));

  assert(!(S1IsUnsigned && S1IsSigned));


  // There are 9 possible permutations of

  // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}


  // In two permutations, the sign bits are known to be the same for both Ops,

  // so simply return Signed / Unsigned corresponding to the MSB


  if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))

    return S0IsSigned;


  // In another two permutations, the sign bits are known to be opposite. In

  // this case return std::nullopt to indicate a bad match.


  if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))

    return std::nullopt;


  // In the remaining five permutations, we don't know the value of the sign

  // bit for at least one Op. Since we have a valid ByteProvider, we know that

  // the upper bits must be extension bits. Thus, the only ways for the sign

  // bit to be unknown is if it was sign extended from unknown value, or if it

  // was any extended. In either case, it is correct to use the signed

  // version of the signedness semantics of dot4


  // In two of such permutations, we known the sign bit is set for

  // one op, and the other is unknown. It is okay to used signed version of

  // dot4.

  if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||

      ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))

    return true;


  // In one such permutation, we don't know either of the sign bits. It is okay

  // to used the signed version of dot4.

  if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))

    return true;


  // In two of such permutations, we known the sign bit is unset for

  // one op, and the other is unknown. Return std::nullopt to indicate a

  // bad match.

  if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||

      ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))

    return std::nullopt;


  llvm_unreachable("Fully covered condition");

}


SDValue SITargetLowering::performAddCombine(SDNode *N,

                                            DAGCombinerInfo &DCI) const {

  SelectionDAG &DAG = DCI.DAG;

  EVT VT = N->getValueType(0);

  SDLoc SL(N);

  SDValue LHS = N->getOperand(0);

  SDValue RHS = N->getOperand(1);


  if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {

    if (Subtarget->hasMad64_32()) {

      if (SDValue Folded = tryFoldToMad64_32(N, DCI))

        return Folded;

    }

  }


  if (SDValue V = reassociateScalarOps(N, DAG)) {

    return V;

  }


  if (VT == MVT::i64) {

    if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))

      return Folded;

  }


  if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&

      (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {

    SDValue TempNode(N, 0);

    std::optional<bool> IsSigned;

    SmallVector<DotSrc, 4> Src0s;

    SmallVector<DotSrc, 4> Src1s;

    SmallVector<SDValue, 4> Src2s;


    // Match the v_dot4 tree, while collecting src nodes.

    int ChainLength = 0;

    for (int I = 0; I < 4; I++) {

      auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;

      if (MulIdx == -1)

        break;

      auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));

      if (!Src0)

        break;

      auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));

      if (!Src1)

        break;


      auto IterIsSigned = checkDot4MulSignedness(

          TempNode->getOperand(MulIdx), *Src0, *Src1,

          TempNode->getOperand(MulIdx)->getOperand(0),

          TempNode->getOperand(MulIdx)->getOperand(1), DAG);

      if (!IterIsSigned)

        break;

      if (!IsSigned)

        IsSigned = *IterIsSigned;

      if (*IterIsSigned != *IsSigned)

        break;

      placeSources(*Src0, *Src1, Src0s, Src1s, I);

      auto AddIdx = 1 - MulIdx;

      // Allow the special case where add (add (mul24, 0), mul24) became ->

      // add (mul24, mul24).

      if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {

        Src2s.push_back(TempNode->getOperand(AddIdx));

        auto Src0 =

            handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));

        if (!Src0)

          break;

        auto Src1 =

            handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));

        if (!Src1)

          break;

        auto IterIsSigned = checkDot4MulSignedness(

            TempNode->getOperand(AddIdx), *Src0, *Src1,

            TempNode->getOperand(AddIdx)->getOperand(0),

            TempNode->getOperand(AddIdx)->getOperand(1), DAG);

        if (!IterIsSigned)

          break;

        assert(IsSigned);

        if (*IterIsSigned != *IsSigned)

          break;

        placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);

        Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));

        ChainLength = I + 2;

        break;

      }


      TempNode = TempNode->getOperand(AddIdx);

      Src2s.push_back(TempNode);

      ChainLength = I + 1;

      if (TempNode->getNumOperands() < 2)

        break;

      LHS = TempNode->getOperand(0);

      RHS = TempNode->getOperand(1);

    }


    if (ChainLength < 2)

      return SDValue();


    // Masks were constructed with assumption that we would find a chain of

    // length 4. If not, then we need to 0 out the MSB bits (via perm mask of

    // 0x0c) so they do not affect dot calculation.

    if (ChainLength < 4) {

      fixMasks(Src0s, ChainLength);

      fixMasks(Src1s, ChainLength);

    }


    SDValue Src0, Src1;


    // If we are just using a single source for both, and have permuted the

    // bytes consistently, we can just use the sources without permuting

    // (commutation).

    bool UseOriginalSrc = false;

    if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&

        Src0s.begin()->PermMask == Src1s.begin()->PermMask &&

        Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&

        Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {

      SmallVector<unsigned, 4> SrcBytes;

      auto Src0Mask = Src0s.begin()->PermMask;

      SrcBytes.push_back(Src0Mask & 0xFF000000);

      bool UniqueEntries = true;

      for (auto I = 1; I < 4; I++) {

        auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));


        if (is_contained(SrcBytes, NextByte)) {

          UniqueEntries = false;

          break;

        }

        SrcBytes.push_back(NextByte);

      }


      if (UniqueEntries) {

        UseOriginalSrc = true;


        auto *FirstElt = Src0s.begin();

        auto FirstEltOp =

            getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);


        auto *SecondElt = Src1s.begin();

        auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,

                                              SecondElt->DWordOffset);


        Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,

                                             MVT::getIntegerVT(32));

        Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,

                                             MVT::getIntegerVT(32));

      }

    }


    if (!UseOriginalSrc) {

      Src0 = resolveSources(DAG, SL, Src0s, false, true);

      Src1 = resolveSources(DAG, SL, Src1s, false, true);

    }


    assert(IsSigned);

    SDValue Src2 =

        DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);


    SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4

                                                  : Intrinsic::amdgcn_udot4,

                                        SL, MVT::i64);


    assert(!VT.isVector());

    auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,

                           Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));


    return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);

  }


  if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())

    return SDValue();


  // add x, zext (setcc) => uaddo_carry x, 0, setcc

  // add x, sext (setcc) => usubo_carry x, 0, setcc

  unsigned Opc = LHS.getOpcode();

  if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||

      Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)

    std::swap(RHS, LHS);


  Opc = RHS.getOpcode();

  switch (Opc) {

  default:

    break;

  case ISD::ZERO_EXTEND:

  case ISD::SIGN_EXTEND:

  case ISD::ANY_EXTEND: {

    auto Cond = RHS.getOperand(0);

    // If this won't be a real VOPC output, we would still need to insert an

    // extra instruction anyway.

    if (!isBoolSGPR(Cond))

      break;

    SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);

    SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};

    Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::USUBO_CARRY : ISD::UADDO_CARRY;

    return DAG.getNode(Opc, SL, VTList, Args);

  }

  case ISD::UADDO_CARRY: {

    // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc

    if (!isNullConstant(RHS.getOperand(1)))

      break;

    SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};

    return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);

  }

  }

  return SDValue();

}


SDValue SITargetLowering::performPtrAddCombine(SDNode *N,

                                               DAGCombinerInfo &DCI) const {

  SelectionDAG &DAG = DCI.DAG;

  SDLoc DL(N);

  EVT VT = N->getValueType(0);

  SDValue N0 = N->getOperand(0);

  SDValue N1 = N->getOperand(1);


  // The following folds transform PTRADDs into regular arithmetic in cases

  // where the PTRADD wouldn't be folded as an immediate offset into memory

  // instructions anyway. They are target-specific in that other targets might

  // prefer to not lose information about the pointer arithmetic.


  // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).

  // Adapted from DAGCombiner::visitADDLikeCommutative.

  SDValue V, K;

  if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {

    SDNodeFlags ShlFlags = N1->getFlags();

    // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,

    // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be

    // preserved.

    SDNodeFlags NewShlFlags =

        ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()

            ? SDNodeFlags::NoSignedWrap

            : SDNodeFlags();

    SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);

    DCI.AddToWorklist(Inner.getNode());

    return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);

  }


  // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in

  // performAddCombine.

  if (N1.getOpcode() == ISD::MUL) {

    if (Subtarget->hasMad64_32()) {

      if (SDValue Folded = tryFoldToMad64_32(N, DCI))

        return Folded;

    }

  }


  // If the 32 low bits of the constant are all zero, there is nothing to fold

  // into an immediate offset, so it's better to eliminate the unnecessary

  // addition for the lower 32 bits than to preserve the PTRADD.

  // Analogous to a fold in performAddCombine.

  if (VT == MVT::i64) {

    if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))

      return Folded;

  }


  if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())

    return SDValue();


  SDValue X = N0;

  SDValue Y = N1.getOperand(0);

  SDValue Z = N1.getOperand(1);

  bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);

  bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);


  if (!YIsConstant && !ZIsConstant && !X->isDivergent() &&

      Y->isDivergent() != Z->isDivergent()) {

    // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and

    // y are uniform and z isn't.

    // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and

    // z are uniform and y isn't.

    // The goal is to push uniform operands up in the computation, so that they

    // can be handled with scalar operations. We can't use reassociateScalarOps

    // for this since it requires two identical commutative operations to

    // reassociate.

    if (Y->isDivergent())

      std::swap(Y, Z);

    // If both additions in the original were NUW, reassociation preserves that.

    SDNodeFlags ReassocFlags =

        (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;

    SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);

    DCI.AddToWorklist(UniformInner.getNode());

    return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);

  }


  return SDValue();

}


static bool isCtlzOpc(unsigned Opc) {

  return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_POISON;

}


SDValue SITargetLowering::performSubCombine(SDNode *N,

                                            DAGCombinerInfo &DCI) const {

  SelectionDAG &DAG = DCI.DAG;

  EVT VT = N->getValueType(0);


  if (VT == MVT::i64) {

    if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))

      return Folded;

  }


  if (VT != MVT::i32)

    return SDValue();


  SDLoc SL(N);

  SDValue LHS = N->getOperand(0);

  SDValue RHS = N->getOperand(1);


  // sub x, zext (setcc) => usubo_carry x, 0, setcc

  // sub x, sext (setcc) => uaddo_carry x, 0, setcc

  unsigned Opc = RHS.getOpcode();

  switch (Opc) {

  default:

    break;

  case ISD::ZERO_EXTEND:

  case ISD::SIGN_EXTEND:

  case ISD::ANY_EXTEND: {

    auto Cond = RHS.getOperand(0);

    // If this won't be a real VOPC output, we would still need to insert an

    // extra instruction anyway.

    if (!isBoolSGPR(Cond))

      break;

    SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);

    SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};

    Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::UADDO_CARRY : ISD::USUBO_CARRY;

    return DAG.getNode(Opc, SL, VTList, Args);

  }

  }


  if (LHS.getOpcode() == ISD::USUBO_CARRY) {

    // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc

    if (!isNullConstant(LHS.getOperand(1)))

      return SDValue();

    SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};

    return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);

  }


  // sub (ctlz (xor x, (sra x, 31))), 1 -> ctls x.

  if (isOneConstant(RHS) && isCtlzOpc(LHS.getOpcode())) {

    SDValue CtlzSrc = LHS.getOperand(0);

    // Check for xor x, (sra x, 31) pattern.

    if (CtlzSrc.getOpcode() == ISD::XOR) {

      SDValue X = CtlzSrc.getOperand(0);

      SDValue SignExt = CtlzSrc.getOperand(1);

      // Try both ordering of XOR operands.

      if (SignExt.getOpcode() != ISD::SRA)

        std::swap(X, SignExt);

      if (SignExt.getOpcode() == ISD::SRA && SignExt.getOperand(0) == X) {

        ConstantSDNode *ShiftAmt =

            dyn_cast<ConstantSDNode>(SignExt.getOperand(1));

        unsigned BitWidth = X.getValueType().getScalarSizeInBits();

        if (ShiftAmt && ShiftAmt->getZExtValue() == BitWidth - 1)

          return DAG.getNode(ISD::CTLS, SL, VT, X);

      }

    }

  }


  return SDValue();

}


SDValue

SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,

                                                 DAGCombinerInfo &DCI) const {


  if (N->getValueType(0) != MVT::i32)

    return SDValue();


  if (!isNullConstant(N->getOperand(1)))

    return SDValue();


  SelectionDAG &DAG = DCI.DAG;

  SDValue LHS = N->getOperand(0);


  // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc

  // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc

  unsigned LHSOpc = LHS.getOpcode();

  unsigned Opc = N->getOpcode();

  if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||

      (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {

    SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};

    return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);

  }

  return SDValue();

}


SDValue SITargetLowering::performFAddCombine(SDNode *N,

                                             DAGCombinerInfo &DCI) const {

  if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)

    return SDValue();


  SelectionDAG &DAG = DCI.DAG;

  EVT VT = N->getValueType(0);


  SDLoc SL(N);

  SDValue LHS = N->getOperand(0);

  SDValue RHS = N->getOperand(1);


  // These should really be instruction patterns, but writing patterns with

  // source modifiers is a pain.


  // fadd (fadd (a, a), b) -> mad 2.0, a, b

  if (LHS.getOpcode() == ISD::FADD) {

    SDValue A = LHS.getOperand(0);

    if (A == LHS.getOperand(1)) {

      unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());

      if (FusedOp != 0) {

        const SDValue Two = DAG.getConstantFP(2.0, SL, VT);

        return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);

      }

    }

  }


  // fadd (b, fadd (a, a)) -> mad 2.0, a, b

  if (RHS.getOpcode() == ISD::FADD) {

    SDValue A = RHS.getOperand(0);

    if (A == RHS.getOperand(1)) {

      unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());

      if (FusedOp != 0) {

        const SDValue Two = DAG.getConstantFP(2.0, SL, VT);

        return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);

      }

    }

  }


  return SDValue();

}


SDValue SITargetLowering::performFSubCombine(SDNode *N,

                                             DAGCombinerInfo &DCI) const {

  if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)

    return SDValue();


  SelectionDAG &DAG = DCI.DAG;

  SDLoc SL(N);

  EVT VT = N->getValueType(0);

  assert(!VT.isVector());


  // Try to get the fneg to fold into the source modifier. This undoes generic

  // DAG combines and folds them into the mad.

  //

  // Only do this if we are not trying to support denormals. v_mad_f32 does

  // not support denormals ever.

  SDValue LHS = N->getOperand(0);

  SDValue RHS = N->getOperand(1);

  if (LHS.getOpcode() == ISD::FADD) {

    // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)

    SDValue A = LHS.getOperand(0);

    if (A == LHS.getOperand(1)) {

      unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());

      if (FusedOp != 0) {

        const SDValue Two = DAG.getConstantFP(2.0, SL, VT);

        SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);


        return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);

      }

    }

  }


  if (RHS.getOpcode() == ISD::FADD) {

    // (fsub c, (fadd a, a)) -> mad -2.0, a, c


    SDValue A = RHS.getOperand(0);

    if (A == RHS.getOperand(1)) {

      unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());

      if (FusedOp != 0) {

        const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);

        return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);

      }

    }

  }


  return SDValue();

}


SDValue SITargetLowering::performFDivCombine(SDNode *N,

                                             DAGCombinerInfo &DCI) const {

  SelectionDAG &DAG = DCI.DAG;

  SDLoc SL(N);

  EVT VT = N->getValueType(0);


  // fsqrt legality correlates to rsq availability.

  if ((VT != MVT::f16 && VT != MVT::bf16) || !isOperationLegal(ISD::FSQRT, VT))

    return SDValue();


  SDValue LHS = N->getOperand(0);

  SDValue RHS = N->getOperand(1);


  SDNodeFlags Flags = N->getFlags();

  SDNodeFlags RHSFlags = RHS->getFlags();

  if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||

      !RHS->hasOneUse())

    return SDValue();


  if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {

    bool IsNegative = false;

    if (CLHS->isExactlyValue(1.0) ||

        (IsNegative = CLHS->isExactlyValue(-1.0))) {

      // fdiv contract 1.0, (sqrt contract x) -> rsq for f16

      // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16

      if (RHS.getOpcode() == ISD::FSQRT) {

        // TODO: Or in RHS flags, somehow missing from SDNodeFlags

        SDValue Rsq =

            DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);

        return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;

      }

    }

  }


  return SDValue();

}


SDValue SITargetLowering::performFMulCombine(SDNode *N,

                                             DAGCombinerInfo &DCI) const {

  SelectionDAG &DAG = DCI.DAG;

  EVT VT = N->getValueType(0);

  EVT ScalarVT = VT.getScalarType();

  EVT IntVT = VT.changeElementType(*DAG.getContext(), MVT::i32);


  if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&

      (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {

    // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.

    return SDValue();

  }


  SDValue LHS = N->getOperand(0);

  SDValue RHS = N->getOperand(1);


  // It is cheaper to realize i32 inline constants as compared against

  // materializing f16 or f64 (or even non-inline f32) values,

  // possible via ldexp usage, as shown below :

  //

  // Given : A = 2^a  &  B = 2^b ; where a and b are integers.

  // fmul x, (select y, A, B)     -> ldexp( x, (select i32 y, a, b) )

  // fmul x, (select y, -A, -B)   -> ldexp( (fneg x), (select i32 y, a, b) )

  if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&

      (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {

    const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));

    if (!TrueNode)

      return SDValue();

    const ConstantFPSDNode *FalseNode =

        isConstOrConstSplatFP(RHS.getOperand(2));

    if (!FalseNode)

      return SDValue();


    if (TrueNode->isNegative() != FalseNode->isNegative())

      return SDValue();


    // For f32, only non-inline constants should be transformed.

    const SIInstrInfo *TII = getSubtarget()->getInstrInfo();

    if (ScalarVT == MVT::f32 &&

        TII->isInlineConstant(TrueNode->getValueAPF()) &&

        TII->isInlineConstant(FalseNode->getValueAPF()))

      return SDValue();


    int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();

    if (TrueNodeExpVal == INT_MIN)

      return SDValue();

    int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();

    if (FalseNodeExpVal == INT_MIN)

      return SDValue();


    SDLoc SL(N);

    SDValue SelectNode =

        DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),

                    DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),

                    DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));


    LHS = TrueNode->isNegative()

              ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())

              : LHS;


    return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());

  }


  return SDValue();

}


SDValue SITargetLowering::performFMACombine(SDNode *N,

                                            DAGCombinerInfo &DCI) const {

  SelectionDAG &DAG = DCI.DAG;

  EVT VT = N->getValueType(0);

  SDLoc SL(N);


  if (!Subtarget->hasDot10Insts() || VT != MVT::f32)

    return SDValue();


  // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->

  //   FDOT2((V2F16)S0, (V2F16)S1, (F32)z))

  SDValue Op1 = N->getOperand(0);

  SDValue Op2 = N->getOperand(1);

  SDValue FMA = N->getOperand(2);


  if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||

      Op2.getOpcode() != ISD::FP_EXTEND)

    return SDValue();


  // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,

  // regardless of the denorm mode setting. Therefore,

  // fp-contract is sufficient to allow generating fdot2.

  const TargetOptions &Options = DAG.getTarget().Options;

  if (Options.AllowFPOpFusion == FPOpFusion::Fast ||

      (N->getFlags().hasAllowContract() &&

       FMA->getFlags().hasAllowContract())) {

    Op1 = Op1.getOperand(0);

    Op2 = Op2.getOperand(0);

    if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

        Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

      return SDValue();


    SDValue Vec1 = Op1.getOperand(0);

    SDValue Idx1 = Op1.getOperand(1);

    SDValue Vec2 = Op2.getOperand(0);


    SDValue FMAOp1 = FMA.getOperand(0);

    SDValue FMAOp2 = FMA.getOperand(1);

    SDValue FMAAcc = FMA.getOperand(2);


    if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||

        FMAOp2.getOpcode() != ISD::FP_EXTEND)

      return SDValue();


    FMAOp1 = FMAOp1.getOperand(0);

    FMAOp2 = FMAOp2.getOperand(0);

    if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

        FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

      return SDValue();


    SDValue Vec3 = FMAOp1.getOperand(0);

    SDValue Vec4 = FMAOp2.getOperand(0);

    SDValue Idx2 = FMAOp1.getOperand(1);


    if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||

        // Idx1 and Idx2 cannot be the same.

        Idx1 == Idx2)

      return SDValue();


    if (Vec1 == Vec2 || Vec3 == Vec4)

      return SDValue();


    if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)

      return SDValue();


    if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {

      return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,

                         DAG.getTargetConstant(0, SL, MVT::i1));

    }

  }

  return SDValue();

}


// Given a double-precision ordered or unordered comparison, return the

// condition code for an equivalent integral comparison of the operands' upper

// 32 bits, or `SETCC_INVALID` if not possible.

// For simplicity, no simplification occurs if the operands are not both known

// to have sign bit zero.

//

// EQ/NE:

//  If LHS.lo32 == RHS.lo32:

//    setcc LHS, RHS, eq/ne => setcc LHS.hi32, RHS.hi32, eq/ne

//  If LHS.lo32 != RHS.lo32:

//    setcc LHS, RHS, eq/ne => setcc LHS.hi32, RHS.hi32, false/true

//  The reduction is not possible if operands may be +0 and -0.

//  For ordered eq / unordered ne, at most one operand may be NaN.

//  For unordered eq / ordered ne, neither operand can be NaN.

//

// LT/GE:

//  If LHS.lo32 >= RHS.lo32 (unsigned):

//    setcc LHS, RHS, [u]lt/ge => LHS.hi32, RHS.hi32, [u]lt/ge

//  If LHS.lo32 < RHS.lo32 (unsigned):

//    setcc LHS, RHS, [u]lt/ge => LHS.hi32, RHS.hi32, [u]le/gt

//  The reduction is only supported if both operands are nonnegative.

//  For ordered lt / unordered ge, the RHS cannot be NaN.

//  For unordered lt / ordered ge, neither operand can be NaN.

//

// LE/GT:

//  If LHS.lo32 > RHS.lo32 (unsigned):

//    setcc LHS, RHS, [u]le/gt => LHS.hi32, RHS.hi32, [u]lt/ge

//  If LHS.lo32 <= RHS.lo32 (unsigned):

//    setcc LHS, RHS, [u]le/gt => LHS.hi32, RHS.hi32, [u]le/gt

//  The reduction is only supported if both operands are nonnegative.

//  For unordered le / ordered gt, the LHS cannot be NaN.

//  For ordered le / unordered gt, neither operand can be NaN.


static ISD::CondCode tryReduceF64CompareToHiHalf(const ISD::CondCode CC,

                                                 const SDValue LHS,

                                                 const SDValue RHS,

                                                 const SelectionDAG &DAG) {

  EVT VT = LHS.getValueType();

  assert(VT == MVT::f64 && "Incorrect operand type!");


  const KnownBits RHSBits = DAG.computeKnownBits(RHS);

  // Bail if RHS sign bit is not known to be zero.

  if (!RHSBits.Zero.isSignBitSet())

    return ISD::SETCC_INVALID;


  const KnownBits RHSKnownLo32 = RHSBits.trunc(32);

  const KnownFPClass RHSFPClass =

      KnownFPClass::bitcast(VT.getFltSemantics(), RHSBits);

  const bool RHSMaybeNaN = !RHSFPClass.isKnownNeverNaN();


  const KnownBits LHSBits = DAG.computeKnownBits(LHS);

  const KnownBits LHSKnownLo32 = LHSBits.trunc(32);

  const KnownFPClass LHSFPClass =

      KnownFPClass::bitcast(VT.getFltSemantics(), LHSBits);

  const bool LHSMaybeNaN = !LHSFPClass.isKnownNeverNaN();


  // Bail if LHS sign bit is not known to be zero.

  if (!LHSBits.Zero.isSignBitSet())

    return ISD::SETCC_INVALID;


  switch (CC) {

  default:

    break;

  case ISD::SETEQ:

  case ISD::SETOEQ:

  case ISD::SETUEQ:

  case ISD::SETONE:

  case ISD::SETUNE: {

    // OEQ should be false if either operand is NaN, so it suffices that at

    // least one operand is not NaN.

    if (CC == ISD::SETOEQ && LHSMaybeNaN && RHSMaybeNaN)

      break;

    // UEQ should be true if either operand is NaN, but this cannot be checked

    // on underlying bits.

    if (CC == ISD::SETUEQ && (LHSMaybeNaN || RHSMaybeNaN))

      break;

    // ONE should be false if either operand is NaN, but this cannot be

    // checked on underlying bits.

    if (CC == ISD::SETONE && (LHSMaybeNaN || RHSMaybeNaN))

      break;

    // UNE should be true if either operand is NaN, so it suffices that they

    // are not both NaN.

    if (CC == ISD::SETUNE && LHSMaybeNaN && RHSMaybeNaN)

      break;


    const std::optional<bool> KnownEq =

        KnownBits::eq(LHSKnownLo32, RHSKnownLo32);


    if (!KnownEq)

      break;


    if (*KnownEq)

      return (CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ)

                 ? ISD::SETEQ

                 : ISD::SETNE;


    return (CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ)

               ? ISD::SETFALSE

               : ISD::SETTRUE;

  }

  case ISD::SETLT:

  case ISD::SETOLT:

  case ISD::SETULT:

  case ISD::SETGE:

  case ISD::SETOGE:

  case ISD::SETUGE: {

    // OLT should be false if either operand is NaN.

    // Since NaNs have maximum exponent and nonzero mantissa, false positives

    // are only possible if the RHS is NaN. (No issue with RHS == +inf since

    // the inequality is strict)

    if (CC == ISD::SETOLT && RHSMaybeNaN)

      break;

    // ULT should be true if either operand is NaN, but this cannot be ensured

    // with a truncated comparison.

    if (CC == ISD::SETULT && (LHSMaybeNaN || RHSMaybeNaN))

      break;

    // OGE should be false if either operand is NaN, but this cannot be

    // ensured with a truncated comparison.

    if (CC == ISD::SETOGE && (LHSMaybeNaN || RHSMaybeNaN))

      break;

    // UGE should be true if either operand is NaN.

    // False negatives are only possible if the RHS is NaN.

    // (No issue with RHS == +inf since the inequality is inclusive)

    if (CC == ISD::SETUGE && RHSMaybeNaN)

      break;


    const std::optional<bool> KnownUge =

        KnownBits::uge(LHSKnownLo32, RHSKnownLo32);


    if (!KnownUge)

      break;


    if (*KnownUge) {

      // LHS.lo32 uge RHS.lo32, so LHS >= RHS iff LHS.hi32 >= RHS.hi32

      return (CC == ISD::SETLT || CC == ISD::SETOLT || CC == ISD::SETULT)

                 ? ISD::SETLT

                 : ISD::SETGE;

    }

    // LHS.lo32 ult RHS.lo32, so LHS >= RHS iff LHS.hi32 > RHS.hi32

    return (CC == ISD::SETLT || CC == ISD::SETOLT || CC == ISD::SETULT)

               ? ISD::SETLE

               : ISD::SETGT;

  }

  case ISD::SETLE:

  case ISD::SETOLE:

  case ISD::SETULE:

  case ISD::SETGT:

  case ISD::SETOGT:

  case ISD::SETUGT: {

    // OLE should be false if either operand is NaN, but this cannot be

    // ensured with a truncated comparison.

    if (CC == ISD::SETOLE && (LHSMaybeNaN || RHSMaybeNaN))

      break;

    // ULE should be true if either operand is NaN.

    // False negatives are only possible if the LHS is NaN.

    // (No issue with LHS == +inf since the inequality is inclusive)

    if (CC == ISD::SETULE && LHSMaybeNaN)

      break;

    // OGT should be false if either operand is NaN.

    // False positives are only possible if the LHS is NaN.

    // (No issue with LHS == +inf since the inequality is strict)

    if (CC == ISD::SETOGT && LHSMaybeNaN)

      break;

    // UGT should be true if either operand is NaN, but this cannot be ensured

    // with a truncated comparison.

    if (CC == ISD::SETUGT && (LHSMaybeNaN || RHSMaybeNaN))

      break;


    const std::optional<bool> KnownUle =

        KnownBits::ule(LHSKnownLo32, RHSKnownLo32);


    if (!KnownUle)

      break;


    if (*KnownUle) {

      // LHS.lo32 ule RHS.lo32, so LHS <= RHS iff LHS.hi32 <= RHS.hi32

      return (CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE)

                 ? ISD::SETLE

                 : ISD::SETGT;

    }

    // LHS.lo32 ugt RHS.lo32, so LHS <= RHS iff LHS.hi32 < RHS.hi32

    return (CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE)

               ? ISD::SETLT

               : ISD::SETGE;

  }

  }


  return ISD::SETCC_INVALID;

}


SDValue SITargetLowering::performSetCCCombine(SDNode *N,

                                              DAGCombinerInfo &DCI) const {

  SelectionDAG &DAG = DCI.DAG;

  SDLoc SL(N);


  SDValue LHS = N->getOperand(0);

  SDValue RHS = N->getOperand(1);

  EVT VT = LHS.getValueType();

  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();


  auto *CRHS = dyn_cast<ConstantSDNode>(RHS);

  if (!CRHS) {

    CRHS = dyn_cast<ConstantSDNode>(LHS);

    if (CRHS) {

      std::swap(LHS, RHS);

      CC = getSetCCSwappedOperands(CC);

    }

  }


  if (CRHS) {

    if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&

        isBoolSGPR(LHS.getOperand(0))) {

      // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1

      // setcc (sext from i1 cc), -1, eq|sle|uge) => cc

      // setcc (sext from i1 cc),  0, eq|sge|ule) => not cc => xor cc, -1

      // setcc (sext from i1 cc),  0, ne|ugt|slt) => cc

      if ((CRHS->isAllOnes() &&

           (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||

          (CRHS->isZero() &&

           (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))

        return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),

                           DAG.getAllOnesConstant(SL, MVT::i1));

      if ((CRHS->isAllOnes() &&

           (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||

          (CRHS->isZero() &&

           (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))

        return LHS.getOperand(0);

    }


    const APInt &CRHSVal = CRHS->getAPIntValue();

    if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&

        LHS.getOpcode() == ISD::SELECT &&

        isa<ConstantSDNode>(LHS.getOperand(1)) &&

        isa<ConstantSDNode>(LHS.getOperand(2)) &&

        isBoolSGPR(LHS.getOperand(0))) {

      // Given CT != FT:

      // setcc (select cc, CT, CF), CF, eq => xor cc, -1

      // setcc (select cc, CT, CF), CF, ne => cc

      // setcc (select cc, CT, CF), CT, ne => xor cc, -1

      // setcc (select cc, CT, CF), CT, eq => cc

      const APInt &CT = LHS.getConstantOperandAPInt(1);

      const APInt &CF = LHS.getConstantOperandAPInt(2);


      if (CT != CF) {

        if ((CF == CRHSVal && CC == ISD::SETEQ) ||

            (CT == CRHSVal && CC == ISD::SETNE))

          return DAG.getNOT(SL, LHS.getOperand(0), MVT::i1);

        if ((CF == CRHSVal && CC == ISD::SETNE) ||

            (CT == CRHSVal && CC == ISD::SETEQ))

          return LHS.getOperand(0);

      }

    }

  }


  // Truncate 64-bit setcc to test only upper 32-bits of its operands in the

  // following cases where information about the lower 32-bits of its operands

  // is known:

  //

  // If LHS.lo32 == RHS.lo32:

  //    setcc LHS, RHS, eq/ne => setcc LHS.hi32, RHS.hi32, eq/ne

  // If LHS.lo32 != RHS.lo32:

  //    setcc LHS, RHS, eq/ne => setcc LHS.hi32, RHS.hi32, false/true

  // If LHS.lo32 >= RHS.lo32 (unsigned):

  //    setcc LHS, RHS, [u]lt/ge => LHS.hi32, RHS.hi32, [u]lt/ge

  // If LHS.lo32 > RHS.lo32 (unsigned):

  //    setcc LHS, RHS, [u]le/gt => LHS.hi32, RHS.hi32, [u]lt/ge

  // If LHS.lo32 <= RHS.lo32 (unsigned):

  //    setcc LHS, RHS, [u]le/gt => LHS.hi32, RHS.hi32, [u]le/gt

  // If LHS.lo32 < RHS.lo32 (unsigned):

  //    setcc LHS, RHS, [u]lt/ge => LHS.hi32, RHS.hi32, [u]le/gt

  if (VT == MVT::i64) {

    const KnownBits LHSKnownLo32 = DAG.computeKnownBits(LHS).trunc(32);

    const KnownBits RHSKnownLo32 = DAG.computeKnownBits(RHS).trunc(32);


    // NewCC is valid iff we can truncate the setcc to only test the upper 32

    // bits

    ISD::CondCode NewCC = ISD::SETCC_INVALID;


    switch (CC) {

    default:

      break;

    case ISD::SETEQ: {

      const std::optional<bool> KnownEq =

          KnownBits::eq(LHSKnownLo32, RHSKnownLo32);

      if (KnownEq)

        NewCC = *KnownEq ? ISD::SETEQ : ISD::SETFALSE;


      break;

    }

    case ISD::SETNE: {

      const std::optional<bool> KnownEq =

          KnownBits::eq(LHSKnownLo32, RHSKnownLo32);

      if (KnownEq)

        NewCC = *KnownEq ? ISD::SETNE : ISD::SETTRUE;


      break;

    }

    case ISD::SETULT:

    case ISD::SETUGE:

    case ISD::SETLT:

    case ISD::SETGE: {

      const std::optional<bool> KnownUge =

          KnownBits::uge(LHSKnownLo32, RHSKnownLo32);

      if (KnownUge) {

        if (*KnownUge) {

          // LHS.lo32 uge RHS.lo32, so LHS >= RHS iff LHS.hi32 >= RHS.hi32

          NewCC = CC;

        } else {

          // LHS.lo32 ult RHS.lo32, so LHS >= RHS iff LHS.hi32 > RHS.hi32

          NewCC = CC == ISD::SETULT   ? ISD::SETULE

                  : CC == ISD::SETUGE ? ISD::SETUGT

                  : CC == ISD::SETLT  ? ISD::SETLE

                                      : ISD::SETGT;

        }

      }

      break;

    }

    case ISD::SETULE:

    case ISD::SETUGT:

    case ISD::SETLE:

    case ISD::SETGT: {

      const std::optional<bool> KnownUle =

          KnownBits::ule(LHSKnownLo32, RHSKnownLo32);

      if (KnownUle) {

        if (*KnownUle) {

          // LHS.lo32 ule RHS.lo32, so LHS <= RHS iff LHS.hi32 <= RHS.hi32

          NewCC = CC;

        } else {

          // LHS.lo32 ugt RHS.lo32, so LHS <= RHS iff LHS.hi32 < RHS.hi32

          NewCC = CC == ISD::SETULE   ? ISD::SETULT

                  : CC == ISD::SETUGT ? ISD::SETUGE

                  : CC == ISD::SETLE  ? ISD::SETLT

                                      : ISD::SETGE;

        }

      }

      break;

    }

    }


    if (NewCC != ISD::SETCC_INVALID)

      return DAG.getSetCC(SL, N->getValueType(0), getHiHalf64(LHS, DAG),

                          getHiHalf64(RHS, DAG), NewCC);

  }


  // Eliminate setcc by using carryout from add/sub instruction


  // LHS = ADD i64 RHS, Z          LHSlo = UADDO       i32 RHSlo, Zlo

  // setcc LHS ult RHS     ->      LHSHi = UADDO_CARRY i32 RHShi, Zhi

  // similarly for subtraction


  // LHS = ADD i64 Y, 1            LHSlo = UADDO       i32 Ylo, 1

  // setcc LHS eq 0        ->      LHSHi = UADDO_CARRY i32 Yhi, 0


  if (VT == MVT::i64 && ((CC == ISD::SETULT &&

                          sd_match(LHS, m_Add(m_Specific(RHS), m_Value()))) ||

                         (CC == ISD::SETUGT &&

                          sd_match(LHS, m_Sub(m_Specific(RHS), m_Value()))) ||

                         (CC == ISD::SETEQ && CRHS && CRHS->isZero() &&

                          sd_match(LHS, m_Add(m_Value(), m_One()))))) {

    bool IsAdd = LHS.getOpcode() == ISD::ADD;


    SDValue Op0 = LHS.getOperand(0);

    SDValue Op1 = LHS.getOperand(1);


    SDValue Op0Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op0);

    SDValue Op1Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op1);


    SDValue Op0Hi = getHiHalf64(Op0, DAG);

    SDValue Op1Hi = getHiHalf64(Op1, DAG);


    SDValue NodeLo =

        DAG.getNode(IsAdd ? ISD::UADDO : ISD::USUBO, SL,

                    DAG.getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo});


    SDValue CarryInHi = NodeLo.getValue(1);

    SDValue NodeHi = DAG.getNode(IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY,

                                 SL, DAG.getVTList(MVT::i32, MVT::i1),

                                 {Op0Hi, Op1Hi, CarryInHi});


    SDValue ResultLo = NodeLo.getValue(0);

    SDValue ResultHi = NodeHi.getValue(0);


    SDValue JoinedResult =

        DAG.getBuildVector(MVT::v2i32, SL, {ResultLo, ResultHi});


    SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, JoinedResult);

    SDValue Overflow = NodeHi.getValue(1);

    DCI.CombineTo(LHS.getNode(), Result);

    return Overflow;

  }


  if (VT != MVT::f32 && VT != MVT::f64 &&

      (!Subtarget->has16BitInsts() || VT != MVT::f16))

    return SDValue();


  // Match isinf/isfinite pattern

  // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))

  // (fcmp one (fabs x), inf) -> (fp_class x,

  // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)

  if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&

      LHS.getOpcode() == ISD::FABS) {

    const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);

    if (!CRHS)

      return SDValue();


    const APFloat &APF = CRHS->getValueAPF();

    if (APF.isInfinity() && !APF.isNegative()) {

      const unsigned IsInfMask =

          SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY;

      const unsigned IsFiniteMask =

          SIInstrFlags::N_ZERO | SIInstrFlags::P_ZERO | SIInstrFlags::N_NORMAL |

          SIInstrFlags::P_NORMAL | SIInstrFlags::N_SUBNORMAL |

          SIInstrFlags::P_SUBNORMAL;

      unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;

      return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),

                         DAG.getConstant(Mask, SL, MVT::i32));

    }

  }


  if (VT == MVT::f64) {

    ISD::CondCode HiHalfCC = tryReduceF64CompareToHiHalf(CC, LHS, RHS, DAG);

    if (HiHalfCC != ISD::SETCC_INVALID)

      return DAG.getSetCC(SL, N->getValueType(0), getHiHalf64(LHS, DAG),

                          getHiHalf64(RHS, DAG), HiHalfCC);

  }


  return SDValue();

}


SDValue

SITargetLowering::performCvtF32UByteNCombine(SDNode *N,

                                             DAGCombinerInfo &DCI) const {

  SelectionDAG &DAG = DCI.DAG;

  SDLoc SL(N);

  unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;


  SDValue Src = N->getOperand(0);

  SDValue Shift = N->getOperand(0);


  // TODO: Extend type shouldn't matter (assuming legal types).

  if (Shift.getOpcode() == ISD::ZERO_EXTEND)

    Shift = Shift.getOperand(0);


  if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {

    // cvt_f32_ubyte1 (shl x,  8) -> cvt_f32_ubyte0 x

    // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x

    // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x

    // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x

    // cvt_f32_ubyte0 (srl x,  8) -> cvt_f32_ubyte1 x

    if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {

      SDValue Shifted = DAG.getZExtOrTrunc(

          Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);


      unsigned ShiftOffset = 8 * Offset;

      if (Shift.getOpcode() == ISD::SHL)

        ShiftOffset -= C->getZExtValue();

      else

        ShiftOffset += C->getZExtValue();


      if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {

        return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,

                           MVT::f32, Shifted);

      }

    }

  }


  const TargetLowering &TLI = DAG.getTargetLoweringInfo();

  APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);

  if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {

    // We simplified Src. If this node is not dead, visit it again so it is

    // folded properly.

    if (N->getOpcode() != ISD::DELETED_NODE)

      DCI.AddToWorklist(N);

    return SDValue(N, 0);

  }


  // Handle (or x, (srl y, 8)) pattern when known bits are zero.

  if (SDValue DemandedSrc =

          TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))

    return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);


  return SDValue();

}


SDValue SITargetLowering::performClampCombine(SDNode *N,

                                              DAGCombinerInfo &DCI) const {

  ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));

  if (!CSrc)

    return SDValue();


  const MachineFunction &MF = DCI.DAG.getMachineFunction();

  const APFloat &F = CSrc->getValueAPF();

  APFloat Zero = APFloat::getZero(F.getSemantics());

  if (F < Zero ||

      (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {

    return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));

  }


  APFloat One(F.getSemantics(), "1.0");

  if (F > One)

    return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));


  return SDValue(CSrc, 0);

}


SDValue SITargetLowering::performSelectCombine(SDNode *N,

                                               DAGCombinerInfo &DCI) const {


  // Try to fold CMP + SELECT patterns with shared constants (both FP and

  // integer).

  // Detect when CMP and SELECT use the same constant and fold them to avoid

  // loading the constant twice. Specifically handles patterns like:

  // %cmp = icmp eq i32 %val, 4242

  // %sel = select i1 %cmp, i32 4242, i32 %other

  // It can be optimized to reuse %val instead of 4242 in select.

  SDValue Cond = N->getOperand(0);

  SDValue TrueVal = N->getOperand(1);

  SDValue FalseVal = N->getOperand(2);


  // Check if condition is a comparison.

  if (Cond.getOpcode() != ISD::SETCC)

    return SDValue();


  SDValue LHS = Cond.getOperand(0);

  SDValue RHS = Cond.getOperand(1);

  ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();


  bool isFloatingPoint = LHS.getValueType().isFloatingPoint();

  bool isInteger = LHS.getValueType().isInteger();


  // Handle simple floating-point and integer types only.

  if (!isFloatingPoint && !isInteger)

    return SDValue();


  bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);

  bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);

  if (!isEquality && !isNonEquality)

    return SDValue();


  SDValue ArgVal, ConstVal;

  if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||

      (isInteger && isa<ConstantSDNode>(RHS))) {

    ConstVal = RHS;

    ArgVal = LHS;

  } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||

             (isInteger && isa<ConstantSDNode>(LHS))) {

    ConstVal = LHS;

    ArgVal = RHS;

  } else {

    return SDValue();

  }


  // Skip optimization for inlinable immediates.

  if (isFloatingPoint) {

    const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();

    if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))

      return SDValue();

  } else {

    const std::optional<int64_t> Val =

        cast<ConstantSDNode>(ConstVal)->getAPIntValue().trySExtValue();

    if (Val && AMDGPU::isInlinableIntLiteral(*Val))

      return SDValue();

  }


  // For equality and non-equality comparisons, patterns:

  // select (setcc x, const), const, y -> select (setcc x, const), x, y

  // select (setccinv x, const), y, const -> select (setccinv x, const), y, x

  if (!(isEquality && TrueVal == ConstVal) &&

      !(isNonEquality && FalseVal == ConstVal))

    return SDValue();


  SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;

  SDValue SelectRHS =

      (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;

  return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,

                         SelectLHS, SelectRHS);

}


SDValue SITargetLowering::PerformDAGCombine(SDNode *N,

                                            DAGCombinerInfo &DCI) const {

  switch (N->getOpcode()) {

  case ISD::ADD:

  case ISD::SUB:

  case ISD::SHL:

  case ISD::SRL:

  case ISD::SRA:

  case ISD::AND:

  case ISD::OR:

  case ISD::XOR:

  case ISD::MUL:

  case ISD::SETCC:

  case ISD::SELECT:

  case ISD::SMIN:

  case ISD::SMAX:

  case ISD::UMIN:

  case ISD::UMAX:

    if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))

      return Res;

    break;

  default:

    break;

  }


  if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)

    return SDValue();


  switch (N->getOpcode()) {

  case ISD::ADD:

    return performAddCombine(N, DCI);

  case ISD::PTRADD:

    return performPtrAddCombine(N, DCI);

  case ISD::SUB:

    return performSubCombine(N, DCI);

  case ISD::UADDO_CARRY:

  case ISD::USUBO_CARRY:

    return performAddCarrySubCarryCombine(N, DCI);

  case ISD::FADD:

    return performFAddCombine(N, DCI);

  case ISD::FSUB:

    return performFSubCombine(N, DCI);

  case ISD::FDIV:

    return performFDivCombine(N, DCI);

  case ISD::FMUL:

    return performFMulCombine(N, DCI);

  case ISD::SETCC:

    return performSetCCCombine(N, DCI);

  case ISD::SELECT:

    if (auto Res = performSelectCombine(N, DCI))

      return Res;

    break;

  case ISD::FMAXNUM:

  case ISD::FMINNUM:

  case ISD::FMAXNUM_IEEE:

  case ISD::FMINNUM_IEEE:

  case ISD::FMAXIMUM:

  case ISD::FMINIMUM:

  case ISD::FMAXIMUMNUM:

  case ISD::FMINIMUMNUM:

  case ISD::SMAX:

  case ISD::SMIN:

  case ISD::UMAX:

  case ISD::UMIN:

  case AMDGPUISD::FMIN_LEGACY:

  case AMDGPUISD::FMAX_LEGACY:

    return performMinMaxCombine(N, DCI);

  case ISD::FMA:

    return performFMACombine(N, DCI);

  case ISD::AND:

    return performAndCombine(N, DCI);

  case ISD::OR:

    return performOrCombine(N, DCI);

  case ISD::FSHR: {

    const SIInstrInfo *TII = getSubtarget()->getInstrInfo();

    if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&

        TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {

      return matchPERM(N, DCI);

    }

    break;

  }

  case ISD::XOR:

    return performXorCombine(N, DCI);

  case ISD::ANY_EXTEND:

  case ISD::ZERO_EXTEND:

    return performZeroOrAnyExtendCombine(N, DCI);

  case ISD::SIGN_EXTEND_INREG:

    return performSignExtendInRegCombine(N, DCI);

  case AMDGPUISD::FP_CLASS:

    return performClassCombine(N, DCI);

  case ISD::FCANONICALIZE:

    return performFCanonicalizeCombine(N, DCI);

  case AMDGPUISD::RCP:

    return performRcpCombine(N, DCI);

  case ISD::FLDEXP:

  case AMDGPUISD::FRACT:

  case AMDGPUISD::RSQ:

  case AMDGPUISD::RCP_LEGACY:

  case AMDGPUISD::RCP_IFLAG:

  case AMDGPUISD::RSQ_CLAMP: {

    // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted

    SDValue Src = N->getOperand(0);

    if (Src.isUndef())

      return Src;

    break;

  }

  case ISD::SINT_TO_FP:

  case ISD::UINT_TO_FP:

    return performUCharToFloatCombine(N, DCI);

  case ISD::FCOPYSIGN:

    return performFCopySignCombine(N, DCI);

  case AMDGPUISD::CVT_F32_UBYTE0:

  case AMDGPUISD::CVT_F32_UBYTE1:

  case AMDGPUISD::CVT_F32_UBYTE2:

  case AMDGPUISD::CVT_F32_UBYTE3:

    return performCvtF32UByteNCombine(N, DCI);

  case AMDGPUISD::FMED3:

    return performFMed3Combine(N, DCI);

  case AMDGPUISD::CVT_PKRTZ_F16_F32:

    return performCvtPkRTZCombine(N, DCI);

  case AMDGPUISD::CLAMP:

    return performClampCombine(N, DCI);

  case ISD::SCALAR_TO_VECTOR: {

    SelectionDAG &DAG = DCI.DAG;

    EVT VT = N->getValueType(0);


    // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))

    if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {

      SDLoc SL(N);

      SDValue Src = N->getOperand(0);

      EVT EltVT = Src.getValueType();

      if (EltVT != MVT::i16)

        Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);


      SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);

      return DAG.getNode(ISD::BITCAST, SL, VT, Ext);

    }


    break;

  }

  case ISD::EXTRACT_VECTOR_ELT:

    return performExtractVectorEltCombine(N, DCI);

  case ISD::INSERT_VECTOR_ELT:

    return performInsertVectorEltCombine(N, DCI);

  case ISD::FP_ROUND:

    return performFPRoundCombine(N, DCI);

  case ISD::LOAD: {

    if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))

      return Widened;

    [[fallthrough]];

  }

  default: {

    if (!DCI.isBeforeLegalize()) {

      if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))

        return performMemSDNodeCombine(MemNode, DCI);

    }


    break;

  }

  }


  return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);

}


/// Helper function for adjustWritemask


static unsigned SubIdx2Lane(unsigned Idx) {

  switch (Idx) {

  default:

    return ~0u;

  case AMDGPU::sub0:

    return 0;

  case AMDGPU::sub1:

    return 1;

  case AMDGPU::sub2:

    return 2;

  case AMDGPU::sub3:

    return 3;

  case AMDGPU::sub4:

    return 4; // Possible with TFE/LWE

  }

}


/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions

SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,

                                          SelectionDAG &DAG) const {

  unsigned Opcode = Node->getMachineOpcode();


  // Subtract 1 because the vdata output is not a MachineSDNode operand.

  int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;

  if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))

    return Node; // not implemented for D16


  SDNode *Users[5] = {nullptr};

  unsigned Lane = 0;

  unsigned DmaskIdx =

      AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;

  unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);

  unsigned NewDmask = 0;

  unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;

  unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;

  bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||

                 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx));

  unsigned TFCLane = 0;

  bool HasChain = Node->getNumValues() > 1;


  if (OldDmask == 0) {

    // These are folded out, but on the chance it happens don't assert.

    return Node;

  }


  unsigned OldBitsSet = llvm::popcount(OldDmask);

  // Work out which is the TFE/LWE lane if that is enabled.

  if (UsesTFC) {

    TFCLane = OldBitsSet;

  }


  // Try to figure out the used register components

  for (SDUse &Use : Node->uses()) {


    // Don't look at users of the chain.

    if (Use.getResNo() != 0)

      continue;


    SDNode *User = Use.getUser();


    // Abort if we can't understand the usage

    if (!User->isMachineOpcode() ||

        User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)

      return Node;


    // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.

    // Note that subregs are packed, i.e. Lane==0 is the first bit set

    // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit

    // set, etc.

    Lane = SubIdx2Lane(User->getConstantOperandVal(1));

    if (Lane == ~0u)

      return Node;


    // Check if the use is for the TFE/LWE generated result at VGPRn+1.

    if (UsesTFC && Lane == TFCLane) {

      Users[Lane] = User;

    } else {

      // Set which texture component corresponds to the lane.

      unsigned Comp;

      for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {

        Comp = llvm::countr_zero(Dmask);

        Dmask &= ~(1 << Comp);

      }


      // Abort if we have more than one user per component.

      if (Users[Lane])

        return Node;


      Users[Lane] = User;

      NewDmask |= 1 << Comp;

    }

  }


  // Don't allow 0 dmask, as hardware assumes one channel enabled.

  bool NoChannels = !NewDmask;

  if (NoChannels) {

    if (!UsesTFC) {

      // No uses of the result and not using TFC. Then do nothing.

      return Node;

    }

    // If the original dmask has one channel - then nothing to do

    if (OldBitsSet == 1)

      return Node;

    // Use an arbitrary dmask - required for the instruction to work

    NewDmask = 1;

  }

  // Abort if there's no change

  if (NewDmask == OldDmask)

    return Node;


  unsigned BitsSet = llvm::popcount(NewDmask);


  // Check for TFE or LWE - increase the number of channels by one to account

  // for the extra return value

  // This will need adjustment for D16 if this is also included in

  // adjustWriteMask (this function) but at present D16 are excluded.

  unsigned NewChannels = BitsSet + UsesTFC;


  int NewOpcode =

      AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);

  assert(NewOpcode != -1 &&

         NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&

         "failed to find equivalent MIMG op");


  // Adjust the writemask in the node

  SmallVector<SDValue, 12> Ops;

  llvm::append_range(Ops, Node->ops().take_front(DmaskIdx));

  Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));

  llvm::append_range(Ops, Node->ops().drop_front(DmaskIdx + 1));


  MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();


  MVT ResultVT = NewChannels == 1

                     ? SVT

                     : MVT::getVectorVT(SVT, NewChannels == 3   ? 4

                                             : NewChannels == 5 ? 8

                                                                : NewChannels);

  SDVTList NewVTList =

      HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);


  MachineSDNode *NewNode =

      DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);


  if (HasChain) {

    // Update chain.

    DAG.setNodeMemRefs(NewNode, Node->memoperands());

    DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));

  }


  if (NewChannels == 1) {

    assert(Node->hasNUsesOfValue(1, 0));

    SDNode *Copy =

        DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),

                           Users[Lane]->getValueType(0), SDValue(NewNode, 0));

    DAG.ReplaceAllUsesWith(Users[Lane], Copy);

    return nullptr;

  }


  // Update the users of the node with the new indices

  for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {

    SDNode *User = Users[i];

    if (!User) {

      // Handle the special case of NoChannels. We set NewDmask to 1 above, but

      // Users[0] is still nullptr because channel 0 doesn't really have a use.

      if (i || !NoChannels)

        continue;

    } else {

      SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);

      SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);

      if (NewUser != User) {

        DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));

        DAG.RemoveDeadNode(User);

      }

    }


    switch (Idx) {

    default:

      break;

    case AMDGPU::sub0:

      Idx = AMDGPU::sub1;

      break;

    case AMDGPU::sub1:

      Idx = AMDGPU::sub2;

      break;

    case AMDGPU::sub2:

      Idx = AMDGPU::sub3;

      break;

    case AMDGPU::sub3:

      Idx = AMDGPU::sub4;

      break;

    }

  }


  DAG.RemoveDeadNode(Node);

  return nullptr;

}


static bool isFrameIndexOp(SDValue Op) {

  if (Op.getOpcode() == ISD::AssertZext)

    Op = Op.getOperand(0);


  return isa<FrameIndexSDNode>(Op);

}


/// Legalize target independent instructions (e.g. INSERT_SUBREG)

/// with frame index operands.

/// LLVM assumes that inputs are to these instructions are registers.

SDNode *


SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,

                                                SelectionDAG &DAG) const {

  if (Node->getOpcode() == ISD::CopyToReg) {

    RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));

    SDValue SrcVal = Node->getOperand(2);


    // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have

    // to try understanding copies to physical registers.

    if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {

      SDLoc SL(Node);

      MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();

      SDValue VReg = DAG.getRegister(

          MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);


      SDNode *Glued = Node->getGluedNode();

      SDValue ToVReg = DAG.getCopyToReg(

          Node->getOperand(0), SL, VReg, SrcVal,

          SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));

      SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),

                                             VReg, ToVReg.getValue(1));

      DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());

      DAG.RemoveDeadNode(Node);

      return ToResultReg.getNode();

    }

  }


  SmallVector<SDValue, 8> Ops;

  for (unsigned i = 0; i < Node->getNumOperands(); ++i) {

    if (!isFrameIndexOp(Node->getOperand(i))) {

      Ops.push_back(Node->getOperand(i));

      continue;

    }


    SDLoc DL(Node);

    Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,

                                             Node->getOperand(i).getValueType(),

                                             Node->getOperand(i)),

                          0));

  }


  return DAG.UpdateNodeOperands(Node, Ops);

}


/// Fold the instructions after selecting them.

/// Returns null if users were already updated.


SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,

                                          SelectionDAG &DAG) const {

  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();

  unsigned Opcode = Node->getMachineOpcode();


  if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&

      !TII->isGather4(Opcode) &&

      AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {

    return adjustWritemask(Node, DAG);

  }


  if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {

    legalizeTargetIndependentNode(Node, DAG);

    return Node;

  }


  switch (Opcode) {

  case AMDGPU::V_DIV_SCALE_F32_e64:

  case AMDGPU::V_DIV_SCALE_F64_e64: {

    // Satisfy the operand register constraint when one of the inputs is

    // undefined. Ordinarily each undef value will have its own implicit_def of

    // a vreg, so force these to use a single register.

    SDValue Src0 = Node->getOperand(1);

    SDValue Src1 = Node->getOperand(3);

    SDValue Src2 = Node->getOperand(5);


    if ((Src0.isMachineOpcode() &&

         Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&

        (Src0 == Src1 || Src0 == Src2))

      break;


    MVT VT = Src0.getValueType().getSimpleVT();

    const TargetRegisterClass *RC =

        getRegClassFor(VT, Src0.getNode()->isDivergent());


    MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();

    SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);


    SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,

                                      Src0, SDValue());


    // src0 must be the same register as src1 or src2, even if the value is

    // undefined, so make sure we don't violate this constraint.

    if (Src0.isMachineOpcode() &&

        Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {

      if (Src1.isMachineOpcode() &&

          Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)

        Src0 = Src1;

      else if (Src2.isMachineOpcode() &&

               Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)

        Src0 = Src2;

      else {

        assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);

        Src0 = UndefReg;

        Src1 = UndefReg;

      }

    } else

      break;


    SmallVector<SDValue, 9> Ops(Node->ops());

    Ops[1] = Src0;

    Ops[3] = Src1;

    Ops[5] = Src2;

    Ops.push_back(ImpDef.getValue(1));

    return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);

  }

  default:

    break;

  }


  return Node;

}


// Any MIMG instructions that use tfe or lwe require an initialization of the

// result register that will be written in the case of a memory access failure.

// The required code is also added to tie this init code to the result of the

// img instruction.


void SITargetLowering::AddMemOpInit(MachineInstr &MI) const {

  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();

  const SIRegisterInfo &TRI = TII->getRegisterInfo();

  MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();

  MachineBasicBlock &MBB = *MI.getParent();


  int DstIdx =

      AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);

  unsigned InitIdx = 0;


  if (TII->isImage(MI)) {

    MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);

    MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);

    MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);


    if (!TFE && !LWE) // intersect_ray

      return;


    unsigned TFEVal = TFE ? TFE->getImm() : 0;

    unsigned LWEVal = LWE ? LWE->getImm() : 0;

    unsigned D16Val = D16 ? D16->getImm() : 0;


    if (!TFEVal && !LWEVal)

      return;


    // At least one of TFE or LWE are non-zero

    // We have to insert a suitable initialization of the result value and

    // tie this to the dest of the image instruction.


    // Calculate which dword we have to initialize to 0.

    MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);


    // check that dmask operand is found.

    assert(MO_Dmask && "Expected dmask operand in instruction");


    unsigned dmask = MO_Dmask->getImm();

    // Determine the number of active lanes taking into account the

    // Gather4 special case

    unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);


    bool Packed = !Subtarget->hasUnpackedD16VMem();


    InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;


    // Abandon attempt if the dst size isn't large enough

    // - this is in fact an error but this is picked up elsewhere and

    // reported correctly.

    const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);


    uint32_t DstSize = TRI.getRegSizeInBits(*DstRC) / 32;

    if (DstSize < InitIdx)

      return;

  } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {

    const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);

    InitIdx = TRI.getRegSizeInBits(*DstRC) / 32;

  } else {

    return;

  }


  const DebugLoc &DL = MI.getDebugLoc();


  // Create a register for the initialization value.

  Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());

  unsigned NewDst = 0; // Final initialized value will be in here


  // If PRTStrictNull feature is enabled (the default) then initialize

  // all the result registers to 0, otherwise just the error indication

  // register (VGPRn+1)

  unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;

  unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);


  BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);

  for (; SizeLeft; SizeLeft--, CurrIdx++) {

    NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));

    // Initialize dword

    Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

    // clang-format off

    BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)

        .addImm(0);

    // clang-format on

    // Insert into the super-reg

    BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)

        .addReg(PrevDst)

        .addReg(SubReg)

        .addImm(SIRegisterInfo::getSubRegFromChannel(CurrIdx));


    PrevDst = NewDst;

  }


  // Add as an implicit operand

  MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));


  // Tie the just added implicit operand to the dst

  MI.tieOperands(DstIdx, MI.getNumOperands() - 1);

}


/// Assign the register class depending on the number of

/// bits set in the writemask


void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,

                                                     SDNode *Node) const {

  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();


  MachineFunction *MF = MI.getMF();

  MachineRegisterInfo &MRI = MF->getRegInfo();


  if (TII->isVOP3(MI.getOpcode())) {

    // Make sure constant bus requirements are respected.

    TII->legalizeOperandsVOP3(MRI, MI);


    if (TII->isMAI(MI)) {

      // The ordinary src0, src1, src2 were legalized above.

      //

      // We have to also legalize the appended v_mfma_ld_scale_b32 operands,

      // as a separate instruction.

      int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),

                                               AMDGPU::OpName::scale_src0);

      if (Src0Idx != -1) {

        int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),

                                                 AMDGPU::OpName::scale_src1);

        if (TII->usesConstantBus(MRI, MI, Src0Idx) &&

            TII->usesConstantBus(MRI, MI, Src1Idx))

          TII->legalizeOpWithMove(MI, Src1Idx);

      }

    }


    return;

  }


  if (TII->isImage(MI))

    TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);

}


static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,

                              uint64_t Val) {

  SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);

  return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);

}


MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,

                                                const SDLoc &DL,

                                                SDValue Ptr) const {

  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();


  // Build the half of the subregister with the constants before building the

  // full 128-bit register. If we are building multiple resource descriptors,

  // this will allow CSEing of the 2-component register.

  const SDValue Ops0[] = {

      DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),

      buildSMovImm32(DAG, DL, 0),

      DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),

      buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),

      DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};


  SDValue SubRegHi = SDValue(

      DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);


  // Combine the constants and the pointer.

  const SDValue Ops1[] = {

      DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,

      DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,

      DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};


  return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);

}


/// Return a resource descriptor with the 'Add TID' bit enabled

///        The TID (Thread ID) is multiplied by the stride value (bits [61:48]

///        of the resource descriptor) to create an offset, which is added to

///        the resource pointer.


MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,

                                           SDValue Ptr, uint32_t RsrcDword1,

                                           uint64_t RsrcDword2And3) const {

  SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);

  SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);

  if (RsrcDword1) {

    PtrHi =

        SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,

                                   DAG.getConstant(RsrcDword1, DL, MVT::i32)),

                0);

  }


  SDValue DataLo =

      buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));

  SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);


  const SDValue Ops[] = {

      DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),

      PtrLo,

      DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),

      PtrHi,

      DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),

      DataLo,

      DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),

      DataHi,

      DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};


  return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);

}


//===----------------------------------------------------------------------===//

//                         SI Inline Assembly Support

//===----------------------------------------------------------------------===//


std::pair<unsigned, const TargetRegisterClass *>


SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,

                                               StringRef Constraint,

                                               MVT VT) const {

  const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);


  const TargetRegisterClass *RC = nullptr;

  if (Constraint.size() == 1) {

    // Check if we cannot determine the bit size of the given value type.  This

    // can happen, for example, in this situation where we have an empty struct

    // (size 0): `call void asm "", "v"({} poison)`-

    if (VT == MVT::Other)

      return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);

    const unsigned BitWidth = VT.getSizeInBits();

    switch (Constraint[0]) {

    default:

      return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);

    case 's':

    case 'r':

      switch (BitWidth) {

      case 16:

        RC = &AMDGPU::SReg_32RegClass;

        break;

      case 64:

        RC = &AMDGPU::SGPR_64RegClass;

        break;

      default:

        RC = SIRegisterInfo::getSGPRClassForBitWidth(BitWidth);

        if (!RC)

          return std::pair(0U, nullptr);

        break;

      }

      break;

    case 'v':

      switch (BitWidth) {

      case 1:

        return std::pair(0U, nullptr);

      case 16:

        RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass

                                             : &AMDGPU::VGPR_32_Lo256RegClass;

        break;

      default:

        RC = Subtarget->has1024AddressableVGPRs()

                 ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)

                 : TRI->getVGPRClassForBitWidth(BitWidth);

        if (!RC)

          return std::pair(0U, nullptr);

        break;

      }

      break;

    case 'a':

      if (!Subtarget->hasMAIInsts())

        break;

      switch (BitWidth) {

      case 1:

        return std::pair(0U, nullptr);

      case 16:

        RC = &AMDGPU::AGPR_32RegClass;

        break;

      default:

        RC = TRI->getAGPRClassForBitWidth(BitWidth);

        if (!RC)

          return std::pair(0U, nullptr);

        break;

      }

      break;

    }

  } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {

    const unsigned BitWidth = VT.getSizeInBits();

    switch (BitWidth) {

    case 16:

      RC = &AMDGPU::AV_32RegClass;

      break;

    default:

      RC = TRI->getVectorSuperClassForBitWidth(BitWidth);

      if (!RC)

        return std::pair(0U, nullptr);

      break;

    }

  }


  // We actually support i128, i16 and f16 as inline parameters

  // even if they are not reported as legal

  if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||

             VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))

    return std::pair(0U, RC);


  auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);

  if (Kind != '\0') {

    if (Kind == 'v') {

      RC = &AMDGPU::VGPR_32_Lo256RegClass;

    } else if (Kind == 's') {

      RC = &AMDGPU::SGPR_32RegClass;

    } else if (Kind == 'a') {

      RC = &AMDGPU::AGPR_32RegClass;

    }


    if (RC) {

      if (NumRegs > 1) {

        if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())

          return std::pair(0U, nullptr);


        uint32_t Width = NumRegs * 32;

        // Prohibit constraints for register ranges with a width that does not

        // match the required type.

        if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())

          return std::pair(0U, nullptr);


        MCRegister Reg = RC->getRegister(Idx);

        if (SIRegisterInfo::isVGPRClass(RC))

          RC = TRI->getVGPRClassForBitWidth(Width);

        else if (SIRegisterInfo::isSGPRClass(RC))

          RC = TRI->getSGPRClassForBitWidth(Width);

        else if (SIRegisterInfo::isAGPRClass(RC))

          RC = TRI->getAGPRClassForBitWidth(Width);

        if (RC) {

          Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);

          if (!Reg) {

            // The register class does not contain the requested register,

            // e.g., because it is an SGPR pair that would violate alignment

            // requirements.

            return std::pair(0U, nullptr);

          }

          return std::pair(Reg, RC);

        }

      }


      // Check for lossy scalar/vector conversions.

      if (VT.isVector() && VT.getSizeInBits() != 32)

        return std::pair(0U, nullptr);

      if (RC && Idx < RC->getNumRegs())

        return std::pair(RC->getRegister(Idx), RC);

      return std::pair(0U, nullptr);

    }

  }


  auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);

  if (Ret.first)

    Ret.second = TRI->getPhysRegBaseClass(Ret.first);


  return Ret;

}


static bool isImmConstraint(StringRef Constraint) {

  if (Constraint.size() == 1) {

    switch (Constraint[0]) {

    default:

      break;

    case 'I':

    case 'J':

    case 'A':

    case 'B':

    case 'C':

      return true;

    }

  } else if (Constraint == "DA" || Constraint == "DB") {

    return true;

  }

  return false;

}


SITargetLowering::ConstraintType


SITargetLowering::getConstraintType(StringRef Constraint) const {

  if (Constraint.size() == 1) {

    switch (Constraint[0]) {

    default:

      break;

    case 's':

    case 'v':

    case 'a':

      return C_RegisterClass;

    }

  } else if (Constraint.size() == 2) {

    if (Constraint == "VA")

      return C_RegisterClass;

  }

  if (isImmConstraint(Constraint)) {

    return C_Other;

  }

  return TargetLowering::getConstraintType(Constraint);

}


static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {

  if (!AMDGPU::isInlinableIntLiteral(Val)) {

    Val = Val & maskTrailingOnes<uint64_t>(Size);

  }

  return Val;

}


void SITargetLowering::LowerAsmOperandForConstraint(SDValue Op,

                                                    StringRef Constraint,

                                                    std::vector<SDValue> &Ops,

                                                    SelectionDAG &DAG) const {

  if (isImmConstraint(Constraint)) {

    uint64_t Val;

    if (getAsmOperandConstVal(Op, Val) &&

        checkAsmConstraintVal(Op, Constraint, Val)) {

      Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());

      Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));

    }

  } else {

    TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);

  }

}


bool SITargetLowering::getAsmOperandConstVal(SDValue Op, uint64_t &Val) const {

  unsigned Size = Op.getScalarValueSizeInBits();

  if (Size > 64)

    return false;


  if (Size == 16 && !Subtarget->has16BitInsts())

    return false;


  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {

    Val = C->getSExtValue();

    return true;

  }

  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {

    Val = C->getValueAPF().bitcastToAPInt().getSExtValue();

    return true;

  }

  if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) {

    if (Size != 16 || Op.getNumOperands() != 2)

      return false;

    if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())

      return false;

    if (ConstantSDNode *C = V->getConstantSplatNode()) {

      Val = C->getSExtValue();

      return true;

    }

    if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {

      Val = C->getValueAPF().bitcastToAPInt().getSExtValue();

      return true;

    }

  }


  return false;

}


bool SITargetLowering::checkAsmConstraintVal(SDValue Op, StringRef Constraint,

                                             uint64_t Val) const {

  if (Constraint.size() == 1) {

    switch (Constraint[0]) {

    case 'I':

      return AMDGPU::isInlinableIntLiteral(Val);

    case 'J':

      return isInt<16>(Val);

    case 'A':

      return checkAsmConstraintValA(Op, Val);

    case 'B':

      return isInt<32>(Val);

    case 'C':

      return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||

             AMDGPU::isInlinableIntLiteral(Val);

    default:

      break;

    }

  } else if (Constraint.size() == 2) {

    if (Constraint == "DA") {

      int64_t HiBits = static_cast<int32_t>(Val >> 32);

      int64_t LoBits = static_cast<int32_t>(Val);

      return checkAsmConstraintValA(Op, HiBits, 32) &&

             checkAsmConstraintValA(Op, LoBits, 32);

    }

    if (Constraint == "DB") {

      return true;

    }

  }

  llvm_unreachable("Invalid asm constraint");

}


bool SITargetLowering::checkAsmConstraintValA(SDValue Op, uint64_t Val,

                                              unsigned MaxSize) const {

  unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);

  bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();

  if (Size == 16) {

    MVT VT = Op.getSimpleValueType();

    switch (VT.SimpleTy) {

    default:

      return false;

    case MVT::i16:

      return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);

    case MVT::f16:

      return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);

    case MVT::bf16:

      return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);

    case MVT::v2i16:

      return AMDGPU::getInlineEncodingV2I16(Val).has_value();

    case MVT::v2f16:

      return AMDGPU::getInlineEncodingV2F16(Val).has_value();

    case MVT::v2bf16:

      return AMDGPU::getInlineEncodingV2BF16(Val).has_value();

    }

  }

  if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||

      (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))

    return true;

  return false;

}


static int getAlignedAGPRClassID(unsigned UnalignedClassID) {

  switch (UnalignedClassID) {

  case AMDGPU::VReg_64RegClassID:

    return AMDGPU::VReg_64_Align2RegClassID;

  case AMDGPU::VReg_96RegClassID:

    return AMDGPU::VReg_96_Align2RegClassID;

  case AMDGPU::VReg_128RegClassID:

    return AMDGPU::VReg_128_Align2RegClassID;

  case AMDGPU::VReg_160RegClassID:

    return AMDGPU::VReg_160_Align2RegClassID;

  case AMDGPU::VReg_192RegClassID:

    return AMDGPU::VReg_192_Align2RegClassID;

  case AMDGPU::VReg_224RegClassID:

    return AMDGPU::VReg_224_Align2RegClassID;

  case AMDGPU::VReg_256RegClassID:

    return AMDGPU::VReg_256_Align2RegClassID;

  case AMDGPU::VReg_288RegClassID:

    return AMDGPU::VReg_288_Align2RegClassID;

  case AMDGPU::VReg_320RegClassID:

    return AMDGPU::VReg_320_Align2RegClassID;

  case AMDGPU::VReg_352RegClassID:

    return AMDGPU::VReg_352_Align2RegClassID;

  case AMDGPU::VReg_384RegClassID:

    return AMDGPU::VReg_384_Align2RegClassID;

  case AMDGPU::VReg_512RegClassID:

    return AMDGPU::VReg_512_Align2RegClassID;

  case AMDGPU::VReg_1024RegClassID:

    return AMDGPU::VReg_1024_Align2RegClassID;

  case AMDGPU::AReg_64RegClassID:

    return AMDGPU::AReg_64_Align2RegClassID;

  case AMDGPU::AReg_96RegClassID:

    return AMDGPU::AReg_96_Align2RegClassID;

  case AMDGPU::AReg_128RegClassID:

    return AMDGPU::AReg_128_Align2RegClassID;

  case AMDGPU::AReg_160RegClassID:

    return AMDGPU::AReg_160_Align2RegClassID;

  case AMDGPU::AReg_192RegClassID:

    return AMDGPU::AReg_192_Align2RegClassID;

  case AMDGPU::AReg_256RegClassID:

    return AMDGPU::AReg_256_Align2RegClassID;

  case AMDGPU::AReg_512RegClassID:

    return AMDGPU::AReg_512_Align2RegClassID;

  case AMDGPU::AReg_1024RegClassID:

    return AMDGPU::AReg_1024_Align2RegClassID;

  default:

    return -1;

  }

}


// Figure out which registers should be reserved for stack access. Only after

// the function is legalized do we know all of the non-spill stack objects or if

// calls are present.


void SITargetLowering::finalizeLowering(MachineFunction &MF) const {

  MachineRegisterInfo &MRI = MF.getRegInfo();

  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();

  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();

  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();

  const SIInstrInfo *TII = ST.getInstrInfo();


  if (Info->isEntryFunction()) {

    // Callable functions have fixed registers used for stack access.

    reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);

  }


  // TODO: Move this logic to getReservedRegs()

  // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.

  unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);

  Register SReg = ST.isWave32()

                      ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)

                      : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,

                                                     &AMDGPU::SGPR_64RegClass);

  Info->setSGPRForEXECCopy(SReg);


  assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),

                             Info->getStackPtrOffsetReg()));

  if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)

    MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());


  // We need to worry about replacing the default register with itself in case

  // of MIR testcases missing the MFI.

  if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)

    MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());


  if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)

    MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());


  Info->limitOccupancy(MF);


  if (ST.isWave32() && !MF.empty()) {

    for (auto &MBB : MF) {

      for (auto &MI : MBB) {

        TII->fixImplicitOperands(MI);

      }

    }

  }


  // FIXME: This is a hack to fixup AGPR classes to use the properly aligned

  // classes if required. Ideally the register class constraints would differ

  // per-subtarget, but there's no easy way to achieve that right now. This is

  // not a problem for VGPRs because the correctly aligned VGPR class is implied

  // from using them as the register class for legal types.

  if (ST.needsAlignedVGPRs()) {

    for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {

      const Register Reg = Register::index2VirtReg(I);

      const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);

      if (!RC)

        continue;

      int NewClassID = getAlignedAGPRClassID(RC->getID());

      if (NewClassID != -1)

        MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));

    }

  }


  TargetLoweringBase::finalizeLowering(MF);

}


void SITargetLowering::computeKnownBitsForTargetNode(const SDValue Op,

                                                     KnownBits &Known,

                                                     const APInt &DemandedElts,

                                                     const SelectionDAG &DAG,

                                                     unsigned Depth) const {

  Known.resetAll();

  unsigned Opc = Op.getOpcode();

  switch (Opc) {

  case ISD::INTRINSIC_WO_CHAIN: {

    unsigned IID = Op.getConstantOperandVal(0);

    switch (IID) {

    case Intrinsic::amdgcn_mbcnt_lo:

    case Intrinsic::amdgcn_mbcnt_hi: {

      const GCNSubtarget &ST =

          DAG.getMachineFunction().getSubtarget<GCNSubtarget>();

      // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at

      // most 31 + src1.

      Known.Zero.setBitsFrom(

          IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);

      KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);

      Known = KnownBits::add(Known, Known2);

      return;

    }

    }

    break;

  }

  }

  return AMDGPUTargetLowering::computeKnownBitsForTargetNode(

      Op, Known, DemandedElts, DAG, Depth);

}


void SITargetLowering::computeKnownBitsForFrameIndex(

    const int FI, KnownBits &Known, const MachineFunction &MF) const {

  TargetLowering::computeKnownBitsForFrameIndex(FI, Known, MF);


  // Set the high bits to zero based on the maximum allowed scratch size per

  // wave. We can't use vaddr in MUBUF instructions if we don't know the address

  // calculation won't overflow, so assume the sign bit is never set.

  Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());

}


static void knownBitsForWorkitemID(const GCNSubtarget &ST,

                                   GISelValueTracking &VT, KnownBits &Known,

                                   unsigned Dim) {

  unsigned MaxValue =

      ST.getMaxWorkitemID(VT.getMachineFunction().getFunction(), Dim);

  Known.Zero.setHighBits(llvm::countl_zero(MaxValue));

}


static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT,

                             KnownBits &Known, const APInt &DemandedElts,

                             unsigned BFEWidth, bool SExt, unsigned Depth) {

  const MachineRegisterInfo &MRI = VT.getMachineFunction().getRegInfo();

  const MachineOperand &Src1 = MI.getOperand(2);


  unsigned Src1Cst = 0;

  if (Src1.isImm()) {

    Src1Cst = Src1.getImm();

  } else if (Src1.isReg()) {

    auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI);

    if (!Cst)

      return;

    Src1Cst = Cst->Value.getZExtValue();

  } else {

    return;

  }


  // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.

  // Width is always [22:16].

  const unsigned Offset =

      Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);

  const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);


  if (Width >= BFEWidth) // Ill-formed.

    return;


  VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,

                          Depth + 1);


  Known = Known.extractBits(Width, Offset);


  if (SExt)

    Known = Known.sext(BFEWidth);

  else

    Known = Known.zext(BFEWidth);

}


void SITargetLowering::computeKnownBitsForTargetInstr(

    GISelValueTracking &VT, Register R, KnownBits &Known,

    const APInt &DemandedElts, const MachineRegisterInfo &MRI,

    unsigned Depth) const {

  Known.resetAll();

  const MachineInstr *MI = MRI.getVRegDef(R);

  switch (MI->getOpcode()) {

  case AMDGPU::S_BFE_I32:

    return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,

                            /*SExt=*/true, Depth);

  case AMDGPU::S_BFE_U32:

    return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,

                            /*SExt=*/false, Depth);

  case AMDGPU::S_BFE_I64:

    return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,

                            /*SExt=*/true, Depth);

  case AMDGPU::S_BFE_U64:

    return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,

                            /*SExt=*/false, Depth);

  case AMDGPU::G_INTRINSIC:

  case AMDGPU::G_INTRINSIC_CONVERGENT: {

    Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();

    switch (IID) {

    case Intrinsic::amdgcn_workitem_id_x:

      knownBitsForWorkitemID(*getSubtarget(), VT, Known, 0);

      break;

    case Intrinsic::amdgcn_workitem_id_y:

      knownBitsForWorkitemID(*getSubtarget(), VT, Known, 1);

      break;

    case Intrinsic::amdgcn_workitem_id_z:

      knownBitsForWorkitemID(*getSubtarget(), VT, Known, 2);

      break;

    case Intrinsic::amdgcn_mbcnt_lo:

    case Intrinsic::amdgcn_mbcnt_hi: {

      // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at

      // most 31 + src1.

      Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo

                                 ? getSubtarget()->getWavefrontSizeLog2()

                                 : 5);

      KnownBits Known2;

      VT.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,

                              Depth + 1);

      Known = KnownBits::add(Known, Known2);

      break;

    }

    case Intrinsic::amdgcn_groupstaticsize: {

      // We can report everything over the maximum size as 0. We can't report

      // based on the actual size because we don't know if it's accurate or not

      // at any given point.

      Known.Zero.setHighBits(

          llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));

      break;

    }

    }

    break;

  }

  case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:

    Known.Zero.setHighBits(24);

    break;

  case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:

    Known.Zero.setHighBits(16);

    break;

  case AMDGPU::G_AMDGPU_COPY_SCC_VCC:

    // G_AMDGPU_COPY_SCC_VCC converts a uniform boolean in VCC to SGPR s32,

    // producing exactly 0 or 1.

    Known.Zero.setHighBits(Known.getBitWidth() - 1);

    break;

  case AMDGPU::G_AMDGPU_SMED3:

  case AMDGPU::G_AMDGPU_UMED3: {

    auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();


    KnownBits Known2;

    VT.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);

    if (Known2.isUnknown())

      break;


    KnownBits Known1;

    VT.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);

    if (Known1.isUnknown())

      break;


    KnownBits Known0;

    VT.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);

    if (Known0.isUnknown())

      break;


    // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.

    Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;

    Known.One = Known0.One & Known1.One & Known2.One;

    break;

  }

  }

}


Align SITargetLowering::computeKnownAlignForTargetInstr(

    GISelValueTracking &VT, Register R, const MachineRegisterInfo &MRI,

    unsigned Depth) const {

  const MachineInstr *MI = MRI.getVRegDef(R);

  if (auto *GI = dyn_cast<GIntrinsic>(MI)) {

    // FIXME: Can this move to generic code? What about the case where the call

    // site specifies a lower alignment?

    Intrinsic::ID IID = GI->getIntrinsicID();

    LLVMContext &Ctx = VT.getMachineFunction().getFunction().getContext();

    AttributeList Attrs =

        Intrinsic::getAttributes(Ctx, IID, Intrinsic::getType(Ctx, IID));

    if (MaybeAlign RetAlign = Attrs.getRetAlignment())

      return *RetAlign;

  }

  return Align(1);

}


Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {

  const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML);

  const Align CacheLineAlign = Align(64);


  // GFX950: Prevent an 8-byte instruction at loop header from being split by

  // the 32-byte instruction fetch window boundary. This avoids a significant

  // fetch delay after backward branch. We use 32-byte alignment with max

  // padding of 4 bytes (one s_nop), see getMaxPermittedBytesForAlignment().

  if (ML && !DisableLoopAlignment &&

      getSubtarget()->hasLoopHeadInstSplitSensitivity()) {

    const MachineBasicBlock *Header = ML->getHeader();

    // Respect user-specified or previously set alignment.

    if (Header->getAlignment() != PrefAlign)

      return Header->getAlignment();

    if (needsFetchWindowAlignment(*Header))

      return Align(32);

  }


  // Pre-GFX10 target did not benefit from loop alignment

  if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||

      getSubtarget()->hasInstFwdPrefetchBug())

    return PrefAlign;


  // On GFX10 I$ is 4 x 64 bytes cache lines.

  // By default prefetcher keeps one cache line behind and reads two ahead.

  // We can modify it with S_INST_PREFETCH for larger loops to have two lines

  // behind and one ahead.

  // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.

  // If loop fits 64 bytes it always spans no more than two cache lines and

  // does not need an alignment.

  // Else if loop is less or equal 128 bytes we do not need to modify prefetch,

  // Else if loop is less or equal 192 bytes we need two lines behind.


  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();

  const MachineBasicBlock *Header = ML->getHeader();

  if (Header->getAlignment() != PrefAlign)

    return Header->getAlignment(); // Already processed.


  unsigned LoopSize = 0;

  for (const MachineBasicBlock *MBB : ML->blocks()) {

    // If inner loop block is aligned assume in average half of the alignment

    // size to be added as nops.

    if (MBB != Header)

      LoopSize += MBB->getAlignment().value() / 2;


    for (const MachineInstr &MI : *MBB) {

      LoopSize += TII->getInstSizeInBytes(MI);

      if (LoopSize > 192)

        return PrefAlign;

    }

  }


  if (LoopSize <= 64)

    return PrefAlign;


  if (LoopSize <= 128)

    return CacheLineAlign;


  // If any of parent loops is surrounded by prefetch instructions do not

  // insert new for inner loop, which would reset parent's settings.

  for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {

    if (MachineBasicBlock *Exit = P->getExitBlock()) {

      auto I = Exit->getFirstNonDebugInstr();

      if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)

        return CacheLineAlign;

    }

  }


  MachineBasicBlock *Pre = ML->getLoopPreheader();

  MachineBasicBlock *Exit = ML->getExitBlock();


  if (Pre && Exit) {

    auto PreTerm = Pre->getFirstTerminator();

    if (PreTerm == Pre->begin() ||

        std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)

      BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))

          .addImm(1); // prefetch 2 lines behind PC


    auto ExitHead = Exit->getFirstNonDebugInstr();

    if (ExitHead == Exit->end() ||

        ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)

      BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))

          .addImm(2); // prefetch 1 line behind PC

  }


  return CacheLineAlign;

}


unsigned SITargetLowering::getMaxPermittedBytesForAlignment(

    MachineBasicBlock *MBB) const {

  // GFX950: Limit padding to 4 bytes (one s_nop) for blocks where an 8-byte

  // instruction could be split by the 32-byte fetch window boundary.

  // See getPrefLoopAlignment() for context.

  if (needsFetchWindowAlignment(*MBB))

    return 4;

  return TargetLowering::getMaxPermittedBytesForAlignment(MBB);

}


bool SITargetLowering::needsFetchWindowAlignment(

    const MachineBasicBlock &MBB) const {

  if (!getSubtarget()->hasLoopHeadInstSplitSensitivity())

    return false;

  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();

  for (const MachineInstr &MI : MBB) {

    if (MI.isMetaInstruction())

      continue;

    // Instructions larger than 4 bytes can be split by a 32-byte boundary.

    return TII->getInstSizeInBytes(MI) > 4;

  }

  return false;

}


[[maybe_unused]]


static bool isCopyFromRegOfInlineAsm(const SDNode *N) {

  assert(N->getOpcode() == ISD::CopyFromReg);

  do {

    // Follow the chain until we find an INLINEASM node.

    N = N->getOperand(0).getNode();

    if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)

      return true;

  } while (N->getOpcode() == ISD::CopyFromReg);

  return false;

}


bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,

                                                  FunctionLoweringInfo *FLI,

                                                  UniformityInfo *UA) const {

  switch (N->getOpcode()) {

  case ISD::CopyFromReg: {

    const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));

    const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();

    const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();

    Register Reg = R->getReg();


    // FIXME: Why does this need to consider isLiveIn?

    if (Reg.isPhysical() || MRI.isLiveIn(Reg))

      return !TRI->isSGPRReg(MRI, Reg);


    if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))

      return UA->isDivergentAtDef(V);


    assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));

    return !TRI->isSGPRReg(MRI, Reg);

  }

  case ISD::LOAD: {

    const LoadSDNode *L = cast<LoadSDNode>(N);

    unsigned AS = L->getAddressSpace();

    // A flat load may access private memory.

    return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;

  }

  case ISD::CALLSEQ_END:

    return true;

  case ISD::INTRINSIC_WO_CHAIN:

    return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));

  case ISD::INTRINSIC_W_CHAIN:

    return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));

  case AMDGPUISD::ATOMIC_CMP_SWAP:

  case AMDGPUISD::BUFFER_ATOMIC_SWAP:

  case AMDGPUISD::BUFFER_ATOMIC_ADD:

  case AMDGPUISD::BUFFER_ATOMIC_SUB:

  case AMDGPUISD::BUFFER_ATOMIC_SMIN:

  case AMDGPUISD::BUFFER_ATOMIC_UMIN:

  case AMDGPUISD::BUFFER_ATOMIC_SMAX:

  case AMDGPUISD::BUFFER_ATOMIC_UMAX:

  case AMDGPUISD::BUFFER_ATOMIC_AND:

  case AMDGPUISD::BUFFER_ATOMIC_OR:

  case AMDGPUISD::BUFFER_ATOMIC_XOR:

  case AMDGPUISD::BUFFER_ATOMIC_INC:

  case AMDGPUISD::BUFFER_ATOMIC_DEC:

  case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:

  case AMDGPUISD::BUFFER_ATOMIC_FADD:

  case AMDGPUISD::BUFFER_ATOMIC_FMIN:

  case AMDGPUISD::BUFFER_ATOMIC_FMAX:

    // Target-specific read-modify-write atomics are sources of divergence.

    return true;

  default:

    if (auto *A = dyn_cast<AtomicSDNode>(N)) {

      // Generic read-modify-write atomics are sources of divergence.

      return A->readMem() && A->writeMem();

    }

    return false;

  }

}


bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG,

                                               EVT VT) const {

  switch (VT.getScalarType().getSimpleVT().SimpleTy) {

  case MVT::f32:

    return !denormalModeIsFlushAllF32(DAG.getMachineFunction());

  case MVT::f64:

  case MVT::f16:

    return !denormalModeIsFlushAllF64F16(DAG.getMachineFunction());

  default:

    return false;

  }

}


bool SITargetLowering::denormalsEnabledForType(

    LLT Ty, const MachineFunction &MF) const {

  switch (Ty.getScalarSizeInBits()) {

  case 32:

    return !denormalModeIsFlushAllF32(MF);

  case 64:

  case 16:

    return !denormalModeIsFlushAllF64F16(MF);

  default:

    return false;

  }

}


bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,

                                                    const APInt &DemandedElts,

                                                    const SelectionDAG &DAG,

                                                    bool SNaN,

                                                    unsigned Depth) const {

  if (Op.getOpcode() == AMDGPUISD::CLAMP) {

    const MachineFunction &MF = DAG.getMachineFunction();

    const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();


    if (Info->getMode().DX10Clamp)

      return true; // Clamped to 0.

    return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);

  }


  return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DemandedElts,

                                                            DAG, SNaN, Depth);

}


// On older subtargets, global FP atomic instructions have a hardcoded FP mode

// and do not support FP32 denormals, and only support v2f16/f64 denormals.


static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW) {

  if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))

    return true;


  const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();

  auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);

  if (DenormMode == DenormalMode::getPreserveSign())

    return true;


  // TODO: Remove this.

  return RMW->getFunction()

      ->getFnAttribute("amdgpu-unsafe-fp-atomics")

      .getValueAsBool();

}


static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) {

  LLVMContext &Ctx = RMW->getContext();

  StringRef MemScope =

      Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("system");


  return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)

         << "Hardware instruction generated for atomic "

         << RMW->getOperationName(RMW->getOperation())

         << " operation at memory scope " << MemScope;

}


static bool isV2F16OrV2BF16(Type *Ty) {

  if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {

    Type *EltTy = VT->getElementType();

    return VT->getNumElements() == 2 &&

           (EltTy->isHalfTy() || EltTy->isBFloatTy());

  }


  return false;

}


static bool isV2F16(Type *Ty) {

  FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);

  return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();

}


static bool isV2BF16(Type *Ty) {

  FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);

  return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();

}


/// \return true if atomicrmw integer ops work for the type.


static bool isAtomicRMWLegalIntTy(Type *Ty) {

  if (auto *IT = dyn_cast<IntegerType>(Ty)) {

    unsigned BW = IT->getBitWidth();

    return BW == 32 || BW == 64;

  }


  return false;

}


/// \return true if this atomicrmw xchg type can be selected.


static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {

  Type *Ty = RMW->getType();

  if (isAtomicRMWLegalIntTy(Ty))

    return true;


  if (PointerType *PT = dyn_cast<PointerType>(Ty)) {

    const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();

    unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());

    return BW == 32 || BW == 64;

  }


  if (Ty->isFloatTy() || Ty->isDoubleTy())

    return true;


  if (FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty)) {

    return VT->getNumElements() == 2 &&

           VT->getElementType()->getPrimitiveSizeInBits() == 16;

  }


  return false;

}


/// \returns true if it's valid to emit a native instruction for \p RMW, based

/// on the properties of the target memory.


static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,

                                        const AtomicRMWInst *RMW,

                                        bool HasSystemScope) {

  // The remote/fine-grained access logic is different from the integer

  // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,

  // fine-grained access does not work, even for a device local allocation.

  //

  // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local

  // allocations work.

  if (HasSystemScope) {

    if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics() &&

        RMW->hasMetadata("amdgpu.no.remote.memory"))

      return true;

    if (Subtarget.hasEmulatedSystemScopeAtomics())

      return true;

  } else if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics())

    return true;


  return RMW->hasMetadata("amdgpu.no.fine.grained.memory");

}


/// \return Action to perform on AtomicRMWInsts for integer operations.

static TargetLowering::AtomicExpansionKind


atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {

  return isAtomicRMWLegalIntTy(RMW->getType())

             ? TargetLowering::AtomicExpansionKind::None

             : TargetLowering::AtomicExpansionKind::CmpXChg;

}


/// Return if a flat address space atomicrmw can access private memory.


static bool flatInstrMayAccessPrivate(const Instruction *I) {

  const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace);

  return !MD ||

         !AMDGPU::hasValueInRangeLikeMetadata(*MD, AMDGPUAS::PRIVATE_ADDRESS);

}


static TargetLowering::AtomicExpansionKind


getPrivateAtomicExpansionKind(const GCNSubtarget &STI) {

  // For GAS, lower to flat atomic.

  return STI.hasGloballyAddressableScratch()

             ? TargetLowering::AtomicExpansionKind::CustomExpand

             : TargetLowering::AtomicExpansionKind::NotAtomic;

}


TargetLowering::AtomicExpansionKind


SITargetLowering::shouldExpandAtomicRMWInIR(const AtomicRMWInst *RMW) const {

  unsigned AS = RMW->getPointerAddressSpace();

  if (AS == AMDGPUAS::PRIVATE_ADDRESS)

    return getPrivateAtomicExpansionKind(*getSubtarget());


  // 64-bit flat atomics that dynamically reside in private memory will silently

  // be dropped.

  //

  // Note that we will emit a new copy of the original atomic in the expansion,

  // which will be incrementally relegalized.

  const DataLayout &DL = RMW->getFunction()->getDataLayout();

  if (AS == AMDGPUAS::FLAT_ADDRESS &&

      DL.getTypeSizeInBits(RMW->getType()) == 64 &&

      flatInstrMayAccessPrivate(RMW))

    return AtomicExpansionKind::CustomExpand;


  auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {

    OptimizationRemarkEmitter ORE(RMW->getFunction());

    ORE.emit([=]() {

      return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";

    });

    return Kind;

  };


  auto SSID = RMW->getSyncScopeID();

  bool HasSystemScope =

      SSID == SyncScope::System ||

      SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");


  auto Op = RMW->getOperation();

  switch (Op) {

  case AtomicRMWInst::Xchg:

    // PCIe supports add and xchg for system atomics.

    return isAtomicRMWLegalXChgTy(RMW)

               ? TargetLowering::AtomicExpansionKind::None

               : TargetLowering::AtomicExpansionKind::CmpXChg;

  case AtomicRMWInst::Add:

    // PCIe supports add and xchg for system atomics.

    return atomicSupportedIfLegalIntType(RMW);

  case AtomicRMWInst::Sub:

  case AtomicRMWInst::And:

  case AtomicRMWInst::Or:

  case AtomicRMWInst::Xor:

  case AtomicRMWInst::Max:

  case AtomicRMWInst::Min:

  case AtomicRMWInst::UMax:

  case AtomicRMWInst::UMin:

  case AtomicRMWInst::UIncWrap:

  case AtomicRMWInst::UDecWrap:

  case AtomicRMWInst::USubCond:

  case AtomicRMWInst::USubSat: {

    if (Op == AtomicRMWInst::USubCond && !Subtarget->hasCondSubInsts())

      return AtomicExpansionKind::CmpXChg;

    if (Op == AtomicRMWInst::USubSat && !Subtarget->hasSubClampInsts())

      return AtomicExpansionKind::CmpXChg;

    if (Op == AtomicRMWInst::USubCond || Op == AtomicRMWInst::USubSat) {

      auto *IT = dyn_cast<IntegerType>(RMW->getType());

      if (!IT || IT->getBitWidth() != 32)

        return AtomicExpansionKind::CmpXChg;

    }


    if (AMDGPU::isFlatGlobalAddrSpace(AS) ||

        AS == AMDGPUAS::BUFFER_FAT_POINTER) {

      if (Subtarget->hasEmulatedSystemScopeAtomics())

        return atomicSupportedIfLegalIntType(RMW);


      // On most subtargets, for atomicrmw operations other than add/xchg,

      // whether or not the instructions will behave correctly depends on where

      // the address physically resides and what interconnect is used in the

      // system configuration. On some some targets the instruction will nop,

      // and in others synchronization will only occur at degraded device scope.

      //

      // If the allocation is known local to the device, the instructions should

      // work correctly.

      if (RMW->hasMetadata("amdgpu.no.remote.memory"))

        return atomicSupportedIfLegalIntType(RMW);


      // If fine-grained remote memory works at device scope, we don't need to

      // do anything.

      if (!HasSystemScope &&

          Subtarget->hasAgentScopeFineGrainedRemoteMemoryAtomics())

        return atomicSupportedIfLegalIntType(RMW);


      // If we are targeting a remote allocated address, it depends what kind of

      // allocation the address belongs to.

      //

      // If the allocation is fine-grained (in host memory, or in PCIe peer

      // device memory), the operation will fail depending on the target.

      //

      // Note fine-grained host memory access does work on APUs or if XGMI is

      // used, but we do not know if we are targeting an APU or the system

      // configuration from the ISA version/target-cpu.

      if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))

        return atomicSupportedIfLegalIntType(RMW);


      if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||

          Op == AtomicRMWInst::Xor) {

        // Atomic sub/or/xor do not work over PCI express, but atomic add

        // does. InstCombine transforms these with 0 to or, so undo that.

        if (const Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());

            ConstVal && ConstVal->isNullValue())

          return AtomicExpansionKind::CustomExpand;

      }


      // If the allocation could be in remote, fine-grained memory, the rmw

      // instructions may fail. cmpxchg should work, so emit that. On some

      // system configurations, PCIe atomics aren't supported so cmpxchg won't

      // even work, so you're out of luck anyway.


      // In summary:

      //

      // Cases that may fail:

      //   - fine-grained pinned host memory

      //   - fine-grained migratable host memory

      //   - fine-grained PCIe peer device

      //

      // Cases that should work, but may be treated overly conservatively.

      //   - fine-grained host memory on an APU

      //   - fine-grained XGMI peer device

      return AtomicExpansionKind::CmpXChg;

    }


    return atomicSupportedIfLegalIntType(RMW);

  }

  case AtomicRMWInst::FAdd: {

    Type *Ty = RMW->getType();


    // TODO: Handle REGION_ADDRESS

    if (AS == AMDGPUAS::LOCAL_ADDRESS) {

      // DS F32 FP atomics do respect the denormal mode, but the rounding mode

      // is fixed to round-to-nearest-even.

      //

      // F64 / PK_F16 / PK_BF16 never flush and are also fixed to

      // round-to-nearest-even.

      //

      // We ignore the rounding mode problem, even in strictfp. The C++ standard

      // suggests it is OK if the floating-point mode may not match the calling

      // thread.

      if (Ty->isFloatTy()) {

        return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None

                                                 : AtomicExpansionKind::CmpXChg;

      }


      if (Ty->isDoubleTy()) {

        // Ignores denormal mode, but we don't consider flushing mandatory.

        return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None

                                                 : AtomicExpansionKind::CmpXChg;

      }


      if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))

        return AtomicExpansionKind::None;


      return AtomicExpansionKind::CmpXChg;

    }


    // LDS atomics respect the denormal mode from the mode register.

    //

    // Traditionally f32 global/buffer memory atomics would unconditionally

    // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never

    // flush.

    //

    // On targets with flat atomic fadd, denormals would flush depending on

    // whether the target address resides in LDS or global memory. We consider

    // this flat-maybe-flush as will-flush.

    if (Ty->isFloatTy() &&

        !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&

        !atomicIgnoresDenormalModeOrFPModeIsFTZ(RMW))

      return AtomicExpansionKind::CmpXChg;


    // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are

    // safe. The message phrasing also should be better.

    if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {

      if (AS == AMDGPUAS::FLAT_ADDRESS) {

        // gfx942, gfx12

        if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))

          return ReportUnsafeHWInst(AtomicExpansionKind::None);

      } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {

        // gfx90a, gfx942, gfx12

        if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))

          return ReportUnsafeHWInst(AtomicExpansionKind::None);


        // gfx942, gfx12

        if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))

          return ReportUnsafeHWInst(AtomicExpansionKind::None);

      } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {

        // gfx90a, gfx942, gfx12

        if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))

          return ReportUnsafeHWInst(AtomicExpansionKind::None);


        // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for

        // buffer. gfx12 does have the buffer version.

        if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))

          return ReportUnsafeHWInst(AtomicExpansionKind::None);

      }


      // global and flat atomic fadd f64: gfx90a, gfx942.

      if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())

        return ReportUnsafeHWInst(AtomicExpansionKind::None);


      if (AS != AMDGPUAS::FLAT_ADDRESS) {

        if (Ty->isFloatTy()) {

          // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,

          // gfx11+.

          if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())

            return ReportUnsafeHWInst(AtomicExpansionKind::None);

          // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.

          if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())

            return ReportUnsafeHWInst(AtomicExpansionKind::None);

        } else {

          // gfx908

          if (RMW->use_empty() &&

              Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&

              isV2F16(Ty))

            return ReportUnsafeHWInst(AtomicExpansionKind::None);

        }

      }


      // flat atomic fadd f32: gfx942, gfx11+.

      if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {

        if (Subtarget->hasFlatAtomicFaddF32Inst())

          return ReportUnsafeHWInst(AtomicExpansionKind::None);


        // If it is in flat address space, and the type is float, we will try to

        // expand it, if the target supports global and lds atomic fadd. The

        // reason we need that is, in the expansion, we emit the check of

        // address space. If it is in global address space, we emit the global

        // atomic fadd; if it is in shared address space, we emit the LDS atomic

        // fadd.

        if (Subtarget->hasLDSFPAtomicAddF32()) {

          if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())

            return AtomicExpansionKind::CustomExpand;

          if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())

            return AtomicExpansionKind::CustomExpand;

        }

      }

    }


    return AtomicExpansionKind::CmpXChg;

  }

  case AtomicRMWInst::FMin:

  case AtomicRMWInst::FMax: {

    Type *Ty = RMW->getType();


    // LDS float and double fmin/fmax were always supported.

    if (AS == AMDGPUAS::LOCAL_ADDRESS) {

      return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None

                                                 : AtomicExpansionKind::CmpXChg;

    }


    if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {

      // For flat and global cases:

      // float, double in gfx7. Manual claims denormal support.

      // Removed in gfx8.

      // float, double restored in gfx10.

      // double removed again in gfx11, so only f32 for gfx11/gfx12.

      //

      // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but

      // no f32.

      if (AS == AMDGPUAS::FLAT_ADDRESS) {

        if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())

          return ReportUnsafeHWInst(AtomicExpansionKind::None);

        if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())

          return ReportUnsafeHWInst(AtomicExpansionKind::None);

      } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||

                 AS == AMDGPUAS::BUFFER_FAT_POINTER) {

        if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())

          return ReportUnsafeHWInst(AtomicExpansionKind::None);

        if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())

          return ReportUnsafeHWInst(AtomicExpansionKind::None);

      }

    }


    return AtomicExpansionKind::CmpXChg;

  }

  case AtomicRMWInst::Nand:

  case AtomicRMWInst::FSub:

  default:

    return AtomicExpansionKind::CmpXChg;

  }


  llvm_unreachable("covered atomicrmw op switch");

}


TargetLowering::AtomicExpansionKind


SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {

  return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS

             ? getPrivateAtomicExpansionKind(*getSubtarget())

             : AtomicExpansionKind::None;

}


TargetLowering::AtomicExpansionKind


SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {

  return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS

             ? getPrivateAtomicExpansionKind(*getSubtarget())

             : AtomicExpansionKind::None;

}


TargetLowering::AtomicExpansionKind


SITargetLowering::shouldExpandAtomicCmpXchgInIR(

    const AtomicCmpXchgInst *CmpX) const {

  unsigned AddrSpace = CmpX->getPointerAddressSpace();

  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)

    return getPrivateAtomicExpansionKind(*getSubtarget());


  if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))

    return AtomicExpansionKind::None;


  const DataLayout &DL = CmpX->getDataLayout();


  Type *ValTy = CmpX->getNewValOperand()->getType();


  // If a 64-bit flat atomic may alias private, we need to avoid using the

  // atomic in the private case.

  return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand

                                           : AtomicExpansionKind::None;

}


const TargetRegisterClass *


SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {

  const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, false);

  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();

  if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)

    return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass

                                 : &AMDGPU::SReg_32RegClass;

  if (!TRI->isSGPRClass(RC) && !isDivergent)

    return TRI->getEquivalentSGPRClass(RC);

  if (TRI->isSGPRClass(RC) && isDivergent) {

    if (Subtarget->hasGFX90AInsts())

      return TRI->getEquivalentAVClass(RC);

    return TRI->getEquivalentVGPRClass(RC);

  }


  return RC;

}


// FIXME: This is a workaround for DivergenceAnalysis not understanding always

// uniform values (as produced by the mask results of control flow intrinsics)

// used outside of divergent blocks. The phi users need to also be treated as

// always uniform.

//

// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?


static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,

                      unsigned WaveSize) {

  // FIXME: We assume we never cast the mask results of a control flow

  // intrinsic.

  // Early exit if the type won't be consistent as a compile time hack.

  IntegerType *IT = dyn_cast<IntegerType>(V->getType());

  if (!IT || IT->getBitWidth() != WaveSize)

    return false;


  if (!isa<Instruction>(V))

    return false;

  if (!Visited.insert(V).second)

    return false;

  bool Result = false;

  for (const auto *U : V->users()) {

    if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {

      if (V == U->getOperand(1)) {

        switch (Intrinsic->getIntrinsicID()) {

        default:

          Result = false;

          break;

        case Intrinsic::amdgcn_if_break:

        case Intrinsic::amdgcn_if:

        case Intrinsic::amdgcn_else:

          Result = true;

          break;

        }

      }

      if (V == U->getOperand(0)) {

        switch (Intrinsic->getIntrinsicID()) {

        default:

          Result = false;

          break;

        case Intrinsic::amdgcn_end_cf:

        case Intrinsic::amdgcn_loop:

          Result = true;

          break;

        }

      }

    } else {

      Result = hasCFUser(U, Visited, WaveSize);

    }

    if (Result)

      break;

  }

  return Result;

}


bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,

                                               const Value *V) const {

  if (const CallInst *CI = dyn_cast<CallInst>(V)) {

    if (CI->isInlineAsm()) {

      // FIXME: This cannot give a correct answer. This should only trigger in

      // the case where inline asm returns mixed SGPR and VGPR results, used

      // outside the defining block. We don't have a specific result to

      // consider, so this assumes if any value is SGPR, the overall register

      // also needs to be SGPR.

      const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();

      TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints(

          MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);

      for (auto &TC : TargetConstraints) {

        if (TC.Type == InlineAsm::isOutput) {

          ComputeConstraintToUse(TC, SDValue());

          const TargetRegisterClass *RC =

              getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,

                                           TC.ConstraintVT)

                  .second;

          if (RC && SIRI->isSGPRClass(RC))

            return true;

        }

      }

    }

  }

  SmallPtrSet<const Value *, 16> Visited;

  return hasCFUser(V, Visited, Subtarget->getWavefrontSize());

}


bool SITargetLowering::hasMemSDNodeUser(SDNode *N) const {

  for (SDUse &Use : N->uses()) {

    if (MemSDNode *M = dyn_cast<MemSDNode>(Use.getUser())) {

      if (getBasePtrIndex(M) == Use.getOperandNo())

        return true;

    }

  }

  return false;

}


bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,

                                           SDValue N1) const {

  if (!N0.hasOneUse())

    return false;

  // Take care of the opportunity to keep N0 uniform

  if (N0->isDivergent() || !N1->isDivergent())

    return true;

  // Check if we have a good chance to form the memory access pattern with the

  // base and offset

  return (DAG.isBaseWithConstantOffset(N0) &&

          hasMemSDNodeUser(*N0->user_begin()));

}


bool SITargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,

                                           Register N0, Register N1) const {

  return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks

}


MachineMemOperand::Flags


SITargetLowering::getTargetMMOFlags(const Instruction &I) const {

  // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.

  MachineMemOperand::Flags Flags = MachineMemOperand::MONone;

  if (I.getMetadata("amdgpu.noclobber"))

    Flags |= MONoClobber;

  if (I.getMetadata("amdgpu.last.use"))

    Flags |= MOLastUse;

  return Flags;

}


void SITargetLowering::emitExpandAtomicAddrSpacePredicate(

    Instruction *AI) const {

  // Given: atomicrmw fadd ptr %addr, float %val ordering

  //

  // With this expansion we produce the following code:

  //   [...]

  //   %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)

  //   br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private

  //

  // atomicrmw.shared:

  //   %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)

  //   %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,

  //                                   float %val ordering

  //   br label %atomicrmw.phi

  //

  // atomicrmw.check.private:

  //   %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)

  //   br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global

  //

  // atomicrmw.private:

  //   %cast.private = addrspacecast ptr %addr to ptr addrspace(5)

  //   %loaded.private = load float, ptr addrspace(5) %cast.private

  //   %val.new = fadd float %loaded.private, %val

  //   store float %val.new, ptr addrspace(5) %cast.private

  //   br label %atomicrmw.phi

  //

  // atomicrmw.global:

  //   %cast.global = addrspacecast ptr %addr to ptr addrspace(1)

  //   %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,

  //                                   float %val ordering

  //   br label %atomicrmw.phi

  //

  // atomicrmw.phi:

  //   %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],

  //                           [ %loaded.private, %atomicrmw.private ],

  //                           [ %loaded.global, %atomicrmw.global ]

  //   br label %atomicrmw.end

  //

  // atomicrmw.end:

  //    [...]

  //

  //

  // For 64-bit atomics which may reside in private memory, we perform a simpler

  // version that only inserts the private check, and uses the flat operation.


  IRBuilder<> Builder(AI);

  LLVMContext &Ctx = Builder.getContext();


  auto *RMW = dyn_cast<AtomicRMWInst>(AI);

  const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()

                                : AtomicCmpXchgInst::getPointerOperandIndex();

  Value *Addr = AI->getOperand(PtrOpIdx);


  /// TODO: Only need to check private, then emit flat-known-not private (no

  /// need for shared block, or cast to global).

  AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI);


  Align Alignment;

  if (RMW)

    Alignment = RMW->getAlign();

  else if (CX)

    Alignment = CX->getAlign();

  else

    llvm_unreachable("unhandled atomic operation");


  // FullFlatEmulation is true if we need to issue the private, shared, and

  // global cases.

  //

  // If this is false, we are only dealing with the flat-targeting-private case,

  // where we only insert a check for private and still use the flat instruction

  // for global and shared.


  bool FullFlatEmulation =

      RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&

      ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||

       (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&

        RMW->getType()->isDoubleTy()));


  // If the return value isn't used, do not introduce a false use in the phi.

  bool ReturnValueIsUsed = !AI->use_empty();


  BasicBlock *BB = Builder.GetInsertBlock();

  Function *F = BB->getParent();

  BasicBlock *ExitBB =

      BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");

  BasicBlock *SharedBB = nullptr;


  BasicBlock *CheckPrivateBB = BB;

  if (FullFlatEmulation) {

    SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);

    CheckPrivateBB =

        BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);

  }


  BasicBlock *PrivateBB =

      BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);

  BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);

  BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);


  std::prev(BB->end())->eraseFromParent();

  Builder.SetInsertPoint(BB);


  Value *LoadedShared = nullptr;

  if (FullFlatEmulation) {

    CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,

                                                 {Addr}, nullptr, "is.shared");

    Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);

    Builder.SetInsertPoint(SharedBB);

    Value *CastToLocal = Builder.CreateAddrSpaceCast(

        Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));


    Instruction *Clone = AI->clone();

    Clone->insertInto(SharedBB, SharedBB->end());

    Clone->getOperandUse(PtrOpIdx).set(CastToLocal);

    LoadedShared = Clone;


    Builder.CreateBr(PhiBB);

    Builder.SetInsertPoint(CheckPrivateBB);

  }


  CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,

                                                {Addr}, nullptr, "is.private");

  Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);


  Builder.SetInsertPoint(PrivateBB);


  Value *CastToPrivate = Builder.CreateAddrSpaceCast(

      Addr, PointerType::get(Ctx, AMDGPUAS::PRIVATE_ADDRESS));


  Value *LoadedPrivate;

  if (RMW) {

    LoadedPrivate = Builder.CreateAlignedLoad(

        RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");


    Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,

                                        LoadedPrivate, RMW->getValOperand());


    Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());

  } else {

    auto [ResultLoad, Equal] =

        buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),

                          CX->getNewValOperand(), CX->getAlign());


    Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),

                                              ResultLoad, 0);

    LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);

  }


  Builder.CreateBr(PhiBB);


  Builder.SetInsertPoint(GlobalBB);


  // Continue using a flat instruction if we only emitted the check for private.

  Instruction *LoadedGlobal = AI;

  if (FullFlatEmulation) {

    Value *CastToGlobal = Builder.CreateAddrSpaceCast(

        Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));

    AI->getOperandUse(PtrOpIdx).set(CastToGlobal);

  }


  AI->removeFromParent();

  AI->insertInto(GlobalBB, GlobalBB->end());


  // The new atomicrmw may go through another round of legalization later.

  if (!FullFlatEmulation) {

    // We inserted the runtime check already, make sure we do not try to

    // re-expand this.

    // TODO: Should union with any existing metadata.

    MDBuilder MDB(F->getContext());

    MDNode *RangeNotPrivate =

        MDB.createRange(APInt(32, AMDGPUAS::PRIVATE_ADDRESS),

                        APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1));

    LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,

                              RangeNotPrivate);

  }


  Builder.CreateBr(PhiBB);


  Builder.SetInsertPoint(PhiBB);


  if (ReturnValueIsUsed) {

    PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);

    AI->replaceAllUsesWith(Loaded);

    if (FullFlatEmulation)

      Loaded->addIncoming(LoadedShared, SharedBB);

    Loaded->addIncoming(LoadedPrivate, PrivateBB);

    Loaded->addIncoming(LoadedGlobal, GlobalBB);

    Loaded->takeName(AI);

  }


  Builder.CreateBr(ExitBB);

}


static void convertScratchAtomicToFlatAtomic(Instruction *I,

                                             unsigned PtrOpIdx) {

  Value *PtrOp = I->getOperand(PtrOpIdx);

  assert(PtrOp->getType()->getPointerAddressSpace() ==

         AMDGPUAS::PRIVATE_ADDRESS);


  Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);

  Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",

                                              I->getIterator());

  I->setOperand(PtrOpIdx, ASCast);

}


void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {

  AtomicRMWInst::BinOp Op = AI->getOperation();


  if (AI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)

    return convertScratchAtomicToFlatAtomic(AI, AI->getPointerOperandIndex());


  if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||

      Op == AtomicRMWInst::Xor) {

    if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());

        ConstVal && ConstVal->isNullValue()) {

      // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0

      AI->setOperation(AtomicRMWInst::Add);


      // We may still need the private-alias-flat handling below.


      // TODO: Skip this for cases where we cannot access remote memory.

    }

  }


  // The non-flat expansions should only perform the de-canonicalization of

  // identity values.

  if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)

    return;


  emitExpandAtomicAddrSpacePredicate(AI);

}


void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {

  if (CI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)

    return convertScratchAtomicToFlatAtomic(CI, CI->getPointerOperandIndex());


  emitExpandAtomicAddrSpacePredicate(CI);

}


void SITargetLowering::emitExpandAtomicLoad(LoadInst *LI) const {

  if (LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)

    return convertScratchAtomicToFlatAtomic(LI, LI->getPointerOperandIndex());


  llvm_unreachable(

      "Expand Atomic Load only handles SCRATCH -> FLAT conversion");

}


void SITargetLowering::emitExpandAtomicStore(StoreInst *SI) const {

  if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)

    return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());


  llvm_unreachable(

      "Expand Atomic Store only handles SCRATCH -> FLAT conversion");

}


LoadInst *


SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {

  IRBuilder<> Builder(AI);

  auto Order = AI->getOrdering();


  // The optimization removes store aspect of the atomicrmw. Therefore, cache

  // must be flushed if the atomic ordering had a release semantics. This is

  // not necessary a fence, a release fence just coincides to do that flush.

  // Avoid replacing of an atomicrmw with a release semantics.

  if (isReleaseOrStronger(Order))

    return nullptr;


  LoadInst *LI = Builder.CreateAlignedLoad(

      AI->getType(), AI->getPointerOperand(), AI->getAlign());

  LI->setAtomic(Order, AI->getSyncScopeID());

  LI->copyMetadata(*AI);

  LI->takeName(AI);

  AI->replaceAllUsesWith(LI);

  AI->eraseFromParent();

  return LI;

}


isMul
static bool isMul(MachineInstr *MI)
Definition AArch64A57FPLoadBalancing.cpp:67

SDValue
return SDValue()

getIntrinsicID
static unsigned getIntrinsicID(const SDNode *N)
Definition AArch64ISelLowering.cpp:8622

FMAInstKind::Accumulator
@ Accumulator
Definition AArch64InstrInfo.cpp:8442

assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

ImplicitAttrs
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
Definition AMDGPUAttributor.cpp:51

allUsesHaveSourceMods
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Definition AMDGPUCombinerHelper.cpp:124

isCtlzOpc
static bool isCtlzOpc(unsigned Opc)
Definition AMDGPUISelLowering.cpp:3331

AMDGPUInstrInfo.h
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.

isNoUnsignedWrap
static bool isNoUnsignedWrap(MachineInstr *Addr)
Definition AMDGPUInstructionSelector.cpp:6389

parseTexFail
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
Definition AMDGPUInstructionSelector.cpp:2059

isAsyncLDSDMA
static bool isAsyncLDSDMA(Intrinsic::ID Intr)
Definition AMDGPUInstructionSelector.cpp:3489

AMDGPULaneMaskUtils.h

packImage16bitOpsToDwords
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
Definition AMDGPULegalizerInfo.cpp:7033

S32
constexpr LLT S32
Definition AMDGPULegalizerInfo.cpp:299

isKnownNonNull
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Definition AMDGPULegalizerInfo.cpp:2454

AMDGPUMCTargetDesc.h
Provides AMDGPU specific target descriptions.

AMDGPUSelectionDAGInfo.h

AMDGPUTargetMachine.h
The AMDGPU TargetMachine interface definition for hw codegen targets.

AMDGPU.h

APInt.h
This file implements a class to represent arbitrary precision integral constant values and operations...

MBB
MachineBasicBlock & MBB
Definition ARMSLSHardening.cpp:71

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition ARMSLSHardening.cpp:73

MBBI
MachineBasicBlock MachineBasicBlock::iterator MBBI
Definition ARMSLSHardening.cpp:72

IT
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))

Results
Function Alias Analysis Results
Definition AliasAnalysis.cpp:808

X
#define X(NUM, ENUM, NAME)
Definition ELF.h:853

BlockExecWeight::DEFAULT
@ DEFAULT
Default weight is used in cases when there is no dedicated execution weight set.
Definition BranchProbabilityInfo.cpp:210

A
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")

E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

ByteProvider.h

CommandLine.h

calculateByteProvider
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Definition DAGCombiner.cpp:9540

Metadata
dxil translate DXIL Translate Metadata
Definition DXILTranslateMetadata.cpp:647

DiagnosticInfo.h

isSigned
static bool isSigned(unsigned Opcode)
Definition ExpandIRInsts.cpp:83

FloatingPointMode.h
Utilities for dealing with flags related to floating point properties and mode controls.

FunctionLoweringInfo.h

GCNSubtarget.h
AMD GCN specific subclass of TargetSubtarget.

GISelValueTracking.h
Provides analysis for querying information about KnownBits during GISel passes.

DEBUG_TYPE
#define DEBUG_TYPE
Definition GenericCycleImpl.h:31

GenericMachineInstrs.h
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...

TII
const HexagonInstrInfo * TII
Definition HexagonCopyToCombine.cpp:118

IRBuilder.h

MI
IRTranslator LLVM IR MI
Definition IRTranslator.cpp:110

IntrinsicInst.h

Users
iv Induction Variable Users
Definition IVUsers.cpp:48

InlinePriorityMode::Size
@ Size
Definition InlineOrder.cpp:25

InlinePriorityMode::ML
@ ML
Definition InlineOrder.cpp:25

getValue
static constexpr Value * getValue(Ty &ValueOrUse)
Definition Instrumentor.cpp:731

NumOps
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
Definition ItaniumDemangle.h:3473

TemplateParamKind::Type
@ Type
Definition ItaniumDemangle.h:1243

Ops
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Definition ItaniumDemangle.h:3391

KnownBits.h

RegName
#define RegName(no)

Options
static LVOptions Options
Definition LVOptions.cpp:25

LowerAtomic.h

F
#define F(x, y, z)
Definition MD5.cpp:54

I
#define I(x, y, z)
Definition MD5.cpp:57

MDBuilder.h

MIPatternMatch.h
Contains matchers for matching SSA Machine Instructions.

AddrMode
AddrMode
Definition MSP430Disassembler.cpp:138

Module
Machine Check Debug Module
Definition MachineCheckDebugify.cpp:124

MachineFrameInfo.h

MachineFunction.h

MachineLoopInfo.h

isUndef
static bool isUndef(const MachineInstr &MI)
Definition MachineSSAContext.cpp:57

Reg
Register Reg
Definition MachineSink.cpp:2126

TRI
Register const TargetRegisterInfo * TRI
Definition MachineSink.cpp:2127

Register
Promote Memory to Register
Definition Mem2Reg.cpp:110

ModRef.h

getAddressSpace
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
Definition NVPTXAliasAnalysis.cpp:54

Signed
@ Signed
Definition NVPTXISelLowering.cpp:6389

II
uint64_t IntrinsicInst * II
Definition NVVMIntrRange.cpp:46

OptimizationRemarkEmitter.h

P
#define P(N)

PseudoSourceValueManager.h

SPReg
static constexpr MCPhysReg SPReg
Definition RISCVFrameLowering.cpp:54

Cond
const SmallVectorImpl< MachineOperand > & Cond
Definition RISCVRedundantCopyElimination.cpp:73

Opc
auto Opc
Definition RISCVRedundantCopyElimination.cpp:77

Mode
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))

SDPatternMatch.h
Contains matchers for matching SelectionDAG nodes and values.

r0
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:39

r3
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:57

r2
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:51

r1
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:45

FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1273

FP_DENORM_FLUSH_IN_FLUSH_OUT
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition SIDefines.h:1270

reservePrivateMemoryRegs
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
Definition SIISelLowering.cpp:3231

adjustLoadValueTypeImpl
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
Definition SIISelLowering.cpp:7632

emitIndirectSrc
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
Definition SIISelLowering.cpp:5348

denormalModeIsFlushAllF64F16
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
Definition SIISelLowering.cpp:74

isAtomicRMWLegalIntTy
static bool isAtomicRMWLegalIntTy(Type *Ty)
Definition SIISelLowering.cpp:19967

knownBitsForWorkitemID
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
Definition SIISelLowering.cpp:19533

flatInstrMayAccessPrivate
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
Definition SIISelLowering.cpp:20031

computeIndirectRegAndOffset
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
Definition SIISelLowering.cpp:5293

denormalModeIsFlushAllF32
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
Definition SIISelLowering.cpp:69

addresses16Bits
static bool addresses16Bits(int Mask)
Definition SIISelLowering.cpp:14896

expand64BitScalarArithmetic
static MachineBasicBlock * expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
Definition SIISelLowering.cpp:5537

isClampZeroToOne
static bool isClampZeroToOne(SDValue A, SDValue B)
Definition SIISelLowering.cpp:16250

supportsMin3Max3
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
Definition SIISelLowering.cpp:16084

findFirstFreeSGPR
static unsigned findFirstFreeSGPR(CCState &CCInfo)
Definition SIISelLowering.cpp:79

getPermuteMask
static uint32_t getPermuteMask(SDValue V)
Definition SIISelLowering.cpp:14273

lowerLaneOp
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
Definition SIISelLowering.cpp:7854

getAlignedAGPRClassID
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
Definition SIISelLowering.cpp:19376

processPSInputArgs
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
Definition SIISelLowering.cpp:2759

selectSOffset
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
Definition SIISelLowering.cpp:11189

getLoadExtOrTrunc
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
Definition SIISelLowering.cpp:12814

globalMemoryFPAtomicIsLegal
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
Definition SIISelLowering.cpp:20001

getDPPOpcForWaveReduction
static std::tuple< unsigned, unsigned > getDPPOpcForWaveReduction(unsigned Opc, const GCNSubtarget &ST)
Definition SIISelLowering.cpp:5723

fixMasks
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
Definition SIISelLowering.cpp:17076

is32bitWaveReduceOperation
static bool is32bitWaveReduceOperation(unsigned Opc)
Definition SIISelLowering.cpp:5704

atomicSupportedIfLegalIntType
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
Definition SIISelLowering.cpp:20024

strictFPExtFromF16
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
Definition SIISelLowering.cpp:16566

isAtomicRMWLegalXChgTy
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
Definition SIISelLowering.cpp:19977

bitOpWithConstantIsReducible
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
Definition SIISelLowering.cpp:14174

convertScratchAtomicToFlatAtomic
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
Definition SIISelLowering.cpp:20696

isCopyFromRegOfInlineAsm
static bool isCopyFromRegOfInlineAsm(const SDNode *N)
Definition SIISelLowering.cpp:19803

elementPairIsOddToEven
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
Definition SIISelLowering.cpp:9560

DisableLoopAlignment
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))

getDWordFromOffset
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
Definition SIISelLowering.cpp:14937

loadM0FromVGPR
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
Definition SIISelLowering.cpp:5243

isFloatingPointWaveReduceOperation
static bool isFloatingPointWaveReduceOperation(unsigned Opc)
Definition SIISelLowering.cpp:5714

isImmConstraint
static bool isImmConstraint(StringRef Constraint)
Definition SIISelLowering.cpp:19219

padEltsToUndef
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
Definition SIISelLowering.cpp:9977

lowerICMPIntrinsic
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
Definition SIISelLowering.cpp:7748

hasCFUser
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
Definition SIISelLowering.cpp:20387

ExtractSubRegs
static std::pair< Register, Register > ExtractSubRegs(MachineInstr &MI, MachineOperand &Op, const TargetRegisterClass *SrcRC, const GCNSubtarget &ST, MachineRegisterInfo &MRI)
Definition SIISelLowering.cpp:5801

emitAtomicRMWLegalRemark
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
Definition SIISelLowering.cpp:19935

SubIdx2Lane
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
Definition SIISelLowering.cpp:18543

getPrivateAtomicExpansionKind
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
Definition SIISelLowering.cpp:20038

addressMayBeAccessedAsPrivate
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
Definition SIISelLowering.cpp:12903

lowerWaveReduce
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
Definition SIISelLowering.cpp:5815

elementPairIsContiguous
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
Definition SIISelLowering.cpp:9555

isV2BF16
static bool isV2BF16(Type *Ty)
Definition SIISelLowering.cpp:19961

allocateSGPR32InputImpl
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
Definition SIISelLowering.cpp:2880

getMad64_32
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
Definition SIISelLowering.cpp:16690

resolveSources
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
Definition SIISelLowering.cpp:17008

hasNon16BitAccesses
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
Definition SIISelLowering.cpp:14915

lowerWaveShuffle
static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
Definition SIISelLowering.cpp:8065

placeSources
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
Definition SIISelLowering.cpp:16932

parseSyncscopeMDArg
static unsigned parseSyncscopeMDArg(const CallBase &CI, unsigned ArgIdx)
Definition SIISelLowering.cpp:1365

memVTFromLoadIntrReturn
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
Definition SIISelLowering.cpp:1269

emitLoadM0FromVGPRLoop
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
Definition SIISelLowering.cpp:5151

matchPERM
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Definition SIISelLowering.cpp:14992

isFrameIndexOp
static bool isFrameIndexOp(SDValue Op)
Definition SIISelLowering.cpp:18740

getSplatConstantFP
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
Definition SIISelLowering.cpp:15995

allocateSGPR32Input
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
Definition SIISelLowering.cpp:2909

knownBitsForSBFE
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
Definition SIISelLowering.cpp:19541

isExtendedFrom16Bits
static bool isExtendedFrom16Bits(SDValue &Operand)
Definition SIISelLowering.cpp:14870

checkDot4MulSignedness
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
Definition SIISelLowering.cpp:17092

vectorEltWillFoldAway
static bool vectorEltWillFoldAway(SDValue Op)
Definition SIISelLowering.cpp:15856

getSPDenormModeValue
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
Definition SIISelLowering.cpp:13325

getConstantPermuteMask
static uint32_t getConstantPermuteMask(uint32_t C)
Definition SIISelLowering.cpp:14249

parseAtomicOrderingCABIArg
static AtomicOrdering parseAtomicOrderingCABIArg(const CallBase &CI, unsigned ArgIdx)
Definition SIISelLowering.cpp:1346

emitIndirectDst
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
Definition SIISelLowering.cpp:5434

setM0ToIndexFromSGPR
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
Definition SIISelLowering.cpp:5306

allocateVGPR32Input
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
Definition SIISelLowering.cpp:2856

splitBlockForLoop
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
Definition SIISelLowering.cpp:5047

getBasePtrIndex
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
Definition SIISelLowering.cpp:14141

allocateFixedSGPRInputImpl
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
Definition SIISelLowering.cpp:2900

constructRetValue
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
Definition SIISelLowering.cpp:9998

handleMulOperand
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
Definition SIISelLowering.cpp:16900

tryReduceF64CompareToHiHalf
static ISD::CondCode tryReduceF64CompareToHiHalf(const ISD::CondCode CC, const SDValue LHS, const SDValue RHS, const SelectionDAG &DAG)
Definition SIISelLowering.cpp:17833

lowerFCMPIntrinsic
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
Definition SIISelLowering.cpp:7782

getIndirectSGPRIdx
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
Definition SIISelLowering.cpp:5329

emitNonHSAIntrinsicError
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
Definition SIISelLowering.cpp:9932

memVTFromLoadIntrData
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
Definition SIISelLowering.cpp:1253

minMaxOpcToMin3Max3Opc
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
Definition SIISelLowering.cpp:15930

getExtOpcodeForPromotedOp
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
Definition SIISelLowering.cpp:8716

expand64BitV_CNDMASK
static void expand64BitV_CNDMASK(MachineInstr &MI, MachineBasicBlock *BB)
Definition SIISelLowering.cpp:5588

lowerBALLOTIntrinsic
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
Definition SIISelLowering.cpp:7811

buildSMovImm32
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
Definition SIISelLowering.cpp:19005

tryFoldMADwithSRL
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
Definition SIISelLowering.cpp:16704

getIntrMemWidth
static unsigned getIntrMemWidth(unsigned IntrID)
Definition SIISelLowering.cpp:1311

getBuildDwordsVector
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
Definition SIISelLowering.cpp:9948

findUser
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
Definition SIISelLowering.cpp:8331

addPermMasks
static unsigned addPermMasks(unsigned First, unsigned Second)
Definition SIISelLowering.cpp:16912

clearUnusedBits
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
Definition SIISelLowering.cpp:19258

getFPTernOp
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
Definition SIISelLowering.cpp:13204

isV2F16OrV2BF16
static bool isV2F16OrV2BF16(Type *Ty)
Definition SIISelLowering.cpp:19946

atomicIgnoresDenormalModeOrFPModeIsFTZ
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
Definition SIISelLowering.cpp:19920

emitRemovedIntrinsicError
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
Definition SIISelLowering.cpp:9940

getFPBinOp
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
Definition SIISelLowering.cpp:13181

buildPCRelGlobalAddress
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
Definition SIISelLowering.cpp:9773

UseDivergentRegisterIndexing
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))

calculateSrcByte
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
Definition SIISelLowering.cpp:14540

isV2F16
static bool isV2F16(Type *Ty)
Definition SIISelLowering.cpp:19956

allocateSGPR64Input
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
Definition SIISelLowering.cpp:2917

getIdentityValueForWaveReduction
static uint64_t getIdentityValueForWaveReduction(unsigned Opc)
Definition SIISelLowering.cpp:5652

SIISelLowering.h
SI DAG Lowering interface definition.

SIMachineFunctionInfo.h

SIRegisterInfo.h
Interface definition for SIRegisterInfo.

contains
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:483

Statistic.h
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...

STATISTIC
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171

LLVM_DEBUG
#define LLVM_DEBUG(...)
Definition Debug.h:119

Y
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")

UniformityAnalysis.h
LLVM IR instance of the generic uniformity analysis.

Concat
static constexpr int Concat[]
Definition X86InterleavedAccess.cpp:232

RHS
Value * RHS
Definition X86PartialReduction.cpp:81

LHS
Value * LHS
Definition X86PartialReduction.cpp:80

FunctionType
Definition ItaniumDemangle.h:835

Input
The Input class is used to parse a yaml document into in-memory structs and vectors.
Definition YAMLTraits.h:1314

Node
Definition ItaniumDemangle.h:166

PointerType
Definition ItaniumDemangle.h:639

ValT

llvm::AMDGPUMachineFunctionInfo
Definition AMDGPUMachineFunctionInfo.h:24

llvm::AMDGPUMachineFunctionInfo::getLDSKernelIdMetadata
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
Definition AMDGPUMachineFunctionInfo.cpp:169

llvm::AMDGPUMachineFunctionInfo::setDynLDSAlign
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
Definition AMDGPUMachineFunctionInfo.cpp:203

llvm::AMDGPUMachineFunctionInfo::setUsesDynamicLDS
void setUsesDynamicLDS(bool DynLDS)
Definition AMDGPUMachineFunctionInfo.cpp:231

llvm::AMDGPUMachineFunctionInfo::isBottomOfStack
bool isBottomOfStack() const
Definition AMDGPUMachineFunctionInfo.h:96

llvm::AMDGPUMachineFunctionInfo::getLDSSize
uint32_t getLDSSize() const
Definition AMDGPUMachineFunctionInfo.h:79

llvm::AMDGPUMachineFunctionInfo::getLDSAbsoluteAddress
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
Definition AMDGPUMachineFunctionInfo.cpp:185

llvm::AMDGPUMachineFunctionInfo::isEntryFunction
bool isEntryFunction() const
Definition AMDGPUMachineFunctionInfo.h:89

llvm::AMDGPUSubtarget::GFX10
@ GFX10
Definition AMDGPUSubtarget.h:42

llvm::AMDGPUSubtarget::GFX9
@ GFX9
Definition AMDGPUSubtarget.h:41

llvm::AMDGPUSubtarget::GFX12
@ GFX12
Definition AMDGPUSubtarget.h:44

llvm::AMDGPUSubtarget::SEA_ISLANDS
@ SEA_ISLANDS
Definition AMDGPUSubtarget.h:39

llvm::AMDGPUSubtarget::SOUTHERN_ISLANDS
@ SOUTHERN_ISLANDS
Definition AMDGPUSubtarget.h:38

llvm::AMDGPUSubtarget::VOLCANIC_ISLANDS
@ VOLCANIC_ISLANDS
Definition AMDGPUSubtarget.h:40

llvm::AMDGPUSubtarget::GFX11
@ GFX11
Definition AMDGPUSubtarget.h:43

llvm::AMDGPUSubtarget::getWavefrontSize
unsigned getWavefrontSize() const
Definition AMDGPUSubtarget.h:219

llvm::AMDGPUTargetLowering::numBitsSigned
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
Definition AMDGPUISelLowering.cpp:56

llvm::AMDGPUTargetLowering::SplitVectorLoad
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
Definition AMDGPUISelLowering.cpp:1867

llvm::AMDGPUTargetLowering::analyzeFormalArgumentsCompute
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
Definition AMDGPUISelLowering.cpp:1215

llvm::AMDGPUTargetLowering::LowerF64ToF16Safe
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:3777

llvm::AMDGPUTargetLowering::storeStackInputValue
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
Definition AMDGPUISelLowering.cpp:5905

llvm::AMDGPUTargetLowering::computeKnownBitsForTargetNode
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
Definition AMDGPUISelLowering.cpp:6023

llvm::AMDGPUTargetLowering::splitBinaryBitConstantOpImpl
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
Definition AMDGPUISelLowering.cpp:4309

llvm::AMDGPUTargetLowering::lowerUnhandledCall
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
Definition AMDGPUISelLowering.cpp:1390

llvm::AMDGPUTargetLowering::LowerGlobalAddress
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI, SDValue Op, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:1529

llvm::AMDGPUTargetLowering::LowerOperation
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
Definition AMDGPUISelLowering.cpp:1436

llvm::AMDGPUTargetLowering::addTokenForArgument
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
Definition AMDGPUISelLowering.cpp:1356

llvm::AMDGPUTargetLowering::isKnownNeverNaNForTargetNode
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
Definition AMDGPUISelLowering.cpp:6280

llvm::AMDGPUTargetLowering::needsDenormHandlingF32
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
Definition AMDGPUISelLowering.cpp:2671

llvm::AMDGPUTargetLowering::getImplicitParameterOffset
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
Definition AMDGPUISelLowering.cpp:5964

llvm::AMDGPUTargetLowering::LowerFP_TO_INT
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:3865

llvm::AMDGPUTargetLowering::loadInputValue
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
Definition AMDGPUISelLowering.cpp:5924

llvm::AMDGPUTargetLowering::AMDGPUTargetLowering
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
Definition AMDGPUISelLowering.cpp:62

llvm::AMDGPUTargetLowering::getEquivalentMemType
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
Definition AMDGPUISelLowering.cpp:41

llvm::AMDGPUTargetLowering::ImplicitParameter
ImplicitParameter
Definition AMDGPUISelLowering.h:397

llvm::AMDGPUTargetLowering::SHARED_BASE
@ SHARED_BASE
Definition AMDGPUISelLowering.h:400

llvm::AMDGPUTargetLowering::PRIVATE_BASE
@ PRIVATE_BASE
Definition AMDGPUISelLowering.h:399

llvm::AMDGPUTargetLowering::FIRST_IMPLICIT
@ FIRST_IMPLICIT
Definition AMDGPUISelLowering.h:398

llvm::AMDGPUTargetLowering::QUEUE_PTR
@ QUEUE_PTR
Definition AMDGPUISelLowering.h:401

llvm::AMDGPUTargetLowering::CreateLiveInRegister
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
Definition AMDGPUISelLowering.cpp:5853

llvm::AMDGPUTargetLowering::SplitVectorStore
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
Definition AMDGPUISelLowering.cpp:1961

llvm::AMDGPUTargetLowering::split64BitValue
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
Definition AMDGPUISelLowering.cpp:1783

llvm::AMDGPUTargetLowering::CCAssignFnForReturn
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
Definition AMDGPUISelLowering.cpp:1351

llvm::AMDGPUTargetLowering::CCAssignFnForCall
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
Definition AMDGPUISelLowering.cpp:1346

llvm::AMDGPUTargetLowering::numBitsUnsigned
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
Definition AMDGPUISelLowering.cpp:52

llvm::AMDGPUTargetLowering::allowApproxFunc
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
Definition AMDGPUISelLowering.cpp:2666

llvm::AMDGPUTargetLowering::LowerReturn
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
Definition AMDGPUISelLowering.cpp:1329

llvm::AMDGPUTargetLowering::ReplaceNodeResults
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Definition AMDGPUISelLowering.cpp:1489

llvm::AMDGPUTargetLowering::performRcpCombine
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
Definition AMDGPUISelLowering.cpp:5539

llvm::AMDGPUTargetLowering::shouldFoldFNegIntoSrc
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
Definition AMDGPUISelLowering.cpp:5231

llvm::AMDGPUTargetLowering::PerformDAGCombine
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
Definition AMDGPUISelLowering.cpp:5575

llvm::AMDGPUTargetLowering::WidenOrSplitVectorLoad
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
Definition AMDGPUISelLowering.cpp:1927

llvm::AMDGPUTargetLowering::getHiHalf64
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
Definition AMDGPUISelLowering.cpp:1805

llvm::AMDGPUTargetMachine
Definition AMDGPUTargetMachine.h:34

llvm::AMDGPUTargetMachine::isNoopAddrSpaceCast
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
Definition AMDGPUTargetMachine.cpp:1140

llvm::AMDGPUTargetMachine::EnableObjectLinking
static bool EnableObjectLinking
Definition AMDGPUTargetMachine.h:43

llvm::AMDGPU::ClusterDimsAttr::isFixedDims
bool isFixedDims() const
Definition AMDGPUBaseInfo.h:1885

llvm::AMDGPU::ClusterDimsAttr::Kind::FixedDims
@ FixedDims
Definition AMDGPUBaseInfo.h:1875

llvm::AMDGPU::ClusterDimsAttr::Kind::NoCluster
@ NoCluster
Definition AMDGPUBaseInfo.h:1875

llvm::AMDGPU::ClusterDimsAttr::Kind::Unknown
@ Unknown
Definition AMDGPUBaseInfo.h:1875

llvm::AMDGPU::ClusterDimsAttr::Kind::VariableDims
@ VariableDims
Definition AMDGPUBaseInfo.h:1875

llvm::AMDGPU::ClusterDimsAttr::getKind
Kind getKind() const
Definition AMDGPUBaseInfo.h:1879

llvm::AMDGPU::ClusterDimsAttr::getDims
const std::array< unsigned, 3 > & getDims() const
Definition AMDGPUBaseInfo.cpp:3787

llvm::AMDGPU::LaneMaskConstants
Definition AMDGPULaneMaskUtils.h:21

llvm::AMDGPU::LaneMaskConstants::MovOpc
const unsigned MovOpc
Definition AMDGPULaneMaskUtils.h:35

llvm::AMDGPU::LaneMaskConstants::get
static const LaneMaskConstants & get(const GCNSubtarget &ST)
Definition AMDGPULaneMaskUtils.h:81

llvm::AMDGPU::LaneMaskConstants::XorTermOpc
const unsigned XorTermOpc
Definition AMDGPULaneMaskUtils.h:42

llvm::AMDGPU::LaneMaskConstants::ExecReg
const Register ExecReg
Definition AMDGPULaneMaskUtils.h:23

llvm::AMDGPU::LaneMaskConstants::AndSaveExecOpc
const unsigned AndSaveExecOpc
Definition AMDGPULaneMaskUtils.h:30

llvm::APFloatBase::rmNearestTiesToEven
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344

llvm::APFloatBase::IEEEhalf
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294

llvm::APFloat
Definition APFloat.h:1029

llvm::APFloat::getQNaN
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1179

llvm::APFloat::convert
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:5899

llvm::APFloat::getExactLog2Abs
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1594

llvm::APFloat::isNegative
bool isNegative() const
Definition APFloat.h:1538

llvm::APFloat::isNormal
bool isNormal() const
Definition APFloat.h:1542

llvm::APFloat::bitcastToAPInt
APInt bitcastToAPInt() const
Definition APFloat.h:1430

llvm::APFloat::getLargest
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1197

llvm::APFloat::getInf
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1157

llvm::APFloat::getZero
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1138

llvm::APFloat::isInfinity
bool isInfinity() const
Definition APFloat.h:1535

llvm::APInt
Class for arbitrary precision integers.
Definition APInt.h:78

llvm::APInt::setHighBits
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1414

llvm::APInt::setBitsFrom
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1408

llvm::APInt::getBitsSet
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259

llvm::APInt::isZero
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381

llvm::APInt::isSignMask
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:467

llvm::APInt::countr_zero
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1662

llvm::APInt::isOneBitSet
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
Definition APInt.h:367

llvm::APInt::isSignBitSet
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition APInt.h:342

llvm::APInt::getHighBitsSet
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297

llvm::APInt::sge
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1244

llvm::APInt::uge
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228

llvm::Argument
This class represents an incoming formal argument to a Function.
Definition Argument.h:32

llvm::Argument::hasAttribute
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition Function.cpp:338

llvm::Argument::getParent
const Function * getParent() const
Definition Argument.h:44

llvm::ArrayRef
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40

llvm::ArrayRef::size
size_t size() const
Get the array size.
Definition ArrayRef.h:141

llvm::ArrayRef::empty
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136

llvm::AtomicCmpXchgInst
An instruction that atomically checks whether a specified value is in a memory location,...
Definition Instructions.h:507

llvm::AtomicCmpXchgInst::getNewValOperand
Value * getNewValOperand()
Definition Instructions.h:642

llvm::AtomicCmpXchgInst::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition Instructions.h:646

llvm::AtomicCmpXchgInst::getCompareOperand
Value * getCompareOperand()
Definition Instructions.h:639

llvm::AtomicCmpXchgInst::getAlign
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition Instructions.h:550

llvm::AtomicCmpXchgInst::getPointerOperandIndex
static unsigned getPointerOperandIndex()
Definition Instructions.h:637

llvm::AtomicRMWInst
an instruction that atomically reads a memory location, combines it with another value,...
Definition Instructions.h:710

llvm::AtomicRMWInst::getAlign
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition Instructions.h:856

llvm::AtomicRMWInst::getPointerOperandIndex
static unsigned getPointerOperandIndex()
Definition Instructions.h:907

llvm::AtomicRMWInst::BinOp
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition Instructions.h:722

llvm::AtomicRMWInst::Add
@ Add
*p = old + v
Definition Instructions.h:726

llvm::AtomicRMWInst::FAdd
@ FAdd
*p = old + v
Definition Instructions.h:747

llvm::AtomicRMWInst::USubCond
@ USubCond
Subtract only if no unsigned overflow.
Definition Instructions.h:786

llvm::AtomicRMWInst::Min
@ Min
*p = old <signed v ? old : v
Definition Instructions.h:740

llvm::AtomicRMWInst::Or
@ Or
*p = old | v
Definition Instructions.h:734

llvm::AtomicRMWInst::Sub
@ Sub
*p = old - v
Definition Instructions.h:728

llvm::AtomicRMWInst::And
@ And
*p = old & v
Definition Instructions.h:730

llvm::AtomicRMWInst::Xor
@ Xor
*p = old ^ v
Definition Instructions.h:736

llvm::AtomicRMWInst::USubSat
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
Definition Instructions.h:790

llvm::AtomicRMWInst::FSub
@ FSub
*p = old - v
Definition Instructions.h:750

llvm::AtomicRMWInst::UIncWrap
@ UIncWrap
Increment one up to a maximum value.
Definition Instructions.h:778

llvm::AtomicRMWInst::Max
@ Max
*p = old >signed v ? old : v
Definition Instructions.h:738

llvm::AtomicRMWInst::UMin
@ UMin
*p = old <unsigned v ? old : v
Definition Instructions.h:744

llvm::AtomicRMWInst::FMin
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition Instructions.h:758

llvm::AtomicRMWInst::UMax
@ UMax
*p = old >unsigned v ? old : v
Definition Instructions.h:742

llvm::AtomicRMWInst::FMax
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition Instructions.h:754

llvm::AtomicRMWInst::UDecWrap
@ UDecWrap
Decrement one until a minimum value or zero.
Definition Instructions.h:782

llvm::AtomicRMWInst::Xchg
@ Xchg
*p = v
Definition Instructions.h:724

llvm::AtomicRMWInst::Nand
@ Nand
*p = ~(old & v)
Definition Instructions.h:732

llvm::AtomicRMWInst::getPointerOperand
Value * getPointerOperand()
Definition Instructions.h:905

llvm::AtomicRMWInst::setOperation
void setOperation(BinOp Operation)
Definition Instructions.h:850

llvm::AtomicRMWInst::getOperation
BinOp getOperation() const
Definition Instructions.h:830

llvm::AtomicRMWInst::getSyncScopeID
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
Definition Instructions.h:896

llvm::AtomicRMWInst::getValOperand
Value * getValOperand()
Definition Instructions.h:909

llvm::AtomicRMWInst::getOperationName
static LLVM_ABI StringRef getOperationName(BinOp Op)
Definition Instructions.cpp:1467

llvm::AtomicRMWInst::getOrdering
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
Definition Instructions.h:882

llvm::AtomicRMWInst::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition Instructions.h:913

llvm::AtomicSDNode::isCompareAndSwap
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
Definition SelectionDAGNodes.h:1688

llvm::AttributeSet
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:407

llvm::AttributeSet::getMemoryEffects
LLVM_ABI MemoryEffects getMemoryEffects() const
Definition Attributes.cpp:1262

llvm::Attribute::getValueAsBool
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
Definition Attributes.cpp:391

llvm::BasicBlock
LLVM Basic Block Representation.
Definition BasicBlock.h:62

llvm::BasicBlock::end
iterator end()
Definition BasicBlock.h:474

llvm::BasicBlock::splitBasicBlock
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
Definition BasicBlock.cpp:528

llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213

llvm::BasicBlock::Create
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206

llvm::BitVector
Definition BitVector.h:101

llvm::BuildVectorSDNode
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Definition SelectionDAGNodes.h:2296

llvm::ByteProvider
Represents known origin of an individual byte in combine pattern.
Definition ByteProvider.h:32

llvm::ByteProvider::SrcOffset
int64_t SrcOffset
Definition ByteProvider.h:56

llvm::ByteProvider::getConstantZero
static ByteProvider getConstantZero()
Definition ByteProvider.h:67

llvm::ByteProvider::getSrc
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
Definition ByteProvider.h:60

llvm::ByteProvider::Src
std::optional< ISelOp > Src
Definition ByteProvider.h:51

llvm::CCState
CCState - This class holds information needed while lowering arguments and return values.
Definition CallingConvLower.h:171

llvm::CCState::getMachineFunction
MachineFunction & getMachineFunction() const
Definition CallingConvLower.h:241

llvm::CCState::getFirstUnallocated
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
Definition CallingConvLower.h:318

llvm::CCState::resultsCompatible
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
Definition CallingConvLower.cpp:264

llvm::CCState::AnalyzeCallResult
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
Definition CallingConvLower.cpp:164

llvm::CCState::AllocateReg
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
Definition CallingConvLower.h:333

llvm::CCState::CheckReturn
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
Definition CallingConvLower.cpp:99

llvm::CCState::AnalyzeReturn
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
Definition CallingConvLower.cpp:113

llvm::CCState::AllocateStack
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
Definition CallingConvLower.h:408

llvm::CCState::AnalyzeCallOperands
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
Definition CallingConvLower.cpp:126

llvm::CCState::getStackSize
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
Definition CallingConvLower.h:246

llvm::CCState::isAllocated
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
Definition CallingConvLower.h:257

llvm::CCState::AnalyzeFormalArguments
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
Definition CallingConvLower.cpp:85

llvm::CCValAssign
CCValAssign - Represent assignment of one arg/retval to a location.
Definition CallingConvLower.h:34

llvm::CCValAssign::isRegLoc
bool isRegLoc() const
Definition CallingConvLower.h:123

llvm::CCValAssign::getLocReg
Register getLocReg() const
Definition CallingConvLower.h:129

llvm::CCValAssign::getLocInfo
LocInfo getLocInfo() const
Definition CallingConvLower.h:135

llvm::CCValAssign::BCvt
@ BCvt
Definition CallingConvLower.h:47

llvm::CCValAssign::SExt
@ SExt
Definition CallingConvLower.h:38

llvm::CCValAssign::ZExt
@ ZExt
Definition CallingConvLower.h:39

llvm::CCValAssign::Full
@ Full
Definition CallingConvLower.h:37

llvm::CCValAssign::FPExt
@ FPExt
Definition CallingConvLower.h:52

llvm::CCValAssign::AExt
@ AExt
Definition CallingConvLower.h:40

llvm::CCValAssign::getValVT
MVT getValVT() const
Definition CallingConvLower.h:121

llvm::CCValAssign::isMemLoc
bool isMemLoc() const
Definition CallingConvLower.h:124

llvm::CCValAssign::getLocMemOffset
int64_t getLocMemOffset() const
Definition CallingConvLower.h:130

llvm::CCValAssign::getLocVT
MVT getLocVT() const
Definition CallingConvLower.h:133

llvm::CallBase
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition InstrTypes.h:1118

llvm::CallBase::hasFnAttr
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
Definition InstrTypes.h:1460

llvm::CallBase::isMustTailCall
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Definition Instructions.cpp:339

llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition InstrTypes.h:1294

llvm::CallBase::arg_size
unsigned arg_size() const
Definition InstrTypes.h:1292

llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition Instructions.h:1529

llvm::CallInst::isTailCall
bool isTailCall() const
Definition Instructions.h:1640

llvm::CastInst::CreatePointerCast
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Definition Instructions.cpp:3102

llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676

llvm::CmpInst::ICMP_NE
@ ICMP_NE
not equal
Definition InstrTypes.h:698

llvm::CmpInst::isSigned
bool isSigned() const
Definition InstrTypes.h:930

llvm::CmpInst::isFPPredicate
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:770

llvm::CmpInst::isIntPredicate
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776

llvm::ConstantFPSDNode
Definition SelectionDAGNodes.h:1877

llvm::ConstantFPSDNode::getValueAPF
const APFloat & getValueAPF() const
Definition SelectionDAGNodes.h:1890

llvm::ConstantFPSDNode::isExactlyValue
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
Definition SelectionDAGNodes.h:1913

llvm::ConstantFPSDNode::isNegative
bool isNegative() const
Return true if the value is negative.
Definition SelectionDAGNodes.h:1903

llvm::ConstantFPSDNode::isInfinity
bool isInfinity() const
Return true if the value is an infinity.
Definition SelectionDAGNodes.h:1900

llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition Constants.h:87

llvm::ConstantInt::isZero
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:219

llvm::ConstantSDNode
Definition SelectionDAGNodes.h:1822

llvm::ConstantSDNode::getZExtValue
uint64_t getZExtValue() const
Definition SelectionDAGNodes.h:1839

llvm::ConstantSDNode::getAPIntValue
const APInt & getAPIntValue() const
Definition SelectionDAGNodes.h:1838

llvm::ConstantSDNode::isZero
bool isZero() const
Definition SelectionDAGNodes.h:1848

llvm::ConstantSDNode::isAllOnes
bool isAllOnes() const
Definition SelectionDAGNodes.h:1849

llvm::Constant
This is an important base class in LLVM.
Definition Constant.h:43

llvm::DWARFExpression::Operation::getNumOperands
uint64_t getNumOperands() const
Definition DWARFExpression.h:93

llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64

llvm::DataLayout::getABITypeAlign
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition DataLayout.cpp:989

llvm::DataLayout::isBigEndian
bool isBigEndian() const
Definition DataLayout.h:218

llvm::DebugLoc
A debug info location.
Definition DebugLoc.h:123

llvm::DiagnosticInfoUnsupported
Diagnostic information for unsupported feature in backend.
Definition DiagnosticInfo.h:1103

llvm::ElementCount::getFixed
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309

llvm::FixedVectorType
Class to represent fixed width SIMD vectors.
Definition DerivedTypes.h:642

llvm::FixedVectorType::getNumElements
unsigned getNumElements() const
Definition DerivedTypes.h:685

llvm::FunctionLoweringInfo
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Definition FunctionLoweringInfo.h:56

llvm::FunctionLoweringInfo::DemoteRegister
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
Definition FunctionLoweringInfo.h:73

llvm::FunctionLoweringInfo::MF
MachineFunction * MF
Definition FunctionLoweringInfo.h:59

llvm::FunctionLoweringInfo::getValueFromVirtualReg
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Definition FunctionLoweringInfo.cpp:571

llvm::FunctionType
Class to represent function types.
Definition DerivedTypes.h:157

llvm::FunctionType::getParamType
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition DerivedTypes.h:189

llvm::Function
Definition Function.h:65

llvm::Function::getFunctionType
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:211

llvm::Function::getDataLayout
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:362

llvm::Function::args
iterator_range< arg_iterator > args()
Definition Function.h:892

llvm::Function::getFnAttribute
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:763

llvm::Function::getCallingConv
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272

llvm::Function::getContext
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358

llvm::Function::getDenormalMode
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition Function.cpp:804

llvm::Function::getArg
Argument * getArg(unsigned i) const
Definition Function.h:886

llvm::GCNSubtarget
Definition GCNSubtarget.h:34

llvm::GCNSubtarget::getInstrInfo
const SIInstrInfo * getInstrInfo() const override
Definition GCNSubtarget.h:112

llvm::GCNSubtarget::TrapHandlerAbi::AMDHSA
@ AMDHSA
Definition GCNSubtarget.h:42

llvm::GCNSubtarget::hasMadF16
bool hasMadF16() const
Definition GCNSubtarget.cpp:408

llvm::GCNSubtarget::getInstCacheLineSize
unsigned getInstCacheLineSize() const
Instruction cache line size in bytes (64 for pre-GFX11, 128 for GFX11+).
Definition GCNSubtarget.h:186

llvm::GCNSubtarget::getRegisterInfo
const SIRegisterInfo * getRegisterInfo() const override
Definition GCNSubtarget.h:120

llvm::GCNSubtarget::hasMin3Max3_16
bool hasMin3Max3_16() const
Definition GCNSubtarget.h:226

llvm::GCNSubtarget::TrapID::LLVMAMDHSADebugTrap
@ LLVMAMDHSADebugTrap
Definition GCNSubtarget.h:47

llvm::GCNSubtarget::TrapID::LLVMAMDHSATrap
@ LLVMAMDHSATrap
Definition GCNSubtarget.h:46

llvm::GCNSubtarget::getKnownHighZeroBitsForFrameIndex
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
Definition GCNSubtarget.h:179

llvm::GCNSubtarget::supportsWaveWideBPermute
bool supportsWaveWideBPermute() const
Definition GCNSubtarget.h:990

llvm::GCNSubtarget::getMaxPrivateElementSize
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition GCNSubtarget.h:188

llvm::GCNSubtarget::isWave64
bool isWave64() const
Definition GCNSubtarget.h:877

llvm::GCNTargetMachine
Definition AMDGPUTargetMachine.h:83

llvm::GCNUserSGPRUsageInfo
Definition GCNSubtarget.h:1011

llvm::GCNUserSGPRUsageInfo::hasQueuePtr
bool hasQueuePtr() const
Definition GCNSubtarget.h:1019

llvm::GCNUserSGPRUsageInfo::hasKernargSegmentPtr
bool hasKernargSegmentPtr() const
Definition GCNSubtarget.h:1021

llvm::GCNUserSGPRUsageInfo::hasDispatchID
bool hasDispatchID() const
Definition GCNSubtarget.h:1023

llvm::GCNUserSGPRUsageInfo::hasPrivateSegmentBuffer
bool hasPrivateSegmentBuffer() const
Definition GCNSubtarget.h:1015

llvm::GCNUserSGPRUsageInfo::getNumFreeUserSGPRs
unsigned getNumFreeUserSGPRs()
Definition GCNSubtarget.cpp:900

llvm::GCNUserSGPRUsageInfo::hasImplicitBufferPtr
bool hasImplicitBufferPtr() const
Definition GCNSubtarget.h:1013

llvm::GCNUserSGPRUsageInfo::hasPrivateSegmentSize
bool hasPrivateSegmentSize() const
Definition GCNSubtarget.h:1027

llvm::GCNUserSGPRUsageInfo::hasDispatchPtr
bool hasDispatchPtr() const
Definition GCNSubtarget.h:1017

llvm::GCNUserSGPRUsageInfo::hasFlatScratchInit
bool hasFlatScratchInit() const
Definition GCNSubtarget.h:1025

llvm::GISelValueTracking
Definition GISelValueTracking.h:34

llvm::GISelValueTracking::getMachineFunction
const MachineFunction & getMachineFunction() const
Definition GISelValueTracking.h:63

llvm::GISelValueTracking::computeKnownBitsImpl
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
Definition GISelValueTracking.cpp:160

llvm::GenericUniformityInfo::isDivergentAtDef
bool isDivergentAtDef(ConstValueRefT V) const
Whether V is divergent at its definition.
Definition GenericUniformityImpl.h:1295

llvm::GlobalAddressSDNode
Definition SelectionDAGNodes.h:2061

llvm::GlobalAddressSDNode::getOffset
int64_t getOffset() const
Definition SelectionDAGNodes.h:2076

llvm::GlobalAddressSDNode::getAddressSpace
LLVM_ABI unsigned getAddressSpace() const
Definition SelectionDAG.cpp:14446

llvm::GlobalAddressSDNode::getGlobal
const GlobalValue * getGlobal() const
Definition SelectionDAGNodes.h:2075

llvm::GlobalValue
Definition GlobalValue.h:49

llvm::GlobalValue::hasExternalLinkage
bool hasExternalLinkage() const
Definition GlobalValue.h:513

llvm::GlobalValue::getAddressSpace
unsigned getAddressSpace() const
Definition GlobalValue.h:207

llvm::GlobalValue::getParent
Module * getParent()
Get the module that this global value is contained inside of...
Definition GlobalValue.h:663

llvm::GlobalValue::getDataLayout
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:141

llvm::GlobalValue::getValueType
Type * getValueType() const
Definition GlobalValue.h:298

llvm::GlobalVariable::getGlobalSize
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
Definition Globals.cpp:569

llvm::IRBuilder
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2858

llvm::InlineAsm::Kind::RegUse
@ RegUse
Definition InlineAsm.h:230

llvm::InlineAsm::isOutput
@ isOutput
Definition InlineAsm.h:99

llvm::InlineAsm::Op_FirstOperand
@ Op_FirstOperand
Definition InlineAsm.h:209

llvm::Instruction
Definition Instruction.h:69

llvm::Instruction::clone
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
Definition Instruction.cpp:1464

llvm::Instruction::removeFromParent
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition Instruction.cpp:98

llvm::Instruction::hasMetadata
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
Definition Instruction.h:437

llvm::Instruction::eraseFromParent
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition Instruction.cpp:112

llvm::Instruction::getFunction
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
Definition Instruction.cpp:90

llvm::Instruction::setMetadata
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition Metadata.cpp:1751

llvm::Instruction::copyMetadata
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Definition Instruction.cpp:1444

llvm::Instruction::getDataLayout
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Definition Instruction.cpp:94

llvm::Instruction::insertInto
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Definition Instruction.cpp:141

llvm::IntegerType
Class to represent integer types.
Definition DerivedTypes.h:42

llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition IntrinsicInst.h:49

llvm::LLT
Definition LowLevelType.h:45

llvm::LLT::getScalarSizeInBits
constexpr unsigned getScalarSizeInBits() const
Definition LowLevelType.h:497

llvm::LLT::scalar
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition LowLevelType.h:88

llvm::LLT::pointer
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
Definition LowLevelType.h:115

llvm::LLT::getSizeInBits
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition LowLevelType.h:387

llvm::LLT::changeElementSize
LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
Definition LowLevelType.h:427

llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68

llvm::LLVMContext::emitError
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
Definition LLVMContext.cpp:214

llvm::LLVMContext::diagnose
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
Definition LLVMContext.cpp:249

llvm::LLVMContext::getOrInsertSyncScopeID
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
Definition LLVMContext.cpp:308

llvm::LoadInst
An instruction for reading from memory.
Definition Instructions.h:181

llvm::LoadInst::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition Instructions.h:266

llvm::LoadInst::setAtomic
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition Instructions.h:246

llvm::LoadInst::getPointerOperandIndex
static unsigned getPointerOperandIndex()
Definition Instructions.h:262

llvm::LoadSDNode
This class is used to represent ISD::LOAD nodes.
Definition SelectionDAGNodes.h:2657

llvm::LoadSDNode::getBasePtr
const SDValue & getBasePtr() const
Definition SelectionDAGNodes.h:2676

llvm::LoadSDNode::getOffset
const SDValue & getOffset() const
Definition SelectionDAGNodes.h:2677

llvm::LoadSDNode::getExtensionType
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Definition SelectionDAGNodes.h:2672

llvm::MCInstrDesc
Describe properties that are true of each instruction in the target description file.
Definition MCInstrDesc.h:199

llvm::MCRegister
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41

llvm::MDBuilder
Definition MDBuilder.h:37

llvm::MDBuilder::createRange
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition MDBuilder.cpp:96

llvm::MDNode
Metadata node.
Definition Metadata.h:1080

llvm::MDNode::getOperand
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1444

llvm::MIBundleBuilder
Helper class for constructing bundles of MachineInstrs.
Definition MachineInstrBuilder.h:622

llvm::MIBundleBuilder::begin
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Definition MachineInstrBuilder.h:660

llvm::MVT
Machine Value Type.
Definition MachineValueType.h:36

llvm::MVT::SimpleValueType
SimpleValueType
Definition MachineValueType.h:38

llvm::MVT::SimpleTy
SimpleValueType SimpleTy
Definition MachineValueType.h:55

llvm::MVT::getScalarSizeInBits
uint64_t getScalarSizeInBits() const
Definition MachineValueType.h:359

llvm::MVT::bitsLE
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
Definition MachineValueType.h:438

llvm::MVT::getVectorNumElements
unsigned getVectorNumElements() const
Definition MachineValueType.h:307

llvm::MVT::isVector
bool isVector() const
Return true if this is a vector value type.
Definition MachineValueType.h:106

llvm::MVT::isScalableVector
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
Definition MachineValueType.h:113

llvm::MVT::getVT
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition ValueTypes.cpp:249

llvm::MVT::getSizeInBits
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
Definition MachineValueType.h:321

llvm::MVT::isPow2VectorType
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition MachineValueType.h:254

llvm::MVT::getStoreSize
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition MachineValueType.h:369

llvm::MVT::getVectorVT
static MVT getVectorVT(MVT VT, unsigned NumElements)
Definition MachineValueType.h:464

llvm::MVT::getIntegerVT
static MVT getIntegerVT(unsigned BitWidth)
Definition MachineValueType.h:454

llvm::MVT::getScalarType
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Definition MachineValueType.h:272

llvm::MachineBasicBlock
Definition MachineBasicBlock.h:122

llvm::MachineBasicBlock::transferSuccessorsAndUpdatePHIs
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
Definition MachineBasicBlock.cpp:964

llvm::MachineBasicBlock::getFirstTerminator
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
Definition MachineBasicBlock.cpp:244

llvm::MachineBasicBlock::addSuccessor
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
Definition MachineBasicBlock.cpp:825

llvm::MachineBasicBlock::succ_empty
bool succ_empty() const
Definition MachineBasicBlock.h:465

llvm::MachineBasicBlock::begin
iterator begin()
Definition MachineBasicBlock.h:384

llvm::MachineBasicBlock::splitAt
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
Definition MachineBasicBlock.cpp:1052

llvm::MachineBasicBlock::end
iterator end()
Definition MachineBasicBlock.h:386

llvm::MachineBasicBlock::getParent
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
Definition MachineBasicBlock.h:330

llvm::MachineBasicBlock::splice
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Definition MachineBasicBlock.h:1163

llvm::MachineBasicBlock::iterator
MachineInstrBundleIterator< MachineInstr > iterator
Definition MachineBasicBlock.h:348

llvm::MachineFrameInfo
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
Definition MachineFrameInfo.h:112

llvm::MachineFrameInfo::CreateFixedObject
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
Definition MachineFrameInfo.cpp:83

llvm::MachineFrameInfo::hasCalls
bool hasCalls() const
Return true if the current function has any function calls.
Definition MachineFrameInfo.h:646

llvm::MachineFrameInfo::setHasTailCall
void setHasTailCall(bool V=true)
Definition MachineFrameInfo.h:677

llvm::MachineFrameInfo::setReturnAddressIsTaken
void setReturnAddressIsTaken(bool s)
Definition MachineFrameInfo.h:394

llvm::MachineFrameInfo::hasStackObjects
bool hasStackObjects() const
Return true if there are any stack objects in this function.
Definition MachineFrameInfo.h:366

llvm::MachineFunction
Definition MachineFunction.h:294

llvm::MachineFunction::getPSVManager
PseudoSourceValueManager & getPSVManager() const
Definition MachineFunction.h:743

llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition MachineFunction.h:788

llvm::MachineFunction::getMachineMemOperand
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
Definition MachineFunction.cpp:555

llvm::MachineFunction::getFrameInfo
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
Definition MachineFunction.h:804

llvm::MachineFunction::getDenormalMode
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition MachineFunction.cpp:331

llvm::MachineFunction::push_back
void push_back(MachineBasicBlock *MBB)
Definition MachineFunction.h:1029

llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition MachineFunction.h:798

llvm::MachineFunction::getDataLayout
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Definition MachineFunction.cpp:316

llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition MachineFunction.h:749

llvm::MachineFunction::iterator
BasicBlockListType::iterator iterator
Definition MachineFunction.h:994

llvm::MachineFunction::getInfo
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Definition MachineFunction.h:884

llvm::MachineFunction::addLiveIn
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
Definition MachineFunction.cpp:802

llvm::MachineFunction::empty
bool empty() const
Definition MachineFunction.h:1023

llvm::MachineFunction::CreateMachineBasicBlock
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
Definition MachineFunction.cpp:518

llvm::MachineFunction::insert
void insert(iterator MBBI, MachineBasicBlock *MBB)
Definition MachineFunction.h:1031

llvm::MachineFunction::getTarget
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Definition MachineFunction.h:784

llvm::MachineInstrBuilder
Definition MachineInstrBuilder.h:171

llvm::MachineInstrBuilder::setOperandDead
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
Definition MachineInstrBuilder.h:389

llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
Definition MachineInstrBuilder.h:199

llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition MachineInstrBuilder.h:233

llvm::MachineInstrBuilder::add
const MachineInstrBuilder & add(const MachineOperand &MO) const
Definition MachineInstrBuilder.h:326

llvm::MachineInstrBuilder::addMBB
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Definition MachineInstrBuilder.h:248

llvm::MachineInstrBuilder::cloneMemRefs
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Definition MachineInstrBuilder.h:315

llvm::MachineInstr
Representation of each machine instruction.
Definition MachineInstr.h:73

llvm::MachineInstr::getOperand
const MachineOperand & getOperand(unsigned i) const
Definition MachineInstr.h:609

llvm::MachineLoop
Definition MachineLoopInfo.h:48

llvm::MachineMemOperand
A description of a memory reference used in the backend.
Definition MachineMemOperand.h:130

llvm::MachineMemOperand::Flags
Flags
Flags values. These may be or'd together.
Definition MachineMemOperand.h:133

llvm::MachineMemOperand::MOVolatile
@ MOVolatile
The memory access is volatile.
Definition MachineMemOperand.h:141

llvm::MachineMemOperand::MODereferenceable
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
Definition MachineMemOperand.h:145

llvm::MachineMemOperand::MOLoad
@ MOLoad
The memory access reads data.
Definition MachineMemOperand.h:137

llvm::MachineMemOperand::MONonTemporal
@ MONonTemporal
The memory access is non-temporal.
Definition MachineMemOperand.h:143

llvm::MachineMemOperand::MONone
@ MONone
Definition MachineMemOperand.h:135

llvm::MachineMemOperand::MOInvariant
@ MOInvariant
The memory access always returns the same value (or traps).
Definition MachineMemOperand.h:147

llvm::MachineMemOperand::MOStore
@ MOStore
The memory access writes data.
Definition MachineMemOperand.h:139

llvm::MachineMemOperand::getFlags
Flags getFlags() const
Return the raw flags of the source value,.
Definition MachineMemOperand.h:227

llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition MachineOperand.h:49

llvm::MachineOperand::getSubReg
unsigned getSubReg() const
Definition MachineOperand.h:377

llvm::MachineOperand::isUndef
bool isUndef() const
Definition MachineOperand.h:407

llvm::MachineOperand::getImm
int64_t getImm() const
Definition MachineOperand.h:560

llvm::MachineOperand::isReg
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Definition MachineOperand.h:331

llvm::MachineOperand::setReg
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
Definition MachineOperand.cpp:60

llvm::MachineOperand::isImm
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Definition MachineOperand.h:333

llvm::MachineOperand::CreateImm
static MachineOperand CreateImm(int64_t Val)
Definition MachineOperand.h:833

llvm::MachineOperand::setIsUndef
void setIsUndef(bool Val=true)
Definition MachineOperand.h:534

llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition MachineOperand.h:372

llvm::MachineOperand::CreateReg
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
Definition MachineOperand.h:851

llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition MachineRegisterInfo.h:53

llvm::MachineRegisterInfo::hasOneNonDBGUse
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
Definition MachineRegisterInfo.cpp:425

llvm::MachineRegisterInfo::getRegClass
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
Definition MachineRegisterInfo.h:648

llvm::MachineRegisterInfo::clearKillFlags
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
Definition MachineRegisterInfo.cpp:453

llvm::MachineRegisterInfo::getVRegDef
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
Definition MachineRegisterInfo.cpp:404

llvm::MachineRegisterInfo::createVirtualRegister
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
Definition MachineRegisterInfo.cpp:154

llvm::MachineRegisterInfo::getType
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
Definition MachineRegisterInfo.h:771

llvm::MachineRegisterInfo::isLiveIn
LLVM_ABI bool isLiveIn(Register Reg) const
Definition MachineRegisterInfo.cpp:458

llvm::MachineRegisterInfo::setType
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
Definition MachineRegisterInfo.cpp:185

llvm::MachineRegisterInfo::setRegClass
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
Definition MachineRegisterInfo.cpp:58

llvm::MachineRegisterInfo::getLiveInVirtReg
LLVM_ABI Register getLiveInVirtReg(MCRegister PReg) const
getLiveInVirtReg - If PReg is a live-in physical register, return the corresponding live-in virtual r...
Definition MachineRegisterInfo.cpp:476

llvm::MachineRegisterInfo::getRegClassOrNull
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
Definition MachineRegisterInfo.h:665

llvm::MachineRegisterInfo::setSimpleHint
void setSimpleHint(Register VReg, Register PrefReg)
Specify the preferred (target independent) register allocation hint for the specified virtual registe...
Definition MachineRegisterInfo.h:823

llvm::MachineRegisterInfo::cloneVirtualRegister
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
Definition MachineRegisterInfo.cpp:176

llvm::MachineRegisterInfo::getNumVirtRegs
unsigned getNumVirtRegs() const
getNumVirtRegs - Return the number of virtual registers created.
Definition MachineRegisterInfo.h:796

llvm::MachineRegisterInfo::replaceRegWith
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
Definition MachineRegisterInfo.cpp:386

llvm::MachineSDNode
An SDNode that represents everything that will be needed to construct a MachineInstr.
Definition SelectionDAGNodes.h:3267

llvm::MemSDNode
This is an abstract virtual class for memory operations.
Definition SelectionDAGNodes.h:1425

llvm::MemSDNode::getAddressSpace
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Definition SelectionDAGNodes.h:1552

llvm::MemSDNode::getAlign
Align getAlign() const
Definition SelectionDAGNodes.h:1450

llvm::MemSDNode::getAAInfo
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
Definition SelectionDAGNodes.h:1480

llvm::MemSDNode::getMemOperand
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
Definition SelectionDAGNodes.h:1521

llvm::MemSDNode::getPointerInfo
const MachinePointerInfo & getPointerInfo() const
Definition SelectionDAGNodes.h:1547

llvm::MemSDNode::getChain
const SDValue & getChain() const
Definition SelectionDAGNodes.h:1588

llvm::MemSDNode::isInvariant
bool isInvariant() const
Definition SelectionDAGNodes.h:1474

llvm::MemSDNode::getMemoryVT
EVT getMemoryVT() const
Return the type of the in-memory value.
Definition SelectionDAGNodes.h:1516

llvm::MemoryEffectsBase::onlyWritesMemory
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition ModRef.h:252

llvm::MemoryEffectsBase::doesNotAccessMemory
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:246

llvm::MemoryEffectsBase::onlyReadsMemory
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition ModRef.h:249

llvm::Module::getDataLayout
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:280

llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition OptimizationRemarkEmitter.h:33

llvm::OptimizationRemarkEmitter::emit
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Definition OptimizationRemarkEmitter.cpp:79

llvm::OptimizationRemark
Diagnostic information for applied optimization remarks.
Definition DiagnosticInfo.h:766

llvm::PHINode
Definition Instructions.h:2659

llvm::PointerType::get
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.

llvm::PoisonValue::get
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition Constants.cpp:2088

llvm::PseudoSourceValueManager::getConstantPool
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Definition PseudoSourceValue.cpp:116

llvm::RegisterSDNode
Definition SelectionDAGNodes.h:2473

llvm::RegisterSDNode::getReg
Register getReg() const
Definition SelectionDAGNodes.h:2482

llvm::Register
Wrapper class representing virtual and physical registers.
Definition Register.h:20

llvm::Register::index2VirtReg
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition Register.h:72

llvm::Register::isPhysical
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83

llvm::SDLoc
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Definition SelectionDAGNodes.h:1253

llvm::SDNode
Represents one node in the SelectionDAG.
Definition SelectionDAGNodes.h:517

llvm::SDNode::getOpcode
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
Definition SelectionDAGNodes.h:712

llvm::SDNode::isDivergent
bool isDivergent() const
Definition SelectionDAGNodes.h:778

llvm::SDNode::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this node.
Definition SelectionDAGNodes.h:784

llvm::SDNode::value_end
value_iterator value_end() const
Definition SelectionDAGNodes.h:1154

llvm::SDNode::getFlags
SDNodeFlags getFlags() const
Definition SelectionDAGNodes.h:1113

llvm::SDNode::getAsZExtVal
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
Definition SelectionDAGNodes.h:1865

llvm::SDNode::getNumValues
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
Definition SelectionDAGNodes.h:1129

llvm::SDNode::getOperand
const SDValue & getOperand(unsigned Num) const
Definition SelectionDAGNodes.h:1062

llvm::SDNode::getConstantOperandVal
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Definition SelectionDAGNodes.h:1861

llvm::SDNode::getValueType
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
Definition SelectionDAGNodes.h:1132

llvm::SDNode::user_begin
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Definition SelectionDAGNodes.h:920

llvm::SDNode::op_end
op_iterator op_end() const
Definition SelectionDAGNodes.h:1070

llvm::SDNode::isAnyAdd
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
Definition SelectionDAGNodes.h:724

llvm::SDNode::value_begin
value_iterator value_begin() const
Definition SelectionDAGNodes.h:1153

llvm::SDNode::op_begin
op_iterator op_begin() const
Definition SelectionDAGNodes.h:1069

llvm::SDUse
Represents a use of a SDNode.
Definition SelectionDAGNodes.h:286

llvm::SDValue
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
Definition SelectionDAGNodes.h:147

llvm::SDValue::isUndef
bool isUndef() const
Definition SelectionDAGNodes.h:1320

llvm::SDValue::getNode
SDNode * getNode() const
get the SDNode which holds the desired result
Definition SelectionDAGNodes.h:161

llvm::SDValue::hasOneUse
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
Definition SelectionDAGNodes.h:1330

llvm::SDValue::getValue
SDValue getValue(unsigned R) const
Definition SelectionDAGNodes.h:181

llvm::SDValue::getValueType
EVT getValueType() const
Return the ValueType of the referenced return value.
Definition SelectionDAGNodes.h:1288

llvm::SDValue::isMachineOpcode
bool isMachineOpcode() const
Definition SelectionDAGNodes.h:1312

llvm::SDValue::getValueSizeInBits
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
Definition SelectionDAGNodes.h:201

llvm::SDValue::getOperand
const SDValue & getOperand(unsigned i) const
Definition SelectionDAGNodes.h:1296

llvm::SDValue::getSimpleValueType
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
Definition SelectionDAGNodes.h:192

llvm::SDValue::getMachineOpcode
unsigned getMachineOpcode() const
Definition SelectionDAGNodes.h:1316

llvm::SDValue::getOpcode
unsigned getOpcode() const
Definition SelectionDAGNodes.h:1284

llvm::SIInstrInfo
Definition SIInstrInfo.h:101

llvm::SIInstrInfo::getMaxMUBUFImmOffset
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Definition SIInstrInfo.cpp:10219

llvm::SIInstrInfo::getDSShaderTypeValue
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
Definition SIInstrInfo.cpp:10998

llvm::SIInstrInfo::MO_ABS32_HI
@ MO_ABS32_HI
Definition SIInstrInfo.h:256

llvm::SIInstrInfo::MO_REL32
@ MO_REL32
Definition SIInstrInfo.h:247

llvm::SIInstrInfo::MO_NONE
@ MO_NONE
Definition SIInstrInfo.h:236

llvm::SIInstrInfo::MO_ABS64
@ MO_ABS64
Definition SIInstrInfo.h:257

llvm::SIInstrInfo::MO_GOTPCREL32
@ MO_GOTPCREL32
Definition SIInstrInfo.h:240

llvm::SIInstrInfo::MO_ABS32_LO
@ MO_ABS32_LO
Definition SIInstrInfo.h:255

llvm::SIMachineFunctionInfo
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Definition SIMachineFunctionInfo.h:415

llvm::SIMachineFunctionInfo::isWholeWaveFunction
bool isWholeWaveFunction() const
Definition SIMachineFunctionInfo.h:694

llvm::SIMachineFunctionInfo::hasWorkGroupIDZ
bool hasWorkGroupIDZ() const
Definition SIMachineFunctionInfo.h:943

llvm::SIMachineFunctionInfo::getClusterDims
AMDGPU::ClusterDimsAttr getClusterDims() const
Definition SIMachineFunctionInfo.h:1227

llvm::SIMachineFunctionInfo::getMode
SIModeRegisterDefaults getMode() const
Definition SIMachineFunctionInfo.h:674

llvm::SIMachineFunctionInfo::getPreloadedValue
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
Definition SIMachineFunctionInfo.h:980

llvm::SIMachineFunctionInfo::getBytesInStackArgArea
unsigned getBytesInStackArgArea() const
Definition SIMachineFunctionInfo.h:842

llvm::SIMachineFunctionInfo::getGWSPSV
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
Definition SIMachineFunctionInfo.h:1179

llvm::SIRegisterInfo
Definition SIRegisterInfo.h:40

llvm::SIRegisterInfo::getSubRegFromChannel
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
Definition SIRegisterInfo.cpp:555

llvm::SIRegisterInfo::getSGPRClassForBitWidth
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
Definition SIRegisterInfo.cpp:3655

llvm::SIRegisterInfo::isVGPRClass
static bool isVGPRClass(const TargetRegisterClass *RC)
Definition SIRegisterInfo.h:243

llvm::SIRegisterInfo::isSGPRClass
static bool isSGPRClass(const TargetRegisterClass *RC)
Definition SIRegisterInfo.h:224

llvm::SIRegisterInfo::isAGPRClass
static bool isAGPRClass(const TargetRegisterClass *RC)
Definition SIRegisterInfo.h:248

llvm::SITargetLowering
Definition SIISelLowering.h:32

llvm::SITargetLowering::isOffsetFoldingLegal
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
Definition SIISelLowering.cpp:9752

llvm::SITargetLowering::isTypeDesirableForOp
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
Definition SIISelLowering.cpp:2349

llvm::SITargetLowering::PostISelFolding
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
Definition SIISelLowering.cpp:18796

llvm::SITargetLowering::splitTernaryVectorOp
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
Definition SIISelLowering.cpp:7447

llvm::SITargetLowering::wrapAddr64Rsrc
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
Definition SIISelLowering.cpp:19011

llvm::SITargetLowering::isFMAFasterThanFMulAndFAdd
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
Definition SIISelLowering.cpp:7305

llvm::SITargetLowering::lowerGET_ROUNDING
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
Definition SIISelLowering.cpp:4718

llvm::SITargetLowering::shouldExpandAtomicRMWInIR
AtomicExpansionKind shouldExpandAtomicRMWInIR(const AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Definition SIISelLowering.cpp:20046

llvm::SITargetLowering::requiresUniformRegister
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
Definition SIISelLowering.cpp:20435

llvm::SITargetLowering::isFMADLegal
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
Definition SIISelLowering.cpp:7365

llvm::SITargetLowering::shouldExpandAtomicStoreInIR
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
Definition SIISelLowering.cpp:20337

llvm::SITargetLowering::bundleInstWithWaitcnt
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
Definition SIISelLowering.cpp:5085

llvm::SITargetLowering::lowerROTR
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
Definition SIISelLowering.cpp:7411

llvm::SITargetLowering::getScalarShiftAmountTy
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
Definition SIISelLowering.cpp:7278

llvm::SITargetLowering::LowerCall
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
Definition SIISelLowering.cpp:4196

llvm::SITargetLowering::getPointerTy
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
Definition SIISelLowering.cpp:1290

llvm::SITargetLowering::denormalsEnabledForType
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
Definition SIISelLowering.cpp:19874

llvm::SITargetLowering::insertCopiesSplitCSR
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
Definition SIISelLowering.cpp:3328

llvm::SITargetLowering::getSetCCResultType
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
Definition SIISelLowering.cpp:7270

llvm::SITargetLowering::legalizeTargetIndependentNode
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
Definition SIISelLowering.cpp:18751

llvm::SITargetLowering::allowsMisalignedMemoryAccessesImpl
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
Definition SIISelLowering.cpp:2081

llvm::SITargetLowering::getPreferredVectorAction
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
Definition SIISelLowering.cpp:2319

llvm::SITargetLowering::lowerFP_EXTEND
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
Definition SIISelLowering.cpp:4901

llvm::SITargetLowering::getSubtarget
const GCNSubtarget * getSubtarget() const
Definition SIISelLowering.cpp:1102

llvm::SITargetLowering::enableAggressiveFMAFusion
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
Definition SIISelLowering.cpp:7257

llvm::SITargetLowering::shouldEmitGOTReloc
bool shouldEmitGOTReloc(const GlobalValue *GV) const
Definition SIISelLowering.cpp:8371

llvm::SITargetLowering::CollectTargetIntrinsicOperands
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
Definition SIISelLowering.cpp:1809

llvm::SITargetLowering::splitUnaryVectorOp
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
Definition SIISelLowering.cpp:7387

llvm::SITargetLowering::lowerGET_FPENV
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
Definition SIISelLowering.cpp:4920

llvm::SITargetLowering::isCanonicalized
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, SDNodeFlags UserFlags={}, unsigned MaxDepth=5) const
Definition SIISelLowering.cpp:15496

llvm::SITargetLowering::allocateSpecialInputSGPRs
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Definition SIISelLowering.cpp:2961

llvm::SITargetLowering::allocateLDSKernelId
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Definition SIISelLowering.cpp:3141

llvm::SITargetLowering::LowerSTACKSAVE
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
Definition SIISelLowering.cpp:4701

llvm::SITargetLowering::isReassocProfitable
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
Definition SIISelLowering.cpp:20474

llvm::SITargetLowering::allocateHSAUserSGPRs
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Definition SIISelLowering.cpp:2998

llvm::SITargetLowering::getRoundingControlRegisters
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
Definition SIISelLowering.cpp:1104

llvm::SITargetLowering::getConstraintType
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
Definition SIISelLowering.cpp:19238

llvm::SITargetLowering::LowerReturn
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
Definition SIISelLowering.cpp:3737

llvm::SITargetLowering::getRegClassFor
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
Definition SIISelLowering.cpp:20364

llvm::SITargetLowering::AddMemOpInit
void AddMemOpInit(MachineInstr &MI) const
Definition SIISelLowering.cpp:18873

llvm::SITargetLowering::getTargetMMOFlags
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
Definition SIISelLowering.cpp:20493

llvm::SITargetLowering::isLegalGlobalAddressingMode
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
Definition SIISelLowering.cpp:1903

llvm::SITargetLowering::computeKnownBitsForFrameIndex
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
Definition SIISelLowering.cpp:19523

llvm::SITargetLowering::shouldConvertConstantLoadToIntImm
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Definition SIISelLowering.cpp:2326

llvm::SITargetLowering::getPrefLoopAlignment
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
Definition SIISelLowering.cpp:19690

llvm::SITargetLowering::getRegForInlineAsmConstraint
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
Definition SIISelLowering.cpp:19077

llvm::SITargetLowering::emitExpandAtomicStore
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
Definition SIISelLowering.cpp:20750

llvm::SITargetLowering::shouldExpandAtomicLoadInIR
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Definition SIISelLowering.cpp:20330

llvm::SITargetLowering::computeKnownAlignForTargetInstr
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
Definition SIISelLowering.cpp:19673

llvm::SITargetLowering::getAsmOperandConstVal
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
Definition SIISelLowering.cpp:19281

llvm::SITargetLowering::isShuffleMaskLegal
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
Definition SIISelLowering.cpp:1139

llvm::SITargetLowering::emitExpandAtomicLoad
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
Definition SIISelLowering.cpp:20742

llvm::SITargetLowering::getOptimalMemOpType
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
Definition SIISelLowering.cpp:2270

llvm::SITargetLowering::ReplaceNodeResults
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Definition SIISelLowering.cpp:8143

llvm::SITargetLowering::getRegisterByName
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
Definition SIISelLowering.cpp:4986

llvm::SITargetLowering::LowerAsmOperandForConstraint
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
Definition SIISelLowering.cpp:19265

llvm::SITargetLowering::getPreferredShiftAmountTy
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
Definition SIISelLowering.cpp:7284

llvm::SITargetLowering::isLegalAddressingMode
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
Definition SIISelLowering.cpp:1959

llvm::SITargetLowering::lowerSET_FPENV
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
Definition SIISelLowering.cpp:4950

llvm::SITargetLowering::shouldPreservePtrArith
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
Definition SIISelLowering.cpp:12565

llvm::SITargetLowering::getTgtMemIntrinsic
void getTgtMemIntrinsic(SmallVectorImpl< IntrinsicInfo > &, const CallBase &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
Definition SIISelLowering.cpp:1372

llvm::SITargetLowering::computeKnownBitsForTargetNode
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
Definition SIISelLowering.cpp:19492

llvm::SITargetLowering::lowerSET_ROUNDING
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
Definition SIISelLowering.cpp:4789

llvm::SITargetLowering::allocateSpecialInputVGPRsFixed
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
Definition SIISelLowering.cpp:2948

llvm::SITargetLowering::lowerIdempotentRMWIntoFencedLoad
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
Definition SIISelLowering.cpp:20759

llvm::SITargetLowering::emitGWSMemViolTestLoop
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
Definition SIISelLowering.cpp:5101

llvm::SITargetLowering::getAddrModeArguments
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
Definition SIISelLowering.cpp:1826

llvm::SITargetLowering::checkAsmConstraintValA
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
Definition SIISelLowering.cpp:19347

llvm::SITargetLowering::shouldEmitFixup
bool shouldEmitFixup(const GlobalValue *GV) const
Definition SIISelLowering.cpp:8364

llvm::SITargetLowering::splitKillBlock
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
Definition SIISelLowering.cpp:5033

llvm::SITargetLowering::emitExpandAtomicCmpXchg
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
Definition SIISelLowering.cpp:20735

llvm::SITargetLowering::canTransformPtrArithOutOfBounds
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
Definition SIISelLowering.cpp:12570

llvm::SITargetLowering::hasMemSDNodeUser
bool hasMemSDNodeUser(SDNode *N) const
Definition SIISelLowering.cpp:20464

llvm::SITargetLowering::isSDNodeSourceOfDivergence
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
Definition SIISelLowering.cpp:19814

llvm::SITargetLowering::EmitInstrWithCustomInserter
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
Definition SIISelLowering.cpp:6688

llvm::SITargetLowering::isEligibleForTailCallOptimization
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
Definition SIISelLowering.cpp:4076

llvm::SITargetLowering::isMemOpHasNoClobberedMemOperand
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
Definition SIISelLowering.cpp:2289

llvm::SITargetLowering::isLegalFlatAddressingMode
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
Definition SIISelLowering.cpp:1885

llvm::SITargetLowering::LowerCallResult
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
Definition SIISelLowering.cpp:3837

llvm::SITargetLowering::LowerFormalArguments
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
Definition SIISelLowering.cpp:3363

llvm::SITargetLowering::PerformDAGCombine
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
Definition SIISelLowering.cpp:18378

llvm::SITargetLowering::isFPExtFoldable
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
Definition SIISelLowering.cpp:1117

llvm::SITargetLowering::AdjustInstrPostInstrSelection
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
Definition SIISelLowering.cpp:18971

llvm::SITargetLowering::getRegisterTypeForCallingConv
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
Definition SIISelLowering.cpp:1145

llvm::SITargetLowering::allocateSpecialInputVGPRs
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
Definition SIISelLowering.cpp:2927

llvm::SITargetLowering::finalizeLowering
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
Definition SIISelLowering.cpp:19428

llvm::SITargetLowering::isNonGlobalAddrSpace
static bool isNonGlobalAddrSpace(unsigned AS)
Definition SIISelLowering.cpp:2294

llvm::SITargetLowering::emitExpandAtomicAddrSpacePredicate
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
Definition SIISelLowering.cpp:20503

llvm::SITargetLowering::buildRSRC
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
Definition SIISelLowering.cpp:19042

llvm::SITargetLowering::getNumRegistersForCallingConv
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
Definition SIISelLowering.cpp:1174

llvm::SITargetLowering::mayBeEmittedAsTailCall
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
Definition SIISelLowering.cpp:4172

llvm::SITargetLowering::passSpecialInputs
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
Definition SIISelLowering.cpp:3895

llvm::SITargetLowering::LowerOperation
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
Definition SIISelLowering.cpp:7477

llvm::SITargetLowering::checkAsmConstraintVal
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
Definition SIISelLowering.cpp:19315

llvm::SITargetLowering::isKnownNeverNaNForTargetNode
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
Definition SIISelLowering.cpp:19900

llvm::SITargetLowering::emitExpandAtomicRMW
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
Definition SIISelLowering.cpp:20708

llvm::SITargetLowering::shouldExpandVectorDynExt
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
Definition SIISelLowering.cpp:16318

llvm::SITargetLowering::shouldUseLDSConstAddress
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
Definition SIISelLowering.cpp:8386

llvm::SITargetLowering::supportSplitCSR
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
Definition SIISelLowering.cpp:3321

llvm::SITargetLowering::isExtractVecEltCheap
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
Definition SIISelLowering.cpp:2341

llvm::SITargetLowering::LowerDYNAMIC_STACKALLOC
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
Definition SIISelLowering.cpp:4635

llvm::SITargetLowering::allowsMisalignedMemoryAccesses
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
Definition SIISelLowering.h:373

llvm::SITargetLowering::isExtractSubvectorCheap
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
Definition SIISelLowering.cpp:2332

llvm::SITargetLowering::canMergeStoresTo
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
Definition SIISelLowering.cpp:2068

llvm::SITargetLowering::lowerPREFETCH
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
Definition SIISelLowering.cpp:4874

llvm::SITargetLowering::SITargetLowering
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
Definition SIISelLowering.cpp:89

llvm::SITargetLowering::computeKnownBitsForTargetInstr
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
Definition SIISelLowering.cpp:19579

llvm::SITargetLowering::isFreeAddrSpaceCast
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
Definition SIISelLowering.cpp:2299

llvm::SITargetLowering::shouldEmitPCReloc
bool shouldEmitPCReloc(const GlobalValue *GV) const
Definition SIISelLowering.cpp:8382

llvm::SITargetLowering::shouldExpandAtomicCmpXchgInIR
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(const AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
Definition SIISelLowering.cpp:20344

llvm::SITargetLowering::initializeSplitCSR
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
Definition SIISelLowering.cpp:3326

llvm::SITargetLowering::allocateSpecialEntryInputVGPRs
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Definition SIISelLowering.cpp:2807

llvm::SITargetLowering::allocatePreloadKernArgSGPRs
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Definition SIISelLowering.cpp:3061

llvm::SITargetLowering::copyToM0
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
Definition SIISelLowering.cpp:9905

llvm::SITargetLowering::splitBinaryVectorOp
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
Definition SIISelLowering.cpp:7423

llvm::SITargetLowering::getKernargSegmentPtrInfo
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
Definition SIISelLowering.cpp:2369

llvm::SITargetLowering::getVectorTypeBreakdownForCallingConv
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Definition SIISelLowering.cpp:1200

llvm::SITargetLowering::getPointerMemTy
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
Definition SIISelLowering.cpp:1302

llvm::SITargetLowering::allocateSystemSGPRs
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
Definition SIISelLowering.cpp:3153

llvm::SITargetLowering::CanLowerReturn
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
Definition SIISelLowering.cpp:3711

llvm::SITargetLowering::getMaxPermittedBytesForAlignment
unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const override
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
Definition SIISelLowering.cpp:19778

llvm::SelectionDAG
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition SelectionDAG.h:231

llvm::SelectionDAG::getExtLoad
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Definition SelectionDAG.cpp:10594

llvm::SelectionDAG::getTargetGlobalAddress
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition SelectionDAG.h:781

llvm::SelectionDAG::getExtOrTrunc
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
Definition SelectionDAG.h:1053

llvm::SelectionDAG::getExtractVectorElt
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
Definition SelectionDAG.h:969

llvm::SelectionDAG::getRoot
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition SelectionDAG.h:601

llvm::SelectionDAG::getAddrSpaceCast
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
Definition SelectionDAG.cpp:2546

llvm::SelectionDAG::isKnownNeverSNaN
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
Definition SelectionDAG.h:2452

llvm::SelectionDAG::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
Definition SelectionDAG.h:516

llvm::SelectionDAG::getCopyToReg
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
Definition SelectionDAG.h:841

llvm::SelectionDAG::getMergeValues
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
Definition SelectionDAG.cpp:10312

llvm::SelectionDAG::getVTList
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
Definition SelectionDAG.cpp:12004

llvm::SelectionDAG::getShiftAmountConstant
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
Definition SelectionDAG.cpp:1872

llvm::SelectionDAG::getAllOnesConstant
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
Definition SelectionDAG.cpp:1861

llvm::SelectionDAG::getMachineNode
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
Definition SelectionDAG.cpp:12445

llvm::SelectionDAG::ExtractVectorElements
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
Definition SelectionDAG.cpp:14430

llvm::SelectionDAG::getAtomicLoad
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
Definition SelectionDAG.cpp:10303

llvm::SelectionDAG::getFreeze
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
Definition SelectionDAG.cpp:2568

llvm::SelectionDAG::isConstantIntBuildVectorOrConstantInt
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
Definition SelectionDAG.cpp:14848

llvm::SelectionDAG::UnrollVectorOp
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
Definition SelectionDAG.cpp:14104

llvm::SelectionDAG::getConstantFP
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
Definition SelectionDAG.cpp:1934

llvm::SelectionDAG::haveNoCommonBitsSet
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
Definition SelectionDAG.cpp:6832

llvm::SelectionDAG::getRegister
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
Definition SelectionDAG.cpp:2434

llvm::SelectionDAG::getLoad
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
Definition SelectionDAG.cpp:10577

llvm::SelectionDAG::SignBitIsZeroFP
LLVM_ABI bool SignBitIsZeroFP(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero, for a floating-point value.
Definition SelectionDAG.cpp:2871

llvm::SelectionDAG::getMemIntrinsicNode
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
Definition SelectionDAG.cpp:10323

llvm::SelectionDAG::getSetCC
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false, SDNodeFlags Flags={})
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
Definition SelectionDAG.h:1382

llvm::SelectionDAG::getAtomic
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
Definition SelectionDAG.cpp:10275

llvm::SelectionDAG::getMemcpy
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
Definition SelectionDAG.cpp:9888

llvm::SelectionDAG::SplitVectorOperand
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
Definition SelectionDAG.h:2636

llvm::SelectionDAG::getNOT
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
Definition SelectionDAG.cpp:1681

llvm::SelectionDAG::getTargetLoweringInfo
const TargetLowering & getTargetLoweringInfo() const
Definition SelectionDAG.h:520

llvm::SelectionDAG::GetSplitDestVTs
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
Definition SelectionDAG.cpp:14343

llvm::SelectionDAG::getUNDEF
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
Definition SelectionDAG.h:1207

llvm::SelectionDAG::getCALLSEQ_END
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
Definition SelectionDAG.h:1184

llvm::SelectionDAG::getBuildVector
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition SelectionDAG.h:896

llvm::SelectionDAG::getBitcastedAnyExtOrTrunc
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
Definition SelectionDAG.cpp:1567

llvm::SelectionDAG::getBitcast
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
Definition SelectionDAG.cpp:2539

llvm::SelectionDAG::getCopyFromReg
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
Definition SelectionDAG.h:867

llvm::SelectionDAG::getSelect
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
Definition SelectionDAG.h:1412

llvm::SelectionDAG::setNodeMemRefs
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
Definition SelectionDAG.cpp:12213

llvm::SelectionDAG::getZeroExtendInReg
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
Definition SelectionDAG.cpp:1621

llvm::SelectionDAG::getDataLayout
const DataLayout & getDataLayout() const
Definition SelectionDAG.h:514

llvm::SelectionDAG::getTokenFactor
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
Definition SelectionDAG.cpp:14942

llvm::SelectionDAG::getConstant
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
Definition SelectionDAG.cpp:1725

llvm::SelectionDAG::getMemBasePlusOffset
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
Definition SelectionDAG.cpp:9184

llvm::SelectionDAG::getSignedTargetConstant
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition SelectionDAG.h:742

llvm::SelectionDAG::getTruncStore
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Definition SelectionDAG.cpp:10703

llvm::SelectionDAG::ReplaceAllUsesWith
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
Definition SelectionDAG.cpp:12962

llvm::SelectionDAG::getStore
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
Definition SelectionDAG.cpp:10627

llvm::SelectionDAG::getSignedConstant
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Definition SelectionDAG.cpp:1855

llvm::SelectionDAG::getCALLSEQ_START
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
Definition SelectionDAG.h:1172

llvm::SelectionDAG::RemoveDeadNode
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
Definition SelectionDAG.cpp:1120

llvm::SelectionDAG::getTargetExtractSubreg
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
Definition SelectionDAG.cpp:12563

llvm::SelectionDAG::getSelectCC
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
Definition SelectionDAG.h:1422

llvm::SelectionDAG::getSExtOrTrunc
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
Definition SelectionDAG.cpp:1555

llvm::SelectionDAG::getTarget
const TargetMachine & getTarget() const
Definition SelectionDAG.h:515

llvm::SelectionDAG::getAnyExtOrTrunc
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
Definition SelectionDAG.cpp:1549

llvm::SelectionDAG::getValueType
LLVM_ABI SDValue getValueType(EVT)
Definition SelectionDAG.cpp:2114

llvm::SelectionDAG::getNode
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
Definition SelectionDAG.cpp:11635

llvm::SelectionDAG::isKnownNeverNaN
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
Definition SelectionDAG.cpp:6262

llvm::SelectionDAG::getTargetConstant
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition SelectionDAG.h:730

llvm::SelectionDAG::ComputeNumSignBits
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
Definition SelectionDAG.cpp:4903

llvm::SelectionDAG::isBaseWithConstantOffset
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
Definition SelectionDAG.cpp:6054

llvm::SelectionDAG::getVectorIdxConstant
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
Definition SelectionDAG.cpp:1885

llvm::SelectionDAG::ReplaceAllUsesOfValueWith
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
Definition SelectionDAG.cpp:13124

llvm::SelectionDAG::getMachineFunction
MachineFunction & getMachineFunction() const
Definition SelectionDAG.h:509

llvm::SelectionDAG::getPOISON
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
Definition SelectionDAG.h:1212

llvm::SelectionDAG::getSplatBuildVector
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition SelectionDAG.h:913

llvm::SelectionDAG::getFrameIndex
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
Definition SelectionDAG.cpp:2001

llvm::SelectionDAG::computeKnownBits
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
Definition SelectionDAG.cpp:3350

llvm::SelectionDAG::getRegisterMask
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
Definition SelectionDAG.cpp:2450

llvm::SelectionDAG::getZExtOrTrunc
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
Definition SelectionDAG.cpp:1561

llvm::SelectionDAG::getCondCode
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
Definition SelectionDAG.cpp:2166

llvm::SelectionDAG::MaskedValueIsZero
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
Definition SelectionDAG.cpp:2916

llvm::SelectionDAG::getObjectPtrOffset
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
Definition SelectionDAG.h:1157

llvm::SelectionDAG::getContext
LLVMContext * getContext() const
Definition SelectionDAG.h:534

llvm::SelectionDAG::setRoot
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition SelectionDAG.h:610

llvm::SelectionDAG::UpdateNodeOperands
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
Definition SelectionDAG.cpp:12097

llvm::SelectionDAG::getEntryNode
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition SelectionDAG.h:604

llvm::SelectionDAG::SplitScalar
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
Definition SelectionDAG.cpp:14328

llvm::SelectionDAG::getVectorShuffle
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
Definition SelectionDAG.cpp:2253

llvm::ShuffleVectorSDNode::getMaskElt
int getMaskElt(unsigned Idx) const
Definition SelectionDAGNodes.h:1780

llvm::ShuffleVectorSDNode::getMask
ArrayRef< int > getMask() const
Definition SelectionDAGNodes.h:1775

llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition SmallPtrSet.h:389

llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition SmallPtrSet.h:527

llvm::SmallSet::count
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:176

llvm::SmallSet::empty
bool empty() const
Definition SmallSet.h:169

llvm::SmallSet::insert
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184

llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition SmallVector.h:581

llvm::SmallVectorImpl::append
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition SmallVector.h:691

llvm::SmallVectorImpl::resize
void resize(size_type N)
Definition SmallVector.h:646

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition SmallVector.h:423

llvm::SmallVectorTemplateCommon::end
iterator end()
Definition SmallVector.h:278

llvm::SmallVectorTemplateCommon::size
size_t size() const
Definition SmallVector.h:83

llvm::SmallVectorTemplateCommon::begin
iterator begin()
Definition SmallVector.h:276

llvm::SmallVectorTemplateCommon::back
reference back()
Definition SmallVector.h:317

llvm::SmallVectorTemplateCommon::empty
bool empty() const
Definition SmallVector.h:86

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition SmallVector.h:1225

llvm::SrcOp
Definition MachineIRBuilder.h:144

llvm::StoreInst
An instruction for storing to memory.
Definition Instructions.h:297

llvm::StringRef
Represent a constant reference to a string, i.e.
Definition StringRef.h:56

llvm::StringRef::empty
constexpr bool empty() const
Check if the string is empty.
Definition StringRef.h:141

llvm::StringRef::size
constexpr size_t size() const
Get the string size.
Definition StringRef.h:144

llvm::StringSwitch
A switch()-like statement whose cases are string literals.
Definition StringSwitch.h:47

llvm::StringSwitch::Case
StringSwitch & Case(StringLiteral S, T Value)
Definition StringSwitch.h:69

llvm::StringSwitch::Default
R Default(T Value)
Definition StringSwitch.h:134

llvm::TargetFrameLowering
Information about stack frame layout on the target.
Definition TargetFrameLowering.h:47

llvm::TargetFrameLowering::StackGrowsUp
@ StackGrowsUp
Definition TargetFrameLowering.h:50

llvm::TargetFrameLowering::getStackAlign
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
Definition TargetFrameLowering.h:107

llvm::TargetFrameLowering::getStackGrowthDirection
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
Definition TargetFrameLowering.h:96

llvm::TargetInstrInfo
TargetInstrInfo - Interface to description of machine instruction set.
Definition TargetInstrInfo.h:115

llvm::TargetLoweringBase::ArgListEntry
Definition TargetLowering.h:301

llvm::TargetLoweringBase::ArgListEntry::Ty
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
Definition TargetLowering.h:308

llvm::TargetLoweringBase::setBooleanVectorContents
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
Definition TargetLowering.h:2642

llvm::TargetLoweringBase::setOperationAction
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
Definition TargetLowering.h:2705

llvm::TargetLoweringBase::finalizeLowering
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
Definition TargetLoweringBase.cpp:2782

llvm::TargetLoweringBase::getValueType
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
Definition TargetLowering.h:1777

llvm::TargetLoweringBase::Expand
@ Expand
Definition TargetLowering.h:206

llvm::TargetLoweringBase::Promote
@ Promote
Definition TargetLowering.h:205

llvm::TargetLoweringBase::getRegClassFor
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
Definition TargetLowering.h:1054

llvm::TargetLoweringBase::getMaxPermittedBytesForAlignment
virtual unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
Definition TargetLoweringBase.cpp:2563

llvm::TargetLoweringBase::getTargetMachine
const TargetMachine & getTargetMachine() const
Definition TargetLowering.h:374

llvm::TargetLoweringBase::getNumRegistersForCallingConv
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
Definition TargetLowering.h:1895

llvm::TargetLoweringBase::getRegisterTypeForCallingConv
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
Definition TargetLowering.h:1887

llvm::TargetLoweringBase::LegalizeTypeAction
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
Definition TargetLowering.h:213

llvm::TargetLoweringBase::TypeSplitVector
@ TypeSplitVector
Definition TargetLowering.h:220

llvm::TargetLoweringBase::TypeWidenVector
@ TypeWidenVector
Definition TargetLowering.h:221

llvm::TargetLoweringBase::setHasExtractBitsInsn
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
Definition TargetLowering.h:2671

llvm::TargetLoweringBase::getPreferredVectorAction
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
Definition TargetLowering.h:538

llvm::TargetLoweringBase::getVectorTypeBreakdownForCallingConv
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Definition TargetLowering.h:1215

llvm::TargetLoweringBase::getStackPointerRegisterToSaveRestore
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
Definition TargetLowering.h:2122

llvm::TargetLoweringBase::setMinFunctionAlignment
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
Definition TargetLowering.h:2901

llvm::TargetLoweringBase::setBooleanContents
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
Definition TargetLowering.h:2628

llvm::TargetLoweringBase::getPrefLoopAlignment
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
Definition TargetLoweringBase.cpp:2557

llvm::TargetLoweringBase::computeRegisterProperties
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
Definition TargetLoweringBase.cpp:1717

llvm::TargetLoweringBase::addRegisterClass
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
Definition TargetLowering.h:2688

llvm::TargetLoweringBase::isTypeLegal
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
Definition TargetLowering.h:1105

llvm::TargetLoweringBase::getPointerTy
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
Definition TargetLowering.h:381

llvm::TargetLoweringBase::setPrefFunctionAlignment
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
Definition TargetLowering.h:2907

llvm::TargetLoweringBase::isOperationLegal
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
Definition TargetLowering.h:1483

llvm::TargetLoweringBase::setTruncStoreAction
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
Definition TargetLowering.h:2768

llvm::TargetLoweringBase::ZeroOrOneBooleanContent
@ ZeroOrOneBooleanContent
Definition TargetLowering.h:239

llvm::TargetLoweringBase::isOperationLegalOrCustom
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
Definition TargetLowering.h:1361

llvm::TargetLoweringBase::isNarrowingProfitable
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
Definition TargetLowering.h:3491

llvm::TargetLoweringBase::setStackPointerRegisterToSaveRestore
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
Definition TargetLowering.h:2663

llvm::TargetLoweringBase::AddPromotedToType
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
Definition TargetLowering.h:2872

llvm::TargetLoweringBase::AtomicExpansionKind
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
Definition TargetLowering.h:256

llvm::TargetLoweringBase::AtomicExpansionKind::CmpXChg
@ CmpXChg
Definition TargetLowering.h:264

llvm::TargetLoweringBase::AtomicExpansionKind::NotAtomic
@ NotAtomic
Definition TargetLowering.h:275

llvm::TargetLoweringBase::AtomicExpansionKind::None
@ None
Definition TargetLowering.h:257

llvm::TargetLoweringBase::AtomicExpansionKind::CustomExpand
@ CustomExpand
Definition TargetLowering.h:271

llvm::TargetLoweringBase::setTargetDAGCombine
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
Definition TargetLowering.h:2893

llvm::TargetLoweringBase::allowsMemoryAccessForAlignment
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
Definition TargetLoweringBase.cpp:2148

llvm::TargetLoweringBase::getPointerMemTy
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
Definition TargetLowering.h:388

llvm::TargetLoweringBase::setSchedulingPreference
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
Definition TargetLowering.h:2647

llvm::TargetLowering::ConstraintType
ConstraintType
Definition TargetLowering.h:5262

llvm::TargetLowering::C_RegisterClass
@ C_RegisterClass
Definition TargetLowering.h:5264

llvm::TargetLowering::C_Other
@ C_Other
Definition TargetLowering.h:5268

llvm::TargetLowering::computeKnownBitsForFrameIndex
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
Definition TargetLowering.cpp:3946

llvm::TargetLowering::scalarizeVectorStore
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
Definition TargetLowering.cpp:11217

llvm::TargetLowering::AsmOperandInfoVector
std::vector< AsmOperandInfo > AsmOperandInfoVector
Definition TargetLowering.h:5320

llvm::TargetLowering::SimplifyMultipleUseDemandedBits
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
Definition TargetLowering.cpp:708

llvm::TargetLowering::expandUnalignedStore
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
Definition TargetLowering.cpp:11440

llvm::TargetLowering::getConstraintType
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
Definition TargetLowering.cpp:5795

llvm::TargetLowering::parametersInCSRMatch
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
Definition TargetLowering.cpp:91

llvm::TargetLowering::expandUnalignedLoad
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
Definition TargetLowering.cpp:11293

llvm::TargetLowering::expandFMINIMUMNUM_FMAXIMUMNUM
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
Definition TargetLowering.cpp:9595

llvm::TargetLowering::isTypeDesirableForOp
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
Definition TargetLowering.h:4677

llvm::TargetLowering::scalarizeVectorLoad
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
Definition TargetLowering.cpp:11129

llvm::TargetLowering::getRegForInlineAsmConstraint
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
Definition TargetLowering.cpp:5939

llvm::TargetLowering::SimplifyDemandedBits
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
Definition TargetLowering.cpp:1162

llvm::TargetLowering::TargetLowering
TargetLowering(const TargetLowering &)=delete

llvm::TargetLowering::EmitInstrWithCustomInserter
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
Definition SelectionDAGISel.cpp:333

llvm::TargetLowering::ParseConstraints
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
Definition TargetLowering.cpp:6001

llvm::TargetLowering::expandRoundInexactToOdd
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
Definition TargetLowering.cpp:12833

llvm::TargetLowering::ComputeConstraintToUse
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
Definition TargetLowering.cpp:6357

llvm::TargetLowering::LowerAsmOperandForConstraint
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
Definition TargetLowering.cpp:5857

llvm::TargetLowering::expandFMINNUM_FMAXNUM
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Definition TargetLowering.cpp:9476

llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition TargetMachine.h:83

llvm::TargetMachine::getOptLevel
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
Definition TargetMachine.h:289

llvm::TargetMachine::getTargetTriple
const Triple & getTargetTriple() const
Definition TargetMachine.h:132

llvm::TargetMachine::shouldAssumeDSOLocal
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
Definition TargetMachine.cpp:200

llvm::TargetMachine::Options
TargetOptions Options
Definition TargetMachine.h:124

llvm::TargetOptions::GuaranteedTailCallOpt
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
Definition TargetOptions.h:196

llvm::TargetRegisterClass
Definition TargetRegisterInfo.h:45

llvm::TargetRegisterClass::getNumRegs
unsigned getNumRegs() const
Return the number of registers in this class.
Definition TargetRegisterInfo.h:82

llvm::TargetRegisterClass::getID
unsigned getID() const
Return the register class ID number.
Definition TargetRegisterInfo.h:74

llvm::TargetRegisterClass::getRegister
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
Definition TargetRegisterInfo.h:89

llvm::TargetRegisterClass::begin
iterator begin() const
begin/end - Return all of the registers in this class.
Definition TargetRegisterInfo.h:78

llvm::TargetRegisterInfo
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Definition TargetRegisterInfo.h:242

llvm::Target
Target - Wrapper for Target specific information.
Definition TargetRegistry.h:148

llvm::Triple
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47

llvm::Triple::OSType
OSType
Definition Triple.h:212

llvm::Triple::AMDHSA
@ AMDHSA
Definition Triple.h:236

llvm::Triple::AMDPAL
@ AMDPAL
Definition Triple.h:246

llvm::Triple::getOS
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:445

llvm::Twine
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82

llvm::TypeSize
Definition TypeSize.h:332

llvm::TypeSize::getFixed
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343

llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46

llvm::Type::getInt32Ty
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313

llvm::Type::isBFloatTy
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:147

llvm::Type::getPointerAddressSpace
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition DerivedTypes.h:831

llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370

llvm::Type::isHalfTy
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144

llvm::Type::isFunctionTy
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition Type.h:275

llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257

llvm::Type::getFltSemantics
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:110

llvm::Type::isVoidTy
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141

llvm::Use
A Use represents the edge between a Value definition and its users.
Definition Use.h:35

llvm::Use::getOperandNo
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35

llvm::Use::set
LLVM_ABI void set(Value *Val)
Definition Value.h:883

llvm::Use::getUser
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61

llvm::User::getOperandUse
const Use & getOperandUse(unsigned i) const
Definition User.h:220

llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition User.h:207

llvm::Value
LLVM Value Representation.
Definition Value.h:75

llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255

llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439

llvm::Value::replaceAllUsesWith
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:549

llvm::Value::getContext
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258

llvm::Value::users
iterator_range< user_iterator > users()
Definition Value.h:426

llvm::Value::use_empty
bool use_empty() const
Definition Value.h:346

llvm::Value::uses
iterator_range< use_iterator > uses()
Definition Value.h:380

llvm::Value::takeName
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:399

llvm::VectorType::getElementType
Type * getElementType() const
Definition DerivedTypes.h:515

llvm::cl::opt
Definition CommandLine.h:1454

llvm::details::FixedOrScalableQuantity::getFixedValue
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200

llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition ilist_node.h:123

uint32_t

uint64_t

Call
CallInst * Call
Definition ObjCARCOpts.cpp:2356

Analysis.h

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition ErrorHandling.h:164

llvm::AMDGPUAS::CONSTANT_ADDRESS_32BIT
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition AMDGPUAddrSpace.h:40

llvm::AMDGPUAS::BUFFER_STRIDED_POINTER
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
Definition AMDGPUAddrSpace.h:47

llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition AMDGPUAddrSpace.h:34

llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition AMDGPUAddrSpace.h:36

llvm::AMDGPUAS::STREAMOUT_REGISTER
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
Definition AMDGPUAddrSpace.h:53

llvm::AMDGPUAS::CONSTANT_ADDRESS
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
Definition AMDGPUAddrSpace.h:37

llvm::AMDGPUAS::UNKNOWN_ADDRESS_SPACE
@ UNKNOWN_ADDRESS_SPACE
Definition AMDGPUAddrSpace.h:85

llvm::AMDGPUAS::FLAT_ADDRESS
@ FLAT_ADDRESS
Address space for flat memory.
Definition AMDGPUAddrSpace.h:32

llvm::AMDGPUAS::GLOBAL_ADDRESS
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition AMDGPUAddrSpace.h:33

llvm::AMDGPUAS::BUFFER_FAT_POINTER
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
Definition AMDGPUAddrSpace.h:42

llvm::AMDGPUAS::PRIVATE_ADDRESS
@ PRIVATE_ADDRESS
Address space for private memory.
Definition AMDGPUAddrSpace.h:38

llvm::AMDGPUAS::BUFFER_RESOURCE
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
Definition AMDGPUAddrSpace.h:45

llvm::AMDGPUISD::MAD_U64_U32
@ MAD_U64_U32
Definition AMDGPUSelectionDAGInfo.h:26

llvm::AMDGPUISD::MAD_I64_I32
@ MAD_I64_I32
Definition AMDGPUSelectionDAGInfo.h:27

llvm::AMDGPUISD::WAVE_ADDRESS
@ WAVE_ADDRESS
Definition AMDGPUSelectionDAGInfo.h:23

llvm::AMDGPU::CPol::CPol
CPol
Definition SIDefines.h:370

llvm::AMDGPU::CPol::GLC
@ GLC
Definition SIDefines.h:371

llvm::AMDGPU::CPol::SWZ_pregfx12
@ SWZ_pregfx12
Definition SIDefines.h:379

llvm::AMDGPU::CPol::ALL
@ ALL
Definition SIDefines.h:418

llvm::AMDGPU::CPol::VIRTUAL_BITS
@ VIRTUAL_BITS
Definition SIDefines.h:431

llvm::AMDGPU::CPol::SWZ
@ SWZ
Definition SIDefines.h:414

llvm::AMDGPU::CPol::ALL_pregfx12
@ ALL_pregfx12
Definition SIDefines.h:378

llvm::AMDGPU::CPol::VOLATILE
@ VOLATILE
Definition SIDefines.h:428

llvm::AMDGPU::DPP::ROW_SHR_FIRST
@ ROW_SHR_FIRST
Definition SIDefines.h:976

llvm::AMDGPU::DPP::BCAST15
@ BCAST15
Definition SIDefines.h:996

llvm::AMDGPU::DPP::BCAST31
@ BCAST31
Definition SIDefines.h:997

llvm::AMDGPU::Exp::Target
Target
Definition SIDefines.h:1023

llvm::AMDGPU::HSAMD::Kernel::Arg::Key::Align
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
Definition AMDGPUMetadata.h:183

llvm::AMDGPU::HSAMD::Kernel::CodeProps::Key::NumVGPRs
constexpr char NumVGPRs[]
Key for Kernel::CodeProps::Metadata::mNumVGPRs.
Definition AMDGPUMetadata.h:262

llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition AMDGPUMetadata.h:396

llvm::AMDGPU::HSAMD::Kernel::Key::SymbolName
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
Definition AMDGPUMetadata.h:388

llvm::AMDGPU::Hwreg
Definition SIDefines.h:513

llvm::AMDGPU::Hwreg::Id
Id
Definition SIDefines.h:515

llvm::AMDGPU::Hwreg::ID_TRAPSTS
@ ID_TRAPSTS
Definition SIDefines.h:518

llvm::AMDGPU::Hwreg::ID_IB_STS2
@ ID_IB_STS2
Definition SIDefines.h:539

llvm::AMDGPU::Hwreg::ID_MODE
@ ID_MODE
Definition SIDefines.h:516

llvm::AMDGPU::Hwreg::OFFSET_MEM_VIOL
@ OFFSET_MEM_VIOL
Definition SIDefines.h:570

llvm::AMDGPU::Hwreg::FP_DENORM_MASK
@ FP_DENORM_MASK
Definition SIDefines.h:576

llvm::AMDGPU::Hwreg::FP_ROUND_MASK
@ FP_ROUND_MASK
Definition SIDefines.h:575

llvm::AMDGPU::isInlinableLiteralBF16
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
Definition AMDGPUBaseInfo.cpp:3092

llvm::AMDGPU::getMIMGG16MappingInfo
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)

llvm::AMDGPU::isInlinableLiteralFP16
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
Definition AMDGPUBaseInfo.cpp:3113

llvm::AMDGPU::getMIMGOpcode
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
Definition AMDGPUBaseInfo.cpp:314

llvm::AMDGPU::isShader
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
Definition AMDGPUBaseInfo.h:1450

llvm::AMDGPU::shouldEmitConstantsToTextSection
bool shouldEmitConstantsToTextSection(const Triple &TT)
Definition AMDGPUBaseInfo.cpp:1620

llvm::AMDGPU::isFlatGlobalAddrSpace
bool isFlatGlobalAddrSpace(unsigned AS)
Definition AMDGPUAddrSpace.h:90

llvm::AMDGPU::AMDHSA_COV5
@ AMDHSA_COV5
Definition AMDGPUBaseInfo.h:64

llvm::AMDGPU::FltRoundToHWConversionTable
const uint64_t FltRoundToHWConversionTable
Definition SIModeRegisterDefaults.cpp:188

llvm::AMDGPU::isGFX12Plus
bool isGFX12Plus(const MCSubtargetInfo &STI)
Definition AMDGPUBaseInfo.cpp:2623

llvm::AMDGPU::getNSAMaxSize
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
Definition AMDGPUBaseInfo.cpp:2544

llvm::AMDGPU::getNullPointerValue
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
Definition AMDGPUAddrSpace.h:174

llvm::AMDGPU::isGFX11
bool isGFX11(const MCSubtargetInfo &STI)
Definition AMDGPUBaseInfo.cpp:2611

llvm::AMDGPU::isGFX13
bool isGFX13(const MCSubtargetInfo &STI)
Definition AMDGPUBaseInfo.cpp:2637

llvm::AMDGPU::hasValueInRangeLikeMetadata
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
Definition AMDGPUBaseInfo.cpp:1745

llvm::AMDGPU::Imm
@ Imm
Definition AMDGPURegBankLegalizeRules.h:152

llvm::AMDGPU::isLegalDPALU_DPPControl
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
Definition AMDGPUBaseInfo.h:1809

llvm::AMDGPU::mayTailCallThisCC
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
Definition AMDGPUBaseInfo.h:1548

llvm::AMDGPU::getAMDHSACodeObjectVersion
unsigned getAMDHSACodeObjectVersion(const Module &M)
Definition AMDGPUBaseInfo.cpp:213

llvm::AMDGPU::hasNamedOperand
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
Definition AMDGPUBaseInfo.h:430

llvm::AMDGPU::isKernel
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
Definition AMDGPUBaseInfo.h:1529

llvm::AMDGPU::isEntryFunctionCC
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
Definition AMDGPUBaseInfo.h:1479

llvm::AMDGPU::isInlinableLiteral32
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
Definition AMDGPUBaseInfo.cpp:3066

llvm::AMDGPU::isCompute
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
Definition AMDGPUBaseInfo.h:1474

llvm::AMDGPU::isIntrinsicSourceOfDivergence
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
Definition AMDGPUBaseInfo.cpp:3489

llvm::AMDGPU::isInlinableIntLiteral
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
Definition AMDGPUBaseInfo.h:1710

llvm::AMDGPU::getMUBUFTfe
bool getMUBUFTfe(unsigned Opc)
Definition AMDGPUBaseInfo.cpp:545

llvm::AMDGPU::getGlobalSaddrOp
LLVM_READONLY int32_t getGlobalSaddrOp(uint32_t Opcode)

llvm::AMDGPU::getVOPe64
LLVM_READONLY int32_t getVOPe64(uint32_t Opcode)

llvm::AMDGPU::isGFX11Plus
bool isGFX11Plus(const MCSubtargetInfo &STI)
Definition AMDGPUBaseInfo.cpp:2615

llvm::AMDGPU::getInlineEncodingV2F16
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
Definition AMDGPUBaseInfo.cpp:3219

llvm::AMDGPU::parseAsmConstraintPhysReg
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
Definition AMDGPUBaseInfo.cpp:1656

llvm::AMDGPU::isGFX10Plus
bool isGFX10Plus(const MCSubtargetInfo &STI)
Definition AMDGPUBaseInfo.cpp:2607

llvm::AMDGPU::TowardZeroF32_TowardNegativeF64
@ TowardZeroF32_TowardNegativeF64
Definition SIModeRegisterDefaults.h:127

llvm::AMDGPU::isUniformMMO
bool isUniformMMO(const MachineMemOperand *MMO)
Definition AMDGPUInstrInfo.cpp:30

llvm::AMDGPU::getInlineEncodingV2I16
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
Definition AMDGPUBaseInfo.cpp:3185

llvm::AMDGPU::decodeFltRoundToHWConversionTable
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
Definition SIModeRegisterDefaults.cpp:235

llvm::AMDGPU::isExtendedGlobalAddrSpace
bool isExtendedGlobalAddrSpace(unsigned AS)
Definition AMDGPUAddrSpace.h:95

llvm::AMDGPU::getMIMGDimInfo
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)

llvm::AMDGPU::getInlineEncodingV2BF16
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
Definition AMDGPUBaseInfo.cpp:3191

llvm::AMDGPU::getMIMGBaseOpcodeInfo
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)

llvm::AMDGPU::isChainCC
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
Definition AMDGPUBaseInfo.h:1497

llvm::AMDGPU::getMaskedMIMGOp
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
Definition AMDGPUBaseInfo.cpp:326

llvm::AMDGPU::getImageDimIntrinsicInfo
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)

llvm::AMDGPU::isInlinableLiteralI16
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
Definition AMDGPUBaseInfo.cpp:3109

llvm::AMDGPU::canGuaranteeTCO
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
Definition AMDGPUBaseInfo.h:1542

llvm::AMDGPU::isGraphics
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
Definition AMDGPUBaseInfo.h:1468

llvm::AMDGPU::isInlinableLiteral64
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
Definition AMDGPUBaseInfo.cpp:3049

llvm::AMDGPU::lookupRsrcIntrinsic
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)

llvm::AMDGPU::FltRoundConversionTable
const uint64_t FltRoundConversionTable
Definition SIModeRegisterDefaults.cpp:74

llvm::ARCCC::Z
@ Z
Definition ARCInfo.h:41

llvm::ARMII::VecSize
@ VecSize
Definition ARMBaseInfo.h:437

llvm::ARM_MB::ST
@ ST
Definition ARMBaseInfo.h:73

llvm::ARM::ProfileKind::M
@ M
Definition ARMTargetParser.h:171

llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition BitmaskEnum.h:126

llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24

llvm::CallingConv::AMDGPU_CS
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition CallingConv.h:197

llvm::CallingConv::AMDGPU_KERNEL
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition CallingConv.h:200

llvm::CallingConv::MaxID
@ MaxID
The highest possible ID. Must be some 2^k - 1.
Definition CallingConv.h:301

llvm::CallingConv::AMDGPU_Gfx
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition CallingConv.h:232

llvm::CallingConv::AMDGPU_CS_ChainPreserve
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition CallingConv.h:249

llvm::CallingConv::AMDGPU_CS_Chain
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition CallingConv.h:245

llvm::CallingConv::AMDGPU_PS
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition CallingConv.h:194

llvm::CallingConv::Fast
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41

llvm::CallingConv::AMDGPU_Gfx_WholeWave
@ AMDGPU_Gfx_WholeWave
Definition CallingConv.h:288

llvm::CallingConv::C
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34

llvm::FPOpFusion::Fast
@ Fast
Definition TargetOptions.h:32

llvm::IRSimilarity::Legal
@ Legal
Definition IRSimilarityIdentifier.h:77

llvm::ISD::SETCC
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:823

llvm::ISD::MERGE_VALUES
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:261

llvm::ISD::STACKSAVE
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition ISDOpcodes.h:1260

llvm::ISD::STORE
@ STORE
Definition ISDOpcodes.h:1170

llvm::ISD::PTRADD
@ PTRADD
PTRADD represents pointer arithmetic semantics, for targets that opt in using shouldPreservePtrArith(...
Definition ISDOpcodes.h:1563

llvm::ISD::ATOMIC_LOAD_FMAX
@ ATOMIC_LOAD_FMAX
Definition ISDOpcodes.h:1414

llvm::ISD::DELETED_NODE
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45

llvm::ISD::SET_FPENV
@ SET_FPENV
Sets the current floating-point environment.
Definition ISDOpcodes.h:1136

llvm::ISD::FP_TO_BF16
@ FP_TO_BF16
Definition ISDOpcodes.h:1017

llvm::ISD::FLOG10
@ FLOG10
Definition ISDOpcodes.h:1057

llvm::ISD::SREM
@ SREM
Definition ISDOpcodes.h:269

llvm::ISD::SMUL_LOHI
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275

llvm::ISD::ATOMIC_LOAD_NAND
@ ATOMIC_LOAD_NAND
Definition ISDOpcodes.h:1407

llvm::ISD::UDIV
@ UDIV
Definition ISDOpcodes.h:268

llvm::ISD::INSERT_SUBVECTOR
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:600

llvm::ISD::UINT_TO_FP
@ UINT_TO_FP
Definition ISDOpcodes.h:885

llvm::ISD::UMIN
@ UMIN
Definition ISDOpcodes.h:729

llvm::ISD::BSWAP
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:783

llvm::ISD::ROTR
@ ROTR
Definition ISDOpcodes.h:773

llvm::ISD::FPOW
@ FPOW
Definition ISDOpcodes.h:1043

llvm::ISD::ConstantFP
@ ConstantFP
Definition ISDOpcodes.h:87

llvm::ISD::ATOMIC_LOAD_MAX
@ ATOMIC_LOAD_MAX
Definition ISDOpcodes.h:1409

llvm::ISD::ATOMIC_STORE
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
Definition ISDOpcodes.h:1379

llvm::ISD::UADDO
@ UADDO
Definition ISDOpcodes.h:349

llvm::ISD::FTRUNC
@ FTRUNC
Definition ISDOpcodes.h:1062

llvm::ISD::SDIV
@ SDIV
Definition ISDOpcodes.h:267

llvm::ISD::ATOMIC_LOAD_UMIN
@ ATOMIC_LOAD_UMIN
Definition ISDOpcodes.h:1410

llvm::ISD::FMAD
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:522

llvm::ISD::FMAXNUM_IEEE
@ FMAXNUM_IEEE
Definition ISDOpcodes.h:1104

llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264

llvm::ISD::LOAD
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition ISDOpcodes.h:1169

llvm::ISD::ANY_EXTEND
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:857

llvm::ISD::FSUB
@ FSUB
Definition ISDOpcodes.h:418

llvm::ISD::ATOMIC_LOAD_USUB_COND
@ ATOMIC_LOAD_USUB_COND
Definition ISDOpcodes.h:1422

llvm::ISD::FMA
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518

llvm::ISD::FABS
@ FABS
Definition ISDOpcodes.h:1031

llvm::ISD::FNEARBYINT
@ FNEARBYINT
Definition ISDOpcodes.h:1064

llvm::ISD::INTRINSIC_VOID
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:220

llvm::ISD::RETURNADDR
@ RETURNADDR
Definition ISDOpcodes.h:111

llvm::ISD::GlobalAddress
@ GlobalAddress
Definition ISDOpcodes.h:88

llvm::ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition ISDOpcodes.h:1392

llvm::ISD::SINT_TO_FP
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:884

llvm::ISD::CONCAT_VECTORS
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:584

llvm::ISD::FADD
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417

llvm::ISD::ABS
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:747

llvm::ISD::FEXP10
@ FEXP10
Definition ISDOpcodes.h:1060

llvm::ISD::FP_TO_FP16
@ FP_TO_FP16
Definition ISDOpcodes.h:1008

llvm::ISD::SRL
@ SRL
Definition ISDOpcodes.h:771

llvm::ISD::FMAXIMUM
@ FMAXIMUM
Definition ISDOpcodes.h:1110

llvm::ISD::FP16_TO_FP
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition ISDOpcodes.h:1007

llvm::ISD::ATOMIC_LOAD_OR
@ ATOMIC_LOAD_OR
Definition ISDOpcodes.h:1405

llvm::ISD::BITCAST
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:997

llvm::ISD::BUILD_PAIR
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254

llvm::ISD::ATOMIC_LOAD_XOR
@ ATOMIC_LOAD_XOR
Definition ISDOpcodes.h:1406

llvm::ISD::FFLOOR
@ FFLOOR
Definition ISDOpcodes.h:1067

llvm::ISD::FLDEXP
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition ISDOpcodes.h:1046

llvm::ISD::BUILTIN_OP_END
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition ISDOpcodes.h:1644

llvm::ISD::ATOMIC_LOAD_FADD
@ ATOMIC_LOAD_FADD
Definition ISDOpcodes.h:1412

llvm::ISD::SRA
@ SRA
Definition ISDOpcodes.h:770

llvm::ISD::ATOMIC_LOAD_USUB_SAT
@ ATOMIC_LOAD_USUB_SAT
Definition ISDOpcodes.h:1423

llvm::ISD::CTLZ_ZERO_POISON
@ CTLZ_ZERO_POISON
Definition ISDOpcodes.h:792

llvm::ISD::SET_ROUNDING
@ SET_ROUNDING
Set rounding mode.
Definition ISDOpcodes.h:979

llvm::ISD::USUBO
@ USUBO
Definition ISDOpcodes.h:353

llvm::ISD::CONVERGENCECTRL_GLUE
@ CONVERGENCECTRL_GLUE
This does not correspond to any convergence control intrinsic.
Definition ISDOpcodes.h:1579

llvm::ISD::SIGN_EXTEND
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:848

llvm::ISD::FLOG2
@ FLOG2
Definition ISDOpcodes.h:1056

llvm::ISD::SCALAR_TO_VECTOR
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:665

llvm::ISD::READSTEADYCOUNTER
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
Definition ISDOpcodes.h:1326

llvm::ISD::BR
@ BR
Control flow instructions. These all have token chains.
Definition ISDOpcodes.h:1185

llvm::ISD::UADDSAT
@ UADDSAT
Definition ISDOpcodes.h:366

llvm::ISD::FMAXNUM
@ FMAXNUM
Definition ISDOpcodes.h:1088

llvm::ISD::FPOWI
@ FPOWI
Definition ISDOpcodes.h:1044

llvm::ISD::FRINT
@ FRINT
Definition ISDOpcodes.h:1063

llvm::ISD::PREFETCH
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition ISDOpcodes.h:1359

llvm::ISD::FSINCOS
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition ISDOpcodes.h:1118

llvm::ISD::FNEG
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition ISDOpcodes.h:1030

llvm::ISD::BR_CC
@ BR_CC
BR_CC - Conditional branch.
Definition ISDOpcodes.h:1215

llvm::ISD::CTTZ
@ CTTZ
Definition ISDOpcodes.h:784

llvm::ISD::SSUBO
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:352

llvm::ISD::CALLSEQ_END
@ CALLSEQ_END
Definition ISDOpcodes.h:1279

llvm::ISD::ATOMIC_LOAD_MIN
@ ATOMIC_LOAD_MIN
Definition ISDOpcodes.h:1408

llvm::ISD::FP_TO_UINT
@ FP_TO_UINT
Definition ISDOpcodes.h:931

llvm::ISD::OR
@ OR
Definition ISDOpcodes.h:740

llvm::ISD::FCANONICALIZE
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:541

llvm::ISD::IS_FPCLASS
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:548

llvm::ISD::SSUBSAT
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:374

llvm::ISD::UMULO
@ UMULO
Definition ISDOpcodes.h:357

llvm::ISD::SRA_PARTS
@ SRA_PARTS
Definition ISDOpcodes.h:838

llvm::ISD::SELECT
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:800

llvm::ISD::UMUL_LOHI
@ UMUL_LOHI
Definition ISDOpcodes.h:276

llvm::ISD::ATOMIC_LOAD
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition ISDOpcodes.h:1375

llvm::ISD::UNDEF
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:233

llvm::ISD::EXTRACT_ELEMENT
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:247

llvm::ISD::ATOMIC_LOAD_FMIN
@ ATOMIC_LOAD_FMIN
Definition ISDOpcodes.h:1415

llvm::ISD::CopyFromReg
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:230

llvm::ISD::FCBRT
@ FCBRT
Definition ISDOpcodes.h:1033

llvm::ISD::SADDO
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:348

llvm::ISD::FSHR
@ FSHR
Definition ISDOpcodes.h:775

llvm::ISD::FROUND
@ FROUND
Definition ISDOpcodes.h:1065

llvm::ISD::USUBSAT
@ USUBSAT
Definition ISDOpcodes.h:375

llvm::ISD::CTLS
@ CTLS
Count leading redundant sign bits.
Definition ISDOpcodes.h:796

llvm::ISD::GET_ROUNDING
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:974

llvm::ISD::MULHU
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704

llvm::ISD::GET_FPMODE
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition ISDOpcodes.h:1154

llvm::ISD::GET_FPENV
@ GET_FPENV
Gets the current floating-point environment.
Definition ISDOpcodes.h:1131

llvm::ISD::SHL
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:769

llvm::ISD::VECTOR_SHUFFLE
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649

llvm::ISD::ATOMIC_LOAD_AND
@ ATOMIC_LOAD_AND
Definition ISDOpcodes.h:1403

llvm::ISD::EXTRACT_SUBVECTOR
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:614

llvm::ISD::FMINNUM_IEEE
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition ISDOpcodes.h:1103

llvm::ISD::FCOS
@ FCOS
Definition ISDOpcodes.h:1035

llvm::ISD::XOR
@ XOR
Definition ISDOpcodes.h:741

llvm::ISD::EXTRACT_VECTOR_ELT
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576

llvm::ISD::CopyToReg
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224

llvm::ISD::ZERO_EXTEND
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:854

llvm::ISD::DEBUGTRAP
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition ISDOpcodes.h:1349

llvm::ISD::FP_TO_UINT_SAT
@ FP_TO_UINT_SAT
Definition ISDOpcodes.h:950

llvm::ISD::CTPOP
@ CTPOP
Definition ISDOpcodes.h:786

llvm::ISD::SELECT_CC
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:815

llvm::ISD::FMUL
@ FMUL
Definition ISDOpcodes.h:419

llvm::ISD::ATOMIC_CMP_SWAP
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition ISDOpcodes.h:1386

llvm::ISD::ATOMIC_LOAD_UMAX
@ ATOMIC_LOAD_UMAX
Definition ISDOpcodes.h:1411

llvm::ISD::SRL_PARTS
@ SRL_PARTS
Definition ISDOpcodes.h:839

llvm::ISD::FMINNUM
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
Definition ISDOpcodes.h:1087

llvm::ISD::SUB
@ SUB
Definition ISDOpcodes.h:265

llvm::ISD::MULHS
@ MULHS
Definition ISDOpcodes.h:705

llvm::ISD::SMULO
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:356

llvm::ISD::DYNAMIC_STACKALLOC
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition ISDOpcodes.h:1179

llvm::ISD::SIGN_EXTEND_INREG
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:892

llvm::ISD::SMIN
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727

llvm::ISD::Constant
@ Constant
Definition ISDOpcodes.h:86

llvm::ISD::FP_EXTEND
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:982

llvm::ISD::VSELECT
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:809

llvm::ISD::UADDO_CARRY
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328

llvm::ISD::FROUNDEVEN
@ FROUNDEVEN
Definition ISDOpcodes.h:1066

llvm::ISD::INLINEASM_BR
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition ISDOpcodes.h:1235

llvm::ISD::FDIV
@ FDIV
Definition ISDOpcodes.h:420

llvm::ISD::BF16_TO_FP
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition ISDOpcodes.h:1016

llvm::ISD::FREM
@ FREM
Definition ISDOpcodes.h:421

llvm::ISD::ATOMIC_LOAD_UDEC_WRAP
@ ATOMIC_LOAD_UDEC_WRAP
Definition ISDOpcodes.h:1421

llvm::ISD::ATOMIC_LOAD_ADD
@ ATOMIC_LOAD_ADD
Definition ISDOpcodes.h:1401

llvm::ISD::STRICT_FP_ROUND
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:500

llvm::ISD::FMINIMUM
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition ISDOpcodes.h:1109

llvm::ISD::ATOMIC_LOAD_SUB
@ ATOMIC_LOAD_SUB
Definition ISDOpcodes.h:1402

llvm::ISD::FP_TO_SINT
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:930

llvm::ISD::READCYCLECOUNTER
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition ISDOpcodes.h:1320

llvm::ISD::STRICT_FP_EXTEND
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:505

llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739

llvm::ISD::TRAP
@ TRAP
TRAP - Trapping instruction.
Definition ISDOpcodes.h:1346

llvm::ISD::INTRINSIC_WO_CHAIN
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205

llvm::ISD::USUBO_CARRY
@ USUBO_CARRY
Definition ISDOpcodes.h:329

llvm::ISD::FLOG
@ FLOG
Definition ISDOpcodes.h:1055

llvm::ISD::UREM
@ UREM
Definition ISDOpcodes.h:270

llvm::ISD::INSERT_VECTOR_ELT
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:565

llvm::ISD::TokenFactor
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53

llvm::ISD::FSIN
@ FSIN
Definition ISDOpcodes.h:1034

llvm::ISD::FEXP
@ FEXP
Definition ISDOpcodes.h:1058

llvm::ISD::FCEIL
@ FCEIL
Definition ISDOpcodes.h:1061

llvm::ISD::ATOMIC_SWAP
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition ISDOpcodes.h:1400

llvm::ISD::CTTZ_ZERO_POISON
@ CTTZ_ZERO_POISON
Bit counting operators with a poisoned result for zero inputs.
Definition ISDOpcodes.h:791

llvm::ISD::ExternalSymbol
@ ExternalSymbol
Definition ISDOpcodes.h:93

llvm::ISD::MUL
@ MUL
Definition ISDOpcodes.h:266

llvm::ISD::FFREXP
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition ISDOpcodes.h:1053

llvm::ISD::FP_ROUND
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:963

llvm::ISD::CTLZ
@ CTLZ
Definition ISDOpcodes.h:785

llvm::ISD::FMAXIMUMNUM
@ FMAXIMUMNUM
Definition ISDOpcodes.h:1115

llvm::ISD::SPONENTRY
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition ISDOpcodes.h:122

llvm::ISD::STRICT_FLDEXP
@ STRICT_FLDEXP
Definition ISDOpcodes.h:441

llvm::ISD::FSQRT
@ FSQRT
Definition ISDOpcodes.h:1032

llvm::ISD::ADDRSPACECAST
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition ISDOpcodes.h:1001

llvm::ISD::INLINEASM
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition ISDOpcodes.h:1232

llvm::ISD::FP_TO_SINT_SAT
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:949

llvm::ISD::TRUNCATE
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:860

llvm::ISD::BRCOND
@ BRCOND
BRCOND - Conditional branch.
Definition ISDOpcodes.h:1208

llvm::ISD::ROTL
@ ROTL
Definition ISDOpcodes.h:772

llvm::ISD::SHL_PARTS
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:837

llvm::ISD::AssertSext
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62

llvm::ISD::BITREVERSE
@ BITREVERSE
Definition ISDOpcodes.h:787

llvm::ISD::ATOMIC_LOAD_UINC_WRAP
@ ATOMIC_LOAD_UINC_WRAP
Definition ISDOpcodes.h:1420

llvm::ISD::FCOPYSIGN
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:534

llvm::ISD::SADDSAT
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:365

llvm::ISD::AssertZext
@ AssertZext
Definition ISDOpcodes.h:63

llvm::ISD::FEXP2
@ FEXP2
Definition ISDOpcodes.h:1059

llvm::ISD::SMAX
@ SMAX
Definition ISDOpcodes.h:728

llvm::ISD::UMAX
@ UMAX
Definition ISDOpcodes.h:730

llvm::ISD::FMINIMUMNUM
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
Definition ISDOpcodes.h:1114

llvm::ISD::INTRINSIC_W_CHAIN
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213

llvm::ISD::BUILD_VECTOR
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556

llvm::ISD::getSetCCSwappedOperands
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
Definition SelectionDAG.cpp:635

llvm::ISD::isSignedIntSetCC
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
Definition ISDOpcodes.h:1809

llvm::ISD::UNINDEXED
@ UNINDEXED
Definition ISDOpcodes.h:1725

llvm::ISD::CondCode
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition ISDOpcodes.h:1776

llvm::ISD::SETOEQ
@ SETOEQ
Definition ISDOpcodes.h:1779

llvm::ISD::SETUNE
@ SETUNE
Definition ISDOpcodes.h:1792

llvm::ISD::SETUEQ
@ SETUEQ
Definition ISDOpcodes.h:1787

llvm::ISD::SETOLE
@ SETOLE
Definition ISDOpcodes.h:1783

llvm::ISD::SETOLT
@ SETOLT
Definition ISDOpcodes.h:1782

llvm::ISD::SETNE
@ SETNE
Definition ISDOpcodes.h:1801

llvm::ISD::SETUGT
@ SETUGT
Definition ISDOpcodes.h:1788

llvm::ISD::SETOGT
@ SETOGT
Definition ISDOpcodes.h:1780

llvm::ISD::SETULT
@ SETULT
Definition ISDOpcodes.h:1790

llvm::ISD::SETUO
@ SETUO
Definition ISDOpcodes.h:1786

llvm::ISD::SETONE
@ SETONE
Definition ISDOpcodes.h:1784

llvm::ISD::SETGT
@ SETGT
Definition ISDOpcodes.h:1797

llvm::ISD::SETLT
@ SETLT
Definition ISDOpcodes.h:1799

llvm::ISD::SETO
@ SETO
Definition ISDOpcodes.h:1785

llvm::ISD::SETGE
@ SETGE
Definition ISDOpcodes.h:1798

llvm::ISD::SETTRUE
@ SETTRUE
Definition ISDOpcodes.h:1793

llvm::ISD::SETUGE
@ SETUGE
Definition ISDOpcodes.h:1789

llvm::ISD::SETLE
@ SETLE
Definition ISDOpcodes.h:1800

llvm::ISD::SETULE
@ SETULE
Definition ISDOpcodes.h:1791

llvm::ISD::SETOGE
@ SETOGE
Definition ISDOpcodes.h:1781

llvm::ISD::SETFALSE
@ SETFALSE
Definition ISDOpcodes.h:1778

llvm::ISD::SETEQ
@ SETEQ
Definition ISDOpcodes.h:1796

llvm::ISD::SETCC_INVALID
@ SETCC_INVALID
Definition ISDOpcodes.h:1804

llvm::ISD::LoadExtType
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition ISDOpcodes.h:1756

llvm::ISD::NON_EXTLOAD
@ NON_EXTLOAD
Definition ISDOpcodes.h:1756

llvm::ISD::SEXTLOAD
@ SEXTLOAD
Definition ISDOpcodes.h:1756

llvm::ISD::ZEXTLOAD
@ ZEXTLOAD
Definition ISDOpcodes.h:1756

llvm::ISD::EXTLOAD
@ EXTLOAD
Definition ISDOpcodes.h:1756

llvm::Intrinsic
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
Definition GenericSSAContext.h:27

llvm::Intrinsic::getDeclarationIfExists
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
Definition Intrinsics.cpp:805

llvm::Intrinsic::getFnAttributes
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.

llvm::Intrinsic::ID
unsigned ID
Definition GenericSSAContext.h:28

llvm::Intrinsic::getAttributes
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.

llvm::Intrinsic::getType
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > OverloadTys={})
Return the function type for an intrinsic.
Definition Intrinsics.cpp:613

llvm::M68k::MemAddrModeKind::V
@ V
Definition M68kBaseInfo.h:63

llvm::M68k::MemAddrModeKind::K
@ K
Definition M68kBaseInfo.h:68

llvm::MIPatternMatch::m_Neg
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
Definition MIPatternMatch.h:935

llvm::MIPatternMatch::mi_match
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
Definition MIPatternMatch.h:28

llvm::MIPatternMatch::m_GFCstOrSplat
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
Definition MIPatternMatch.h:189

llvm::PatternMatch::m_Add
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
Definition PatternMatch.h:1150

llvm::PatternMatch::m_Specific
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition PatternMatch.h:939

llvm::PatternMatch::m_One
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition PatternMatch.h:560

llvm::PatternMatch::m_Value
auto m_Value()
Match an arbitrary value and ignore it.
Definition PatternMatch.h:135

llvm::PatternMatch::m_Shl
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
Definition PatternMatch.h:1282

llvm::PatternMatch::m_Sub
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
Definition PatternMatch.h:1162

llvm::RISCVFenceField::R
@ R
Definition RISCVBaseInfo.h:487

llvm::SDPatternMatch
Definition SDPatternMatch.h:29

llvm::SDPatternMatch::m_IntrinsicWOChain
auto m_IntrinsicWOChain(const OpndPreds &...Opnds)
Definition SDPatternMatch.h:1547

llvm::SDPatternMatch::sd_match
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
Definition SDPatternMatch.h:73

llvm::SDPatternMatch::m_ConstInt
ConstantInt_match m_ConstInt()
Match any integer constants or splat of an integer constant.
Definition SDPatternMatch.h:1281

llvm::SIInstrFlags::FlatGlobal
@ FlatGlobal
Definition SIDefines.h:145

llvm::SIInstrFlags::FlatScratch
@ FlatScratch
Definition SIDefines.h:160

llvm::SIInstrFlags::FLAT
@ FLAT
Definition SIDefines.h:90

llvm::SIInstrFlags::N_ZERO
@ N_ZERO
Definition SIDefines.h:192

llvm::SIInstrFlags::N_NORMAL
@ N_NORMAL
Definition SIDefines.h:190

llvm::SIInstrFlags::Q_NAN
@ Q_NAN
Definition SIDefines.h:188

llvm::SIInstrFlags::S_NAN
@ S_NAN
Definition SIDefines.h:187

llvm::SIInstrFlags::P_NORMAL
@ P_NORMAL
Definition SIDefines.h:195

llvm::SIInstrFlags::N_INFINITY
@ N_INFINITY
Definition SIDefines.h:189

llvm::SIInstrFlags::P_SUBNORMAL
@ P_SUBNORMAL
Definition SIDefines.h:194

llvm::SIInstrFlags::N_SUBNORMAL
@ N_SUBNORMAL
Definition SIDefines.h:191

llvm::SIInstrFlags::P_INFINITY
@ P_INFINITY
Definition SIDefines.h:196

llvm::SIInstrFlags::P_ZERO
@ P_ZERO
Definition SIDefines.h:193

llvm::SISrcMods::NONE
@ NONE
Definition SIDefines.h:275

llvm::SISrcMods::NEG
@ NEG
Definition SIDefines.h:276

llvm::SI::KernelInputOffsets::Offsets
Offsets
Offsets in bytes from the start of the input buffer.
Definition SIInstrInfo.h:1906

llvm::SI::KernelInputOffsets::NGROUPS_Y
@ NGROUPS_Y
Definition SIInstrInfo.h:1908

llvm::SI::KernelInputOffsets::NGROUPS_Z
@ NGROUPS_Z
Definition SIInstrInfo.h:1909

llvm::SI::KernelInputOffsets::LOCAL_SIZE_Z
@ LOCAL_SIZE_Z
Definition SIInstrInfo.h:1915

llvm::SI::KernelInputOffsets::NGROUPS_X
@ NGROUPS_X
Definition SIInstrInfo.h:1907

llvm::SI::KernelInputOffsets::LOCAL_SIZE_Y
@ LOCAL_SIZE_Y
Definition SIInstrInfo.h:1914

llvm::SI::KernelInputOffsets::LOCAL_SIZE_X
@ LOCAL_SIZE_X
Definition SIInstrInfo.h:1913

llvm::SI
Definition SIInstrInfo.h:1902

llvm::SPII::Store
@ Store
Definition SparcInstrInfo.h:33

llvm::SPII::Load
@ Load
Definition SparcInstrInfo.h:32

llvm::Sched::RegPressure
@ RegPressure
Definition TargetLowering.h:107

llvm::SyncScope::System
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58

llvm::abi::TypeKind::Pointer
@ Pointer
Definition Types.h:33

llvm::cl::Hidden
@ Hidden
Definition CommandLine.h:138

llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition CommandLine.h:444

llvm::codeview::CompileSym3Flags::Exp
@ Exp
Definition CodeView.h:459

llvm::codeview::EncodedFramePtrReg::BasePtr
@ BasePtr
Definition CodeView.h:527

llvm::codeview::FrameCookieKind::Copy
@ Copy
Definition CodeView.h:494

llvm::codeview::PublicSymFlags::Function
@ Function
Definition CodeView.h:408

llvm::dwarf_linker::DebugSectionKind::DebugLoc
@ DebugLoc
Definition DWARFLinkerBase.h:34

llvm::lltok::APFloat
@ APFloat
Definition LLToken.h:533

llvm::logicalview::LVOutputKind::Split
@ Split
Definition LVOptions.h:145

llvm::logicalview::LVAttributeKind::Zero
@ Zero
Definition LVOptions.h:130

llvm::lsp::MessageType::Info
@ Info
Definition Protocol.h:1295

llvm::ms_demangle::IntrinsicFunctionKind::New
@ New
Definition MicrosoftDemangleNodes.h:121

llvm::ms_demangle::QualifierMangleMode::Result
@ Result
Definition MicrosoftDemangle.h:132

llvm::numbers::inv_pi
constexpr double inv_pi
Definition STLForwardCompat.h:71

llvm::objcarc::ARCInstKind::User
@ User
could "use" a pointer
Definition ObjCARCInstKind.h:52

llvm::pdb::PDB_SymType::PointerType
@ PointerType
Definition PDBTypes.h:258

llvm::pdb::PDB_LocType::BitField
@ BitField
Definition PDBTypes.h:299

llvm::pdb::PDB_DataKind::Param
@ Param
Definition PDBTypes.h:232

llvm::pdb::DbgHeaderType::Max
@ Max
Definition RawConstants.h:98

llvm::rdf::Use
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385

llvm::rdf::Node
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381

llvm::sandboxir::Instruction
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73

llvm::sframe::Flags
Flags
Definition SFrame.h:39

llvm::tgtok::Bits
@ Bits
Definition TGLexer.h:78

llvm::tgtok::TrueVal
@ TrueVal
Definition TGLexer.h:57

llvm::tgtok::FalseVal
@ FalseVal
Definition TGLexer.h:58

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition FunctionInfo.h:25

llvm::UniformityInfo
GenericUniformityInfo< SSAContext > UniformityInfo
Definition UniformityAnalysis.h:25

llvm::drop_begin
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315

llvm::Offset
@ Offset
Definition DWP.cpp:557

llvm::Value
FunctionAddr VTableAddr Value
Definition InstrProf.h:137

llvm::getICmpCondCode
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition Analysis.cpp:237

llvm::finalizeBundle
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
Definition MachineInstrBundle.cpp:139

llvm::ScanOptions::DPP
@ DPP
Definition AMDGPU.h:110

llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738

llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition MachineInstrBuilder.h:449

llvm::zip_equal
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:840

llvm::isInt
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165

llvm::isNullConstant
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
Definition SelectionDAG.cpp:13556

llvm::buildCmpXchgValue
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
Definition LowerAtomic.cpp:40

llvm::RegState::Implicit
@ Implicit
Not emitted register (e.g. carry, or temporary result).
Definition MachineInstrBuilder.h:59

llvm::RegState::Dead
@ Dead
Unused definition.
Definition MachineInstrBuilder.h:63

llvm::RegState::Kill
@ Kill
The last use of a register.
Definition MachineInstrBuilder.h:61

llvm::RegState::Undef
@ Undef
Value of the register doesn't matter.
Definition MachineInstrBuilder.h:65

llvm::RegState::Define
@ Define
Register definition.
Definition MachineInstrBuilder.h:57

llvm::RegState::ImplicitDefine
@ ImplicitDefine
Definition MachineInstrBuilder.h:80

llvm::Depth
@ Depth
Definition SIMachineScheduler.h:36

llvm::peekThroughBitcasts
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
Definition SelectionDAG.cpp:13656

llvm::LoopIdiomVectorizeStyle::Masked
@ Masked
Definition LoopIdiomVectorize.h:16

llvm::dyn_cast
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643

llvm::Done
@ Done
Definition Threading.h:60

llvm::CCAssignFn
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
Definition CallingConvLower.h:157

llvm::minIntN
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition MathExtras.h:223

llvm::bit_width
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:325

llvm::append_range
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2207

llvm::alignDown
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546

llvm::MemoryEffects
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
Definition ModRef.h:356

llvm::popcount
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156

llvm::isConstOrConstSplatFP
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
Definition SelectionDAG.cpp:13750

llvm::PowerOf2Ceil
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385

llvm::countr_zero
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204

llvm::isShiftedMask_64
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273

llvm::isReleaseOrStronger
bool isReleaseOrStronger(AtomicOrdering AO)
Definition AtomicOrdering.h:133

llvm::MinAlign
constexpr T MinAlign(U A, V B)
A and B are either alignments or offsets.
Definition MathExtras.h:357

llvm::MONoClobber
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44

llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745

llvm::Log2_32
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331

llvm::AtomicOrderingCABI
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
Definition AtomicOrdering.h:28

llvm::AtomicOrderingCABI::release
@ release
Definition AtomicOrdering.h:32

llvm::AtomicOrderingCABI::seq_cst
@ seq_cst
Definition AtomicOrdering.h:34

llvm::AtomicOrderingCABI::acquire
@ acquire
Definition AtomicOrdering.h:31

llvm::countl_zero
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:263

llvm::isBoolSGPR
bool isBoolSGPR(SDValue V)
Definition SIISelLowering.cpp:14210

llvm::getImm
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
Definition SPIRVUtils.cpp:1122

llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279

llvm::ComplexDeinterleavingOperation::CAdd
@ CAdd
Definition ComplexDeinterleavingPass.h:36

llvm::fcZero
@ fcZero
Definition FloatingPointMode.h:356

llvm::fcPosInf
@ fcPosInf
Definition FloatingPointMode.h:350

llvm::Hi_32
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150

llvm::dbgs
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209

llvm::report_fatal_error
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163

llvm::alignTo
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144

llvm::isUInt
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189

llvm::getFCmpCondCode
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition Analysis.cpp:203

llvm::CodeGenOptLevel::None
@ None
-O0
Definition CodeGen.h:83

llvm::SmallVector
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
Definition SmallVector.h:1151

llvm::Lo_32
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155

llvm::isa
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547

llvm::MOCooperative
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52

llvm::PackElem::Hi
@ Hi
Definition VECustomDAG.h:132

llvm::PackElem::Lo
@ Lo
Definition VECustomDAG.h:131

llvm::buildAtomicRMWValue
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
Definition LowerAtomic.cpp:52

llvm::AtomicOrdering
AtomicOrdering
Atomic ordering for LLVM's memory model.
Definition AtomicOrdering.h:56

llvm::AtomicOrdering::Monotonic
@ Monotonic
Definition AtomicOrdering.h:59

llvm::AtomicOrdering::Acquire
@ Acquire
Definition AtomicOrdering.h:61

llvm::AtomicOrdering::Release
@ Release
Definition AtomicOrdering.h:62

llvm::AtomicOrdering::SequentiallyConsistent
@ SequentiallyConsistent
Definition AtomicOrdering.h:64

llvm::divideCeil
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394

llvm::IRMemLocation::First
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74

llvm::Data
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221

llvm::AfterLegalizeDAG
@ AfterLegalizeDAG
Definition DAGCombine.h:19

llvm::AfterLegalizeVectorOps
@ AfterLegalizeVectorOps
Definition DAGCombine.h:18

llvm::AfterLegalizeTypes
@ AfterLegalizeTypes
Definition DAGCombine.h:17

llvm::LEB128Sign::Signed
@ Signed
Definition LEB128.h:232

llvm::RecurKind::Or
@ Or
Bitwise or logical OR of integers.
Definition IVDescriptors.h:41

llvm::RecurKind::Mul
@ Mul
Product of integers.
Definition IVDescriptors.h:40

llvm::RecurKind::Add
@ Add
Sum of integers.
Definition IVDescriptors.h:37

llvm::MCPhysReg
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21

llvm::Next
FunctionAddr VTableAddr Next
Definition InstrProf.h:141

llvm::Op
DWARFExpression::Operation Op
Definition DWARFExpressionPrinter.cpp:23

llvm::M0
unsigned M0(unsigned Val)
Definition VE.h:376

llvm::ArrayRef
ArrayRef(const T &OneElt) -> ArrayRef< T >

llvm::isConstOrConstSplat
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
Definition SelectionDAG.cpp:13707

llvm::maxIntN
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition MathExtras.h:232

llvm::BitWidth
constexpr unsigned BitWidth
Definition BitmaskEnum.h:219

llvm::DS_Warning
@ DS_Warning
Definition DiagnosticInfo.h:52

llvm::cast
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559

llvm::getIConstantVRegValWithLookThrough
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433

llvm::find_if
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1771

llvm::isOneConstant
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
Definition SelectionDAG.cpp:13575

llvm::MOLastUse
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48

llvm::is_contained
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946

llvm::commonAlignment
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201

llvm::VFParamKind::Vector
@ Vector
Definition VFABIDemangler.h:27

llvm::maskTrailingOnes
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77

llvm::getUndefRegState
constexpr RegState getUndefRegState(bool B)
Definition MachineInstrBuilder.h:96

llvm::ValueUniformity::Custom
@ Custom
The result value requires a custom uniformity check.
Definition Uniformity.h:31

llvm::printReg
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
Definition TargetRegisterInfo.cpp:110

llvm::fltNanEncoding::AllOnes
@ AllOnes
Definition APFloat.h:987

std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:876

N
#define N

DotSrc
Definition SIISelLowering.cpp:16926

DotSrc::SrcOp
SDValue SrcOp
Definition SIISelLowering.cpp:16927

DotSrc::DWordOffset
int64_t DWordOffset
Definition SIISelLowering.cpp:16929

DotSrc::PermMask
int64_t PermMask
Definition SIISelLowering.cpp:16928

llvm::AMDGPUFunctionArgInfo
Definition AMDGPUArgumentUsageInfo.h:89

llvm::AMDGPUFunctionArgInfo::PreloadedValue
PreloadedValue
Definition AMDGPUArgumentUsageInfo.h:91

llvm::AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X
@ CLUSTER_WORKGROUP_ID_X
Definition AMDGPUArgumentUsageInfo.h:107

llvm::AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z
@ CLUSTER_WORKGROUP_ID_Z
Definition AMDGPUArgumentUsageInfo.h:109

llvm::AMDGPUFunctionArgInfo::LDS_KERNEL_ID
@ LDS_KERNEL_ID
Definition AMDGPUArgumentUsageInfo.h:99

llvm::AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_X
Definition AMDGPUArgumentUsageInfo.h:110

llvm::AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_ID_Z
Definition AMDGPUArgumentUsageInfo.h:112

llvm::AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
Definition AMDGPUArgumentUsageInfo.h:113

llvm::AMDGPUFunctionArgInfo::QUEUE_PTR
@ QUEUE_PTR
Definition AMDGPUArgumentUsageInfo.h:95

llvm::AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y
@ CLUSTER_WORKGROUP_ID_Y
Definition AMDGPUArgumentUsageInfo.h:108

llvm::AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
@ IMPLICIT_ARG_PTR
Definition AMDGPUArgumentUsageInfo.h:105

llvm::AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y
@ CLUSTER_WORKGROUP_MAX_ID_Y
Definition AMDGPUArgumentUsageInfo.h:111

llvm::AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER
@ PRIVATE_SEGMENT_BUFFER
Definition AMDGPUArgumentUsageInfo.h:93

llvm::AMDGPUFunctionArgInfo::WORKITEM_ID_Y
@ WORKITEM_ID_Y
Definition AMDGPUArgumentUsageInfo.h:117

llvm::AMDGPUFunctionArgInfo::WORKGROUP_ID_Y
@ WORKGROUP_ID_Y
Definition AMDGPUArgumentUsageInfo.h:101

llvm::AMDGPUFunctionArgInfo::DISPATCH_ID
@ DISPATCH_ID
Definition AMDGPUArgumentUsageInfo.h:97

llvm::AMDGPUFunctionArgInfo::DISPATCH_PTR
@ DISPATCH_PTR
Definition AMDGPUArgumentUsageInfo.h:94

llvm::AMDGPUFunctionArgInfo::WORKGROUP_ID_X
@ WORKGROUP_ID_X
Definition AMDGPUArgumentUsageInfo.h:100

llvm::AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR
@ IMPLICIT_BUFFER_PTR
Definition AMDGPUArgumentUsageInfo.h:104

llvm::AMDGPUFunctionArgInfo::WORKITEM_ID_X
@ WORKITEM_ID_X
Definition AMDGPUArgumentUsageInfo.h:116

llvm::AMDGPUFunctionArgInfo::WORKITEM_ID_Z
@ WORKITEM_ID_Z
Definition AMDGPUArgumentUsageInfo.h:118

llvm::AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
@ KERNARG_SEGMENT_PTR
Definition AMDGPUArgumentUsageInfo.h:96

llvm::AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
@ WORKGROUP_ID_Z
Definition AMDGPUArgumentUsageInfo.h:102

llvm::AMDGPUFunctionArgInfo::WorkItemIDZ
ArgDescriptor WorkItemIDZ
Definition AMDGPUArgumentUsageInfo.h:154

llvm::AMDGPUFunctionArgInfo::WorkItemIDY
ArgDescriptor WorkItemIDY
Definition AMDGPUArgumentUsageInfo.h:153

llvm::AMDGPUFunctionArgInfo::getPreloadedValue
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
Definition AMDGPUArgumentUsageInfo.cpp:44

llvm::AMDGPUFunctionArgInfo::WorkItemIDX
ArgDescriptor WorkItemIDX
Definition AMDGPUArgumentUsageInfo.h:152

llvm::AMDGPUFunctionArgInfo::FixedABIFunctionInfo
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
Definition AMDGPUArgumentUsageInfo.h:165

llvm::AMDGPU::EncodingFields< HwregId, HwregOffset, HwregSize >::encode
static constexpr uint64_t encode(Fields... Values)
Definition AMDGPUBaseInfo.h:419

llvm::AMDGPU::EncodingFields< HwregId, HwregOffset, HwregSize >::decode
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
Definition AMDGPUBaseInfo.h:423

llvm::AMDGPU::ImageDimIntrinsicInfo
Definition AMDGPUInstrInfo.h:49

llvm::AMDGPU::ImageDimIntrinsicInfo::BaseOpcode
unsigned BaseOpcode
Definition AMDGPUInstrInfo.h:51

llvm::AMDGPU::ImageDimIntrinsicInfo::VAddrEnd
uint8_t VAddrEnd
Definition AMDGPUInstrInfo.h:73

llvm::AMDGPU::ImageDimIntrinsicInfo::AtomicNoRetBaseOpcode
unsigned AtomicNoRetBaseOpcode
Definition AMDGPUInstrInfo.h:52

llvm::AMDGPU::ImageDimIntrinsicInfo::CachePolicyIndex
uint8_t CachePolicyIndex
Definition AMDGPUInstrInfo.h:78

llvm::AMDGPU::ImageDimIntrinsicInfo::Dim
MIMGDim Dim
Definition AMDGPUInstrInfo.h:53

llvm::AMDGPU::ImageDimIntrinsicInfo::NumBiasArgs
uint8_t NumBiasArgs
Definition AMDGPUInstrInfo.h:56

llvm::AMDGPU::ImageDimIntrinsicInfo::BiasIndex
uint8_t BiasIndex
Definition AMDGPUInstrInfo.h:67

llvm::AMDGPU::ImageDimIntrinsicInfo::GradientStart
uint8_t GradientStart
Definition AMDGPUInstrInfo.h:69

llvm::AMDGPU::ImageDimIntrinsicInfo::RsrcIndex
uint8_t RsrcIndex
Definition AMDGPUInstrInfo.h:74

llvm::AMDGPU::ImageDimIntrinsicInfo::UnormIndex
uint8_t UnormIndex
Definition AMDGPUInstrInfo.h:76

llvm::AMDGPU::ImageDimIntrinsicInfo::NumGradients
uint8_t NumGradients
Definition AMDGPUInstrInfo.h:58

llvm::AMDGPU::ImageDimIntrinsicInfo::SampIndex
uint8_t SampIndex
Definition AMDGPUInstrInfo.h:75

llvm::AMDGPU::ImageDimIntrinsicInfo::CoordStart
uint8_t CoordStart
Definition AMDGPUInstrInfo.h:70

llvm::AMDGPU::ImageDimIntrinsicInfo::TexFailCtrlIndex
uint8_t TexFailCtrlIndex
Definition AMDGPUInstrInfo.h:77

llvm::AMDGPU::ImageDimIntrinsicInfo::VAddrStart
uint8_t VAddrStart
Definition AMDGPUInstrInfo.h:65

llvm::AMDGPU::ImageDimIntrinsicInfo::DMaskIndex
uint8_t DMaskIndex
Definition AMDGPUInstrInfo.h:64

llvm::AMDGPU::MIMGBaseOpcodeInfo
Definition AMDGPUBaseInfo.h:437

llvm::AMDGPU::MIMGBaseOpcodeInfo::Gather4
bool Gather4
Definition AMDGPUBaseInfo.h:443

llvm::AMDGPU::MIMGBaseOpcodeInfo::Gradients
bool Gradients
Definition AMDGPUBaseInfo.h:446

llvm::AMDGPU::MIMGBaseOpcodeInfo::AtomicX2
bool AtomicX2
Definition AMDGPUBaseInfo.h:441

llvm::AMDGPU::MIMGBaseOpcodeInfo::Sampler
bool Sampler
Definition AMDGPUBaseInfo.h:442

llvm::AMDGPU::MIMGBaseOpcodeInfo::NoReturn
bool NoReturn
Definition AMDGPUBaseInfo.h:454

llvm::AMDGPU::MIMGBaseOpcodeInfo::HasD16
bool HasD16
Definition AMDGPUBaseInfo.h:450

llvm::AMDGPU::MIMGBaseOpcodeInfo::MSAA
bool MSAA
Definition AMDGPUBaseInfo.h:451

llvm::AMDGPU::MIMGBaseOpcodeInfo::Store
bool Store
Definition AMDGPUBaseInfo.h:439

llvm::AMDGPU::MIMGBaseOpcodeInfo::Atomic
bool Atomic
Definition AMDGPUBaseInfo.h:440

llvm::AMDGPU::MIMGDimInfo::Encoding
uint8_t Encoding
Definition AMDGPUBaseInfo.h:470

llvm::AMDGPU::MIMGDimInfo::DA
bool DA
Definition AMDGPUBaseInfo.h:469

llvm::AMDGPU::MIMGG16MappingInfo::G16
MIMGBaseOpcode G16
Definition AMDGPUBaseInfo.h:505

llvm::AMDGPU::RsrcIntrinsic
Definition AMDGPUInstrInfo.h:36

llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39

llvm::Align::value
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77

llvm::ArgDescriptor
Definition AMDGPUArgumentUsageInfo.h:23

llvm::ArgDescriptor::createStack
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
Definition AMDGPUArgumentUsageInfo.h:41

llvm::ArgDescriptor::getRegister
MCRegister getRegister() const
Definition AMDGPUArgumentUsageInfo.h:62

llvm::ArgDescriptor::createArg
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
Definition AMDGPUArgumentUsageInfo.h:47

llvm::ArgDescriptor::isMasked
bool isMasked() const
Definition AMDGPUArgumentUsageInfo.h:72

llvm::ArgDescriptor::createRegister
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Definition AMDGPUArgumentUsageInfo.h:35

llvm::ArgInfo
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:42

llvm::DenormalMode
Represent subnormal handling kind for floating point instruction inputs and outputs.
Definition FloatingPointMode.h:71

llvm::DenormalMode::Dynamic
@ Dynamic
Denormals have unknown treatment.
Definition FloatingPointMode.h:87

llvm::DenormalMode::getPreserveSign
static constexpr DenormalMode getPreserveSign()
Definition FloatingPointMode.h:119

llvm::DenormalMode::getIEEE
static constexpr DenormalMode getIEEE()
Definition FloatingPointMode.h:115

llvm::EVT
Extended Value Type.
Definition ValueTypes.h:35

llvm::EVT::getStoreSize
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:403

llvm::EVT::isSimple
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:145

llvm::EVT::getVectorVT
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:70

llvm::EVT::changeTypeToInteger
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:129

llvm::EVT::bitsLT
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:308

llvm::EVT::isFloatingPoint
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:155

llvm::EVT::getSizeInBits
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:381

llvm::EVT::isByteSized
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:251

llvm::EVT::getScalarSizeInBits
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:393

llvm::EVT::isPow2VectorType
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:486

llvm::EVT::getStoreSizeInBits
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:420

llvm::EVT::getSimpleVT
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:324

llvm::EVT::getIntegerVT
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:61

llvm::EVT::isVector
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:176

llvm::EVT::getScalarType
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:331

llvm::EVT::bitsEq
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:264

llvm::EVT::getTypeForEVT
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition ValueTypes.cpp:218

llvm::EVT::getVectorElementType
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:336

llvm::EVT::changeElementType
EVT changeElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:121

llvm::EVT::isScalarInteger
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:165

llvm::EVT::getFltSemantics
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition ValueTypes.cpp:336

llvm::EVT::getVectorNumElements
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:344

llvm::ISD::ArgFlagsTy
Definition TargetCallingConv.h:27

llvm::ISD::ArgFlagsTy::isSplitEnd
bool isSplitEnd() const
Definition TargetCallingConv.h:141

llvm::ISD::ArgFlagsTy::isInReg
bool isInReg() const
Definition TargetCallingConv.h:85

llvm::ISD::ArgFlagsTy::isSRet
bool isSRet() const
Definition TargetCallingConv.h:88

llvm::ISD::ArgFlagsTy::getPointerAddrSpace
unsigned getPointerAddrSpace() const
Definition TargetCallingConv.h:196

llvm::ISD::ArgFlagsTy::isZExt
bool isZExt() const
Definition TargetCallingConv.h:76

llvm::ISD::ArgFlagsTy::isSplit
bool isSplit() const
Definition TargetCallingConv.h:138

llvm::ISD::ArgFlagsTy::isSExt
bool isSExt() const
Definition TargetCallingConv.h:79

llvm::ISD::ArgFlagsTy::getByValSize
unsigned getByValSize() const
Definition TargetCallingConv.h:178

llvm::ISD::ArgFlagsTy::isByVal
bool isByVal() const
Definition TargetCallingConv.h:91

llvm::ISD::ArgFlagsTy::isByRef
bool isByRef() const
Definition TargetCallingConv.h:94

llvm::ISD::InputArg
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
Definition TargetCallingConv.h:204

llvm::ISD::InputArg::Used
bool Used
Definition TargetCallingConv.h:215

llvm::ISD::InputArg::isOrigArg
bool isOrigArg() const
Definition TargetCallingConv.h:232

llvm::ISD::InputArg::VT
MVT VT
Legalized type of this argument part.
Definition TargetCallingConv.h:207

llvm::ISD::InputArg::getOrigArgIndex
unsigned getOrigArgIndex() const
Definition TargetCallingConv.h:236

llvm::ISD::InputArg::Flags
ArgFlagsTy Flags
Definition TargetCallingConv.h:205

llvm::ISD::OutputArg
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
Definition TargetCallingConv.h:246

llvm::KnownBits
Definition KnownBits.h:24

llvm::KnownBits::eq
static LLVM_ABI std::optional< bool > eq(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_EQ result.
Definition KnownBits.cpp:645

llvm::KnownBits::isUnknown
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:64

llvm::KnownBits::trunc
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:165

llvm::KnownBits::getBitWidth
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44

llvm::KnownBits::zext
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:176

llvm::KnownBits::resetAll
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:72

llvm::KnownBits::add
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false, bool SelfAdd=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:361

llvm::KnownBits::isNonZero
bool isNonZero() const
Returns true if this value is known to be non-zero.
Definition KnownBits.h:109

llvm::KnownBits::extractBits
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:239

llvm::KnownBits::sext
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:184

llvm::KnownBits::countMinLeadingZeros
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:262

llvm::KnownBits::ule
static LLVM_ABI std::optional< bool > ule(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_ULE result.
Definition KnownBits.cpp:679

llvm::KnownBits::One
APInt One
Definition KnownBits.h:26

llvm::KnownBits::Zero
APInt Zero
Definition KnownBits.h:25

llvm::KnownBits::uge
static LLVM_ABI std::optional< bool > uge(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_UGE result.
Definition KnownBits.cpp:669

llvm::KnownBits::isAllOnes
bool isAllOnes() const
Returns true if value is all one bits.
Definition KnownBits.h:81

llvm::KnownFPClass
Definition KnownFPClass.h:26

llvm::KnownFPClass::isKnownNeverNaN
bool isKnownNeverNaN() const
Return true if it's known this can never be a nan.
Definition KnownFPClass.h:52

llvm::KnownFPClass::bitcast
static LLVM_ABI KnownFPClass bitcast(const fltSemantics &FltSemantics, const KnownBits &Bits)
Report known values for a bitcast into a float with provided semantics.
Definition KnownFPClass.cpp:237

llvm::MachinePointerInfo
This class contains a discriminated union of information about pointers in memory operands,...
Definition MachineMemOperand.h:42

llvm::MachinePointerInfo::getStack
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
Definition MachineOperand.cpp:1163

llvm::MachinePointerInfo::AddrSpace
unsigned AddrSpace
Definition MachineMemOperand.h:49

llvm::MachinePointerInfo::getWithOffset
MachinePointerInfo getWithOffset(int64_t O) const
Definition MachineMemOperand.h:82

llvm::MachinePointerInfo::getGOT
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
Definition MachineOperand.cpp:1159

llvm::MachinePointerInfo::getFixedStack
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Definition MachineOperand.cpp:1150

llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106

llvm::MemOp
Definition TargetLowering.h:119

llvm::SDNodeFlags
These are IR-level optimization flags that may be propagated to SDNodes.
Definition SelectionDAGNodes.h:384

llvm::SDNodeFlags::Disjoint
@ Disjoint
Definition SelectionDAGNodes.h:401

llvm::SDNodeFlags::NoUnsignedWrap
@ NoUnsignedWrap
Definition SelectionDAGNodes.h:397

llvm::SDNodeFlags::NoSignedWrap
@ NoSignedWrap
Definition SelectionDAGNodes.h:398

llvm::SDNodeFlags::hasNoUnsignedWrap
bool hasNoUnsignedWrap() const
Definition SelectionDAGNodes.h:477

llvm::SDNodeFlags::hasNoNaNs
bool hasNoNaNs() const
Definition SelectionDAGNodes.h:483

llvm::SDNodeFlags::hasAllowContract
bool hasAllowContract() const
Definition SelectionDAGNodes.h:487

llvm::SDNodeFlags::hasNoSignedWrap
bool hasNoSignedWrap() const
Definition SelectionDAGNodes.h:478

llvm::SDVTList
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
Definition SelectionDAGNodes.h:80

llvm::SDVTList::VTs
const EVT * VTs
Definition SelectionDAGNodes.h:81

llvm::SDVTList::NumVTs
unsigned int NumVTs
Definition SelectionDAGNodes.h:82

llvm::SIModeRegisterDefaults::FP64FP16Denormals
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
Definition SIModeRegisterDefaults.h:37

llvm::SIModeRegisterDefaults::FP32Denormals
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
Definition SIModeRegisterDefaults.h:33

llvm::TargetLoweringBase::AddrMode
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
Definition TargetLowering.h:2978

llvm::TargetLoweringBase::AddrMode::BaseOffs
int64_t BaseOffs
Definition TargetLowering.h:2980

llvm::TargetLoweringBase::AddrMode::BaseGV
GlobalValue * BaseGV
Definition TargetLowering.h:2979

llvm::TargetLoweringBase::AddrMode::HasBaseReg
bool HasBaseReg
Definition TargetLowering.h:2981

llvm::TargetLoweringBase::AddrMode::Scale
int64_t Scale
Definition TargetLowering.h:2982

llvm::TargetLoweringBase::IntrinsicInfo
Definition TargetLowering.h:1222

llvm::TargetLoweringBase::IntrinsicInfo::fallbackAddressSpace
std::optional< unsigned > fallbackAddressSpace
Definition TargetLowering.h:1231

llvm::TargetLowering::CallLoweringInfo
This structure contains all information that is necessary for lowering calls.
Definition TargetLowering.h:4831

llvm::TargetLowering::CallLoweringInfo::Args
ArgListTy Args
Definition TargetLowering.h:4858

llvm::TargetLowering::CallLoweringInfo::IsTailCall
bool IsTailCall
Definition TargetLowering.h:4850

llvm::TargetLowering::CallLoweringInfo::Callee
SDValue Callee
Definition TargetLowering.h:4857

llvm::TargetLowering::CallLoweringInfo::DL
SDLoc DL
Definition TargetLowering.h:4860

llvm::TargetLowering::CallLoweringInfo::IsVarArg
bool IsVarArg
Definition TargetLowering.h:4839

llvm::TargetLowering::CallLoweringInfo::ConvergenceControlToken
SDValue ConvergenceControlToken
Definition TargetLowering.h:4867

llvm::TargetLowering::CallLoweringInfo::Ins
SmallVector< ISD::InputArg, 32 > Ins
Definition TargetLowering.h:4864

llvm::TargetLowering::CallLoweringInfo::Chain
SDValue Chain
Definition TargetLowering.h:4832

llvm::TargetLowering::CallLoweringInfo::CB
const CallBase * CB
Definition TargetLowering.h:4861

llvm::TargetLowering::CallLoweringInfo::Outs
SmallVector< ISD::OutputArg, 32 > Outs
Definition TargetLowering.h:4862

llvm::TargetLowering::CallLoweringInfo::OutVals
SmallVector< SDValue, 32 > OutVals
Definition TargetLowering.h:4863

llvm::TargetLowering::CallLoweringInfo::CallConv
CallingConv::ID CallConv
Definition TargetLowering.h:4856

llvm::TargetLowering::CallLoweringInfo::DAG
SelectionDAG & DAG
Definition TargetLowering.h:4859

llvm::TargetLowering::DAGCombinerInfo
Definition TargetLowering.h:4535

llvm::TargetLowering::DAGCombinerInfo::isBeforeLegalize
bool isBeforeLegalize() const
Definition TargetLowering.h:4546

llvm::TargetLowering::DAGCombinerInfo::DAG
SelectionDAG & DAG
Definition TargetLowering.h:4541

llvm::cl::desc
Definition CommandLine.h:410

llvm::fltSemantics
Definition APFloat.h:997