35#include "llvm/IR/IntrinsicsAMDGPU.h"
36#include "llvm/IR/IntrinsicsR600.h"
38#define DEBUG_TYPE "amdgpu-legalinfo"
48 "amdgpu-global-isel-new-legality",
49 cl::desc(
"Use GlobalISel desired legality, rather than try to use"
50 "rules compatible with selection patterns"),
65 unsigned Bits = Ty.getSizeInBits();
75 const LLT Ty = Query.Types[TypeIdx];
81 return Ty.getNumElements() % 2 != 0 &&
82 EltSize > 1 && EltSize < 32 &&
83 Ty.getSizeInBits() % 32 != 0;
89 const LLT Ty = Query.Types[TypeIdx];
96 const LLT Ty = Query.Types[TypeIdx];
98 return EltTy.
getSizeInBits() == 16 && Ty.getNumElements() > 2;
104 const LLT Ty = Query.Types[TypeIdx];
106 return std::pair(TypeIdx,
113 const LLT Ty = Query.Types[TypeIdx];
115 unsigned Size = Ty.getSizeInBits();
116 unsigned Pieces = (
Size + 63) / 64;
117 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
127 const LLT Ty = Query.Types[TypeIdx];
130 const int Size = Ty.getSizeInBits();
132 const int NextMul32 = (
Size + 31) / 32;
136 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
144 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
145 return std::make_pair(TypeIdx,
LLT::scalar(MemSize));
152 const LLT Ty = Query.Types[TypeIdx];
154 const unsigned EltSize = Ty.getElementType().getSizeInBits();
157 assert(EltSize == 32 || EltSize == 64);
162 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
166 return std::pair(TypeIdx,
181 const unsigned NumElems = Ty.getElementCount().getFixedValue();
186 const unsigned Size = Ty.getSizeInBits();
199 const LLT Ty = Query.Types[TypeIdx];
206 const LLT Ty = Query.Types[TypeIdx];
207 unsigned Size = Ty.getSizeInBits();
216 const LLT QueryTy = Query.Types[TypeIdx];
223 const LLT QueryTy = Query.Types[TypeIdx];
230 const LLT QueryTy = Query.Types[TypeIdx];
236 return ((ST.useRealTrue16Insts() &&
Size == 16) ||
Size % 32 == 0) &&
242 return EltSize == 16 || EltSize % 32 == 0;
246 const int EltSize = Ty.getElementType().getSizeInBits();
247 return EltSize == 32 || EltSize == 64 ||
248 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
249 EltSize == 128 || EltSize == 256;
278 LLT Ty = Query.Types[TypeIdx];
286 const LLT QueryTy = Query.Types[TypeIdx];
370 if (Ty.isPointerOrPointerVector())
371 Ty = Ty.changeElementType(
LLT::scalar(Ty.getScalarSizeInBits()));
375 (ST.useRealTrue16Insts() && Ty ==
S16) ||
390 const LLT Ty = Query.Types[TypeIdx];
391 return !Ty.
isVector() && Ty.getSizeInBits() > 32 &&
392 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
400 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
410 bool IsLoad,
bool IsAtomic) {
414 return ST.enableFlatScratch() ? 128 : 32;
416 return ST.useDS128() ? 128 : 64;
427 return IsLoad ? 512 : 128;
432 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
441 const bool IsLoad = Query.
Opcode != AMDGPU::G_STORE;
443 unsigned RegSize = Ty.getSizeInBits();
446 unsigned AS = Query.
Types[1].getAddressSpace();
453 if (Ty.isVector() && MemSize !=
RegSize)
460 if (IsLoad && MemSize <
Size)
461 MemSize = std::max(MemSize,
Align);
481 if (!ST.hasDwordx3LoadStores())
494 if (AlignBits < MemSize) {
497 Align(AlignBits / 8)))
527 const unsigned Size = Ty.getSizeInBits();
528 if (Ty.isPointerVector())
538 unsigned EltSize = Ty.getScalarSizeInBits();
539 return EltSize != 32 && EltSize != 64;
553 const unsigned Size = Ty.getSizeInBits();
554 if (
Size != MemSizeInBits)
555 return Size <= 32 && Ty.isVector();
561 return Ty.isVector() && (!MemTy.
isVector() || MemTy == Ty) &&
570 uint64_t AlignInBits,
unsigned AddrSpace,
580 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
591 if (AlignInBits < RoundedSize)
598 RoundedSize, AddrSpace,
Align(AlignInBits / 8),
610 Query.
Types[1].getAddressSpace(), Opcode);
630 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
633 Register VectorReg =
MRI.createGenericVirtualRegister(VectorTy);
634 std::array<Register, 4> VectorElems;
635 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
636 for (
unsigned I = 0;
I < NumParts; ++
I)
638 B.buildExtractVectorElementConstant(
S32, VectorReg,
I).getReg(0);
639 B.buildMergeValues(MO, VectorElems);
643 Register BitcastReg =
MRI.createGenericVirtualRegister(VectorTy);
644 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
645 auto Scalar =
B.buildBitcast(ScalarTy, BitcastReg);
646 B.buildIntToPtr(MO, Scalar);
666 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
667 auto Unmerged =
B.buildUnmerge(
LLT::scalar(32), Pointer);
668 for (
unsigned I = 0;
I < NumParts; ++
I)
670 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
672 Register Scalar =
B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
673 return B.buildBitcast(VectorTy, Scalar).getReg(0);
692 auto GetAddrSpacePtr = [&TM](
unsigned AS) {
705 const LLT BufferStridedPtr =
708 const LLT CodePtr = FlatPtr;
710 const std::initializer_list<LLT> AddrSpaces64 = {
711 GlobalPtr, ConstantPtr, FlatPtr
714 const std::initializer_list<LLT> AddrSpaces32 = {
715 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
718 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
720 const std::initializer_list<LLT> FPTypesBase = {
724 const std::initializer_list<LLT> FPTypes16 = {
728 const std::initializer_list<LLT> FPTypesPK16 = {
732 const LLT MinScalarFPTy = ST.has16BitInsts() ?
S16 :
S32;
753 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
755 if (ST.hasScalarAddSub64()) {
758 .clampMaxNumElementsStrict(0,
S16, 2)
766 .clampMaxNumElementsStrict(0,
S16, 2)
773 if (ST.hasScalarSMulU64()) {
776 .clampMaxNumElementsStrict(0,
S16, 2)
784 .clampMaxNumElementsStrict(0,
S16, 2)
794 .minScalarOrElt(0,
S16)
799 }
else if (ST.has16BitInsts()) {
833 .widenScalarToNextMultipleOf(0, 32)
843 if (ST.hasMad64_32())
848 if (ST.hasIntClamp()) {
871 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
881 if (ST.hasVOP3PInsts()) {
883 .clampMaxNumElements(0,
S8, 2)
904 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
916 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
923 .clampScalar(0,
S16,
S64);
956 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
957 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
964 if (ST.has16BitInsts()) {
965 if (ST.hasVOP3PInsts())
968 FPOpActions.legalFor({
S16});
970 TrigActions.customFor({
S16});
971 FDIVActions.customFor({
S16});
974 if (ST.hasPackedFP32Ops()) {
975 FPOpActions.legalFor({
V2S32});
976 FPOpActions.clampMaxNumElementsStrict(0,
S32, 2);
979 auto &MinNumMaxNumIeee =
982 if (ST.hasVOP3PInsts()) {
983 MinNumMaxNumIeee.legalFor(FPTypesPK16)
985 .clampMaxNumElements(0,
S16, 2)
988 }
else if (ST.has16BitInsts()) {
989 MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0,
S16,
S64).scalarize(0);
991 MinNumMaxNumIeee.legalFor(FPTypesBase)
997 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
999 if (ST.hasVOP3PInsts()) {
1000 MinNumMaxNum.customFor(FPTypesPK16)
1002 .clampMaxNumElements(0,
S16, 2)
1003 .clampScalar(0,
S16,
S64)
1005 }
else if (ST.has16BitInsts()) {
1006 MinNumMaxNum.customFor(FPTypes16)
1007 .clampScalar(0,
S16,
S64)
1010 MinNumMaxNum.customFor(FPTypesBase)
1011 .clampScalar(0,
S32,
S64)
1015 if (ST.hasVOP3PInsts())
1031 .legalFor(FPTypesPK16)
1036 if (ST.has16BitInsts()) {
1065 if (ST.hasFractBug()) {
1094 if (ST.hasCvtPkF16F32Inst()) {
1096 .clampMaxNumElements(0,
S16, 2);
1100 FPTruncActions.scalarize(0).lower();
1108 if (ST.has16BitInsts()) {
1128 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1129 FMad.customFor({
S32,
S16});
1130 else if (ST.hasMadMacF32Insts())
1131 FMad.customFor({
S32});
1132 else if (ST.hasMadF16())
1133 FMad.customFor({
S16});
1138 if (ST.has16BitInsts()) {
1141 FRem.minScalar(0,
S32)
1150 .clampMaxNumElements(0,
S16, 2)
1169 if (ST.has16BitInsts())
1180 if (ST.has16BitInsts())
1191 .clampScalar(0,
S16,
S64)
1206 .clampScalar(0,
S16,
S64)
1210 if (ST.has16BitInsts()) {
1212 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1214 .clampScalar(0,
S16,
S64)
1218 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1220 .clampScalar(0,
S32,
S64)
1224 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1227 .clampScalar(0,
S32,
S64)
1239 .scalarSameSizeAs(1, 0)
1255 {
S1}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1256 .legalForCartesianProduct(
1257 {
S32}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1258 if (ST.has16BitInsts()) {
1259 CmpBuilder.legalFor({{
S1,
S16}});
1270 {
S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1272 if (ST.hasSALUFloatInsts())
1282 if (ST.has16BitInsts())
1283 ExpOps.customFor({{
S32}, {
S16}});
1285 ExpOps.customFor({
S32});
1286 ExpOps.clampScalar(0, MinScalarFPTy,
S32)
1295 if (ST.has16BitInsts())
1311 .clampScalar(0,
S32,
S32)
1318 if (ST.has16BitInsts())
1321 .widenScalarToNextPow2(1)
1327 .lowerFor({
S1,
S16})
1328 .widenScalarToNextPow2(1)
1355 .clampScalar(0,
S32,
S32)
1365 .clampScalar(0,
S32,
S64)
1369 if (ST.has16BitInsts()) {
1372 .clampMaxNumElementsStrict(0,
S16, 2)
1379 if (ST.hasVOP3PInsts()) {
1382 .clampMaxNumElements(0,
S16, 2)
1387 if (ST.hasIntMinMax64()) {
1390 .clampMaxNumElements(0,
S16, 2)
1398 .clampMaxNumElements(0,
S16, 2)
1407 .widenScalarToNextPow2(0)
1435 .legalForCartesianProduct(AddrSpaces32, {
S32})
1451 .legalForCartesianProduct(AddrSpaces32, {
S32})
1468 const auto needToSplitMemOp = [=](
const LegalityQuery &Query,
1469 bool IsLoad) ->
bool {
1473 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1487 unsigned NumRegs = (MemSize + 31) / 32;
1489 if (!ST.hasDwordx3LoadStores())
1500 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1501 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1502 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1508 for (
unsigned Op : {G_LOAD, G_STORE}) {
1509 const bool IsStore =
Op == G_STORE;
1514 Actions.legalForTypesWithMemDesc({{
S32, GlobalPtr,
S32, GlobalAlign32},
1517 {
S64, GlobalPtr,
S64, GlobalAlign32},
1520 {
S32, GlobalPtr,
S8, GlobalAlign8},
1521 {
S32, GlobalPtr,
S16, GlobalAlign16},
1523 {
S32, LocalPtr,
S32, 32},
1524 {
S64, LocalPtr,
S64, 32},
1526 {
S32, LocalPtr,
S8, 8},
1527 {
S32, LocalPtr,
S16, 16},
1530 {
S32, PrivatePtr,
S32, 32},
1531 {
S32, PrivatePtr,
S8, 8},
1532 {
S32, PrivatePtr,
S16, 16},
1535 {
S32, ConstantPtr,
S32, GlobalAlign32},
1538 {
S64, ConstantPtr,
S64, GlobalAlign32},
1539 {
V2S32, ConstantPtr,
V2S32, GlobalAlign32}});
1548 Actions.unsupportedIf(
1549 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1563 Actions.customIf(
typeIs(1, Constant32Ptr));
1589 return !Query.
Types[0].isVector() &&
1590 needToSplitMemOp(Query,
Op == G_LOAD);
1592 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1597 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1600 if (DstSize > MemSize)
1606 if (MemSize > MaxSize)
1614 return Query.
Types[0].isVector() &&
1615 needToSplitMemOp(Query,
Op == G_LOAD);
1617 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1631 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1632 if (MemSize > MaxSize) {
1636 if (MaxSize % EltSize == 0) {
1642 unsigned NumPieces = MemSize / MaxSize;
1646 if (NumPieces == 1 || NumPieces >= NumElts ||
1647 NumElts % NumPieces != 0)
1648 return std::pair(0, EltTy);
1656 return std::pair(0, EltTy);
1671 return std::pair(0, EltTy);
1676 .widenScalarToNextPow2(0)
1683 .legalForTypesWithMemDesc({{
S32, GlobalPtr,
S8, 8},
1684 {
S32, GlobalPtr,
S16, 2 * 8},
1685 {
S32, LocalPtr,
S8, 8},
1686 {
S32, LocalPtr,
S16, 16},
1687 {
S32, PrivatePtr,
S8, 8},
1688 {
S32, PrivatePtr,
S16, 16},
1689 {
S32, ConstantPtr,
S8, 8},
1690 {
S32, ConstantPtr,
S16, 2 * 8}})
1696 if (ST.hasFlatAddressSpace()) {
1697 ExtLoads.legalForTypesWithMemDesc(
1698 {{
S32, FlatPtr,
S8, 8}, {
S32, FlatPtr,
S16, 16}});
1713 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1714 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1715 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1716 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1717 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr},
1718 {
S64, GlobalPtr}, {
S64, LocalPtr},
1719 {
S32, RegionPtr}, {
S64, RegionPtr}});
1720 if (ST.hasFlatAddressSpace()) {
1721 Atomics.legalFor({{
S32, FlatPtr}, {
S64, FlatPtr}});
1726 if (ST.hasLDSFPAtomicAddF32()) {
1727 Atomic.legalFor({{
S32, LocalPtr}, {
S32, RegionPtr}});
1728 if (ST.hasLdsAtomicAddF64())
1729 Atomic.legalFor({{
S64, LocalPtr}});
1730 if (ST.hasAtomicDsPkAdd16Insts())
1731 Atomic.legalFor({{
V2F16, LocalPtr}, {
V2BF16, LocalPtr}});
1733 if (ST.hasAtomicFaddInsts())
1734 Atomic.legalFor({{
S32, GlobalPtr}});
1735 if (ST.hasFlatAtomicFaddF32Inst())
1736 Atomic.legalFor({{
S32, FlatPtr}});
1738 if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
1749 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1750 ST.hasAtomicBufferGlobalPkAddF16Insts())
1751 Atomic.legalFor({{
V2F16, GlobalPtr}, {
V2F16, BufferFatPtr}});
1752 if (ST.hasAtomicGlobalPkAddBF16Inst())
1753 Atomic.legalFor({{
V2BF16, GlobalPtr}});
1754 if (ST.hasAtomicFlatPkAdd16Insts())
1755 Atomic.legalFor({{
V2F16, FlatPtr}, {
V2BF16, FlatPtr}});
1760 auto &AtomicFMinFMax =
1762 .legalFor({{
F32, LocalPtr}, {
F64, LocalPtr}});
1764 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1766 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1767 AtomicFMinFMax.
legalFor({{
F64, GlobalPtr}, {
F64, BufferFatPtr}});
1768 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1770 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1777 {
S32, FlatPtr}, {
S64, FlatPtr}})
1778 .legalFor({{
S32, LocalPtr}, {
S64, LocalPtr},
1779 {
S32, RegionPtr}, {
S64, RegionPtr}});
1785 LocalPtr, FlatPtr, PrivatePtr,
1789 .clampScalar(0,
S16,
S64)
1804 if (ST.has16BitInsts()) {
1805 if (ST.hasVOP3PInsts()) {
1807 .clampMaxNumElements(0,
S16, 2);
1809 Shifts.legalFor({{
S16,
S16}});
1812 Shifts.widenScalarIf(
1817 const LLT AmountTy = Query.
Types[1];
1818 return ValTy.isScalar() && ValTy.getSizeInBits() <= 16 &&
1822 Shifts.clampScalar(1,
S32,
S32);
1823 Shifts.widenScalarToNextPow2(0, 16);
1824 Shifts.clampScalar(0,
S16,
S64);
1834 Shifts.clampScalar(1,
S32,
S32);
1835 Shifts.widenScalarToNextPow2(0, 32);
1836 Shifts.clampScalar(0,
S32,
S64);
1845 for (
unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1846 unsigned VecTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1847 unsigned EltTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1848 unsigned IdxTypeIdx = 2;
1852 const LLT EltTy = Query.
Types[EltTypeIdx];
1853 const LLT VecTy = Query.
Types[VecTypeIdx];
1854 const LLT IdxTy = Query.
Types[IdxTypeIdx];
1856 const bool isLegalVecType =
1866 return (EltSize == 32 || EltSize == 64) &&
1882 const LLT EltTy = Query.
Types[EltTypeIdx];
1883 const LLT VecTy = Query.
Types[VecTypeIdx];
1887 const unsigned TargetEltSize =
1888 DstEltSize % 64 == 0 ? 64 : 32;
1889 return std::pair(VecTypeIdx,
1893 .clampScalar(EltTypeIdx,
S32,
S64)
1907 const LLT &EltTy = Query.
Types[1].getElementType();
1908 return Query.
Types[0] != EltTy;
1911 for (
unsigned Op : {G_EXTRACT, G_INSERT}) {
1912 unsigned BigTyIdx =
Op == G_EXTRACT ? 1 : 0;
1913 unsigned LitTyIdx =
Op == G_EXTRACT ? 0 : 1;
1922 const LLT BigTy = Query.
Types[BigTyIdx];
1927 const LLT BigTy = Query.
Types[BigTyIdx];
1928 const LLT LitTy = Query.
Types[LitTyIdx];
1934 const LLT BigTy = Query.
Types[BigTyIdx];
1940 const LLT LitTy = Query.
Types[LitTyIdx];
1959 if (ST.hasScalarPackInsts()) {
1962 .minScalarOrElt(0,
S16)
1969 BuildVector.customFor({
V2S16,
S16});
1970 BuildVector.minScalarOrElt(0,
S32);
1989 for (
unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1990 unsigned BigTyIdx =
Op == G_MERGE_VALUES ? 0 : 1;
1991 unsigned LitTyIdx =
Op == G_MERGE_VALUES ? 1 : 0;
1993 auto notValidElt = [=](
const LegalityQuery &Query,
unsigned TypeIdx) {
1994 const LLT Ty = Query.
Types[TypeIdx];
1995 if (Ty.isVector()) {
2010 const LLT BigTy = Query.
Types[BigTyIdx];
2030 return notValidElt(Query, LitTyIdx);
2035 return notValidElt(Query, BigTyIdx);
2040 if (
Op == G_MERGE_VALUES) {
2041 Builder.widenScalarIf(
2044 const LLT Ty = Query.
Types[LitTyIdx];
2045 return Ty.getSizeInBits() < 32;
2052 const LLT Ty = Query.
Types[BigTyIdx];
2053 return Ty.getSizeInBits() % 16 != 0;
2058 const LLT &Ty = Query.
Types[BigTyIdx];
2059 unsigned NewSizeInBits = 1 <<
Log2_32_Ceil(Ty.getSizeInBits() + 1);
2060 if (NewSizeInBits >= 256) {
2061 unsigned RoundedTo =
alignTo<64>(Ty.getSizeInBits() + 1);
2062 if (RoundedTo < NewSizeInBits)
2063 NewSizeInBits = RoundedTo;
2065 return std::pair(BigTyIdx,
LLT::scalar(NewSizeInBits));
2076 .clampScalar(0,
S32,
S64);
2078 if (ST.hasVOP3PInsts()) {
2079 SextInReg.lowerFor({{
V2S16}})
2083 .clampMaxNumElementsStrict(0,
S16, 2);
2084 }
else if (ST.has16BitInsts()) {
2085 SextInReg.lowerFor({{
S32}, {
S64}, {
S16}});
2089 SextInReg.lowerFor({{
S32}, {
S64}});
2102 FSHRActionDefs.legalFor({{
S32,
S32}})
2103 .clampMaxNumElementsStrict(0,
S16, 2);
2104 if (ST.hasVOP3PInsts())
2106 FSHRActionDefs.scalarize(0).lower();
2108 if (ST.hasVOP3PInsts()) {
2111 .clampMaxNumElementsStrict(0,
S16, 2)
2135 .clampScalar(1,
S32,
S32)
2144 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2145 G_READ_REGISTER, G_WRITE_REGISTER,
2150 if (ST.hasIEEEMinimumMaximumInsts()) {
2152 .legalFor(FPTypesPK16)
2155 }
else if (ST.hasVOP3PInsts()) {
2158 .clampMaxNumElementsStrict(0,
S16, 2)
2174 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2175 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2181 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2182 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2183 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2184 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2190 verify(*ST.getInstrInfo());
2199 switch (
MI.getOpcode()) {
2200 case TargetOpcode::G_ADDRSPACE_CAST:
2202 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2204 case TargetOpcode::G_FCEIL:
2206 case TargetOpcode::G_FREM:
2208 case TargetOpcode::G_INTRINSIC_TRUNC:
2210 case TargetOpcode::G_SITOFP:
2212 case TargetOpcode::G_UITOFP:
2214 case TargetOpcode::G_FPTOSI:
2216 case TargetOpcode::G_FPTOUI:
2218 case TargetOpcode::G_FMINNUM:
2219 case TargetOpcode::G_FMAXNUM:
2220 case TargetOpcode::G_FMINIMUMNUM:
2221 case TargetOpcode::G_FMAXIMUMNUM:
2223 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2225 case TargetOpcode::G_INSERT_VECTOR_ELT:
2227 case TargetOpcode::G_FSIN:
2228 case TargetOpcode::G_FCOS:
2230 case TargetOpcode::G_GLOBAL_VALUE:
2232 case TargetOpcode::G_LOAD:
2233 case TargetOpcode::G_SEXTLOAD:
2234 case TargetOpcode::G_ZEXTLOAD:
2236 case TargetOpcode::G_STORE:
2238 case TargetOpcode::G_FMAD:
2240 case TargetOpcode::G_FDIV:
2242 case TargetOpcode::G_FFREXP:
2244 case TargetOpcode::G_FSQRT:
2246 case TargetOpcode::G_UDIV:
2247 case TargetOpcode::G_UREM:
2248 case TargetOpcode::G_UDIVREM:
2250 case TargetOpcode::G_SDIV:
2251 case TargetOpcode::G_SREM:
2252 case TargetOpcode::G_SDIVREM:
2254 case TargetOpcode::G_ATOMIC_CMPXCHG:
2256 case TargetOpcode::G_FLOG2:
2258 case TargetOpcode::G_FLOG:
2259 case TargetOpcode::G_FLOG10:
2261 case TargetOpcode::G_FEXP2:
2263 case TargetOpcode::G_FEXP:
2264 case TargetOpcode::G_FEXP10:
2266 case TargetOpcode::G_FPOW:
2268 case TargetOpcode::G_FFLOOR:
2270 case TargetOpcode::G_BUILD_VECTOR:
2271 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2273 case TargetOpcode::G_MUL:
2275 case TargetOpcode::G_CTLZ:
2276 case TargetOpcode::G_CTTZ:
2278 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2280 case TargetOpcode::G_STACKSAVE:
2282 case TargetOpcode::G_GET_FPENV:
2284 case TargetOpcode::G_SET_FPENV:
2286 case TargetOpcode::G_TRAP:
2288 case TargetOpcode::G_DEBUGTRAP:
2308 if (ST.hasApertureRegs()) {
2313 ? AMDGPU::SRC_SHARED_BASE
2314 : AMDGPU::SRC_PRIVATE_BASE;
2315 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2316 !ST.hasGloballyAddressableScratch()) &&
2317 "Cannot use src_private_base with globally addressable scratch!");
2319 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2320 B.buildCopy({Dst}, {
Register(ApertureRegNo)});
2321 return B.buildUnmerge(
S32, Dst).getReg(1);
2326 Register LoadAddr =
MRI.createGenericVirtualRegister(
2336 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
2338 Register KernargPtrReg =
MRI.createGenericVirtualRegister(
2352 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2355 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2358 Register QueuePtr =
MRI.createGenericVirtualRegister(
2374 B.buildObjectPtrOffset(
2376 B.buildConstant(
LLT::scalar(64), StructOffset).getReg(0));
2377 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2385 switch (Def->getOpcode()) {
2386 case AMDGPU::G_FRAME_INDEX:
2387 case AMDGPU::G_GLOBAL_VALUE:
2388 case AMDGPU::G_BLOCK_ADDR:
2390 case AMDGPU::G_CONSTANT: {
2391 const ConstantInt *CI = Def->getOperand(1).getCImm();
2392 return CI->
getSExtValue() != TM.getNullPointerValue(AddrSpace);
2408 assert(
MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2410 Intrinsic::amdgcn_addrspacecast_nonnull));
2415 :
MI.getOperand(1).getReg();
2416 LLT DstTy =
MRI.getType(Dst);
2417 LLT SrcTy =
MRI.getType(Src);
2419 unsigned SrcAS = SrcTy.getAddressSpace();
2428 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2429 MI.setDesc(
B.getTII().get(TargetOpcode::G_BITCAST));
2436 auto castFlatToLocalOrPrivate = [&](
const DstOp &Dst) ->
Register {
2438 ST.hasGloballyAddressableScratch()) {
2442 Register SrcLo =
B.buildExtract(
S32, Src, 0).getReg(0);
2444 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
2445 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2447 MRI.setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2449 return B.buildIntToPtr(Dst,
Sub).getReg(0);
2453 return B.buildExtract(Dst, Src, 0).getReg(0);
2459 castFlatToLocalOrPrivate(Dst);
2460 MI.eraseFromParent();
2464 unsigned NullVal = TM.getNullPointerValue(DestAS);
2466 auto SegmentNull =
B.buildConstant(DstTy, NullVal);
2467 auto FlatNull =
B.buildConstant(SrcTy, 0);
2470 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2474 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2476 MI.eraseFromParent();
2483 auto castLocalOrPrivateToFlat = [&](
const DstOp &Dst) ->
Register {
2486 Register SrcAsInt =
B.buildPtrToInt(
S32, Src).getReg(0);
2489 ST.hasGloballyAddressableScratch()) {
2494 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {
S32})
2498 if (ST.isWave64()) {
2499 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {
S32})
2505 B.buildConstant(
S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2506 Register SrcHi =
B.buildShl(
S32, ThreadID, ShAmt).getReg(0);
2508 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).
getReg(0);
2512 B.buildInstr(AMDGPU::S_MOV_B64, {
S64},
2513 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2515 MRI.setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2516 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2525 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).
getReg(0);
2531 castLocalOrPrivateToFlat(Dst);
2532 MI.eraseFromParent();
2536 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2538 auto SegmentNull =
B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2539 auto FlatNull =
B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2542 SegmentNull.getReg(0));
2544 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2546 MI.eraseFromParent();
2551 SrcTy.getSizeInBits() == 64) {
2553 B.buildExtract(Dst, Src, 0);
2554 MI.eraseFromParent();
2561 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2562 auto PtrLo =
B.buildPtrToInt(
S32, Src);
2563 auto HighAddr =
B.buildConstant(
S32, AddrHiVal);
2564 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2565 MI.eraseFromParent();
2572 MI.eraseFromParent();
2580 LLT Ty =
MRI.getType(Src);
2581 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2586 auto C1 =
B.buildFConstant(Ty, C1Val);
2587 auto CopySign =
B.buildFCopysign(Ty, C1, Src);
2590 auto Tmp1 =
B.buildFAdd(Ty, Src, CopySign);
2591 auto Tmp2 =
B.buildFSub(Ty, Tmp1, CopySign);
2593 auto C2 =
B.buildFConstant(Ty, C2Val);
2594 auto Fabs =
B.buildFAbs(Ty, Src);
2597 B.buildSelect(
MI.getOperand(0).getReg(),
Cond, Src, Tmp2);
2598 MI.eraseFromParent();
2616 auto Trunc =
B.buildIntrinsicTrunc(
S64, Src);
2618 const auto Zero =
B.buildFConstant(
S64, 0.0);
2619 const auto One =
B.buildFConstant(
S64, 1.0);
2622 auto And =
B.buildAnd(
S1, Lt0, NeTrunc);
2623 auto Add =
B.buildSelect(
S64,
And, One, Zero);
2626 B.buildFAdd(
MI.getOperand(0).getReg(), Trunc,
Add);
2627 MI.eraseFromParent();
2635 Register Src0Reg =
MI.getOperand(1).getReg();
2636 Register Src1Reg =
MI.getOperand(2).getReg();
2637 auto Flags =
MI.getFlags();
2638 LLT Ty =
MRI.getType(DstReg);
2640 auto Div =
B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2641 auto Trunc =
B.buildIntrinsicTrunc(Ty, Div, Flags);
2642 auto Neg =
B.buildFNeg(Ty, Trunc, Flags);
2643 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2644 MI.eraseFromParent();
2650 const unsigned FractBits = 52;
2651 const unsigned ExpBits = 11;
2654 auto Const0 =
B.buildConstant(
S32, FractBits - 32);
2655 auto Const1 =
B.buildConstant(
S32, ExpBits);
2657 auto ExpPart =
B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {
S32})
2659 .addUse(Const0.getReg(0))
2660 .addUse(Const1.getReg(0));
2662 return B.buildSub(
S32, ExpPart,
B.buildConstant(
S32, 1023));
2676 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2683 const unsigned FractBits = 52;
2686 const auto SignBitMask =
B.buildConstant(
S32, UINT32_C(1) << 31);
2687 auto SignBit =
B.buildAnd(
S32,
Hi, SignBitMask);
2689 const auto FractMask =
B.buildConstant(
S64, (UINT64_C(1) << FractBits) - 1);
2691 const auto Zero32 =
B.buildConstant(
S32, 0);
2694 auto SignBit64 =
B.buildMergeLikeInstr(
S64, {Zero32, SignBit});
2696 auto Shr =
B.buildAShr(
S64, FractMask, Exp);
2697 auto Not =
B.buildNot(
S64, Shr);
2698 auto Tmp0 =
B.buildAnd(
S64, Src, Not);
2699 auto FiftyOne =
B.buildConstant(
S32, FractBits - 1);
2704 auto Tmp1 =
B.buildSelect(
S64, ExpLt0, SignBit64, Tmp0);
2705 B.buildSelect(
MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2706 MI.eraseFromParent();
2722 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2723 auto ThirtyTwo =
B.buildConstant(
S32, 32);
2725 if (
MRI.getType(Dst) ==
S64) {
2726 auto CvtHi =
Signed ?
B.buildSITOFP(
S64, Unmerge.getReg(1))
2727 :
B.buildUITOFP(
S64, Unmerge.getReg(1));
2729 auto CvtLo =
B.buildUITOFP(
S64, Unmerge.getReg(0));
2730 auto LdExp =
B.buildFLdexp(
S64, CvtHi, ThirtyTwo);
2733 B.buildFAdd(Dst, LdExp, CvtLo);
2734 MI.eraseFromParent();
2740 auto One =
B.buildConstant(
S32, 1);
2744 auto ThirtyOne =
B.buildConstant(
S32, 31);
2745 auto X =
B.buildXor(
S32, Unmerge.getReg(0), Unmerge.getReg(1));
2746 auto OppositeSign =
B.buildAShr(
S32,
X, ThirtyOne);
2747 auto MaxShAmt =
B.buildAdd(
S32, ThirtyTwo, OppositeSign);
2748 auto LS =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32})
2749 .addUse(Unmerge.getReg(1));
2750 auto LS2 =
B.buildSub(
S32, LS, One);
2751 ShAmt =
B.buildUMin(
S32, LS2, MaxShAmt);
2753 ShAmt =
B.buildCTLZ(
S32, Unmerge.getReg(1));
2754 auto Norm =
B.buildShl(
S64, Src, ShAmt);
2755 auto Unmerge2 =
B.buildUnmerge({
S32,
S32}, Norm);
2756 auto Adjust =
B.buildUMin(
S32, One, Unmerge2.getReg(0));
2757 auto Norm2 =
B.buildOr(
S32, Unmerge2.getReg(1), Adjust);
2758 auto FVal =
Signed ?
B.buildSITOFP(
S32, Norm2) :
B.buildUITOFP(
S32, Norm2);
2759 auto Scale =
B.buildSub(
S32, ThirtyTwo, ShAmt);
2760 B.buildFLdexp(Dst, FVal, Scale);
2761 MI.eraseFromParent();
2778 const LLT SrcLT =
MRI.getType(Src);
2781 unsigned Flags =
MI.getFlags();
2792 auto Trunc =
B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2800 Sign =
B.buildAShr(
S32, Src,
B.buildConstant(
S32, 31));
2801 Trunc =
B.buildFAbs(
S32, Trunc, Flags);
2805 K0 =
B.buildFConstant(
2807 K1 =
B.buildFConstant(
2810 K0 =
B.buildFConstant(
2812 K1 =
B.buildFConstant(
2816 auto Mul =
B.buildFMul(SrcLT, Trunc, K0, Flags);
2817 auto FloorMul =
B.buildFFloor(SrcLT,
Mul, Flags);
2818 auto Fma =
B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2821 :
B.buildFPTOUI(
S32, FloorMul);
2822 auto Lo =
B.buildFPTOUI(
S32, Fma);
2826 Sign =
B.buildMergeLikeInstr(
S64, {Sign, Sign});
2828 B.buildSub(Dst,
B.buildXor(
S64,
B.buildMergeLikeInstr(
S64, {Lo, Hi}), Sign),
2831 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
2832 MI.eraseFromParent();
2859 LLT VecTy =
MRI.getType(Vec);
2872 auto IntVec =
B.buildPtrToInt(IntVecTy, Vec);
2873 auto IntElt =
B.buildExtractVectorElement(IntTy, IntVec,
MI.getOperand(2));
2874 B.buildIntToPtr(Dst, IntElt);
2876 MI.eraseFromParent();
2883 std::optional<ValueAndVReg> MaybeIdxVal =
2887 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2890 auto Unmerge =
B.buildUnmerge(EltTy, Vec);
2891 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2896 MI.eraseFromParent();
2911 LLT VecTy =
MRI.getType(Vec);
2925 auto IntVecSource =
B.buildPtrToInt(IntVecTy, Vec);
2926 auto IntIns =
B.buildPtrToInt(IntTy, Ins);
2927 auto IntVecDest =
B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2929 B.buildIntToPtr(Dst, IntVecDest);
2930 MI.eraseFromParent();
2937 std::optional<ValueAndVReg> MaybeIdxVal =
2942 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2945 if (IdxVal < NumElts) {
2947 for (
unsigned i = 0; i < NumElts; ++i)
2948 SrcRegs.
push_back(
MRI.createGenericVirtualRegister(EltTy));
2949 B.buildUnmerge(SrcRegs, Vec);
2951 SrcRegs[IdxVal] =
MI.getOperand(2).getReg();
2952 B.buildMergeLikeInstr(Dst, SrcRegs);
2957 MI.eraseFromParent();
2967 LLT Ty =
MRI.getType(DstReg);
2968 unsigned Flags =
MI.getFlags();
2972 if (ST.hasTrigReducedRange()) {
2973 auto MulVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2974 TrigVal =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2975 .addUse(MulVal.getReg(0))
2979 TrigVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2982 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2986 MI.eraseFromParent();
2994 unsigned GAFlags)
const {
3023 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3025 if (ST.has64BitLiterals()) {
3029 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3033 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3042 if (!
B.getMRI()->getRegClassOrNull(PCReg))
3043 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3046 B.buildExtract(DstReg, PCReg, 0);
3056 if (RequiresHighHalf && ST.has64BitLiterals()) {
3057 if (!
MRI.getRegClassOrNull(DstReg))
3058 MRI.setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3059 B.buildInstr(AMDGPU::S_MOV_B64)
3069 Register AddrLo = !RequiresHighHalf && !
MRI.getRegClassOrNull(DstReg)
3071 :
MRI.createGenericVirtualRegister(
S32);
3073 if (!
MRI.getRegClassOrNull(AddrLo))
3074 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3077 B.buildInstr(AMDGPU::S_MOV_B32)
3082 if (RequiresHighHalf) {
3084 "Must provide a 64-bit pointer type!");
3087 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3089 B.buildInstr(AMDGPU::S_MOV_B32)
3099 if (!
MRI.getRegClassOrNull(AddrDst))
3100 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3102 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3106 if (AddrDst != DstReg)
3107 B.buildCast(DstReg, AddrDst);
3108 }
else if (AddrLo != DstReg) {
3111 B.buildCast(DstReg, AddrLo);
3119 LLT Ty =
MRI.getType(DstReg);
3120 unsigned AS = Ty.getAddressSpace();
3128 GV->
getName() !=
"llvm.amdgcn.module.lds" &&
3132 Fn,
"local memory global used by non-kernel function",
3141 B.buildUndef(DstReg);
3142 MI.eraseFromParent();
3162 if (
B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
3166 auto Sz =
B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {
S32});
3167 B.buildIntToPtr(DstReg, Sz);
3168 MI.eraseFromParent();
3175 MI.eraseFromParent();
3179 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3181 MI.eraseFromParent();
3189 MI.eraseFromParent();
3195 MI.eraseFromParent();
3200 Register GOTAddr =
MRI.createGenericVirtualRegister(PtrTy);
3211 if (Ty.getSizeInBits() == 32) {
3213 auto Load =
B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3214 B.buildExtract(DstReg, Load, 0);
3216 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3218 MI.eraseFromParent();
3236 LLT PtrTy =
MRI.getType(PtrReg);
3241 auto Cast =
B.buildAddrSpaceCast(ConstPtr, PtrReg);
3243 MI.getOperand(1).setReg(Cast.getReg(0));
3248 if (
MI.getOpcode() != AMDGPU::G_LOAD)
3252 LLT ValTy =
MRI.getType(ValReg);
3262 const unsigned ValSize = ValTy.getSizeInBits();
3274 if (WideMemSize == ValSize) {
3280 MI.setMemRefs(MF, {WideMMO});
3286 if (ValSize > WideMemSize)
3293 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3294 B.buildTrunc(ValReg, WideLoad).getReg(0);
3301 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3302 B.buildExtract(ValReg, WideLoad, 0);
3306 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3307 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3311 MI.eraseFromParent();
3324 Register DataReg =
MI.getOperand(0).getReg();
3325 LLT DataTy =
MRI.getType(DataReg);
3339 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
3368 "this should not have been custom lowered");
3370 LLT ValTy =
MRI.getType(CmpVal);
3373 Register PackedVal =
B.buildBuildVector(VecTy, { NewVal, CmpVal }).
getReg(0);
3375 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3379 .setMemRefs(
MI.memoperands());
3381 MI.eraseFromParent();
3389 switch (
DefMI->getOpcode()) {
3390 case TargetOpcode::G_INTRINSIC: {
3392 case Intrinsic::amdgcn_frexp_mant:
3400 case TargetOpcode::G_FFREXP: {
3401 if (
DefMI->getOperand(0).getReg() == Src)
3405 case TargetOpcode::G_FPEXT: {
3426std::pair<Register, Register>
3428 unsigned Flags)
const {
3433 auto SmallestNormal =
B.buildFConstant(
3435 auto IsLtSmallestNormal =
3438 auto Scale32 =
B.buildFConstant(
F32, 0x1.0p+32);
3439 auto One =
B.buildFConstant(
F32, 1.0);
3441 B.buildSelect(
F32, IsLtSmallestNormal, Scale32, One, Flags);
3442 auto ScaledInput =
B.buildFMul(
F32, Src, ScaleFactor, Flags);
3444 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3457 LLT Ty =
B.getMRI()->getType(Dst);
3458 unsigned Flags =
MI.getFlags();
3463 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3464 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {
F32})
3465 .addUse(Ext.getReg(0))
3467 B.buildFPTrunc(Dst,
Log2, Flags);
3468 MI.eraseFromParent();
3476 B.buildIntrinsic(Intrinsic::amdgcn_log, {
MI.getOperand(0)})
3479 MI.eraseFromParent();
3483 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3484 .addUse(ScaledInput)
3487 auto ThirtyTwo =
B.buildFConstant(Ty, 32.0);
3488 auto Zero =
B.buildFConstant(Ty, 0.0);
3490 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3491 B.buildFSub(Dst,
Log2, ResultOffset, Flags);
3493 MI.eraseFromParent();
3499 auto FMul =
B.buildFMul(Ty,
X,
Y, Flags);
3500 return B.buildFAdd(Ty,
FMul, Z, Flags).getReg(0);
3505 const bool IsLog10 =
MI.getOpcode() == TargetOpcode::G_FLOG10;
3506 assert(IsLog10 ||
MI.getOpcode() == TargetOpcode::G_FLOG);
3511 unsigned Flags =
MI.getFlags();
3512 const LLT Ty =
MRI.getType(
X);
3522 if (Ty == F16 && !ST.has16BitInsts()) {
3524 auto PromoteSrc =
B.buildFPExt(
F32,
X);
3526 B.buildFPTrunc(Dst, LogVal);
3531 MI.eraseFromParent();
3540 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(
X).setMIFlags(Flags);
3543 if (ST.hasFastFMAF32()) {
3545 const float c_log10 = 0x1.344134p-2f;
3546 const float cc_log10 = 0x1.09f79ep-26f;
3549 const float c_log = 0x1.62e42ep-1f;
3550 const float cc_log = 0x1.efa39ep-25f;
3552 auto C =
B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3553 auto CC =
B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3555 R =
B.buildFMul(Ty,
Y,
C, Flags).getReg(0);
3556 auto NegR =
B.buildFNeg(Ty, R, Flags);
3557 auto FMA0 =
B.buildFMA(Ty,
Y,
C, NegR, Flags);
3558 auto FMA1 =
B.buildFMA(Ty,
Y, CC, FMA0, Flags);
3559 R =
B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3562 const float ch_log10 = 0x1.344000p-2f;
3563 const float ct_log10 = 0x1.3509f6p-18f;
3566 const float ch_log = 0x1.62e000p-1f;
3567 const float ct_log = 0x1.0bfbe8p-15f;
3569 auto CH =
B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3570 auto CT =
B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3572 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3573 auto YH =
B.buildAnd(Ty,
Y, MaskConst);
3574 auto YT =
B.buildFSub(Ty,
Y, YH, Flags);
3575 auto YTCT =
B.buildFMul(Ty, YT, CT, Flags);
3578 getMad(
B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3580 R =
getMad(
B, Ty, YH.getReg(0),
CH.getReg(0), Mad1, Flags);
3583 const bool IsFiniteOnly =
3587 if (!IsFiniteOnly) {
3590 auto Fabs =
B.buildFAbs(Ty,
Y);
3593 R =
B.buildSelect(Ty, IsFinite, R,
Y, Flags).getReg(0);
3597 auto Zero =
B.buildFConstant(Ty, 0.0);
3599 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3600 auto Shift =
B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3601 B.buildFSub(Dst, R, Shift, Flags);
3603 B.buildCopy(Dst, R);
3606 MI.eraseFromParent();
3612 unsigned Flags)
const {
3613 const double Log2BaseInverted =
3616 LLT Ty =
B.getMRI()->getType(Dst);
3621 auto LogSrc =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3624 auto ScaledResultOffset =
B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3625 auto Zero =
B.buildFConstant(Ty, 0.0);
3627 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3628 auto Log2Inv =
B.buildFConstant(Ty, Log2BaseInverted);
3630 if (ST.hasFastFMAF32())
3631 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3633 auto Mul =
B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3634 B.buildFAdd(Dst,
Mul, ResultOffset, Flags);
3642 ?
B.buildFLog2(Ty, Src, Flags)
3643 :
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3646 auto Log2BaseInvertedOperand =
B.buildFConstant(Ty, Log2BaseInverted);
3647 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3658 unsigned Flags =
MI.getFlags();
3659 LLT Ty =
B.getMRI()->getType(Dst);
3665 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3666 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {
F32})
3667 .addUse(Ext.getReg(0))
3669 B.buildFPTrunc(Dst,
Log2, Flags);
3670 MI.eraseFromParent();
3680 MI.eraseFromParent();
3688 auto RangeCheckConst =
B.buildFConstant(Ty, -0x1.f80000p+6f);
3690 RangeCheckConst, Flags);
3692 auto SixtyFour =
B.buildFConstant(Ty, 0x1.0p+6f);
3693 auto Zero =
B.buildFConstant(Ty, 0.0);
3694 auto AddOffset =
B.buildSelect(
F32, NeedsScaling, SixtyFour, Zero, Flags);
3695 auto AddInput =
B.buildFAdd(
F32, Src, AddOffset, Flags);
3697 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3698 .addUse(AddInput.getReg(0))
3701 auto TwoExpNeg64 =
B.buildFConstant(Ty, 0x1.0p-64f);
3702 auto One =
B.buildFConstant(Ty, 1.0);
3703 auto ResultScale =
B.buildSelect(
F32, NeedsScaling, TwoExpNeg64, One, Flags);
3704 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3705 MI.eraseFromParent();
3711 LLT Ty =
B.getMRI()->getType(Dst);
3716 auto Mul =
B.buildFMul(Ty,
X, Log2E, Flags);
3720 .addUse(
Mul.getReg(0))
3723 B.buildFExp2(Dst,
Mul.getReg(0), Flags);
3729 auto Threshold =
B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3732 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+6f);
3733 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3734 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X, Flags);
3737 auto ExpInput =
B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3739 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3740 .addUse(ExpInput.getReg(0))
3743 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.969d48p-93f);
3744 auto AdjustedResult =
B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3745 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3753 const unsigned Flags =
MI.getFlags();
3756 LLT Ty =
MRI.getType(Dst);
3759 const bool IsExp10 =
MI.getOpcode() == TargetOpcode::G_FEXP10;
3766 MI.eraseFromParent();
3774 auto Ext =
B.buildFPExt(
F32,
X, Flags);
3777 B.buildFPTrunc(Dst, Lowered, Flags);
3778 MI.eraseFromParent();
3788 MI.eraseFromParent();
3816 const unsigned FlagsNoContract = Flags &
~MachineInstr::FmContract;
3819 if (ST.hasFastFMAF32()) {
3821 const float cc_exp = 0x1.4ae0bep-26f;
3822 const float c_exp10 = 0x1.a934f0p+1f;
3823 const float cc_exp10 = 0x1.2f346ep-24f;
3825 auto C =
B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3826 PH =
B.buildFMul(Ty,
X,
C, Flags).getReg(0);
3827 auto NegPH =
B.buildFNeg(Ty, PH, Flags);
3828 auto FMA0 =
B.buildFMA(Ty,
X,
C, NegPH, Flags);
3830 auto CC =
B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3831 PL =
B.buildFMA(Ty,
X, CC, FMA0, Flags).getReg(0);
3833 const float ch_exp = 0x1.714000p+0f;
3834 const float cl_exp = 0x1.47652ap-12f;
3836 const float ch_exp10 = 0x1.a92000p+1f;
3837 const float cl_exp10 = 0x1.4f0978p-11f;
3839 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3840 auto XH =
B.buildAnd(Ty,
X, MaskConst);
3841 auto XL =
B.buildFSub(Ty,
X, XH, Flags);
3843 auto CH =
B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3844 PH =
B.buildFMul(Ty, XH,
CH, Flags).getReg(0);
3846 auto CL =
B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3847 auto XLCL =
B.buildFMul(Ty, XL, CL, Flags);
3850 getMad(
B, Ty, XL.getReg(0),
CH.getReg(0), XLCL.getReg(0), Flags);
3851 PL =
getMad(
B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3854 auto E =
B.buildIntrinsicRoundeven(Ty, PH, Flags);
3857 auto PHSubE =
B.buildFSub(Ty, PH, E, FlagsNoContract);
3858 auto A =
B.buildFAdd(Ty, PHSubE, PL, Flags);
3861 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3862 .addUse(
A.getReg(0))
3864 auto R =
B.buildFLdexp(Ty, Exp2, IntE, Flags);
3866 auto UnderflowCheckConst =
3867 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3868 auto Zero =
B.buildFConstant(Ty, 0.0);
3872 R =
B.buildSelect(Ty, Underflow, Zero, R);
3875 auto OverflowCheckConst =
3876 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3881 R =
B.buildSelect(Ty, Overflow, Inf, R, Flags);
3884 B.buildCopy(Dst, R);
3885 MI.eraseFromParent();
3894 unsigned Flags =
MI.getFlags();
3895 LLT Ty =
B.getMRI()->getType(Dst);
3900 auto Log =
B.buildFLog2(
F32, Src0, Flags);
3901 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
3902 .addUse(Log.getReg(0))
3905 B.buildFExp2(Dst,
Mul, Flags);
3906 }
else if (Ty == F16) {
3908 auto Log =
B.buildFLog2(F16, Src0, Flags);
3909 auto Ext0 =
B.buildFPExt(
F32, Log, Flags);
3910 auto Ext1 =
B.buildFPExt(
F32, Src1, Flags);
3911 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
3912 .addUse(Ext0.getReg(0))
3913 .addUse(Ext1.getReg(0))
3915 B.buildFExp2(Dst,
B.buildFPTrunc(F16,
Mul), Flags);
3919 MI.eraseFromParent();
3927 ModSrc = SrcFNeg->getOperand(1).getReg();
3929 ModSrc = SrcFAbs->getOperand(1).getReg();
3931 ModSrc = SrcFAbs->getOperand(1).getReg();
3942 Register OrigSrc =
MI.getOperand(1).getReg();
3943 unsigned Flags =
MI.getFlags();
3945 "this should not have been custom lowered");
3955 auto Fract =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {
F64})
3975 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3977 B.buildFMinNum(Min, Fract, Const, Flags);
3982 CorrectedFract =
B.buildSelect(
F64, IsNan, ModSrc, Min, Flags).getReg(0);
3985 auto NegFract =
B.buildFNeg(
F64, CorrectedFract, Flags);
3986 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3988 MI.eraseFromParent();
4004 if (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4006 Src0 =
B.buildTrunc(
S16,
MI.getOperand(1).getReg()).getReg(0);
4007 Src1 =
B.buildTrunc(
S16,
MI.getOperand(2).getReg()).getReg(0);
4010 auto Merge =
B.buildMergeLikeInstr(
S32, {Src0, Src1});
4011 B.buildBitcast(Dst,
Merge);
4013 MI.eraseFromParent();
4030 bool UsePartialMad64_32,
4031 bool SeparateOddAlignedProducts)
const {
4046 auto getZero32 = [&]() ->
Register {
4048 Zero32 =
B.buildConstant(
S32, 0).getReg(0);
4051 auto getZero64 = [&]() ->
Register {
4053 Zero64 =
B.buildConstant(
S64, 0).getReg(0);
4058 for (
unsigned i = 0; i < Src0.
size(); ++i) {
4069 if (CarryIn.empty())
4072 bool HaveCarryOut =
true;
4074 if (CarryIn.size() == 1) {
4076 LocalAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4080 CarryAccum = getZero32();
4082 CarryAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4083 for (
unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4085 B.buildUAdde(
S32,
S1, CarryAccum, getZero32(), CarryIn[i])
4090 LocalAccum = getZero32();
4091 HaveCarryOut =
false;
4096 B.buildUAdde(
S32,
S1, CarryAccum, LocalAccum, CarryIn.back());
4097 LocalAccum =
Add.getReg(0);
4111 auto buildMadChain =
4114 assert((DstIndex + 1 < Accum.
size() && LocalAccum.size() == 2) ||
4115 (DstIndex + 1 >= Accum.
size() && LocalAccum.size() == 1));
4122 if (LocalAccum.size() == 1 &&
4123 (!UsePartialMad64_32 || !CarryIn.empty())) {
4126 unsigned j1 = DstIndex - j0;
4127 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4131 auto Mul =
B.buildMul(
S32, Src0[j0], Src1[j1]);
4133 LocalAccum[0] =
Mul.getReg(0);
4135 if (CarryIn.empty()) {
4136 LocalAccum[0] =
B.buildAdd(
S32, LocalAccum[0],
Mul).getReg(0);
4139 B.buildUAdde(
S32,
S1, LocalAccum[0],
Mul, CarryIn.back())
4145 }
while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4149 if (j0 <= DstIndex) {
4150 bool HaveSmallAccum =
false;
4153 if (LocalAccum[0]) {
4154 if (LocalAccum.size() == 1) {
4155 Tmp =
B.buildAnyExt(
S64, LocalAccum[0]).getReg(0);
4156 HaveSmallAccum =
true;
4157 }
else if (LocalAccum[1]) {
4158 Tmp =
B.buildMergeLikeInstr(
S64, LocalAccum).getReg(0);
4159 HaveSmallAccum =
false;
4161 Tmp =
B.buildZExt(
S64, LocalAccum[0]).getReg(0);
4162 HaveSmallAccum =
true;
4165 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4167 HaveSmallAccum =
true;
4171 unsigned j1 = DstIndex - j0;
4172 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4176 auto Mad =
B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {
S64,
S1},
4177 {Src0[j0], Src1[j1], Tmp});
4178 Tmp = Mad.getReg(0);
4179 if (!HaveSmallAccum)
4180 CarryOut.push_back(Mad.getReg(1));
4181 HaveSmallAccum =
false;
4184 }
while (j0 <= DstIndex);
4186 auto Unmerge =
B.buildUnmerge(
S32, Tmp);
4187 LocalAccum[0] = Unmerge.getReg(0);
4188 if (LocalAccum.size() > 1)
4189 LocalAccum[1] = Unmerge.getReg(1);
4216 for (
unsigned i = 0; i <= Accum.
size() / 2; ++i) {
4217 Carry OddCarryIn = std::move(OddCarry);
4218 Carry EvenCarryIn = std::move(EvenCarry);
4223 if (2 * i < Accum.
size()) {
4224 auto LocalAccum = Accum.
drop_front(2 * i).take_front(2);
4225 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4230 if (!SeparateOddAlignedProducts) {
4231 auto LocalAccum = Accum.
drop_front(2 * i - 1).take_front(2);
4232 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4234 bool IsHighest = 2 * i >= Accum.
size();
4237 .take_front(IsHighest ? 1 : 2);
4238 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4244 Lo =
B.buildUAddo(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0]);
4246 Lo =
B.buildAdd(
S32, Accum[2 * i - 1], SeparateOddOut[0]);
4248 Lo =
B.buildUAdde(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0],
4251 Accum[2 * i - 1] =
Lo->getOperand(0).getReg();
4254 auto Hi =
B.buildUAdde(
S32,
S1, Accum[2 * i], SeparateOddOut[1],
4255 Lo->getOperand(1).getReg());
4256 Accum[2 * i] =
Hi.getReg(0);
4257 SeparateOddCarry =
Hi.getReg(1);
4264 if (
Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4265 EvenCarryIn.push_back(CarryOut);
4267 if (2 * i < Accum.
size()) {
4268 if (
Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4269 OddCarry.push_back(CarryOut);
4281 assert(ST.hasMad64_32());
4282 assert(
MI.getOpcode() == TargetOpcode::G_MUL);
4291 LLT Ty =
MRI.getType(DstReg);
4294 unsigned Size = Ty.getSizeInBits();
4295 if (ST.hasVectorMulU64() &&
Size == 64)
4298 unsigned NumParts =
Size / 32;
4310 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4314 for (
unsigned i = 0; i < NumParts; ++i) {
4318 B.buildUnmerge(Src0Parts, Src0);
4319 B.buildUnmerge(Src1Parts, Src1);
4322 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4323 SeparateOddAlignedProducts);
4325 B.buildMergeLikeInstr(DstReg, AccumRegs);
4326 MI.eraseFromParent();
4338 LLT DstTy =
MRI.getType(Dst);
4339 LLT SrcTy =
MRI.getType(Src);
4341 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_CTLZ
4342 ? AMDGPU::G_AMDGPU_FFBH_U32
4343 : AMDGPU::G_AMDGPU_FFBL_B32;
4344 auto Tmp =
B.buildInstr(NewOpc, {DstTy}, {Src});
4347 MI.eraseFromParent();
4356 LLT SrcTy =
MRI.getType(Src);
4357 TypeSize NumBits = SrcTy.getSizeInBits();
4361 auto ShiftAmt =
B.buildConstant(
S32, 32u - NumBits);
4362 auto Extend =
B.buildAnyExt(
S32, {Src}).
getReg(0u);
4363 auto Shift =
B.buildShl(
S32, Extend, ShiftAmt);
4364 auto Ctlz =
B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {
S32}, {Shift});
4365 B.buildTrunc(Dst, Ctlz);
4366 MI.eraseFromParent();
4372 if (
MI.getOpcode() != TargetOpcode::G_XOR)
4375 return ConstVal == -1;
4382 Register CondDef =
MI.getOperand(0).getReg();
4383 if (!
MRI.hasOneNonDBGUse(CondDef))
4391 if (!
MRI.hasOneNonDBGUse(NegatedCond))
4397 UseMI = &*
MRI.use_instr_nodbg_begin(NegatedCond);
4401 if (
UseMI->getParent() != Parent ||
UseMI->getOpcode() != AMDGPU::G_BRCOND)
4410 UncondBrTarget = &*NextMBB;
4412 if (
Next->getOpcode() != AMDGPU::G_BR)
4431 *ArgRC,
B.getDebugLoc(), ArgTy);
4435 const unsigned Mask = Arg->
getMask();
4443 auto ShiftAmt =
B.buildConstant(
S32, Shift);
4444 AndMaskSrc =
B.buildLShr(
S32, LiveIn, ShiftAmt).getReg(0);
4447 B.buildAnd(DstReg, AndMaskSrc,
B.buildConstant(
S32, Mask >> Shift));
4449 B.buildCopy(DstReg, LiveIn);
4459 if (!ST.hasClusters()) {
4462 MI.eraseFromParent();
4475 Register ClusterMaxIdXYZ =
MRI.createGenericVirtualRegister(
S32);
4476 Register ClusterWorkGroupIdXYZ =
MRI.createGenericVirtualRegister(
S32);
4482 auto One =
B.buildConstant(
S32, 1);
4483 auto ClusterSizeXYZ =
B.buildAdd(
S32, ClusterMaxIdXYZ, One);
4484 auto GlobalIdXYZ =
B.buildAdd(
S32, ClusterWorkGroupIdXYZ,
4485 B.buildMul(
S32, ClusterIdXYZ, ClusterSizeXYZ));
4492 B.buildCopy(DstReg, GlobalIdXYZ);
4493 MI.eraseFromParent();
4497 B.buildCopy(DstReg, ClusterIdXYZ);
4498 MI.eraseFromParent();
4503 unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4505 MRI.setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4506 B.buildInstr(AMDGPU::S_GETREG_B32_const)
4508 .addImm(ClusterIdField);
4509 auto Zero =
B.buildConstant(
S32, 0);
4512 B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4513 MI.eraseFromParent();
4555 auto LoadConstant = [&](
unsigned N) {
4556 B.buildConstant(DstReg,
N);
4560 if (ST.hasArchitectedSGPRs() &&
4567 Arg = &WorkGroupIDX;
4568 ArgRC = &AMDGPU::SReg_32RegClass;
4572 Arg = &WorkGroupIDY;
4573 ArgRC = &AMDGPU::SReg_32RegClass;
4577 Arg = &WorkGroupIDZ;
4578 ArgRC = &AMDGPU::SReg_32RegClass;
4582 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
4583 return LoadConstant(0);
4584 Arg = &ClusterWorkGroupIDX;
4585 ArgRC = &AMDGPU::SReg_32RegClass;
4589 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
4590 return LoadConstant(0);
4591 Arg = &ClusterWorkGroupIDY;
4592 ArgRC = &AMDGPU::SReg_32RegClass;
4596 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
4597 return LoadConstant(0);
4598 Arg = &ClusterWorkGroupIDZ;
4599 ArgRC = &AMDGPU::SReg_32RegClass;
4604 return LoadConstant(ClusterDims.
getDims()[0] - 1);
4605 Arg = &ClusterWorkGroupMaxIDX;
4606 ArgRC = &AMDGPU::SReg_32RegClass;
4611 return LoadConstant(ClusterDims.
getDims()[1] - 1);
4612 Arg = &ClusterWorkGroupMaxIDY;
4613 ArgRC = &AMDGPU::SReg_32RegClass;
4618 return LoadConstant(ClusterDims.
getDims()[2] - 1);
4619 Arg = &ClusterWorkGroupMaxIDZ;
4620 ArgRC = &AMDGPU::SReg_32RegClass;
4624 Arg = &ClusterWorkGroupMaxFlatID;
4625 ArgRC = &AMDGPU::SReg_32RegClass;
4640 return LoadConstant(0);
4645 B.buildUndef(DstReg);
4649 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4661 MI.eraseFromParent();
4667 B.buildConstant(
MI.getOperand(0).getReg(),
C);
4668 MI.eraseFromParent();
4675 unsigned MaxID = ST.getMaxWorkitemID(
B.getMF().getFunction(), Dim);
4689 B.buildUndef(DstReg);
4690 MI.eraseFromParent();
4694 if (Arg->isMasked()) {
4708 MI.eraseFromParent();
4715 Register KernArgReg =
B.getMRI()->createGenericVirtualRegister(PtrTy);
4724 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
4732 Align Alignment)
const {
4736 "unexpected kernarg parameter type");
4740 B.buildLoad(DstReg,
Ptr, PtrInfo,
Align(4),
4743 MI.eraseFromParent();
4751 LLT DstTy =
MRI.getType(Dst);
4778 auto FloatY =
B.buildUITOFP(
S32,
Y);
4779 auto RcpIFlag =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {FloatY});
4781 auto ScaledY =
B.buildFMul(
S32, RcpIFlag, Scale);
4782 auto Z =
B.buildFPTOUI(
S32, ScaledY);
4785 auto NegY =
B.buildSub(
S32,
B.buildConstant(
S32, 0),
Y);
4786 auto NegYZ =
B.buildMul(
S32, NegY, Z);
4787 Z =
B.buildAdd(
S32, Z,
B.buildUMulH(
S32, Z, NegYZ));
4790 auto Q =
B.buildUMulH(
S32,
X, Z);
4791 auto R =
B.buildSub(
S32,
X,
B.buildMul(
S32, Q,
Y));
4794 auto One =
B.buildConstant(
S32, 1);
4797 Q =
B.buildSelect(
S32,
Cond,
B.buildAdd(
S32, Q, One), Q);
4803 B.buildSelect(DstDivReg,
Cond,
B.buildAdd(
S32, Q, One), Q);
4806 B.buildSelect(DstRemReg,
Cond,
B.buildSub(
S32, R,
Y), R);
4825 auto Unmerge =
B.buildUnmerge(
S32, Val);
4827 auto CvtLo =
B.buildUITOFP(
S32, Unmerge.getReg(0));
4828 auto CvtHi =
B.buildUITOFP(
S32, Unmerge.getReg(1));
4830 auto Mad =
B.buildFMAD(
4834 auto Rcp =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {Mad});
4835 auto Mul1 =
B.buildFMul(
4839 auto Mul2 =
B.buildFMul(
4841 auto Trunc =
B.buildIntrinsicTrunc(
S32, Mul2);
4844 auto Mad2 =
B.buildFMAD(
4848 auto ResultLo =
B.buildFPTOUI(
S32, Mad2);
4849 auto ResultHi =
B.buildFPTOUI(
S32, Trunc);
4851 return {ResultLo.getReg(0), ResultHi.getReg(0)};
4866 auto Rcp =
B.buildMergeLikeInstr(
S64, {RcpLo, RcpHi});
4868 auto Zero64 =
B.buildConstant(
S64, 0);
4869 auto NegDenom =
B.buildSub(
S64, Zero64, Denom);
4871 auto MulLo1 =
B.buildMul(
S64, NegDenom, Rcp);
4872 auto MulHi1 =
B.buildUMulH(
S64, Rcp, MulLo1);
4874 auto UnmergeMulHi1 =
B.buildUnmerge(
S32, MulHi1);
4875 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4876 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4878 auto Add1_Lo =
B.buildUAddo(
S32,
S1, RcpLo, MulHi1_Lo);
4879 auto Add1_Hi =
B.buildUAdde(
S32,
S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4880 auto Add1 =
B.buildMergeLikeInstr(
S64, {Add1_Lo, Add1_Hi});
4882 auto MulLo2 =
B.buildMul(
S64, NegDenom, Add1);
4883 auto MulHi2 =
B.buildUMulH(
S64, Add1, MulLo2);
4884 auto UnmergeMulHi2 =
B.buildUnmerge(
S32, MulHi2);
4885 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4886 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4888 auto Zero32 =
B.buildConstant(
S32, 0);
4889 auto Add2_Lo =
B.buildUAddo(
S32,
S1, Add1_Lo, MulHi2_Lo);
4890 auto Add2_Hi =
B.buildUAdde(
S32,
S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4891 auto Add2 =
B.buildMergeLikeInstr(
S64, {Add2_Lo, Add2_Hi});
4893 auto UnmergeNumer =
B.buildUnmerge(
S32, Numer);
4894 Register NumerLo = UnmergeNumer.getReg(0);
4895 Register NumerHi = UnmergeNumer.getReg(1);
4897 auto MulHi3 =
B.buildUMulH(
S64, Numer, Add2);
4898 auto Mul3 =
B.buildMul(
S64, Denom, MulHi3);
4899 auto UnmergeMul3 =
B.buildUnmerge(
S32, Mul3);
4900 Register Mul3_Lo = UnmergeMul3.getReg(0);
4901 Register Mul3_Hi = UnmergeMul3.getReg(1);
4902 auto Sub1_Lo =
B.buildUSubo(
S32,
S1, NumerLo, Mul3_Lo);
4903 auto Sub1_Hi =
B.buildUSube(
S32,
S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4904 auto Sub1_Mi =
B.buildSub(
S32, NumerHi, Mul3_Hi);
4905 auto Sub1 =
B.buildMergeLikeInstr(
S64, {Sub1_Lo, Sub1_Hi});
4907 auto UnmergeDenom =
B.buildUnmerge(
S32, Denom);
4908 Register DenomLo = UnmergeDenom.getReg(0);
4909 Register DenomHi = UnmergeDenom.getReg(1);
4912 auto C1 =
B.buildSExt(
S32, CmpHi);
4915 auto C2 =
B.buildSExt(
S32, CmpLo);
4918 auto C3 =
B.buildSelect(
S32, CmpEq, C2, C1);
4925 auto Sub2_Lo =
B.buildUSubo(
S32,
S1, Sub1_Lo, DenomLo);
4926 auto Sub2_Mi =
B.buildUSube(
S32,
S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4927 auto Sub2_Hi =
B.buildUSube(
S32,
S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4928 auto Sub2 =
B.buildMergeLikeInstr(
S64, {Sub2_Lo, Sub2_Hi});
4930 auto One64 =
B.buildConstant(
S64, 1);
4931 auto Add3 =
B.buildAdd(
S64, MulHi3, One64);
4937 auto C6 =
B.buildSelect(
4941 auto Add4 =
B.buildAdd(
S64, Add3, One64);
4942 auto Sub3_Lo =
B.buildUSubo(
S32,
S1, Sub2_Lo, DenomLo);
4944 auto Sub3_Mi =
B.buildUSube(
S32,
S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4945 auto Sub3_Hi =
B.buildUSube(
S32,
S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4946 auto Sub3 =
B.buildMergeLikeInstr(
S64, {Sub3_Lo, Sub3_Hi});
4952 auto Sel1 =
B.buildSelect(
4959 auto Sel2 =
B.buildSelect(
4970 switch (
MI.getOpcode()) {
4973 case AMDGPU::G_UDIV: {
4974 DstDivReg =
MI.getOperand(0).getReg();
4977 case AMDGPU::G_UREM: {
4978 DstRemReg =
MI.getOperand(0).getReg();
4981 case AMDGPU::G_UDIVREM: {
4982 DstDivReg =
MI.getOperand(0).getReg();
4983 DstRemReg =
MI.getOperand(1).getReg();
4990 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
4991 Register Num =
MI.getOperand(FirstSrcOpIdx).getReg();
4992 Register Den =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
4993 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
5002 MI.eraseFromParent();
5012 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
5013 if (Ty !=
S32 && Ty !=
S64)
5016 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
5017 Register LHS =
MI.getOperand(FirstSrcOpIdx).getReg();
5018 Register RHS =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
5020 auto SignBitOffset =
B.buildConstant(
S32, Ty.getSizeInBits() - 1);
5021 auto LHSign =
B.buildAShr(Ty, LHS, SignBitOffset);
5022 auto RHSign =
B.buildAShr(Ty, RHS, SignBitOffset);
5024 LHS =
B.buildAdd(Ty, LHS, LHSign).getReg(0);
5025 RHS =
B.buildAdd(Ty, RHS, RHSign).getReg(0);
5027 LHS =
B.buildXor(Ty, LHS, LHSign).getReg(0);
5028 RHS =
B.buildXor(Ty, RHS, RHSign).getReg(0);
5030 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5031 switch (
MI.getOpcode()) {
5034 case AMDGPU::G_SDIV: {
5035 DstDivReg =
MI.getOperand(0).getReg();
5036 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
5039 case AMDGPU::G_SREM: {
5040 DstRemReg =
MI.getOperand(0).getReg();
5041 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
5044 case AMDGPU::G_SDIVREM: {
5045 DstDivReg =
MI.getOperand(0).getReg();
5046 DstRemReg =
MI.getOperand(1).getReg();
5047 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
5048 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
5059 auto Sign =
B.buildXor(Ty, LHSign, RHSign).getReg(0);
5060 auto SignXor =
B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
5061 B.buildSub(DstDivReg, SignXor, Sign);
5065 auto Sign = LHSign.getReg(0);
5066 auto SignXor =
B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
5067 B.buildSub(DstRemReg, SignXor, Sign);
5070 MI.eraseFromParent();
5081 LLT ResTy =
MRI.getType(Res);
5086 if (!AllowInaccurateRcp && ResTy !=
LLT::scalar(16))
5097 if (CLHS->isExactlyValue(1.0)) {
5098 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5102 MI.eraseFromParent();
5107 if (CLHS->isExactlyValue(-1.0)) {
5108 auto FNeg =
B.buildFNeg(ResTy, RHS, Flags);
5109 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5110 .addUse(FNeg.getReg(0))
5113 MI.eraseFromParent();
5120 if (!AllowInaccurateRcp && (ResTy !=
LLT::scalar(16) ||
5125 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5128 B.buildFMul(Res, LHS, RCP, Flags);
5130 MI.eraseFromParent();
5141 LLT ResTy =
MRI.getType(Res);
5145 if (!AllowInaccurateRcp)
5148 auto NegY =
B.buildFNeg(ResTy,
Y);
5149 auto One =
B.buildFConstant(ResTy, 1.0);
5151 auto R =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5155 auto Tmp0 =
B.buildFMA(ResTy, NegY, R, One);
5156 R =
B.buildFMA(ResTy, Tmp0, R, R);
5158 auto Tmp1 =
B.buildFMA(ResTy, NegY, R, One);
5159 R =
B.buildFMA(ResTy, Tmp1, R, R);
5161 auto Ret =
B.buildFMul(ResTy,
X, R);
5162 auto Tmp2 =
B.buildFMA(ResTy, NegY, Ret,
X);
5164 B.buildFMA(Res, Tmp2, R, Ret);
5165 MI.eraseFromParent();
5197 auto LHSExt =
B.buildFPExt(
S32, LHS, Flags);
5198 auto RHSExt =
B.buildFPExt(
S32, RHS, Flags);
5199 auto NegRHSExt =
B.buildFNeg(
S32, RHSExt);
5200 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5201 .addUse(RHSExt.getReg(0))
5203 auto Quot =
B.buildFMul(
S32, LHSExt, Rcp, Flags);
5205 if (ST.hasMadMacF32Insts()) {
5206 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5207 Quot =
B.buildFMAD(
S32, Err, Rcp, Quot, Flags);
5208 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5210 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5211 Quot =
B.buildFMA(
S32, Err, Rcp, Quot, Flags);
5212 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5214 auto Tmp =
B.buildFMul(
S32, Err, Rcp, Flags);
5215 Tmp =
B.buildAnd(
S32, Tmp,
B.buildConstant(
S32, 0xff800000));
5216 Quot =
B.buildFAdd(
S32, Tmp, Quot, Flags);
5217 auto RDst =
B.buildFPTrunc(
S16, Quot, Flags);
5218 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5219 .addUse(RDst.getReg(0))
5224 MI.eraseFromParent();
5237 unsigned SPDenormMode =
5240 if (ST.hasDenormModeInst()) {
5242 uint32_t DPDenormModeDefault =
Mode.fpDenormModeDPValue();
5244 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5245 B.buildInstr(AMDGPU::S_DENORM_MODE)
5246 .addImm(NewDenormModeValue);
5249 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5250 .addImm(SPDenormMode)
5272 auto One =
B.buildFConstant(
S32, 1.0f);
5274 auto DenominatorScaled =
5275 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5280 auto NumeratorScaled =
5281 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5287 auto ApproxRcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5288 .addUse(DenominatorScaled.getReg(0))
5290 auto NegDivScale0 =
B.buildFNeg(
S32, DenominatorScaled, Flags);
5293 const bool HasDynamicDenormals =
5298 if (!PreservesDenormals) {
5299 if (HasDynamicDenormals) {
5300 SavedSPDenormMode =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5301 B.buildInstr(AMDGPU::S_GETREG_B32)
5302 .addDef(SavedSPDenormMode)
5308 auto Fma0 =
B.buildFMA(
S32, NegDivScale0, ApproxRcp, One, Flags);
5309 auto Fma1 =
B.buildFMA(
S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5310 auto Mul =
B.buildFMul(
S32, NumeratorScaled, Fma1, Flags);
5311 auto Fma2 =
B.buildFMA(
S32, NegDivScale0,
Mul, NumeratorScaled, Flags);
5312 auto Fma3 =
B.buildFMA(
S32, Fma2, Fma1,
Mul, Flags);
5313 auto Fma4 =
B.buildFMA(
S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5315 if (!PreservesDenormals) {
5316 if (HasDynamicDenormals) {
5317 assert(SavedSPDenormMode);
5318 B.buildInstr(AMDGPU::S_SETREG_B32)
5319 .addReg(SavedSPDenormMode)
5325 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S32})
5326 .addUse(Fma4.getReg(0))
5327 .addUse(Fma1.getReg(0))
5328 .addUse(Fma3.getReg(0))
5329 .addUse(NumeratorScaled.getReg(1))
5332 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5333 .addUse(Fmas.getReg(0))
5338 MI.eraseFromParent();
5357 auto One =
B.buildFConstant(
S64, 1.0);
5359 auto DivScale0 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5365 auto NegDivScale0 =
B.buildFNeg(
S64, DivScale0.getReg(0), Flags);
5367 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S64})
5368 .addUse(DivScale0.getReg(0))
5371 auto Fma0 =
B.buildFMA(
S64, NegDivScale0, Rcp, One, Flags);
5372 auto Fma1 =
B.buildFMA(
S64, Rcp, Fma0, Rcp, Flags);
5373 auto Fma2 =
B.buildFMA(
S64, NegDivScale0, Fma1, One, Flags);
5375 auto DivScale1 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5381 auto Fma3 =
B.buildFMA(
S64, Fma1, Fma2, Fma1, Flags);
5382 auto Mul =
B.buildFMul(
S64, DivScale1.getReg(0), Fma3, Flags);
5383 auto Fma4 =
B.buildFMA(
S64, NegDivScale0,
Mul, DivScale1.getReg(0), Flags);
5386 if (!ST.hasUsableDivScaleConditionOutput()) {
5392 auto NumUnmerge =
B.buildUnmerge(
S32, LHS);
5393 auto DenUnmerge =
B.buildUnmerge(
S32, RHS);
5394 auto Scale0Unmerge =
B.buildUnmerge(
S32, DivScale0);
5395 auto Scale1Unmerge =
B.buildUnmerge(
S32, DivScale1);
5398 Scale1Unmerge.getReg(1));
5400 Scale0Unmerge.getReg(1));
5401 Scale =
B.buildXor(
S1, CmpNum, CmpDen).getReg(0);
5403 Scale = DivScale1.getReg(1);
5406 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S64})
5407 .addUse(Fma4.getReg(0))
5408 .addUse(Fma3.getReg(0))
5409 .addUse(
Mul.getReg(0))
5413 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup,
ArrayRef(Res))
5414 .addUse(Fmas.getReg(0))
5419 MI.eraseFromParent();
5431 LLT Ty =
MRI.getType(Res0);
5434 auto Mant =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5437 auto Exp =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5441 if (ST.hasFractBug()) {
5442 auto Fabs =
B.buildFAbs(Ty, Val);
5446 auto Zero =
B.buildConstant(InstrExpTy, 0);
5447 Exp =
B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5448 Mant =
B.buildSelect(Ty, IsFinite, Mant, Val);
5451 B.buildCopy(Res0, Mant);
5452 B.buildSExtOrTrunc(Res1, Exp);
5454 MI.eraseFromParent();
5469 auto Abs =
B.buildFAbs(
S32, RHS, Flags);
5472 auto C0 =
B.buildFConstant(
S32, 0x1p+96f);
5473 auto C1 =
B.buildFConstant(
S32, 0x1p-32f);
5474 auto C2 =
B.buildFConstant(
S32, 1.0f);
5477 auto Sel =
B.buildSelect(
S32, CmpRes, C1, C2, Flags);
5479 auto Mul0 =
B.buildFMul(
S32, RHS, Sel, Flags);
5481 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5482 .addUse(Mul0.getReg(0))
5485 auto Mul1 =
B.buildFMul(
S32, LHS, RCP, Flags);
5487 B.buildFMul(Res, Sel, Mul1, Flags);
5489 MI.eraseFromParent();
5498 unsigned Flags =
MI.getFlags();
5499 assert(!ST.has16BitInsts());
5501 auto Ext =
B.buildFPExt(
F32,
MI.getOperand(1), Flags);
5502 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {
F32})
5503 .addUse(Ext.getReg(0))
5505 B.buildFPTrunc(
MI.getOperand(0),
Log2, Flags);
5506 MI.eraseFromParent();
5516 const unsigned Flags =
MI.getFlags();
5525 MI.eraseFromParent();
5529 auto ScaleThreshold =
B.buildFConstant(
F32, 0x1.0p-96f);
5531 auto ScaleUpFactor =
B.buildFConstant(
F32, 0x1.0p+32f);
5532 auto ScaledX =
B.buildFMul(
F32,
X, ScaleUpFactor, Flags);
5533 auto SqrtX =
B.buildSelect(
F32, NeedScale, ScaledX,
X, Flags);
5538 .addUse(SqrtX.getReg(0))
5541 auto NegOne =
B.buildConstant(I32, -1);
5542 auto SqrtSNextDown =
B.buildAdd(I32, SqrtS, NegOne);
5544 auto NegSqrtSNextDown =
B.buildFNeg(
F32, SqrtSNextDown, Flags);
5545 auto SqrtVP =
B.buildFMA(
F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5547 auto PosOne =
B.buildConstant(I32, 1);
5548 auto SqrtSNextUp =
B.buildAdd(I32, SqrtS, PosOne);
5550 auto NegSqrtSNextUp =
B.buildFNeg(
F32, SqrtSNextUp, Flags);
5551 auto SqrtVS =
B.buildFMA(
F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5553 auto Zero =
B.buildFConstant(
F32, 0.0f);
5557 B.buildSelect(
F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5561 B.buildSelect(
F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5564 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F32}).addReg(SqrtX.getReg(0));
5565 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5567 auto Half =
B.buildFConstant(
F32, 0.5f);
5568 auto SqrtH =
B.buildFMul(
F32, SqrtR, Half, Flags);
5569 auto NegSqrtH =
B.buildFNeg(
F32, SqrtH, Flags);
5570 auto SqrtE =
B.buildFMA(
F32, NegSqrtH, SqrtS, Half, Flags);
5571 SqrtH =
B.buildFMA(
F32, SqrtH, SqrtE, SqrtH, Flags);
5572 SqrtS =
B.buildFMA(
F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5573 auto NegSqrtS =
B.buildFNeg(
F32, SqrtS, Flags);
5574 auto SqrtD =
B.buildFMA(
F32, NegSqrtS, SqrtS, SqrtX, Flags);
5575 SqrtS =
B.buildFMA(
F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5578 auto ScaleDownFactor =
B.buildFConstant(
F32, 0x1.0p-16f);
5580 auto ScaledDown =
B.buildFMul(
F32, SqrtS, ScaleDownFactor, Flags);
5582 SqrtS =
B.buildSelect(
F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5585 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5587 MI.eraseFromParent();
5619 assert(
MRI.getType(Dst) ==
F64 &&
"only expect to lower f64 sqrt");
5622 unsigned Flags =
MI.getFlags();
5624 auto ScaleConstant =
B.buildFConstant(
F64, 0x1.0p-767);
5626 auto ZeroInt =
B.buildConstant(
S32, 0);
5630 auto ScaleUpFactor =
B.buildConstant(
S32, 256);
5631 auto ScaleUp =
B.buildSelect(
S32, Scaling, ScaleUpFactor, ZeroInt);
5632 auto SqrtX =
B.buildFLdexp(
F64,
X, ScaleUp, Flags);
5635 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F64}).addReg(SqrtX.getReg(0));
5637 auto Half =
B.buildFConstant(
F64, 0.5);
5638 auto SqrtH0 =
B.buildFMul(
F64, SqrtY, Half);
5639 auto SqrtS0 =
B.buildFMul(
F64, SqrtX, SqrtY);
5641 auto NegSqrtH0 =
B.buildFNeg(
F64, SqrtH0);
5642 auto SqrtR0 =
B.buildFMA(
F64, NegSqrtH0, SqrtS0, Half);
5644 auto SqrtS1 =
B.buildFMA(
F64, SqrtS0, SqrtR0, SqrtS0);
5645 auto SqrtH1 =
B.buildFMA(
F64, SqrtH0, SqrtR0, SqrtH0);
5647 auto NegSqrtS1 =
B.buildFNeg(
F64, SqrtS1);
5648 auto SqrtD0 =
B.buildFMA(
F64, NegSqrtS1, SqrtS1, SqrtX);
5650 auto SqrtS2 =
B.buildFMA(
F64, SqrtD0, SqrtH1, SqrtS1);
5652 auto NegSqrtS2 =
B.buildFNeg(
F64, SqrtS2);
5653 auto SqrtD1 =
B.buildFMA(
F64, NegSqrtS2, SqrtS2, SqrtX);
5655 auto SqrtRet =
B.buildFMA(
F64, SqrtD1, SqrtH1, SqrtS2);
5658 auto ScaleDownFactor =
B.buildConstant(
S32, -128);
5659 auto ScaleDown =
B.buildSelect(
S32, Scaling, ScaleDownFactor, ZeroInt);
5660 SqrtRet =
B.buildFLdexp(
F64, SqrtRet, ScaleDown, Flags);
5669 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5671 MI.eraseFromParent();
5678 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
5702 auto Flags =
MI.getFlags();
5704 LLT Ty =
MRI.getType(Dst);
5714 auto Rsq =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5724 auto ClampMax = UseIEEE ?
B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5725 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5730 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5732 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5733 MI.eraseFromParent();
5745 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5746 IID == Intrinsic::amdgcn_permlanex16;
5747 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
5748 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
5752 auto LaneOp =
B.buildIntrinsic(IID, {VT}).addUse(Src0);
5754 case Intrinsic::amdgcn_readfirstlane:
5755 case Intrinsic::amdgcn_permlane64:
5756 return LaneOp.getReg(0);
5757 case Intrinsic::amdgcn_readlane:
5758 case Intrinsic::amdgcn_set_inactive:
5759 case Intrinsic::amdgcn_set_inactive_chain_arg:
5760 return LaneOp.addUse(Src1).getReg(0);
5761 case Intrinsic::amdgcn_writelane:
5762 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5763 case Intrinsic::amdgcn_permlane16:
5764 case Intrinsic::amdgcn_permlanex16: {
5766 int64_t Src4 =
MI.getOperand(6).getImm();
5767 int64_t Src5 =
MI.getOperand(7).getImm();
5768 return LaneOp.addUse(Src1)
5775 case Intrinsic::amdgcn_mov_dpp8:
5776 return LaneOp.addImm(
MI.getOperand(3).getImm()).getReg(0);
5777 case Intrinsic::amdgcn_update_dpp:
5778 return LaneOp.addUse(Src1)
5779 .addImm(
MI.getOperand(4).getImm())
5780 .addImm(
MI.getOperand(5).getImm())
5781 .addImm(
MI.getOperand(6).getImm())
5782 .addImm(
MI.getOperand(7).getImm())
5792 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5793 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
5794 Src1 =
MI.getOperand(3).getReg();
5795 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
5796 Src2 =
MI.getOperand(4).getReg();
5800 LLT Ty =
MRI.getType(DstReg);
5801 unsigned Size = Ty.getSizeInBits();
5803 unsigned SplitSize = 32;
5804 if (IID == Intrinsic::amdgcn_update_dpp && (
Size % 64 == 0) &&
5805 ST.hasDPALU_DPP() &&
5809 if (
Size == SplitSize) {
5815 Src0 =
B.buildAnyExt(
S32, Src0).getReg(0);
5817 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5820 if (IID == Intrinsic::amdgcn_writelane)
5823 Register LaneOpDst = createLaneOp(Src0, Src1, Src2,
S32);
5824 B.buildTrunc(DstReg, LaneOpDst);
5825 MI.eraseFromParent();
5829 if (
Size % SplitSize != 0)
5833 bool NeedsBitcast =
false;
5834 if (Ty.isVector()) {
5837 if (EltSize == SplitSize) {
5838 PartialResTy = EltTy;
5839 }
else if (EltSize == 16 || EltSize == 32) {
5840 unsigned NElem = SplitSize / EltSize;
5844 NeedsBitcast =
true;
5849 unsigned NumParts =
Size / SplitSize;
5853 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5854 Src1Parts =
B.buildUnmerge(PartialResTy, Src1);
5856 if (IID == Intrinsic::amdgcn_writelane)
5857 Src2Parts =
B.buildUnmerge(PartialResTy, Src2);
5859 for (
unsigned i = 0; i < NumParts; ++i) {
5860 Src0 = Src0Parts.
getReg(i);
5862 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5863 Src1 = Src1Parts.
getReg(i);
5865 if (IID == Intrinsic::amdgcn_writelane)
5866 Src2 = Src2Parts.
getReg(i);
5868 PartialRes.
push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
5872 B.buildBitcast(DstReg,
B.buildMergeLikeInstr(
5875 B.buildMergeLikeInstr(DstReg, PartialRes);
5877 MI.eraseFromParent();
5885 ST.getTargetLowering()->getImplicitParameterOffset(
5887 LLT DstTy =
MRI.getType(DstReg);
5890 Register KernargPtrReg =
MRI.createGenericVirtualRegister(DstTy);
5895 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
5896 B.buildConstant(IdxTy,
Offset).getReg(0));
5907 Register Pointer =
MI.getOperand(2).getReg();
5909 Register NumRecords =
MI.getOperand(4).getReg();
5915 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
5917 auto ExtStride =
B.buildAnyExt(
S32, Stride);
5919 if (ST.has45BitNumRecordsBufferResource()) {
5924 auto PointerInt =
B.buildPtrToInt(PtrIntTy, Pointer);
5925 auto ExtPointer =
B.buildAnyExtOrTrunc(
S64, PointerInt);
5926 auto NumRecordsLHS =
B.buildShl(
S64, NumRecords,
B.buildConstant(
S32, 57));
5927 Register LowHalf =
B.buildOr(
S64, ExtPointer, NumRecordsLHS).getReg(0);
5931 auto NumRecordsRHS =
B.buildLShr(
S64, NumRecords,
B.buildConstant(
S32, 7));
5932 auto ShiftedStride =
B.buildShl(
S32, ExtStride,
B.buildConstant(
S32, 12));
5933 auto ExtShiftedStride =
5934 B.buildMergeValues(
S64, {Zero, ShiftedStride.getReg(0)});
5935 auto ShiftedFlags =
B.buildShl(
S32, Flags,
B.buildConstant(
S32, 28));
5936 auto ExtShiftedFlags =
5937 B.buildMergeValues(
S64, {Zero, ShiftedFlags.getReg(0)});
5938 auto CombinedFields =
B.buildOr(
S64, NumRecordsRHS, ExtShiftedStride);
5940 B.buildOr(
S64, CombinedFields, ExtShiftedFlags).getReg(0);
5941 B.buildMergeValues(Result, {LowHalf, HighHalf});
5943 NumRecords =
B.buildTrunc(
S32, NumRecords).getReg(0);
5944 auto Unmerge =
B.buildUnmerge(
S32, Pointer);
5945 auto LowHalf = Unmerge.getReg(0);
5946 auto HighHalf = Unmerge.getReg(1);
5948 auto AndMask =
B.buildConstant(
S32, 0x0000ffff);
5949 auto Masked =
B.buildAnd(
S32, HighHalf, AndMask);
5950 auto ShiftConst =
B.buildConstant(
S32, 16);
5951 auto ShiftedStride =
B.buildShl(
S32, ExtStride, ShiftConst);
5952 auto NewHighHalf =
B.buildOr(
S32,
Masked, ShiftedStride);
5953 Register NewHighHalfReg = NewHighHalf.getReg(0);
5954 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5957 MI.eraseFromParent();
5974 MI.eraseFromParent();
5982 std::optional<uint32_t> KnownSize =
5984 if (KnownSize.has_value())
5985 B.buildConstant(DstReg, *KnownSize);
6003 MI.eraseFromParent();
6010 unsigned AddrSpace)
const {
6012 auto Unmerge =
B.buildUnmerge(
S32,
MI.getOperand(2).getReg());
6016 ST.hasGloballyAddressableScratch()) {
6018 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
6019 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6021 MRI.setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
6023 Register XOR =
B.buildXor(
S32, Hi32, FlatScratchBaseHi).getReg(0);
6025 B.buildConstant(
S32, 1u << 26));
6030 MI.eraseFromParent();
6040std::pair<Register, unsigned>
6054 MRI, OrigOffset,
nullptr, CheckNUW);
6057 if (
MRI.getType(BaseReg).isPointer())
6058 BaseReg =
B.buildPtrToInt(
MRI.getType(OrigOffset), BaseReg).getReg(0);
6068 unsigned Overflow = ImmOffset & ~MaxImm;
6069 ImmOffset -= Overflow;
6070 if ((int32_t)Overflow < 0) {
6071 Overflow += ImmOffset;
6075 if (Overflow != 0) {
6077 BaseReg =
B.buildConstant(
S32, Overflow).getReg(0);
6079 auto OverflowVal =
B.buildConstant(
S32, Overflow);
6080 BaseReg =
B.buildAdd(
S32, BaseReg, OverflowVal).getReg(0);
6085 BaseReg =
B.buildConstant(
S32, 0).getReg(0);
6087 return std::pair(BaseReg, ImmOffset);
6094 bool ImageStore)
const {
6097 LLT StoreVT =
MRI.getType(Reg);
6100 if (ST.hasUnpackedD16VMem()) {
6101 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6104 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6105 WideRegs.
push_back(
B.buildAnyExt(
S32, Unmerge.getReg(
I)).getReg(0));
6113 if (ImageStore && ST.hasImageStoreD16Bug()) {
6116 Reg =
B.buildBitcast(
S32, Reg).getReg(0);
6118 PackedRegs.
resize(2,
B.buildUndef(
S32).getReg(0));
6125 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6126 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6128 PackedRegs.
resize(6,
B.buildUndef(
S16).getReg(0));
6136 auto Unmerge =
B.buildUnmerge(
S32, Reg);
6137 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6139 PackedRegs.
resize(4,
B.buildUndef(
S32).getReg(0));
6156 bool IsFormat)
const {
6158 LLT Ty =
MRI->getType(VData);
6168 VData =
B.buildBitcast(Ty, VData).getReg(0);
6176 if (Ty.isVector()) {
6177 if (Ty.getElementType() ==
S16 && Ty.getNumElements() <= 4) {
6189 bool IsFormat)
const {
6194 LLT Ty =
MRI.getType(VData);
6196 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6211 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6214 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6218 VIndex =
MI.getOperand(3).getReg();
6221 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6224 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6225 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6229 Format =
MI.getOperand(5 + OpOffset).getImm();
6233 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6239 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6240 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6241 }
else if (IsFormat) {
6242 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6243 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6247 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6250 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6253 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6258 auto MIB =
B.buildInstr(
Opc)
6269 MIB.addImm(AuxiliaryData)
6270 .addImm(HasVIndex ? -1 : 0)
6271 .addMemOperand(MMO);
6273 MI.eraseFromParent();
6279 unsigned ImmOffset,
unsigned Format,
6282 auto MIB =
B.buildInstr(
Opc)
6293 MIB.addImm(AuxiliaryData)
6294 .addImm(HasVIndex ? -1 : 0)
6295 .addMemOperand(MMO);
6301 bool IsTyped)
const {
6315 assert(
MI.getNumExplicitDefs() == 1 ||
MI.getNumExplicitDefs() == 2);
6316 bool IsTFE =
MI.getNumExplicitDefs() == 2;
6318 StatusDst =
MI.getOperand(1).getReg();
6323 Register RSrc =
MI.getOperand(2 + OpOffset).getReg();
6326 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6329 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps + OpOffset;
6332 VIndex =
MI.getOperand(3 + OpOffset).getReg();
6335 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6338 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6339 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6343 Format =
MI.getOperand(5 + OpOffset).getImm();
6347 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6350 LLT Ty =
MRI.getType(Dst);
6357 Dst =
MI.getOperand(0).getReg();
6358 B.setInsertPt(
B.getMBB(),
MI);
6365 Dst =
MI.getOperand(0).getReg();
6366 B.setInsertPt(
B.getMBB(),
MI);
6370 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6371 const bool Unpacked = ST.hasUnpackedD16VMem();
6381 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6382 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6383 }
else if (IsFormat) {
6387 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6389 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6390 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6395 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6396 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6399 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6400 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6403 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6404 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6410 unsigned NumValueDWords =
divideCeil(Ty.getSizeInBits(), 32);
6411 unsigned NumLoadDWords = NumValueDWords + 1;
6413 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(LoadTy);
6415 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6417 Register ExtDst =
B.getMRI()->createGenericVirtualRegister(
S32);
6418 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6419 B.buildTrunc(Dst, ExtDst);
6420 }
else if (NumValueDWords == 1) {
6421 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6424 for (
unsigned I = 0;
I != NumValueDWords; ++
I)
6425 LoadElts.
push_back(
B.getMRI()->createGenericVirtualRegister(
S32));
6427 B.buildUnmerge(LoadElts, LoadDstReg);
6429 B.buildMergeLikeInstr(Dst, LoadElts);
6432 (IsD16 && !Ty.isVector())) {
6433 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(
S32);
6435 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6436 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6437 B.buildTrunc(Dst, LoadDstReg);
6438 }
else if (Unpacked && IsD16 && Ty.isVector()) {
6440 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6442 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6443 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6445 auto Unmerge =
B.buildUnmerge(
S32, LoadDstReg);
6447 for (
unsigned I = 0,
N = Unmerge->getNumOperands() - 1;
I !=
N; ++
I)
6448 Repack.
push_back(
B.buildTrunc(EltTy, Unmerge.getReg(
I)).getReg(0));
6449 B.buildMergeLikeInstr(Dst, Repack);
6452 AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6455 MI.eraseFromParent();
6461 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6462 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6463 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6464 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6465 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6466 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6467 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6468 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6469 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6470 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6471 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6472 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6473 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6474 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6475 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6476 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6477 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6478 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6479 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6480 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6481 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6482 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6483 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6484 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6485 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6486 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6487 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6488 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6489 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6490 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6491 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6492 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6493 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6494 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6495 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6496 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6497 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6498 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6499 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6500 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6501 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6502 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6503 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6504 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6505 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6506 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6507 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6508 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6509 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6510 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6511 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6512 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6513 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6514 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6515 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6516 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6517 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6518 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6519 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6520 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6521 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6522 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6523 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6524 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6525 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6526 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6527 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6528 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6529 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6530 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6531 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6532 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6533 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6534 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6535 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6536 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6537 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6538 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6539 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6540 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6541 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6542 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6543 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6552 const bool IsCmpSwap =
6553 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6554 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6555 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6556 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6567 CmpVal =
MI.getOperand(3).getReg();
6572 Register RSrc =
MI.getOperand(3 + OpOffset).getReg();
6573 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6576 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6579 VIndex =
MI.getOperand(4 + OpOffset).getReg();
6582 VIndex =
B.buildConstant(
LLT::scalar(32), 0).getReg(0);
6585 Register VOffset =
MI.getOperand(4 + OpOffset).getReg();
6586 Register SOffset =
MI.getOperand(5 + OpOffset).getReg();
6587 unsigned AuxiliaryData =
MI.getOperand(6 + OpOffset).getImm();
6606 .addImm(AuxiliaryData)
6607 .addImm(HasVIndex ? -1 : 0)
6608 .addMemOperand(MMO);
6610 MI.eraseFromParent();
6620 bool IsA16,
bool IsG16) {
6636 (
B.getMRI()->getType(AddrReg) ==
S16)) {
6641 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6645 "Bias needs to be converted to 16 bit in A16 mode");
6647 AddrReg =
B.buildBitcast(
V2S16, AddrReg).getReg(0);
6653 if (((
I + 1) >= EndIdx) ||
6660 !
MI.getOperand(ArgOffset +
I + 1).isReg()) {
6662 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6667 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6678 int DimIdx,
int NumVAddrs) {
6682 for (
int I = 0;
I != NumVAddrs; ++
I) {
6684 if (
SrcOp.isReg()) {
6690 int NumAddrRegs = AddrRegs.
size();
6691 if (NumAddrRegs != 1) {
6694 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6697 for (
int I = 1;
I != NumVAddrs; ++
I) {
6700 MI.getOperand(DimIdx +
I).setReg(AMDGPU::NoRegister);
6722 const unsigned NumDefs =
MI.getNumExplicitDefs();
6723 const unsigned ArgOffset = NumDefs + 1;
6724 bool IsTFE = NumDefs == 2;
6742 VData =
MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6743 Ty =
MRI->getType(VData);
6746 const bool IsAtomicPacked16Bit =
6747 (BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6748 BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6756 ST.hasG16() ? (BaseOpcode->
Gradients && GradTy ==
S16) : GradTy ==
S16;
6757 const bool IsA16 = AddrTy ==
S16;
6758 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() ==
S16;
6761 if (!BaseOpcode->
Atomic) {
6762 DMask =
MI.getOperand(ArgOffset + Intr->
DMaskIndex).getImm();
6765 }
else if (DMask != 0) {
6767 }
else if (!IsTFE && !BaseOpcode->
Store) {
6769 B.buildUndef(
MI.getOperand(0));
6770 MI.eraseFromParent();
6778 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6779 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6780 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6781 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6782 unsigned NewOpcode = LoadOpcode;
6783 if (BaseOpcode->
Store)
6784 NewOpcode = StoreOpcode;
6786 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
6789 MI.setDesc(
B.getTII().get(NewOpcode));
6793 if (IsTFE && DMask == 0) {
6796 MI.getOperand(ArgOffset + Intr->
DMaskIndex).setImm(DMask);
6799 if (BaseOpcode->
Atomic) {
6801 LLT Ty =
MRI->getType(VData0);
6804 if (Ty.isVector() && !IsAtomicPacked16Bit)
6811 auto Concat =
B.buildBuildVector(PackedTy, {VData0, VData1});
6812 MI.getOperand(2).setReg(
Concat.getReg(0));
6813 MI.getOperand(3).setReg(AMDGPU::NoRegister);
6817 unsigned CorrectedNumVAddrs = Intr->
NumVAddrs;
6820 if (BaseOpcode->
Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6826 if (IsA16 && !ST.hasA16()) {
6831 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->
Sampler);
6832 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6834 if (IsA16 || IsG16) {
6842 const bool UseNSA = ST.hasNSAEncoding() &&
6843 PackedRegs.
size() >= ST.getNSAThreshold(MF) &&
6844 (PackedRegs.
size() <= NSAMaxSize || HasPartialNSA);
6845 const bool UsePartialNSA =
6846 UseNSA && HasPartialNSA && PackedRegs.
size() > NSAMaxSize;
6848 if (UsePartialNSA) {
6852 auto Concat =
B.buildConcatVectors(
6853 PackedAddrTy,
ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6854 PackedRegs[NSAMaxSize - 1] =
Concat.getReg(0);
6855 PackedRegs.
resize(NSAMaxSize);
6856 }
else if (!UseNSA && PackedRegs.
size() > 1) {
6858 auto Concat =
B.buildConcatVectors(PackedAddrTy, PackedRegs);
6859 PackedRegs[0] =
Concat.getReg(0);
6863 const unsigned NumPacked = PackedRegs.
size();
6866 if (!
SrcOp.isReg()) {
6876 SrcOp.setReg(AMDGPU::NoRegister);
6893 const bool UseNSA = ST.hasNSAEncoding() &&
6894 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
6895 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
6896 const bool UsePartialNSA =
6897 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6899 if (UsePartialNSA) {
6901 ArgOffset + Intr->
VAddrStart + NSAMaxSize - 1,
6903 }
else if (!UseNSA && Intr->
NumVAddrs > 1) {
6918 if (!Ty.isVector() || !IsD16)
6922 if (RepackedReg != VData) {
6923 MI.getOperand(1).setReg(RepackedReg);
6931 const int NumElts = Ty.
isVector() ? Ty.getNumElements() : 1;
6934 if (NumElts < DMaskLanes)
6937 if (NumElts > 4 || DMaskLanes > 4)
6947 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6948 const LLT AdjustedTy =
6964 if (IsD16 && ST.hasUnpackedD16VMem()) {
6971 unsigned RoundedElts = (AdjustedTy.
getSizeInBits() + 31) / 32;
6972 unsigned RoundedSize = 32 * RoundedElts;
6976 RegTy = !IsTFE && EltSize == 16 ?
V2S16 :
S32;
6981 if (!IsTFE && (RoundedTy == Ty || !Ty.
isVector()))
6987 B.setInsertPt(*
MI.getParent(), ++
MI.getIterator());
6991 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6992 const int ResultNumRegs = LoadResultTy.
getSizeInBits() / 32;
6994 Register NewResultReg =
MRI->createGenericVirtualRegister(LoadResultTy);
6996 MI.getOperand(0).setReg(NewResultReg);
7004 Dst1Reg =
MI.getOperand(1).getReg();
7005 if (
MRI->getType(Dst1Reg) !=
S32)
7009 MI.removeOperand(1);
7013 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
7022 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7024 if (ResultNumRegs == 1) {
7026 ResultRegs[0] = NewResultReg;
7029 for (
int I = 0;
I != NumDataRegs; ++
I)
7030 ResultRegs[
I] =
MRI->createGenericVirtualRegister(RegTy);
7031 B.buildUnmerge(ResultRegs, NewResultReg);
7036 ResultRegs.
resize(NumDataRegs);
7041 if (IsD16 && !Ty.isVector()) {
7042 B.buildTrunc(DstReg, ResultRegs[0]);
7047 if (Ty ==
V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7048 B.buildBitcast(DstReg, ResultRegs[0]);
7060 if (RegTy !=
V2S16 && !ST.hasUnpackedD16VMem()) {
7062 Reg =
B.buildBitcast(
V2S16, Reg).getReg(0);
7063 }
else if (ST.hasUnpackedD16VMem()) {
7065 Reg =
B.buildTrunc(
S16, Reg).getReg(0);
7069 auto padWithUndef = [&](
LLT Ty,
int NumElts) {
7072 Register Undef =
B.buildUndef(Ty).getReg(0);
7073 for (
int I = 0;
I != NumElts; ++
I)
7078 LLT ResTy =
MRI->getType(ResultRegs[0]);
7080 padWithUndef(ResTy, NumElts - ResultRegs.
size());
7081 B.buildBuildVector(DstReg, ResultRegs);
7085 assert(!ST.hasUnpackedD16VMem() && ResTy ==
V2S16);
7086 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7092 if (ResultRegs.
size() == 1) {
7093 NewResultReg = ResultRegs[0];
7094 }
else if (ResultRegs.
size() == 2) {
7096 NewResultReg =
B.buildConcatVectors(
V4S16, ResultRegs).getReg(0);
7102 if (
MRI->getType(DstReg).getNumElements() <
7103 MRI->getType(NewResultReg).getNumElements()) {
7104 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
7106 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
7111 padWithUndef(ResTy, RegsToCover - ResultRegs.
size());
7112 B.buildConcatVectors(DstReg, ResultRegs);
7121 Register OrigDst =
MI.getOperand(0).getReg();
7123 LLT Ty =
B.getMRI()->getType(OrigDst);
7124 unsigned Size = Ty.getSizeInBits();
7127 if (
Size < 32 && ST.hasScalarSubwordLoads()) {
7129 Opc =
Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7130 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7133 Dst =
B.getMRI()->createGenericVirtualRegister(
LLT::scalar(32));
7135 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7144 B.setInsertPt(
B.getMBB(),
MI);
7149 B.setInsertPt(
B.getMBB(),
MI);
7155 MI.setDesc(
B.getTII().get(
Opc));
7156 MI.removeOperand(1);
7159 const unsigned MemSize = (
Size + 7) / 8;
7160 const Align MemAlign =
B.getDataLayout().getABITypeAlign(
7167 MI.addMemOperand(MF, MMO);
7168 if (Dst != OrigDst) {
7169 MI.getOperand(0).setReg(Dst);
7170 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
7171 B.buildTrunc(OrigDst, Dst);
7193 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7194 MI.removeOperand(0);
7204 if (!ST.isTrapHandlerEnabled() ||
7208 return ST.supportsGetDoorbellID() ?
7221 MI.eraseFromParent();
7231 BuildMI(*TrapBB, TrapBB->
end(),
DL,
B.getTII().get(AMDGPU::S_ENDPGM))
7233 BuildMI(BB, &
MI,
DL,
B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7237 MI.eraseFromParent();
7246 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7253 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
7255 Register KernargPtrReg =
MRI.createGenericVirtualRegister(
7271 Register LoadAddr =
MRI.createGenericVirtualRegister(
7273 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7276 Register Temp =
B.buildLoad(
S64, LoadAddr, *MMO).getReg(0);
7277 B.buildCopy(SGPR01, Temp);
7278 B.buildInstr(AMDGPU::S_TRAP)
7281 MI.eraseFromParent();
7292 B.buildCopy(SGPR01, LiveIn);
7293 B.buildInstr(AMDGPU::S_TRAP)
7297 MI.eraseFromParent();
7306 if (ST.hasPrivEnabledTrap2NopBug()) {
7307 ST.getInstrInfo()->insertSimulatedTrap(
MRI,
B.getMBB(),
MI,
7309 MI.eraseFromParent();
7313 B.buildInstr(AMDGPU::S_TRAP)
7315 MI.eraseFromParent();
7324 if (!ST.isTrapHandlerEnabled() ||
7328 Fn,
"debugtrap handler not supported",
MI.getDebugLoc(),
DS_Warning));
7331 B.buildInstr(AMDGPU::S_TRAP)
7335 MI.eraseFromParent();
7348 Register NodePtr =
MI.getOperand(2).getReg();
7349 Register RayExtent =
MI.getOperand(3).getReg();
7350 Register RayOrigin =
MI.getOperand(4).getReg();
7352 Register RayInvDir =
MI.getOperand(6).getReg();
7355 if (!ST.hasGFX10_AEncoding()) {
7358 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7365 const bool IsA16 =
MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
7366 const bool Is64 =
MRI.getType(NodePtr).getSizeInBits() == 64;
7367 const unsigned NumVDataDwords = 4;
7368 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7369 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7371 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7373 const unsigned BaseOpcodes[2][2] = {
7374 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7375 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7376 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7380 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7381 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7382 : AMDGPU::MIMGEncGfx10NSA,
7383 NumVDataDwords, NumVAddrDwords);
7387 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7388 : AMDGPU::MIMGEncGfx10Default,
7389 NumVDataDwords, NumVAddrDwords);
7394 if (UseNSA && IsGFX11Plus) {
7396 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7397 auto Merged =
B.buildMergeLikeInstr(
7398 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7399 Ops.push_back(Merged.getReg(0));
7402 Ops.push_back(NodePtr);
7403 Ops.push_back(RayExtent);
7404 packLanes(RayOrigin);
7407 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7408 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7409 auto MergedDir =
B.buildMergeLikeInstr(
7412 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(0),
7413 UnmergeRayDir.getReg(0)}))
7416 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(1),
7417 UnmergeRayDir.getReg(1)}))
7420 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(2),
7421 UnmergeRayDir.getReg(2)}))
7423 Ops.push_back(MergedDir.getReg(0));
7426 packLanes(RayInvDir);
7430 auto Unmerge =
B.buildUnmerge({
S32,
S32}, NodePtr);
7431 Ops.push_back(Unmerge.getReg(0));
7432 Ops.push_back(Unmerge.getReg(1));
7434 Ops.push_back(NodePtr);
7436 Ops.push_back(RayExtent);
7439 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7440 Ops.push_back(Unmerge.getReg(0));
7441 Ops.push_back(Unmerge.getReg(1));
7442 Ops.push_back(Unmerge.getReg(2));
7445 packLanes(RayOrigin);
7447 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7448 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7452 B.buildMergeLikeInstr(R1,
7453 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7454 B.buildMergeLikeInstr(
7455 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7456 B.buildMergeLikeInstr(
7457 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7463 packLanes(RayInvDir);
7470 Register MergedOps =
B.buildMergeLikeInstr(OpTy,
Ops).getReg(0);
7472 Ops.push_back(MergedOps);
7475 auto MIB =
B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7484 .addImm(IsA16 ? 1 : 0)
7487 MI.eraseFromParent();
7497 Register DstOrigin =
MI.getOperand(1).getReg();
7499 Register NodePtr =
MI.getOperand(4).getReg();
7500 Register RayExtent =
MI.getOperand(5).getReg();
7501 Register InstanceMask =
MI.getOperand(6).getReg();
7502 Register RayOrigin =
MI.getOperand(7).getReg();
7504 Register Offsets =
MI.getOperand(9).getReg();
7505 Register TDescr =
MI.getOperand(10).getReg();
7507 if (!ST.hasBVHDualAndBVH8Insts()) {
7510 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7515 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7516 const unsigned NumVDataDwords = 10;
7517 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7519 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7520 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7521 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7524 auto RayExtentInstanceMaskVec =
B.buildMergeLikeInstr(
7525 V2S32, {RayExtent,
B.buildAnyExt(
S32, InstanceMask)});
7527 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7528 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7534 .addUse(RayExtentInstanceMaskVec.getReg(0))
7541 MI.eraseFromParent();
7550 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7551 MI.eraseFromParent();
7558 if (!ST.hasArchitectedSGPRs())
7562 auto TTMP8 =
B.buildCopy(
S32,
Register(AMDGPU::TTMP8));
7563 auto LSB =
B.buildConstant(
S32, 25);
7564 auto Width =
B.buildConstant(
S32, 5);
7565 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7566 MI.eraseFromParent();
7574 unsigned Width)
const {
7577 if (!
MRI.getRegClassOrNull(DstReg))
7578 MRI.setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
7579 B.buildInstr(AMDGPU::S_GETREG_B32_const)
7582 MI.eraseFromParent();
7596 if (
MRI.getType(Src) !=
S64)
7600 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
7604 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
7607 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7608 MI.eraseFromParent();
7616 if (
MRI.getType(Src) !=
S64)
7619 auto Unmerge =
B.buildUnmerge({
S32,
S32},
MI.getOperand(0));
7623 .addReg(Unmerge.getReg(0));
7627 .addReg(Unmerge.getReg(1));
7628 MI.eraseFromParent();
7640 case Intrinsic::amdgcn_if:
7641 case Intrinsic::amdgcn_else: {
7644 bool Negated =
false;
7656 std::swap(CondBrTarget, UncondBrTarget);
7658 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
7659 if (IntrID == Intrinsic::amdgcn_if) {
7660 B.buildInstr(AMDGPU::SI_IF)
7663 .addMBB(UncondBrTarget);
7665 B.buildInstr(AMDGPU::SI_ELSE)
7668 .addMBB(UncondBrTarget);
7677 B.buildBr(*CondBrTarget);
7680 MRI.setRegClass(Def,
TRI->getWaveMaskRegClass());
7681 MRI.setRegClass(
Use,
TRI->getWaveMaskRegClass());
7682 MI.eraseFromParent();
7683 BrCond->eraseFromParent();
7689 case Intrinsic::amdgcn_loop: {
7692 bool Negated =
false;
7702 std::swap(CondBrTarget, UncondBrTarget);
7704 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
7705 B.buildInstr(AMDGPU::SI_LOOP)
7707 .addMBB(UncondBrTarget);
7712 B.buildBr(*CondBrTarget);
7714 MI.eraseFromParent();
7715 BrCond->eraseFromParent();
7716 MRI.setRegClass(Reg,
TRI->getWaveMaskRegClass());
7722 case Intrinsic::amdgcn_addrspacecast_nonnull:
7724 case Intrinsic::amdgcn_make_buffer_rsrc:
7726 case Intrinsic::amdgcn_kernarg_segment_ptr:
7729 B.buildConstant(
MI.getOperand(0).getReg(), 0);
7730 MI.eraseFromParent();
7736 case Intrinsic::amdgcn_implicitarg_ptr:
7738 case Intrinsic::amdgcn_workitem_id_x:
7741 case Intrinsic::amdgcn_workitem_id_y:
7744 case Intrinsic::amdgcn_workitem_id_z:
7747 case Intrinsic::amdgcn_workgroup_id_x:
7752 case Intrinsic::amdgcn_workgroup_id_y:
7757 case Intrinsic::amdgcn_workgroup_id_z:
7762 case Intrinsic::amdgcn_cluster_id_x:
7763 return ST.hasClusters() &&
7766 case Intrinsic::amdgcn_cluster_id_y:
7767 return ST.hasClusters() &&
7770 case Intrinsic::amdgcn_cluster_id_z:
7771 return ST.hasClusters() &&
7774 case Intrinsic::amdgcn_cluster_workgroup_id_x:
7775 return ST.hasClusters() &&
7778 case Intrinsic::amdgcn_cluster_workgroup_id_y:
7779 return ST.hasClusters() &&
7782 case Intrinsic::amdgcn_cluster_workgroup_id_z:
7783 return ST.hasClusters() &&
7786 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
7787 return ST.hasClusters() &&
7789 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
7790 return ST.hasClusters() &&
7793 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
7794 return ST.hasClusters() &&
7797 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
7798 return ST.hasClusters() &&
7801 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
7802 return ST.hasClusters() &&
7806 case Intrinsic::amdgcn_wave_id:
7808 case Intrinsic::amdgcn_lds_kernel_id:
7811 case Intrinsic::amdgcn_dispatch_ptr:
7814 case Intrinsic::amdgcn_queue_ptr:
7817 case Intrinsic::amdgcn_implicit_buffer_ptr:
7820 case Intrinsic::amdgcn_dispatch_id:
7823 case Intrinsic::r600_read_ngroups_x:
7827 case Intrinsic::r600_read_ngroups_y:
7830 case Intrinsic::r600_read_ngroups_z:
7833 case Intrinsic::r600_read_local_size_x:
7836 case Intrinsic::r600_read_local_size_y:
7840 case Intrinsic::r600_read_local_size_z:
7843 case Intrinsic::amdgcn_fdiv_fast:
7845 case Intrinsic::amdgcn_is_shared:
7847 case Intrinsic::amdgcn_is_private:
7849 case Intrinsic::amdgcn_wavefrontsize: {
7850 B.buildConstant(
MI.getOperand(0), ST.getWavefrontSize());
7851 MI.eraseFromParent();
7854 case Intrinsic::amdgcn_s_buffer_load:
7856 case Intrinsic::amdgcn_raw_buffer_store:
7857 case Intrinsic::amdgcn_raw_ptr_buffer_store:
7858 case Intrinsic::amdgcn_struct_buffer_store:
7859 case Intrinsic::amdgcn_struct_ptr_buffer_store:
7861 case Intrinsic::amdgcn_raw_buffer_store_format:
7862 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7863 case Intrinsic::amdgcn_struct_buffer_store_format:
7864 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7866 case Intrinsic::amdgcn_raw_tbuffer_store:
7867 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7868 case Intrinsic::amdgcn_struct_tbuffer_store:
7869 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7871 case Intrinsic::amdgcn_raw_buffer_load:
7872 case Intrinsic::amdgcn_raw_ptr_buffer_load:
7873 case Intrinsic::amdgcn_raw_atomic_buffer_load:
7874 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
7875 case Intrinsic::amdgcn_struct_buffer_load:
7876 case Intrinsic::amdgcn_struct_ptr_buffer_load:
7877 case Intrinsic::amdgcn_struct_atomic_buffer_load:
7878 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
7880 case Intrinsic::amdgcn_raw_buffer_load_format:
7881 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7882 case Intrinsic::amdgcn_struct_buffer_load_format:
7883 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7885 case Intrinsic::amdgcn_raw_tbuffer_load:
7886 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7887 case Intrinsic::amdgcn_struct_tbuffer_load:
7888 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7890 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7891 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7892 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7893 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7894 case Intrinsic::amdgcn_raw_buffer_atomic_add:
7895 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7896 case Intrinsic::amdgcn_struct_buffer_atomic_add:
7897 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7898 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7899 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7900 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7901 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7902 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7903 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7904 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7905 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7906 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7907 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7908 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7909 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7910 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7911 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7912 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7913 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7914 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7915 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7916 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7917 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7918 case Intrinsic::amdgcn_raw_buffer_atomic_and:
7919 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7920 case Intrinsic::amdgcn_struct_buffer_atomic_and:
7921 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7922 case Intrinsic::amdgcn_raw_buffer_atomic_or:
7923 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7924 case Intrinsic::amdgcn_struct_buffer_atomic_or:
7925 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7926 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7927 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7928 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7929 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7930 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7931 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7932 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7933 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7934 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7935 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7936 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7937 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7938 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7939 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7940 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7941 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7942 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7943 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7944 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7945 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7946 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7947 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7948 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7949 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7950 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7951 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7952 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7953 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7955 case Intrinsic::amdgcn_rsq_clamp:
7957 case Intrinsic::amdgcn_image_bvh_intersect_ray:
7959 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
7960 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
7962 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
7963 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
7964 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
7965 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
7966 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
7967 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
7968 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
7969 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
7972 if (
MRI.getType(Index) !=
S64)
7973 MI.getOperand(5).setReg(
B.buildAnyExt(
S64, Index).getReg(0));
7976 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7977 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7978 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7979 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7980 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7981 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7982 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7983 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7986 if (
MRI.getType(Index) !=
S32)
7987 MI.getOperand(5).setReg(
B.buildAnyExt(
S32, Index).getReg(0));
7990 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
7991 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
7992 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
7993 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
7994 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
7995 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
7996 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7997 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7998 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8000 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8003 if (
MRI.getType(Index) != IdxTy)
8004 MI.getOperand(7).setReg(
B.buildAnyExt(IdxTy, Index).getReg(0));
8008 case Intrinsic::amdgcn_fmed3: {
8014 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
8015 MI.removeOperand(1);
8019 case Intrinsic::amdgcn_readlane:
8020 case Intrinsic::amdgcn_writelane:
8021 case Intrinsic::amdgcn_readfirstlane:
8022 case Intrinsic::amdgcn_permlane16:
8023 case Intrinsic::amdgcn_permlanex16:
8024 case Intrinsic::amdgcn_permlane64:
8025 case Intrinsic::amdgcn_set_inactive:
8026 case Intrinsic::amdgcn_set_inactive_chain_arg:
8027 case Intrinsic::amdgcn_mov_dpp8:
8028 case Intrinsic::amdgcn_update_dpp:
8030 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8032 case Intrinsic::amdgcn_dead: {
8036 MI.eraseFromParent();
8039 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8040 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8041 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8042 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8043 B.buildLoad(
MI.getOperand(0),
MI.getOperand(2), **
MI.memoperands_begin());
8044 MI.eraseFromParent();
8046 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8047 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8048 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8049 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8050 B.buildStore(
MI.getOperand(2),
MI.getOperand(1), **
MI.memoperands_begin());
8051 MI.eraseFromParent();
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
static constexpr unsigned FPEnvModeBitField
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS32Vectors
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS64Vectors
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
static constexpr unsigned FPEnvTrapBitField
static constexpr unsigned MaxRegisterSize
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
static bool isRegisterVectorType(LLT Ty)
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Interface for Targets to specify which operations they can successfully select and how the others sho...
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define FP_DENORM_FLUSH_NONE
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static constexpr int Concat[]
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B, AMDGPU::Hwreg::Id HwReg, unsigned LowBit, unsigned Width) const
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkGroupId(MachineInstr &MI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isEntryFunction() const
bool isModuleEntryFunction() const
const std::array< unsigned, 3 > & getDims() const
static const fltSemantics & IEEEsingle()
static const fltSemantics & IEEEdouble()
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
@ ICMP_SLT
signed less than
@ FCMP_OLT
0 1 0 0 True if ordered and less than
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
@ ICMP_UGE
unsigned greater or equal
@ ICMP_SGT
signed greater than
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
@ ICMP_ULT
unsigned less than
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
This is the shared class of boolean and integer constants.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
static constexpr LLT float64()
Get a 64-bit IEEE double value.
constexpr unsigned getScalarSizeInBits() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
static constexpr LLT float16()
Get a 16-bit IEEE half value.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr LLT getScalarType() const
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
static constexpr LLT float32()
Get a 32-bit IEEE float value.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI void computeTables()
Compute any ancillary tables needed to quickly decide how an operation should be handled.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & unsupported()
The instruction is unsupported.
LegalizeRuleSet & scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx)
Change the type TypeIdx to have the same scalar size as type SameSizeIdx.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & bitcastIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
The specified type index is coerced if predicate is true.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & unsupportedFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
LegalizeRuleSet & lowerIf(LegalityPredicate Predicate)
The instruction is lowered if predicate is true.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & unsupportedIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Widen the scalar to the one selected by the mutation if the predicate is true.
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & clampNumElements(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the number of elements for the given vectors to at least MinTy's number of elements and at most...
LegalizeRuleSet & maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Conditionally limit the maximum size of the scalar.
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
const LegacyLegalizerInfo & getLegacyLegalizerInfo() const
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Wrapper class representing virtual and physical registers.
constexpr bool isValid() const
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
The instances of the Type class are immutable: once they are created, they are never changed.
A Use represents the edge between a Value definition and its users.
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX1250(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LLVM_ABI LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size)
True if the total bitwidth of the specified type index is Size bits.
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
Invariant opcodes: All instruction sets have these as their low opcodes.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool has_single_bit(T Value) noexcept
std::function< bool(const LegalityQuery &)> LegalityPredicate
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
To bit_cast(const From &from) noexcept
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
unsigned Log2(Align A)
Returns the log2 of the alignment.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
static constexpr uint64_t encode(Fields... Values)
MIMGBaseOpcode BaseOpcode
This struct is a compact representation of a valid (non-zero power of two) alignment.
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.