37#include "llvm/IR/IntrinsicsAMDGPU.h"
38#include "llvm/IR/IntrinsicsR600.h"
40#define DEBUG_TYPE "amdgpu-legalinfo"
50 "amdgpu-global-isel-new-legality",
51 cl::desc(
"Use GlobalISel desired legality, rather than try to use"
52 "rules compatible with selection patterns"),
67 unsigned Bits = Ty.getSizeInBits();
77 const LLT Ty = Query.Types[TypeIdx];
83 return Ty.getNumElements() % 2 != 0 &&
84 EltSize > 1 && EltSize < 32 &&
85 Ty.getSizeInBits() % 32 != 0;
91 const LLT Ty = Query.Types[TypeIdx];
98 const LLT Ty = Query.Types[TypeIdx];
100 return EltTy.
getSizeInBits() == 16 && Ty.getNumElements() > 2;
106 const LLT Ty = Query.Types[TypeIdx];
108 return std::pair(TypeIdx,
115 const LLT Ty = Query.Types[TypeIdx];
117 unsigned Size = Ty.getSizeInBits();
118 unsigned Pieces = (
Size + 63) / 64;
119 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
129 const LLT Ty = Query.Types[TypeIdx];
132 const int Size = Ty.getSizeInBits();
134 const int NextMul32 = (
Size + 31) / 32;
138 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
146 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
147 return std::make_pair(TypeIdx,
LLT::scalar(MemSize));
154 const LLT Ty = Query.Types[TypeIdx];
156 const unsigned EltSize = Ty.getElementType().getSizeInBits();
159 assert(EltSize == 32 || EltSize == 64);
164 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
168 return std::pair(TypeIdx,
183 const unsigned NumElems = Ty.getElementCount().getFixedValue();
188 const unsigned Size = Ty.getSizeInBits();
201 const LLT Ty = Query.Types[TypeIdx];
208 const LLT Ty = Query.Types[TypeIdx];
209 unsigned Size = Ty.getSizeInBits();
218 const LLT QueryTy = Query.Types[TypeIdx];
225 const LLT QueryTy = Query.Types[TypeIdx];
232 const LLT QueryTy = Query.Types[TypeIdx];
238 return ((ST.useRealTrue16Insts() &&
Size == 16) ||
Size % 32 == 0) &&
244 return EltSize == 16 || EltSize % 32 == 0;
248 const int EltSize = Ty.getElementType().getSizeInBits();
249 return EltSize == 32 || EltSize == 64 ||
250 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
251 EltSize == 128 || EltSize == 256;
280 LLT Ty = Query.Types[TypeIdx];
288 const LLT QueryTy = Query.Types[TypeIdx];
373 if (Ty.isPointerOrPointerVector())
374 Ty = Ty.changeElementType(
LLT::scalar(Ty.getScalarSizeInBits()));
378 (ST.useRealTrue16Insts() && Ty ==
S16) ||
393 const LLT Ty = Query.Types[TypeIdx];
394 return !Ty.
isVector() && Ty.getSizeInBits() > 32 &&
395 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
403 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
413 bool IsLoad,
bool IsAtomic) {
417 return ST.hasFlatScratchEnabled() ? 128 : 32;
419 return ST.useDS128() ? 128 : 64;
430 return IsLoad ? 512 : 128;
435 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
444 const bool IsLoad = Query.
Opcode != AMDGPU::G_STORE;
446 unsigned RegSize = Ty.getSizeInBits();
449 unsigned AS = Query.
Types[1].getAddressSpace();
456 if (Ty.isVector() && MemSize !=
RegSize)
463 if (IsLoad && MemSize <
Size)
464 MemSize = std::max(MemSize,
Align);
484 if (!ST.hasDwordx3LoadStores())
497 if (AlignBits < MemSize) {
500 Align(AlignBits / 8)))
530 const unsigned Size = Ty.getSizeInBits();
531 if (Ty.isPointerVector())
541 unsigned EltSize = Ty.getScalarSizeInBits();
542 return EltSize != 32 && EltSize != 64;
556 const unsigned Size = Ty.getSizeInBits();
557 if (
Size != MemSizeInBits)
558 return Size <= 32 && Ty.isVector();
564 return Ty.isVector() && (!MemTy.
isVector() || MemTy == Ty) &&
573 uint64_t AlignInBits,
unsigned AddrSpace,
583 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
594 if (AlignInBits < RoundedSize)
601 RoundedSize, AddrSpace,
Align(AlignInBits / 8),
613 Query.
Types[1].getAddressSpace(), Opcode);
633 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
637 std::array<Register, 4> VectorElems;
638 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
639 for (
unsigned I = 0;
I < NumParts; ++
I)
641 B.buildExtractVectorElementConstant(
S32, VectorReg,
I).getReg(0);
642 B.buildMergeValues(MO, VectorElems);
647 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
648 auto Scalar =
B.buildBitcast(ScalarTy, BitcastReg);
649 B.buildIntToPtr(MO, Scalar);
669 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
670 auto Unmerged =
B.buildUnmerge(
LLT::scalar(32), Pointer);
671 for (
unsigned I = 0;
I < NumParts; ++
I)
673 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
675 Register Scalar =
B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
676 return B.buildBitcast(VectorTy, Scalar).getReg(0);
695 auto GetAddrSpacePtr = [&TM](
unsigned AS) {
708 const LLT BufferStridedPtr =
711 const LLT CodePtr = FlatPtr;
713 const std::initializer_list<LLT> AddrSpaces64 = {
714 GlobalPtr, ConstantPtr, FlatPtr
717 const std::initializer_list<LLT> AddrSpaces32 = {
718 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
721 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
723 const std::initializer_list<LLT> FPTypesBase = {
727 const std::initializer_list<LLT> FPTypes16 = {
731 const std::initializer_list<LLT> FPTypesPK16 = {
735 const LLT MinScalarFPTy = ST.has16BitInsts() ?
S16 :
S32;
756 if (ST.hasVOP3PInsts() && ST.hasAddNoCarryInsts() && ST.hasIntClamp()) {
758 if (ST.hasScalarAddSub64()) {
761 .clampMaxNumElementsStrict(0,
S16, 2)
769 .clampMaxNumElementsStrict(0,
S16, 2)
776 if (ST.hasScalarSMulU64()) {
779 .clampMaxNumElementsStrict(0,
S16, 2)
787 .clampMaxNumElementsStrict(0,
S16, 2)
797 .minScalarOrElt(0,
S16)
802 }
else if (ST.has16BitInsts()) {
836 .widenScalarToNextMultipleOf(0, 32)
846 if (ST.hasMad64_32())
851 if (ST.hasIntClamp()) {
874 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
884 if (ST.hasVOP3PInsts()) {
886 .clampMaxNumElements(0,
S8, 2)
907 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
919 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
926 .clampScalar(0,
S16,
S64);
959 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
960 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
967 if (ST.has16BitInsts()) {
968 if (ST.hasVOP3PInsts())
971 FPOpActions.legalFor({
S16});
973 TrigActions.customFor({
S16});
974 FDIVActions.customFor({
S16});
977 if (ST.hasPackedFP32Ops()) {
978 FPOpActions.legalFor({
V2S32});
979 FPOpActions.clampMaxNumElementsStrict(0,
S32, 2);
982 auto &MinNumMaxNumIeee =
985 if (ST.hasVOP3PInsts()) {
986 MinNumMaxNumIeee.legalFor(FPTypesPK16)
988 .clampMaxNumElements(0,
S16, 2)
991 }
else if (ST.has16BitInsts()) {
992 MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0,
S16,
S64).scalarize(0);
994 MinNumMaxNumIeee.legalFor(FPTypesBase)
1000 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
1002 if (ST.hasVOP3PInsts()) {
1003 MinNumMaxNum.customFor(FPTypesPK16)
1005 .clampMaxNumElements(0,
S16, 2)
1006 .clampScalar(0,
S16,
S64)
1008 }
else if (ST.has16BitInsts()) {
1009 MinNumMaxNum.customFor(FPTypes16)
1010 .clampScalar(0,
S16,
S64)
1013 MinNumMaxNum.customFor(FPTypesBase)
1014 .clampScalar(0,
S32,
S64)
1018 if (ST.hasVOP3PInsts())
1035 .
legalFor(ST.hasPackedFP32Ops(), {V2S32})
1037 if (ST.hasPackedFP32Ops())
1041 if (ST.has16BitInsts()) {
1075 if (ST.hasFractBug()) {
1109 if (ST.hasCvtPkF16F32Inst()) {
1111 .clampMaxNumElements(0,
S16, 2);
1115 FPTruncActions.scalarize(0).lower();
1123 if (ST.has16BitInsts()) {
1137 if (ST.hasPackedFP32Ops())
1147 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1148 FMad.customFor({
S32,
S16});
1149 else if (ST.hasMadMacF32Insts())
1150 FMad.customFor({
S32});
1151 else if (ST.hasMadF16())
1152 FMad.customFor({
S16});
1157 if (ST.has16BitInsts()) {
1160 FRem.minScalar(0,
S32)
1169 .clampMaxNumElements(0,
S16, 2)
1188 if (ST.has16BitInsts())
1199 if (ST.has16BitInsts())
1212 .legalFor(ST.has16BitInsts(),{{S16, S16}})
1216 if (
ST.has16BitInsts())
1226 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1227 .clampScalar(0,
S16,
S64)
1231 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1237 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1241 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1242 .clampScalar(0,
S16,
S64)
1246 if (
ST.has16BitInsts()) {
1247 getActionDefinitionsBuilder(
1248 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1250 .clampScalar(0,
S16,
S64)
1253 getActionDefinitionsBuilder(
1254 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1256 .clampScalar(0,
S32,
S64)
1259 getActionDefinitionsBuilder(
1260 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1263 .clampScalar(0,
S32,
S64)
1267 getActionDefinitionsBuilder(G_PTR_ADD)
1273 getActionDefinitionsBuilder(G_PTRMASK)
1275 .scalarSameSizeAs(1, 0)
1279 getActionDefinitionsBuilder(G_ICMP)
1291 {
S1}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1292 .legalForCartesianProduct(
1293 {
S32}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1294 if (
ST.has16BitInsts()) {
1295 CmpBuilder.legalFor({{
S1,
S16}});
1306 {
S1},
ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1308 if (
ST.hasSALUFloatInsts())
1317 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1318 if (
ST.has16BitInsts())
1319 ExpOps.customFor({{
S32}, {
S16}});
1321 ExpOps.customFor({
S32});
1322 ExpOps.clampScalar(0, MinScalarFPTy,
S32)
1325 getActionDefinitionsBuilder(G_FPOWI)
1326 .clampScalar(0, MinScalarFPTy,
S32)
1329 getActionDefinitionsBuilder(G_FLOG2)
1330 .legalFor(
ST.has16BitInsts(), {S16})
1335 getActionDefinitionsBuilder(G_FEXP2)
1336 .legalFor(
ST.has16BitInsts(), {S16})
1342 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1344 LogOps.clampScalar(0, MinScalarFPTy,
S32)
1348 getActionDefinitionsBuilder(G_CTPOP)
1350 .clampScalar(0,
S32,
S32)
1351 .widenScalarToNextPow2(1, 32)
1352 .clampScalar(1,
S32,
S64)
1354 .widenScalarToNextPow2(0, 32);
1357 if (
ST.has16BitInsts())
1358 getActionDefinitionsBuilder(G_IS_FPCLASS)
1359 .legalForCartesianProduct({
S1}, FPTypes16)
1360 .widenScalarToNextPow2(1)
1364 getActionDefinitionsBuilder(G_IS_FPCLASS)
1365 .legalForCartesianProduct({
S1}, FPTypesBase)
1366 .lowerFor({
S1,
S16})
1367 .widenScalarToNextPow2(1)
1374 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1376 .clampScalar(0,
S32,
S32)
1377 .clampScalar(1,
S32,
S64)
1378 .widenScalarToNextPow2(0, 32)
1379 .widenScalarToNextPow2(1, 32)
1383 getActionDefinitionsBuilder(G_CTLZ_ZERO_POISON)
1386 .clampScalar(0,
S32,
S32)
1387 .clampScalar(1,
S32,
S64)
1389 .widenScalarToNextPow2(0, 32)
1390 .widenScalarToNextPow2(1, 32);
1392 getActionDefinitionsBuilder(G_CTTZ_ZERO_POISON)
1394 .clampScalar(0,
S32,
S32)
1395 .clampScalar(1,
S32,
S64)
1397 .widenScalarToNextPow2(0, 32)
1398 .widenScalarToNextPow2(1, 32);
1400 getActionDefinitionsBuilder(G_CTLS)
1403 .clampScalar(0,
S32,
S32)
1404 .clampScalar(1,
S32,
S32);
1408 getActionDefinitionsBuilder(G_BITREVERSE)
1410 .clampScalar(0,
S32,
S64)
1412 .widenScalarToNextPow2(0);
1414 if (
ST.has16BitInsts()) {
1415 getActionDefinitionsBuilder(G_BSWAP)
1417 .clampMaxNumElementsStrict(0,
S16, 2)
1420 .widenScalarToNextPow2(0)
1421 .clampScalar(0,
S16,
S32)
1424 if (
ST.hasVOP3PInsts()) {
1425 getActionDefinitionsBuilder(G_ABS)
1427 .clampMaxNumElements(0,
S16, 2)
1429 .widenScalarToNextPow2(0)
1432 if (
ST.hasIntMinMax64()) {
1433 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1435 .clampMaxNumElements(0,
S16, 2)
1437 .widenScalarToNextPow2(0)
1441 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1443 .clampMaxNumElements(0,
S16, 2)
1445 .widenScalarToNextPow2(0)
1450 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1452 .widenScalarToNextPow2(0)
1459 getActionDefinitionsBuilder(G_BSWAP)
1464 .widenScalarToNextPow2(0)
1469 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1472 .widenScalarToNextPow2(0)
1477 getActionDefinitionsBuilder(G_INTTOPTR)
1479 .legalForCartesianProduct(AddrSpaces64, {
S64})
1480 .legalForCartesianProduct(AddrSpaces32, {
S32})
1493 getActionDefinitionsBuilder(G_PTRTOINT)
1495 .legalForCartesianProduct(AddrSpaces64, {
S64})
1496 .legalForCartesianProduct(AddrSpaces32, {
S32})
1509 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1513 const auto needToSplitMemOp = [=](
const LegalityQuery &Query,
1514 bool IsLoad) ->
bool {
1518 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1532 unsigned NumRegs = (MemSize + 31) / 32;
1534 if (!
ST.hasDwordx3LoadStores())
1545 unsigned GlobalAlign32 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1546 unsigned GlobalAlign16 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1547 unsigned GlobalAlign8 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1553 for (
unsigned Op : {G_LOAD, G_STORE}) {
1554 const bool IsStore =
Op == G_STORE;
1556 auto &Actions = getActionDefinitionsBuilder(
Op);
1559 Actions.legalForTypesWithMemDesc({{
S32, GlobalPtr,
S32, GlobalAlign32},
1562 {
S64, GlobalPtr,
S64, GlobalAlign32},
1565 {
S32, GlobalPtr,
S8, GlobalAlign8},
1566 {
S32, GlobalPtr,
S16, GlobalAlign16},
1568 {
S32, LocalPtr,
S32, 32},
1569 {
S64, LocalPtr,
S64, 32},
1571 {
S32, LocalPtr,
S8, 8},
1572 {
S32, LocalPtr,
S16, 16},
1575 {
S32, PrivatePtr,
S32, 32},
1576 {
S32, PrivatePtr,
S8, 8},
1577 {
S32, PrivatePtr,
S16, 16},
1580 {
S32, ConstantPtr,
S32, GlobalAlign32},
1583 {
S64, ConstantPtr,
S64, GlobalAlign32},
1584 {
V2S32, ConstantPtr,
V2S32, GlobalAlign32}});
1586 Actions.legalForTypesWithMemDesc(
ST.useRealTrue16Insts(),
1587 {{S16, GlobalPtr, S8, GlobalAlign8},
1588 {S16, GlobalPtr, S16, GlobalAlign16},
1589 {S16, LocalPtr, S8, 8},
1590 {S16, LocalPtr, S16, 16},
1591 {S16, PrivatePtr, S8, 8},
1592 {S16, PrivatePtr, S16, 16}});
1602 Actions.unsupportedIf(
1603 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1617 Actions.customIf(
typeIs(1, Constant32Ptr));
1643 return !Query.
Types[0].isVector() &&
1644 needToSplitMemOp(Query,
Op == G_LOAD);
1646 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1651 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1654 if (DstSize > MemSize)
1660 if (MemSize > MaxSize)
1668 return Query.
Types[0].isVector() &&
1669 needToSplitMemOp(Query,
Op == G_LOAD);
1671 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1685 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1686 if (MemSize > MaxSize) {
1690 if (MaxSize % EltSize == 0) {
1696 unsigned NumPieces = MemSize / MaxSize;
1700 if (NumPieces == 1 || NumPieces >= NumElts ||
1701 NumElts % NumPieces != 0)
1702 return std::pair(0, EltTy);
1710 return std::pair(0, EltTy);
1725 return std::pair(0, EltTy);
1730 .widenScalarToNextPow2(0)
1736 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1737 .legalForTypesWithMemDesc({{
S32, GlobalPtr,
S8, 8},
1738 {
S32, GlobalPtr,
S16, 2 * 8},
1739 {
S32, LocalPtr,
S8, 8},
1740 {
S32, LocalPtr,
S16, 16},
1741 {
S32, PrivatePtr,
S8, 8},
1742 {
S32, PrivatePtr,
S16, 16},
1743 {
S32, ConstantPtr,
S8, 8},
1744 {
S32, ConstantPtr,
S16, 2 * 8}})
1750 if (
ST.hasFlatAddressSpace()) {
1751 ExtLoads.legalForTypesWithMemDesc(
1752 {{
S32, FlatPtr,
S8, 8}, {
S32, FlatPtr,
S16, 16}});
1760 ExtLoads.customIf(
typeIs(1, Constant32Ptr));
1762 ExtLoads.narrowScalarIf(
1769 ExtLoads.clampScalar(0,
S32,
S32)
1770 .widenScalarToNextPow2(0)
1773 auto &Atomics = getActionDefinitionsBuilder(
1774 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1775 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1776 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1777 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1778 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr},
1779 {
S64, GlobalPtr}, {
S64, LocalPtr},
1780 {
S32, RegionPtr}, {
S64, RegionPtr}});
1781 if (
ST.hasFlatAddressSpace()) {
1782 Atomics.legalFor({{
S32, FlatPtr}, {
S64, FlatPtr}});
1786 getActionDefinitionsBuilder({G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
1787 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr}, {
S32, RegionPtr}});
1788 if (
ST.hasFlatAddressSpace()) {
1789 Atomics32.legalFor({{
S32, FlatPtr}});
1793 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1794 if (
ST.hasLDSFPAtomicAddF32()) {
1795 Atomic.legalFor({{
S32, LocalPtr}, {
S32, RegionPtr}});
1796 if (
ST.hasLdsAtomicAddF64())
1797 Atomic.legalFor({{
S64, LocalPtr}});
1798 if (
ST.hasAtomicDsPkAdd16Insts())
1799 Atomic.legalFor({{
V2F16, LocalPtr}, {
V2BF16, LocalPtr}});
1801 if (
ST.hasAtomicFaddInsts())
1802 Atomic.legalFor({{
S32, GlobalPtr}});
1803 if (
ST.hasFlatAtomicFaddF32Inst())
1804 Atomic.legalFor({{
S32, FlatPtr}});
1806 if (
ST.hasGFX90AInsts() ||
ST.hasGFX1250Insts()) {
1817 if (
ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1818 ST.hasAtomicBufferGlobalPkAddF16Insts())
1819 Atomic.legalFor({{
V2F16, GlobalPtr}, {
V2F16, BufferFatPtr}});
1820 if (
ST.hasAtomicGlobalPkAddBF16Inst())
1821 Atomic.legalFor({{
V2BF16, GlobalPtr}});
1822 if (
ST.hasAtomicFlatPkAdd16Insts())
1823 Atomic.legalFor({{
V2F16, FlatPtr}, {
V2BF16, FlatPtr}});
1828 auto &AtomicFMinFMax =
1829 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1830 .legalFor({{
F32, LocalPtr}, {
F64, LocalPtr}});
1832 if (
ST.hasAtomicFMinFMaxF32GlobalInsts())
1833 AtomicFMinFMax.legalFor({{
F32, GlobalPtr},{
F32, BufferFatPtr}});
1834 if (
ST.hasAtomicFMinFMaxF64GlobalInsts())
1835 AtomicFMinFMax.legalFor({{
F64, GlobalPtr}, {
F64, BufferFatPtr}});
1836 if (
ST.hasAtomicFMinFMaxF32FlatInsts())
1837 AtomicFMinFMax.legalFor({
F32, FlatPtr});
1838 if (
ST.hasAtomicFMinFMaxF64FlatInsts())
1839 AtomicFMinFMax.legalFor({
F64, FlatPtr});
1843 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1844 .customFor({{
S32, GlobalPtr}, {
S64, GlobalPtr},
1845 {
S32, FlatPtr}, {
S64, FlatPtr}})
1846 .legalFor({{
S32, LocalPtr}, {
S64, LocalPtr},
1847 {
S32, RegionPtr}, {
S64, RegionPtr}});
1851 getActionDefinitionsBuilder(G_SELECT)
1853 LocalPtr, FlatPtr, PrivatePtr,
1857 .clampScalar(0,
S16,
S64)
1861 .clampMaxNumElements(0,
S32, 2)
1862 .clampMaxNumElements(0, LocalPtr, 2)
1863 .clampMaxNumElements(0, PrivatePtr, 2)
1865 .widenScalarToNextPow2(0)
1870 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1872 if (
ST.has16BitInsts()) {
1873 if (
ST.hasVOP3PInsts()) {
1875 .clampMaxNumElements(0,
S16, 2);
1877 Shifts.legalFor({{
S16,
S16}});
1880 Shifts.widenScalarIf(
1885 const LLT AmountTy = Query.
Types[1];
1890 Shifts.clampScalar(1,
S32,
S32);
1891 Shifts.widenScalarToNextPow2(0, 16);
1892 Shifts.clampScalar(0,
S16,
S64);
1894 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1902 Shifts.clampScalar(1,
S32,
S32);
1903 Shifts.widenScalarToNextPow2(0, 32);
1904 Shifts.clampScalar(0,
S32,
S64);
1906 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1911 Shifts.scalarize(0);
1913 for (
unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1914 unsigned VecTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1915 unsigned EltTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1916 unsigned IdxTypeIdx = 2;
1918 getActionDefinitionsBuilder(
Op)
1920 const LLT EltTy = Query.
Types[EltTypeIdx];
1921 const LLT VecTy = Query.
Types[VecTypeIdx];
1922 const LLT IdxTy = Query.
Types[IdxTypeIdx];
1924 const bool isLegalVecType =
1934 return (EltSize == 32 || EltSize == 64) &&
1950 const LLT EltTy = Query.
Types[EltTypeIdx];
1951 const LLT VecTy = Query.
Types[VecTypeIdx];
1955 const unsigned TargetEltSize =
1956 DstEltSize % 64 == 0 ? 64 : 32;
1957 return std::pair(VecTypeIdx,
1961 .clampScalar(EltTypeIdx,
S32,
S64)
1962 .clampScalar(VecTypeIdx,
S32,
S64)
1963 .clampScalar(IdxTypeIdx,
S32,
S32)
1964 .clampMaxNumElements(VecTypeIdx,
S32, 32)
1973 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1975 const LLT &EltTy = Query.
Types[1].getElementType();
1976 return Query.
Types[0] != EltTy;
1979 for (
unsigned Op : {G_EXTRACT, G_INSERT}) {
1980 unsigned BigTyIdx =
Op == G_EXTRACT ? 1 : 0;
1981 unsigned LitTyIdx =
Op == G_EXTRACT ? 0 : 1;
1982 getActionDefinitionsBuilder(
Op)
1985 const LLT BigTy = Query.
Types[BigTyIdx];
1991 const LLT LitTy = Query.
Types[LitTyIdx];
1996 .widenScalarToNextPow2(BigTyIdx, 32)
2004 const LLT BigTy = Query.
Types[BigTyIdx];
2005 const LLT LitTy = Query.
Types[LitTyIdx];
2013 getActionDefinitionsBuilder(G_BUILD_VECTOR)
2022 if (
ST.hasScalarPackInsts()) {
2025 .minScalarOrElt(0,
S16)
2028 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
2032 BuildVector.customFor({
V2S16,
S16});
2033 BuildVector.minScalarOrElt(0,
S32);
2035 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
2043 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
2045 .clampMaxNumElements(0,
S32, 32)
2046 .clampMaxNumElements(1,
S16, 2)
2047 .clampMaxNumElements(0,
S16, 64);
2049 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
2052 for (
unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
2053 unsigned BigTyIdx =
Op == G_MERGE_VALUES ? 0 : 1;
2054 unsigned LitTyIdx =
Op == G_MERGE_VALUES ? 1 : 0;
2056 auto notValidElt = [=](
const LegalityQuery &Query,
unsigned TypeIdx) {
2057 const LLT Ty = Query.
Types[TypeIdx];
2069 getActionDefinitionsBuilder(
Op)
2073 const LLT BigTy = Query.
Types[BigTyIdx];
2079 .widenScalarToNextPow2(LitTyIdx, 16)
2088 .clampScalar(LitTyIdx,
S32,
S512)
2089 .widenScalarToNextPow2(LitTyIdx, 32)
2093 return notValidElt(Query, LitTyIdx);
2098 return notValidElt(Query, BigTyIdx);
2103 if (
Op == G_MERGE_VALUES) {
2104 Builder.widenScalarIf(
2107 const LLT Ty = Query.
Types[LitTyIdx];
2113 Builder.widenScalarIf(
2115 const LLT Ty = Query.
Types[BigTyIdx];
2121 const LLT &Ty = Query.
Types[BigTyIdx];
2123 if (NewSizeInBits >= 256) {
2125 if (RoundedTo < NewSizeInBits)
2126 NewSizeInBits = RoundedTo;
2128 return std::pair(BigTyIdx,
LLT::scalar(NewSizeInBits));
2137 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
2138 .legalFor({{
S32}, {
S64}})
2139 .clampScalar(0,
S32,
S64);
2141 if (
ST.hasVOP3PInsts()) {
2142 SextInReg.lowerFor({{
V2S16}})
2146 .clampMaxNumElementsStrict(0,
S16, 2);
2147 }
else if (
ST.has16BitInsts()) {
2148 SextInReg.lowerFor({{
S32}, {
S64}, {
S16}});
2152 SextInReg.lowerFor({{
S32}, {
S64}});
2157 .clampScalar(0,
S32,
S64)
2160 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2164 auto &FSHRActionDefs = getActionDefinitionsBuilder(G_FSHR);
2165 FSHRActionDefs.legalFor({{
S32,
S32}})
2166 .clampMaxNumElementsStrict(0,
S16, 2);
2167 if (
ST.hasVOP3PInsts())
2169 FSHRActionDefs.scalarize(0).lower();
2171 if (
ST.hasVOP3PInsts()) {
2172 getActionDefinitionsBuilder(G_FSHL)
2174 .clampMaxNumElementsStrict(0,
S16, 2)
2178 getActionDefinitionsBuilder(G_FSHL)
2183 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2186 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({
S64});
2188 getActionDefinitionsBuilder(G_FENCE)
2191 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2196 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2198 .clampScalar(1,
S32,
S32)
2199 .clampScalar(0,
S32,
S64)
2200 .widenScalarToNextPow2(0)
2203 getActionDefinitionsBuilder(
2207 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2208 G_READ_REGISTER, G_WRITE_REGISTER,
2213 if (
ST.hasIEEEMinimumMaximumInsts()) {
2214 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2215 .legalFor(FPTypesPK16)
2216 .clampMaxNumElements(0,
S16, 2)
2218 }
else if (
ST.hasVOP3PInsts()) {
2219 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2221 .clampMaxNumElementsStrict(0,
S16, 2)
2225 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2227 .clampScalar(0,
S32,
S64)
2231 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2234 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2236 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2237 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2238 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2241 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2243 getActionDefinitionsBuilder(
2244 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2245 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2246 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2247 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2252 getLegacyLegalizerInfo().computeTables();
2262 switch (
MI.getOpcode()) {
2263 case TargetOpcode::G_ADDRSPACE_CAST:
2265 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2267 case TargetOpcode::G_FCEIL:
2269 case TargetOpcode::G_FREM:
2271 case TargetOpcode::G_INTRINSIC_TRUNC:
2273 case TargetOpcode::G_SITOFP:
2275 case TargetOpcode::G_UITOFP:
2277 case TargetOpcode::G_FPTOSI:
2279 case TargetOpcode::G_FPTOUI:
2281 case TargetOpcode::G_FMINNUM:
2282 case TargetOpcode::G_FMAXNUM:
2283 case TargetOpcode::G_FMINIMUMNUM:
2284 case TargetOpcode::G_FMAXIMUMNUM:
2286 case TargetOpcode::G_EXTRACT:
2288 case TargetOpcode::G_INSERT:
2290 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2292 case TargetOpcode::G_INSERT_VECTOR_ELT:
2294 case TargetOpcode::G_FSIN:
2295 case TargetOpcode::G_FCOS:
2297 case TargetOpcode::G_GLOBAL_VALUE:
2299 case TargetOpcode::G_LOAD:
2300 case TargetOpcode::G_SEXTLOAD:
2301 case TargetOpcode::G_ZEXTLOAD:
2303 case TargetOpcode::G_STORE:
2305 case TargetOpcode::G_FMAD:
2307 case TargetOpcode::G_FDIV:
2309 case TargetOpcode::G_FFREXP:
2311 case TargetOpcode::G_FSQRT:
2313 case TargetOpcode::G_UDIV:
2314 case TargetOpcode::G_UREM:
2315 case TargetOpcode::G_UDIVREM:
2317 case TargetOpcode::G_SDIV:
2318 case TargetOpcode::G_SREM:
2319 case TargetOpcode::G_SDIVREM:
2321 case TargetOpcode::G_ATOMIC_CMPXCHG:
2323 case TargetOpcode::G_FLOG2:
2325 case TargetOpcode::G_FLOG:
2326 case TargetOpcode::G_FLOG10:
2328 case TargetOpcode::G_FEXP2:
2330 case TargetOpcode::G_FEXP:
2331 case TargetOpcode::G_FEXP10:
2333 case TargetOpcode::G_FPOW:
2335 case TargetOpcode::G_FFLOOR:
2337 case TargetOpcode::G_BUILD_VECTOR:
2338 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2340 case TargetOpcode::G_MUL:
2342 case TargetOpcode::G_CTLZ:
2343 case TargetOpcode::G_CTTZ:
2345 case TargetOpcode::G_CTLS:
2347 case TargetOpcode::G_CTLZ_ZERO_POISON:
2349 case TargetOpcode::G_STACKSAVE:
2351 case TargetOpcode::G_GET_FPENV:
2353 case TargetOpcode::G_SET_FPENV:
2355 case TargetOpcode::G_TRAP:
2357 case TargetOpcode::G_DEBUGTRAP:
2377 if (ST.hasApertureRegs()) {
2382 ? AMDGPU::SRC_SHARED_BASE
2383 : AMDGPU::SRC_PRIVATE_BASE;
2384 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2385 !ST.hasGloballyAddressableScratch()) &&
2386 "Cannot use src_private_base with globally addressable scratch!");
2389 B.buildCopy({Dst}, {
Register(ApertureRegNo)});
2390 return B.buildUnmerge(
S32, Dst).getReg(1);
2405 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
2421 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2424 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2446 B.buildObjectPtrOffset(
2448 B.buildConstant(
LLT::scalar(64), StructOffset).getReg(0));
2449 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2457 switch (Def->getOpcode()) {
2458 case AMDGPU::G_FRAME_INDEX:
2459 case AMDGPU::G_GLOBAL_VALUE:
2460 case AMDGPU::G_BLOCK_ADDR:
2462 case AMDGPU::G_CONSTANT: {
2463 const ConstantInt *CI = Def->getOperand(1).getCImm();
2480 assert(
MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2482 Intrinsic::amdgcn_addrspacecast_nonnull));
2487 :
MI.getOperand(1).getReg();
2491 unsigned SrcAS = SrcTy.getAddressSpace();
2501 MI.setDesc(
B.getTII().get(TargetOpcode::G_BITCAST));
2508 auto castFlatToLocalOrPrivate = [&](
const DstOp &Dst) ->
Register {
2510 ST.hasGloballyAddressableScratch()) {
2514 Register SrcLo =
B.buildExtract(
S32, Src, 0).getReg(0);
2516 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
2517 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2519 MRI.
setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2521 return B.buildIntToPtr(Dst,
Sub).getReg(0);
2525 return B.buildExtract(Dst, Src, 0).getReg(0);
2531 castFlatToLocalOrPrivate(Dst);
2532 MI.eraseFromParent();
2538 auto SegmentNull =
B.buildConstant(DstTy, NullVal);
2539 auto FlatNull =
B.buildConstant(SrcTy, 0);
2542 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2546 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2548 MI.eraseFromParent();
2555 auto castLocalOrPrivateToFlat = [&](
const DstOp &Dst) ->
Register {
2558 Register SrcAsInt =
B.buildPtrToInt(
S32, Src).getReg(0);
2561 ST.hasGloballyAddressableScratch()) {
2566 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {
S32})
2570 if (ST.isWave64()) {
2571 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {
S32})
2577 B.buildConstant(
S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2578 Register SrcHi =
B.buildShl(
S32, ThreadID, ShAmt).getReg(0);
2580 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).
getReg(0);
2584 B.buildInstr(AMDGPU::S_MOV_B64, {
S64},
2585 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2587 MRI.
setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2588 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2597 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).
getReg(0);
2603 castLocalOrPrivateToFlat(Dst);
2604 MI.eraseFromParent();
2608 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2615 SegmentNull.getReg(0));
2617 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2619 MI.eraseFromParent();
2624 SrcTy.getSizeInBits() == 64) {
2626 B.buildExtract(Dst, Src, 0);
2627 MI.eraseFromParent();
2634 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2635 auto PtrLo =
B.buildPtrToInt(
S32, Src);
2636 if (AddrHiVal == 0) {
2638 B.buildIntToPtr(Dst, Zext);
2640 auto HighAddr =
B.buildConstant(
S32, AddrHiVal);
2641 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2644 MI.eraseFromParent();
2651 MI.eraseFromParent();
2660 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2665 auto C1 =
B.buildFConstant(Ty, C1Val);
2666 auto CopySign =
B.buildFCopysign(Ty, C1, Src);
2669 auto Tmp1 =
B.buildFAdd(Ty, Src, CopySign);
2670 auto Tmp2 =
B.buildFSub(Ty, Tmp1, CopySign);
2672 auto C2 =
B.buildFConstant(Ty, C2Val);
2673 auto Fabs =
B.buildFAbs(Ty, Src);
2676 B.buildSelect(
MI.getOperand(0).getReg(),
Cond, Src, Tmp2);
2677 MI.eraseFromParent();
2695 auto Trunc =
B.buildIntrinsicTrunc(
S64, Src);
2697 const auto Zero =
B.buildFConstant(
S64, 0.0);
2698 const auto One =
B.buildFConstant(
S64, 1.0);
2701 auto And =
B.buildAnd(
S1, Lt0, NeTrunc);
2702 auto Add =
B.buildSelect(
S64,
And, One, Zero);
2705 B.buildFAdd(
MI.getOperand(0).getReg(), Trunc,
Add);
2706 MI.eraseFromParent();
2714 Register Src0Reg =
MI.getOperand(1).getReg();
2715 Register Src1Reg =
MI.getOperand(2).getReg();
2716 auto Flags =
MI.getFlags();
2719 auto Div =
B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2720 auto Trunc =
B.buildIntrinsicTrunc(Ty, Div, Flags);
2721 auto Neg =
B.buildFNeg(Ty, Trunc, Flags);
2722 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2723 MI.eraseFromParent();
2729 const unsigned FractBits = 52;
2730 const unsigned ExpBits = 11;
2733 auto Const0 =
B.buildConstant(
S32, FractBits - 32);
2734 auto Const1 =
B.buildConstant(
S32, ExpBits);
2736 auto ExpPart =
B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {
S32})
2738 .addUse(Const0.getReg(0))
2739 .addUse(Const1.getReg(0));
2741 return B.buildSub(
S32, ExpPart,
B.buildConstant(
S32, 1023));
2755 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2762 const unsigned FractBits = 52;
2765 const auto SignBitMask =
B.buildConstant(
S32, UINT32_C(1) << 31);
2766 auto SignBit =
B.buildAnd(
S32,
Hi, SignBitMask);
2768 const auto FractMask =
B.buildConstant(
S64, (UINT64_C(1) << FractBits) - 1);
2770 const auto Zero32 =
B.buildConstant(
S32, 0);
2773 auto SignBit64 =
B.buildMergeLikeInstr(
S64, {Zero32, SignBit});
2775 auto Shr =
B.buildAShr(
S64, FractMask, Exp);
2776 auto Not =
B.buildNot(
S64, Shr);
2777 auto Tmp0 =
B.buildAnd(
S64, Src, Not);
2778 auto FiftyOne =
B.buildConstant(
S32, FractBits - 1);
2783 auto Tmp1 =
B.buildSelect(
S64, ExpLt0, SignBit64, Tmp0);
2784 B.buildSelect(
MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2785 MI.eraseFromParent();
2801 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2802 auto ThirtyTwo =
B.buildConstant(
S32, 32);
2805 auto CvtHi =
Signed ?
B.buildSITOFP(
S64, Unmerge.getReg(1))
2806 :
B.buildUITOFP(
S64, Unmerge.getReg(1));
2808 auto CvtLo =
B.buildUITOFP(
S64, Unmerge.getReg(0));
2809 auto LdExp =
B.buildFLdexp(
S64, CvtHi, ThirtyTwo);
2812 B.buildFAdd(Dst, LdExp, CvtLo);
2813 MI.eraseFromParent();
2819 auto One =
B.buildConstant(
S32, 1);
2823 auto ThirtyOne =
B.buildConstant(
S32, 31);
2824 auto X =
B.buildXor(
S32, Unmerge.getReg(0), Unmerge.getReg(1));
2825 auto OppositeSign =
B.buildAShr(
S32,
X, ThirtyOne);
2826 auto MaxShAmt =
B.buildAdd(
S32, ThirtyTwo, OppositeSign);
2827 auto LS =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32})
2828 .addUse(Unmerge.getReg(1));
2829 auto LS2 =
B.buildSub(
S32, LS, One);
2830 ShAmt =
B.buildUMin(
S32, LS2, MaxShAmt);
2832 ShAmt =
B.buildCTLZ(
S32, Unmerge.getReg(1));
2833 auto Norm =
B.buildShl(
S64, Src, ShAmt);
2834 auto Unmerge2 =
B.buildUnmerge({
S32,
S32}, Norm);
2835 auto Adjust =
B.buildUMin(
S32, One, Unmerge2.getReg(0));
2836 auto Norm2 =
B.buildOr(
S32, Unmerge2.getReg(1), Adjust);
2837 auto FVal =
Signed ?
B.buildSITOFP(
S32, Norm2) :
B.buildUITOFP(
S32, Norm2);
2838 auto Scale =
B.buildSub(
S32, ThirtyTwo, ShAmt);
2839 B.buildFLdexp(Dst, FVal, Scale);
2840 MI.eraseFromParent();
2860 unsigned Flags =
MI.getFlags();
2871 auto Trunc =
B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2879 Sign =
B.buildAShr(
S32, Src,
B.buildConstant(
S32, 31));
2880 Trunc =
B.buildFAbs(
S32, Trunc, Flags);
2884 K0 =
B.buildFConstant(
2886 K1 =
B.buildFConstant(
2889 K0 =
B.buildFConstant(
2891 K1 =
B.buildFConstant(
2895 auto Mul =
B.buildFMul(SrcLT, Trunc, K0, Flags);
2896 auto FloorMul =
B.buildFFloor(SrcLT,
Mul, Flags);
2897 auto Fma =
B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2900 :
B.buildFPTOUI(
S32, FloorMul);
2901 auto Lo =
B.buildFPTOUI(
S32, Fma);
2905 Sign =
B.buildMergeLikeInstr(
S64, {Sign, Sign});
2907 B.buildSub(Dst,
B.buildXor(
S64,
B.buildMergeLikeInstr(
S64, {Lo, Hi}), Sign),
2910 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
2911 MI.eraseFromParent();
2943 unsigned StartIdx =
Offset / 32;
2945 auto Unmerge =
B.buildUnmerge(
LLT::scalar(32), SrcReg);
2947 if (DstCount == 1) {
2949 B.buildIntToPtr(DstReg, Unmerge.getReg(StartIdx));
2954 for (
unsigned I = 0;
I < DstCount; ++
I)
2955 MergeVec.
push_back(Unmerge.getReg(StartIdx +
I));
2956 B.buildMergeLikeInstr(DstReg, MergeVec);
2959 MI.eraseFromParent();
2969 Register InsertSrc =
MI.getOperand(2).getReg();
2978 if (
Offset % 32 != 0 || DstSize % 32 != 0 || InsertSize % 32 != 0)
2982 unsigned DstCount = DstSize / 32;
2983 unsigned InsertCount = InsertSize / 32;
2984 unsigned StartIdx =
Offset / 32;
2986 auto SrcUnmerge =
B.buildUnmerge(
S32, SrcReg);
2989 for (
unsigned I = 0;
I < StartIdx; ++
I)
2992 if (InsertCount == 1) {
2996 InsertSrc =
B.buildPtrToInt(
S32, InsertSrc).getReg(0);
2999 auto InsertUnmerge =
B.buildUnmerge(
S32, InsertSrc);
3000 for (
unsigned I = 0;
I < InsertCount; ++
I)
3004 for (
unsigned I = StartIdx + InsertCount;
I < DstCount; ++
I)
3007 B.buildMergeLikeInstr(DstReg, MergeVec);
3009 MI.eraseFromParent();
3036 auto IntVec =
B.buildPtrToInt(IntVecTy, Vec);
3037 auto IntElt =
B.buildExtractVectorElement(IntTy, IntVec,
MI.getOperand(2));
3038 B.buildIntToPtr(Dst, IntElt);
3040 MI.eraseFromParent();
3047 std::optional<ValueAndVReg> MaybeIdxVal =
3051 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3054 auto Unmerge =
B.buildUnmerge(EltTy, Vec);
3055 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
3060 MI.eraseFromParent();
3089 auto IntVecSource =
B.buildPtrToInt(IntVecTy, Vec);
3090 auto IntIns =
B.buildPtrToInt(IntTy, Ins);
3091 auto IntVecDest =
B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
3093 B.buildIntToPtr(Dst, IntVecDest);
3094 MI.eraseFromParent();
3101 std::optional<ValueAndVReg> MaybeIdxVal =
3106 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3109 if (IdxVal < NumElts) {
3111 for (
unsigned i = 0; i < NumElts; ++i)
3113 B.buildUnmerge(SrcRegs, Vec);
3115 SrcRegs[IdxVal] =
MI.getOperand(2).getReg();
3116 B.buildMergeLikeInstr(Dst, SrcRegs);
3121 MI.eraseFromParent();
3132 unsigned Flags =
MI.getFlags();
3136 if (ST.hasTrigReducedRange()) {
3137 auto MulVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
3138 TrigVal =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
3139 .addUse(MulVal.getReg(0))
3143 TrigVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
3146 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
3150 MI.eraseFromParent();
3158 unsigned GAFlags)
const {
3187 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3189 if (ST.has64BitLiterals()) {
3193 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3197 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3206 if (!
B.getMRI()->getRegClassOrNull(PCReg))
3207 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3210 B.buildExtract(DstReg, PCReg, 0);
3220 if (RequiresHighHalf && ST.has64BitLiterals()) {
3222 MRI.
setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3223 B.buildInstr(AMDGPU::S_MOV_B64)
3238 MRI.
setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3241 B.buildInstr(AMDGPU::S_MOV_B32)
3246 if (RequiresHighHalf) {
3248 "Must provide a 64-bit pointer type!");
3251 MRI.
setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3253 B.buildInstr(AMDGPU::S_MOV_B32)
3264 MRI.
setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3266 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3270 if (AddrDst != DstReg)
3271 B.buildCast(DstReg, AddrDst);
3272 }
else if (AddrLo != DstReg) {
3275 B.buildCast(DstReg, AddrLo);
3284 unsigned AS = Ty.getAddressSpace();
3292 GV->
getName() !=
"llvm.amdgcn.module.lds" &&
3296 Fn,
"local memory global used by non-kernel function",
3305 B.buildUndef(DstReg);
3306 MI.eraseFromParent();
3330 auto Sz =
B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {
S32});
3331 B.buildIntToPtr(DstReg, Sz);
3332 MI.eraseFromParent();
3338 MI.eraseFromParent();
3342 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3344 MI.eraseFromParent();
3352 MI.eraseFromParent();
3358 MI.eraseFromParent();
3374 if (Ty.getSizeInBits() == 32) {
3376 auto Load =
B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3377 B.buildExtract(DstReg, Load, 0);
3379 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3381 MI.eraseFromParent();
3404 auto Cast =
B.buildAddrSpaceCast(ConstPtr, PtrReg);
3406 MI.getOperand(1).setReg(Cast.getReg(0));
3411 if (
MI.getOpcode() != AMDGPU::G_LOAD)
3437 if (WideMemSize == ValSize) {
3443 MI.setMemRefs(MF, {WideMMO});
3449 if (ValSize > WideMemSize)
3456 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3457 B.buildTrunc(ValReg, WideLoad).getReg(0);
3464 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3465 B.buildExtract(ValReg, WideLoad, 0);
3469 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3470 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3474 MI.eraseFromParent();
3487 Register DataReg =
MI.getOperand(0).getReg();
3532 "this should not have been custom lowered");
3537 Register PackedVal =
B.buildBuildVector(VecTy, { NewVal, CmpVal }).
getReg(0);
3539 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3543 .setMemRefs(
MI.memoperands());
3545 MI.eraseFromParent();
3553 switch (
DefMI->getOpcode()) {
3554 case TargetOpcode::G_INTRINSIC: {
3556 case Intrinsic::amdgcn_frexp_mant:
3557 case Intrinsic::amdgcn_log:
3558 case Intrinsic::amdgcn_log_clamp:
3559 case Intrinsic::amdgcn_exp2:
3560 case Intrinsic::amdgcn_sqrt:
3568 case TargetOpcode::G_FSQRT:
3570 case TargetOpcode::G_FFREXP: {
3571 if (
DefMI->getOperand(0).getReg() == Src)
3575 case TargetOpcode::G_FPEXT: {
3596std::pair<Register, Register>
3598 unsigned Flags)
const {
3603 auto SmallestNormal =
B.buildFConstant(
3605 auto IsLtSmallestNormal =
3608 auto Scale32 =
B.buildFConstant(
F32, 0x1.0p+32);
3609 auto One =
B.buildFConstant(
F32, 1.0);
3611 B.buildSelect(
F32, IsLtSmallestNormal, Scale32, One, Flags);
3612 auto ScaledInput =
B.buildFMul(
F32, Src, ScaleFactor, Flags);
3614 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3627 LLT Ty =
B.getMRI()->getType(Dst);
3628 unsigned Flags =
MI.getFlags();
3633 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3634 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {
F32})
3635 .addUse(Ext.getReg(0))
3637 B.buildFPTrunc(Dst,
Log2, Flags);
3638 MI.eraseFromParent();
3646 B.buildIntrinsic(Intrinsic::amdgcn_log, {
MI.getOperand(0)})
3649 MI.eraseFromParent();
3653 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3654 .addUse(ScaledInput)
3657 auto ThirtyTwo =
B.buildFConstant(Ty, 32.0);
3658 auto Zero =
B.buildFConstant(Ty, 0.0);
3660 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3661 B.buildFSub(Dst,
Log2, ResultOffset, Flags);
3663 MI.eraseFromParent();
3669 auto FMul =
B.buildFMul(Ty,
X,
Y, Flags);
3670 return B.buildFAdd(Ty,
FMul, Z, Flags).getReg(0);
3675 const bool IsLog10 =
MI.getOpcode() == TargetOpcode::G_FLOG10;
3676 assert(IsLog10 ||
MI.getOpcode() == TargetOpcode::G_FLOG);
3681 unsigned Flags =
MI.getFlags();
3694 auto PromoteSrc =
B.buildFPExt(
F32,
X);
3696 B.buildFPTrunc(Dst, LogVal);
3701 MI.eraseFromParent();
3710 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(
X).setMIFlags(Flags);
3713 if (ST.hasFastFMAF32()) {
3715 const float c_log10 = 0x1.344134p-2f;
3716 const float cc_log10 = 0x1.09f79ep-26f;
3719 const float c_log = 0x1.62e42ep-1f;
3720 const float cc_log = 0x1.efa39ep-25f;
3722 auto C =
B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3723 auto CC =
B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3727 R =
B.buildFMul(Ty,
Y,
C, NewFlags).getReg(0);
3728 auto NegR =
B.buildFNeg(Ty, R, NewFlags);
3729 auto FMA0 =
B.buildFMA(Ty,
Y,
C, NegR, NewFlags);
3730 auto FMA1 =
B.buildFMA(Ty,
Y, CC, FMA0, NewFlags);
3731 R =
B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0);
3734 const float ch_log10 = 0x1.344000p-2f;
3735 const float ct_log10 = 0x1.3509f6p-18f;
3738 const float ch_log = 0x1.62e000p-1f;
3739 const float ct_log = 0x1.0bfbe8p-15f;
3741 auto CH =
B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3742 auto CT =
B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3744 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3745 auto YH =
B.buildAnd(Ty,
Y, MaskConst);
3746 auto YT =
B.buildFSub(Ty,
Y, YH, Flags);
3750 auto YTCT =
B.buildFMul(Ty, YT, CT, NewFlags);
3753 getMad(
B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags);
3755 R =
getMad(
B, Ty, YH.getReg(0),
CH.getReg(0), Mad1, NewFlags);
3758 const bool IsFiniteOnly =
3761 if (!IsFiniteOnly) {
3764 auto Fabs =
B.buildFAbs(Ty,
Y);
3767 R =
B.buildSelect(Ty, IsFinite, R,
Y, Flags).getReg(0);
3771 auto Zero =
B.buildFConstant(Ty, 0.0);
3773 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3774 auto Shift =
B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3775 B.buildFSub(Dst, R, Shift, Flags);
3777 B.buildCopy(Dst, R);
3780 MI.eraseFromParent();
3786 unsigned Flags)
const {
3787 const double Log2BaseInverted =
3790 LLT Ty =
B.getMRI()->getType(Dst);
3795 auto LogSrc =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3798 auto ScaledResultOffset =
B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3799 auto Zero =
B.buildFConstant(Ty, 0.0);
3801 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3802 auto Log2Inv =
B.buildFConstant(Ty, Log2BaseInverted);
3804 if (ST.hasFastFMAF32())
3805 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3807 auto Mul =
B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3808 B.buildFAdd(Dst,
Mul, ResultOffset, Flags);
3816 ?
B.buildFLog2(Ty, Src, Flags)
3817 :
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3820 auto Log2BaseInvertedOperand =
B.buildFConstant(Ty, Log2BaseInverted);
3821 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3832 unsigned Flags =
MI.getFlags();
3833 LLT Ty =
B.getMRI()->getType(Dst);
3843 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3844 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {
F32})
3845 .addUse(Ext.getReg(0))
3847 B.buildFPTrunc(Dst,
Log2, Flags);
3848 MI.eraseFromParent();
3858 MI.eraseFromParent();
3866 auto RangeCheckConst =
B.buildFConstant(Ty, -0x1.f80000p+6f);
3868 RangeCheckConst, Flags);
3870 auto SixtyFour =
B.buildFConstant(Ty, 0x1.0p+6f);
3871 auto Zero =
B.buildFConstant(Ty, 0.0);
3872 auto AddOffset =
B.buildSelect(
F32, NeedsScaling, SixtyFour, Zero, Flags);
3873 auto AddInput =
B.buildFAdd(
F32, Src, AddOffset, Flags);
3875 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3876 .addUse(AddInput.getReg(0))
3879 auto TwoExpNeg64 =
B.buildFConstant(Ty, 0x1.0p-64f);
3880 auto One =
B.buildFConstant(Ty, 1.0);
3881 auto ResultScale =
B.buildSelect(
F32, NeedsScaling, TwoExpNeg64, One, Flags);
3882 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3883 MI.eraseFromParent();
3888 const SrcOp &Src,
unsigned Flags) {
3889 LLT Ty = Dst.getLLTTy(*
B.getMRI());
3892 return B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Dst})
3893 .addUse(Src.getReg())
3896 return B.buildFExp2(Dst, Src, Flags);
3902 bool IsExp10)
const {
3903 LLT Ty =
B.getMRI()->getType(
X);
3907 auto Const =
B.buildFConstant(Ty, IsExp10 ? 0x1.a934f0p+1f :
numbers::log2e);
3908 auto Mul =
B.buildFMul(Ty,
X, Const, Flags);
3915 LLT Ty =
B.getMRI()->getType(Dst);
3922 auto Threshold =
B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3925 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+6f);
3926 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3927 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X, Flags);
3930 auto ExpInput =
B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3932 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3933 .addUse(ExpInput.getReg(0))
3936 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.969d48p-93f);
3937 auto AdjustedResult =
B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3938 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3944 unsigned Flags)
const {
3945 LLT Ty =
B.getMRI()->getType(Dst);
3950 auto K0 =
B.buildFConstant(Ty, 0x1.a92000p+1f);
3951 auto K1 =
B.buildFConstant(Ty, 0x1.4f0978p-11f);
3953 auto Mul1 =
B.buildFMul(Ty,
X, K1, Flags);
3954 auto Exp2_1 =
buildExp(
B, Ty, Mul1, Flags);
3955 auto Mul0 =
B.buildFMul(Ty,
X, K0, Flags);
3956 auto Exp2_0 =
buildExp(
B, Ty, Mul0, Flags);
3957 B.buildFMul(Dst, Exp2_0, Exp2_1, Flags);
3967 auto Threshold =
B.buildFConstant(Ty, -0x1.2f7030p+5f);
3971 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+5f);
3972 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3973 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X);
3975 auto K0 =
B.buildFConstant(Ty, 0x1.a92000p+1f);
3976 auto K1 =
B.buildFConstant(Ty, 0x1.4f0978p-11f);
3978 auto Mul1 =
B.buildFMul(Ty, AdjustedX, K1, Flags);
3979 auto Exp2_1 =
buildExp(
B, Ty, Mul1, Flags);
3980 auto Mul0 =
B.buildFMul(Ty, AdjustedX, K0, Flags);
3981 auto Exp2_0 =
buildExp(
B, Ty, Mul0, Flags);
3983 auto MulExps =
B.buildFMul(Ty, Exp2_0, Exp2_1, Flags);
3984 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.9f623ep-107f);
3985 auto AdjustedResult =
B.buildFMul(Ty, MulExps, ResultScaleFactor, Flags);
3987 B.buildSelect(Dst, NeedsScaling, AdjustedResult, MulExps);
4006 if (
MI.getOpcode() == TargetOpcode::G_FEXP2) {
4008 Dn =
B.buildFRint(
S64,
X, Flags).getReg(0);
4010 F =
B.buildFSub(
S64,
X, Dn, Flags).getReg(0);
4012 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.62e42fefa39efp-1));
4013 auto C2 =
B.buildFConstant(
S64,
APFloat(0x1.abc9e3b39803fp-56));
4014 auto Mul2 =
B.buildFMul(
S64,
F, C2, Flags).getReg(0);
4015 T =
B.buildFMA(
S64,
F, C1, Mul2, Flags).getReg(0);
4017 }
else if (
MI.getOpcode() == TargetOpcode::G_FEXP10) {
4018 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.a934f0979a371p+1));
4019 auto Mul =
B.buildFMul(
S64,
X, C1, Flags).getReg(0);
4020 Dn =
B.buildFRint(
S64,
Mul, Flags).getReg(0);
4022 auto NegDn =
B.buildFNeg(
S64, Dn, Flags).getReg(0);
4023 auto C2 =
B.buildFConstant(
S64,
APFloat(-0x1.9dc1da994fd21p-59));
4024 auto C3 =
B.buildFConstant(
S64,
APFloat(0x1.34413509f79ffp-2));
4025 auto Inner =
B.buildFMA(
S64, NegDn, C3,
X, Flags).getReg(0);
4026 F =
B.buildFMA(
S64, NegDn, C2, Inner, Flags).getReg(0);
4028 auto C4 =
B.buildFConstant(
S64,
APFloat(0x1.26bb1bbb55516p+1));
4029 auto C5 =
B.buildFConstant(
S64,
APFloat(-0x1.f48ad494ea3e9p-53));
4030 auto MulF =
B.buildFMul(
S64,
F, C5, Flags).getReg(0);
4031 T =
B.buildFMA(
S64,
F, C4, MulF, Flags).getReg(0);
4034 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.71547652b82fep+0));
4035 auto Mul =
B.buildFMul(
S64,
X, C1, Flags).getReg(0);
4036 Dn =
B.buildFRint(
S64,
Mul, Flags).getReg(0);
4038 auto NegDn =
B.buildFNeg(
S64, Dn, Flags).getReg(0);
4039 auto C2 =
B.buildFConstant(
S64,
APFloat(0x1.abc9e3b39803fp-56));
4040 auto C3 =
B.buildFConstant(
S64,
APFloat(0x1.62e42fefa39efp-1));
4041 auto Inner =
B.buildFMA(
S64, NegDn, C3,
X, Flags).getReg(0);
4042 T =
B.buildFMA(
S64, NegDn, C2, Inner, Flags).getReg(0);
4046 auto P =
B.buildFConstant(
S64, 0x1.ade156a5dcb37p-26);
4047 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.28af3fca7ab0cp-22),
4049 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.71dee623fde64p-19),
4051 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.a01997c89e6b0p-16),
4053 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.a01a014761f6ep-13),
4055 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.6c16c1852b7b0p-10),
4057 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.1111111122322p-7), Flags);
4058 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.55555555502a1p-5), Flags);
4059 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.5555555555511p-3), Flags);
4060 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.000000000000bp-1), Flags);
4062 auto One =
B.buildFConstant(
S64, 1.0);
4063 P =
B.buildFMA(
S64,
T,
P, One, Flags);
4064 P =
B.buildFMA(
S64,
T,
P, One, Flags);
4067 auto DnInt =
B.buildFPTOSI(
S32, Dn);
4068 auto Z =
B.buildFLdexp(
S64,
P, DnInt, Flags);
4075 Z =
B.buildSelect(
S64, CondHi, Z, PInf, Flags);
4082 B.buildSelect(
MI.getOperand(0).getReg(), CondLo, Z, Zero, Flags);
4084 MI.eraseFromParent();
4092 const unsigned Flags =
MI.getFlags();
4104 const bool IsExp10 =
MI.getOpcode() == TargetOpcode::G_FEXP10;
4112 MI.eraseFromParent();
4123 auto Ext =
B.buildFPExt(
F32,
X, Flags);
4126 B.buildFPTrunc(Dst, Lowered, Flags);
4127 MI.eraseFromParent();
4138 MI.eraseFromParent();
4166 const unsigned FlagsNoContract = Flags &
~MachineInstr::FmContract;
4169 if (ST.hasFastFMAF32()) {
4171 const float cc_exp = 0x1.4ae0bep-26f;
4172 const float c_exp10 = 0x1.a934f0p+1f;
4173 const float cc_exp10 = 0x1.2f346ep-24f;
4175 auto C =
B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
4176 PH =
B.buildFMul(Ty,
X,
C, Flags).getReg(0);
4177 auto NegPH =
B.buildFNeg(Ty, PH, Flags);
4178 auto FMA0 =
B.buildFMA(Ty,
X,
C, NegPH, Flags);
4180 auto CC =
B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
4181 PL =
B.buildFMA(Ty,
X, CC, FMA0, Flags).getReg(0);
4183 const float ch_exp = 0x1.714000p+0f;
4184 const float cl_exp = 0x1.47652ap-12f;
4186 const float ch_exp10 = 0x1.a92000p+1f;
4187 const float cl_exp10 = 0x1.4f0978p-11f;
4189 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
4190 auto XH =
B.buildAnd(Ty,
X, MaskConst);
4191 auto XL =
B.buildFSub(Ty,
X, XH, Flags);
4193 auto CH =
B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
4194 PH =
B.buildFMul(Ty, XH,
CH, Flags).getReg(0);
4196 auto CL =
B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
4197 auto XLCL =
B.buildFMul(Ty, XL, CL, Flags);
4200 getMad(
B, Ty, XL.getReg(0),
CH.getReg(0), XLCL.getReg(0), Flags);
4201 PL =
getMad(
B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
4204 auto E =
B.buildIntrinsicRoundeven(Ty, PH, Flags);
4207 auto PHSubE =
B.buildFSub(Ty, PH, E, FlagsNoContract);
4208 auto A =
B.buildFAdd(Ty, PHSubE, PL, Flags);
4211 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
4212 .addUse(
A.getReg(0))
4214 auto R =
B.buildFLdexp(Ty, Exp2, IntE, Flags);
4216 auto UnderflowCheckConst =
4217 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
4218 auto Zero =
B.buildFConstant(Ty, 0.0);
4222 R =
B.buildSelect(Ty, Underflow, Zero, R);
4225 auto OverflowCheckConst =
4226 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
4231 R =
B.buildSelect(Ty, Overflow, Inf, R, Flags);
4234 B.buildCopy(Dst, R);
4235 MI.eraseFromParent();
4244 unsigned Flags =
MI.getFlags();
4245 LLT Ty =
B.getMRI()->getType(Dst);
4250 auto Log =
B.buildFLog2(
F32, Src0, Flags);
4251 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
4252 .addUse(Log.getReg(0))
4255 B.buildFExp2(Dst,
Mul, Flags);
4256 }
else if (Ty == F16) {
4258 auto Log =
B.buildFLog2(F16, Src0, Flags);
4259 auto Ext0 =
B.buildFPExt(
F32, Log, Flags);
4260 auto Ext1 =
B.buildFPExt(
F32, Src1, Flags);
4261 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
4262 .addUse(Ext0.getReg(0))
4263 .addUse(Ext1.getReg(0))
4265 B.buildFExp2(Dst,
B.buildFPTrunc(F16,
Mul), Flags);
4269 MI.eraseFromParent();
4277 ModSrc = SrcFNeg->getOperand(1).getReg();
4279 ModSrc = SrcFAbs->getOperand(1).getReg();
4281 ModSrc = SrcFAbs->getOperand(1).getReg();
4292 Register OrigSrc =
MI.getOperand(1).getReg();
4293 unsigned Flags =
MI.getFlags();
4295 "this should not have been custom lowered");
4305 auto Fract =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {
F64})
4325 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
4327 B.buildFMinNum(Min, Fract, Const, Flags);
4332 CorrectedFract =
B.buildSelect(
F64, IsNan, ModSrc, Min, Flags).getReg(0);
4335 auto NegFract =
B.buildFNeg(
F64, CorrectedFract, Flags);
4336 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
4338 MI.eraseFromParent();
4354 if (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4356 Src0 =
B.buildTrunc(
S16,
MI.getOperand(1).getReg()).getReg(0);
4357 Src1 =
B.buildTrunc(
S16,
MI.getOperand(2).getReg()).getReg(0);
4360 auto Merge =
B.buildMergeLikeInstr(
S32, {Src0, Src1});
4361 B.buildBitcast(Dst,
Merge);
4363 MI.eraseFromParent();
4380 bool UsePartialMad64_32,
4381 bool SeparateOddAlignedProducts)
const {
4396 auto getZero32 = [&]() ->
Register {
4398 Zero32 =
B.buildConstant(
S32, 0).getReg(0);
4401 auto getZero64 = [&]() ->
Register {
4403 Zero64 =
B.buildConstant(
S64, 0).getReg(0);
4408 for (
unsigned i = 0; i < Src0.
size(); ++i) {
4419 if (CarryIn.empty())
4422 bool HaveCarryOut =
true;
4424 if (CarryIn.size() == 1) {
4426 LocalAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4430 CarryAccum = getZero32();
4432 CarryAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4433 for (
unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4435 B.buildUAdde(
S32,
S1, CarryAccum, getZero32(), CarryIn[i])
4440 LocalAccum = getZero32();
4441 HaveCarryOut =
false;
4446 B.buildUAdde(
S32,
S1, CarryAccum, LocalAccum, CarryIn.back());
4447 LocalAccum =
Add.getReg(0);
4461 auto buildMadChain =
4464 assert((DstIndex + 1 < Accum.
size() && LocalAccum.size() == 2) ||
4465 (DstIndex + 1 >= Accum.
size() && LocalAccum.size() == 1));
4472 if (LocalAccum.size() == 1 &&
4473 (!UsePartialMad64_32 || !CarryIn.empty())) {
4476 unsigned j1 = DstIndex - j0;
4477 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4481 auto Mul =
B.buildMul(
S32, Src0[j0], Src1[j1]);
4483 LocalAccum[0] =
Mul.getReg(0);
4485 if (CarryIn.empty()) {
4486 LocalAccum[0] =
B.buildAdd(
S32, LocalAccum[0],
Mul).getReg(0);
4489 B.buildUAdde(
S32,
S1, LocalAccum[0],
Mul, CarryIn.back())
4495 }
while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4499 if (j0 <= DstIndex) {
4500 bool HaveSmallAccum =
false;
4503 if (LocalAccum[0]) {
4504 if (LocalAccum.size() == 1) {
4505 Tmp =
B.buildAnyExt(
S64, LocalAccum[0]).getReg(0);
4506 HaveSmallAccum =
true;
4507 }
else if (LocalAccum[1]) {
4508 Tmp =
B.buildMergeLikeInstr(
S64, LocalAccum).getReg(0);
4509 HaveSmallAccum =
false;
4511 Tmp =
B.buildZExt(
S64, LocalAccum[0]).getReg(0);
4512 HaveSmallAccum =
true;
4515 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4517 HaveSmallAccum =
true;
4521 unsigned j1 = DstIndex - j0;
4522 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4526 auto Mad =
B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {
S64,
S1},
4527 {Src0[j0], Src1[j1], Tmp});
4528 Tmp = Mad.getReg(0);
4529 if (!HaveSmallAccum)
4530 CarryOut.push_back(Mad.getReg(1));
4531 HaveSmallAccum =
false;
4534 }
while (j0 <= DstIndex);
4536 auto Unmerge =
B.buildUnmerge(
S32, Tmp);
4537 LocalAccum[0] = Unmerge.getReg(0);
4538 if (LocalAccum.size() > 1)
4539 LocalAccum[1] = Unmerge.getReg(1);
4566 for (
unsigned i = 0; i <= Accum.
size() / 2; ++i) {
4567 Carry OddCarryIn = std::move(OddCarry);
4568 Carry EvenCarryIn = std::move(EvenCarry);
4573 if (2 * i < Accum.
size()) {
4574 auto LocalAccum = Accum.
drop_front(2 * i).take_front(2);
4575 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4580 if (!SeparateOddAlignedProducts) {
4581 auto LocalAccum = Accum.
drop_front(2 * i - 1).take_front(2);
4582 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4584 bool IsHighest = 2 * i >= Accum.
size();
4587 .take_front(IsHighest ? 1 : 2);
4588 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4594 Lo =
B.buildUAddo(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0]);
4596 Lo =
B.buildAdd(
S32, Accum[2 * i - 1], SeparateOddOut[0]);
4598 Lo =
B.buildUAdde(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0],
4601 Accum[2 * i - 1] =
Lo->getOperand(0).getReg();
4604 auto Hi =
B.buildUAdde(
S32,
S1, Accum[2 * i], SeparateOddOut[1],
4605 Lo->getOperand(1).getReg());
4606 Accum[2 * i] =
Hi.getReg(0);
4607 SeparateOddCarry =
Hi.getReg(1);
4614 if (
Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4615 EvenCarryIn.push_back(CarryOut);
4617 if (2 * i < Accum.
size()) {
4618 if (
Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4619 OddCarry.push_back(CarryOut);
4631 assert(ST.hasMad64_32());
4632 assert(
MI.getOpcode() == TargetOpcode::G_MUL);
4644 unsigned Size = Ty.getSizeInBits();
4645 if (ST.hasVMulU64Inst() &&
Size == 64)
4648 unsigned NumParts =
Size / 32;
4660 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4664 for (
unsigned i = 0; i < NumParts; ++i) {
4668 B.buildUnmerge(Src0Parts, Src0);
4669 B.buildUnmerge(Src1Parts, Src1);
4672 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4673 SeparateOddAlignedProducts);
4675 B.buildMergeLikeInstr(DstReg, AccumRegs);
4676 MI.eraseFromParent();
4691 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_CTLZ
4692 ? AMDGPU::G_AMDGPU_FFBH_U32
4693 : AMDGPU::G_AMDGPU_FFBL_B32;
4694 auto Tmp =
B.buildInstr(NewOpc, {DstTy}, {Src});
4697 MI.eraseFromParent();
4707 TypeSize NumBits = SrcTy.getSizeInBits();
4711 auto ShiftAmt =
B.buildConstant(
S32, 32u - NumBits);
4712 auto Extend =
B.buildAnyExt(
S32, {Src}).
getReg(0u);
4713 auto Shift =
B.buildShl(
S32, Extend, ShiftAmt);
4714 auto Ctlz =
B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {
S32}, {Shift});
4715 B.buildTrunc(Dst, Ctlz);
4716 MI.eraseFromParent();
4727 assert(SrcTy ==
S32 &&
"legalizeCTLS only supports s32");
4728 unsigned BitWidth = SrcTy.getSizeInBits();
4730 auto Sffbh =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32}).addUse(Src);
4732 B.buildSub(Dst, Clamped,
B.buildConstant(
S32, 1));
4733 MI.eraseFromParent();
4739 if (
MI.getOpcode() != TargetOpcode::G_XOR)
4742 return ConstVal == -1;
4749 Register CondDef =
MI.getOperand(0).getReg();
4768 if (
UseMI->getParent() != Parent ||
UseMI->getOpcode() != AMDGPU::G_BRCOND)
4777 UncondBrTarget = &*NextMBB;
4779 if (
Next->getOpcode() != AMDGPU::G_BR)
4798 *ArgRC,
B.getDebugLoc(), ArgTy);
4802 const unsigned Mask = Arg->
getMask();
4810 auto ShiftAmt =
B.buildConstant(
S32, Shift);
4811 AndMaskSrc =
B.buildLShr(
S32, LiveIn, ShiftAmt).getReg(0);
4814 B.buildAnd(DstReg, AndMaskSrc,
B.buildConstant(
S32, Mask >> Shift));
4816 B.buildCopy(DstReg, LiveIn);
4826 if (!ST.hasClusters()) {
4829 MI.eraseFromParent();
4849 auto One =
B.buildConstant(
S32, 1);
4850 auto ClusterSizeXYZ =
B.buildAdd(
S32, ClusterMaxIdXYZ, One);
4851 auto GlobalIdXYZ =
B.buildAdd(
S32, ClusterWorkGroupIdXYZ,
4852 B.buildMul(
S32, ClusterIdXYZ, ClusterSizeXYZ));
4859 B.buildCopy(DstReg, GlobalIdXYZ);
4860 MI.eraseFromParent();
4864 B.buildCopy(DstReg, ClusterIdXYZ);
4865 MI.eraseFromParent();
4870 unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4872 MRI.
setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4873 B.buildInstr(AMDGPU::S_GETREG_B32_const)
4875 .addImm(ClusterIdField);
4876 auto Zero =
B.buildConstant(
S32, 0);
4879 B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4880 MI.eraseFromParent();
4922 auto LoadConstant = [&](
unsigned N) {
4923 B.buildConstant(DstReg,
N);
4927 if (ST.hasArchitectedSGPRs() &&
4934 Arg = &WorkGroupIDX;
4935 ArgRC = &AMDGPU::SReg_32RegClass;
4939 Arg = &WorkGroupIDY;
4940 ArgRC = &AMDGPU::SReg_32RegClass;
4944 Arg = &WorkGroupIDZ;
4945 ArgRC = &AMDGPU::SReg_32RegClass;
4949 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
4950 return LoadConstant(0);
4951 Arg = &ClusterWorkGroupIDX;
4952 ArgRC = &AMDGPU::SReg_32RegClass;
4956 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
4957 return LoadConstant(0);
4958 Arg = &ClusterWorkGroupIDY;
4959 ArgRC = &AMDGPU::SReg_32RegClass;
4963 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
4964 return LoadConstant(0);
4965 Arg = &ClusterWorkGroupIDZ;
4966 ArgRC = &AMDGPU::SReg_32RegClass;
4971 return LoadConstant(ClusterDims.
getDims()[0] - 1);
4972 Arg = &ClusterWorkGroupMaxIDX;
4973 ArgRC = &AMDGPU::SReg_32RegClass;
4978 return LoadConstant(ClusterDims.
getDims()[1] - 1);
4979 Arg = &ClusterWorkGroupMaxIDY;
4980 ArgRC = &AMDGPU::SReg_32RegClass;
4985 return LoadConstant(ClusterDims.
getDims()[2] - 1);
4986 Arg = &ClusterWorkGroupMaxIDZ;
4987 ArgRC = &AMDGPU::SReg_32RegClass;
4991 Arg = &ClusterWorkGroupMaxFlatID;
4992 ArgRC = &AMDGPU::SReg_32RegClass;
5007 return LoadConstant(0);
5012 B.buildUndef(DstReg);
5016 if (!Arg->isRegister() || !Arg->getRegister().isValid())
5028 MI.eraseFromParent();
5034 B.buildConstant(
MI.getOperand(0).getReg(),
C);
5035 MI.eraseFromParent();
5042 unsigned MaxID = ST.getMaxWorkitemID(
B.getMF().getFunction(), Dim);
5056 B.buildUndef(DstReg);
5057 MI.eraseFromParent();
5061 if (Arg->isMasked()) {
5075 MI.eraseFromParent();
5090 Register KernArgReg =
B.getMRI()->createGenericVirtualRegister(PtrTy);
5099 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
5107 Align Alignment)
const {
5111 "unexpected kernarg parameter type");
5118 MI.eraseFromParent();
5153 auto FloatY =
B.buildUITOFP(
S32,
Y);
5154 auto RcpIFlag =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {FloatY});
5156 auto ScaledY =
B.buildFMul(
S32, RcpIFlag, Scale);
5157 auto Z =
B.buildFPTOUI(
S32, ScaledY);
5160 auto NegY =
B.buildSub(
S32,
B.buildConstant(
S32, 0),
Y);
5161 auto NegYZ =
B.buildMul(
S32, NegY, Z);
5162 Z =
B.buildAdd(
S32, Z,
B.buildUMulH(
S32, Z, NegYZ));
5165 auto Q =
B.buildUMulH(
S32,
X, Z);
5166 auto R =
B.buildSub(
S32,
X,
B.buildMul(
S32, Q,
Y));
5169 auto One =
B.buildConstant(
S32, 1);
5172 Q =
B.buildSelect(
S32,
Cond,
B.buildAdd(
S32, Q, One), Q);
5178 B.buildSelect(DstDivReg,
Cond,
B.buildAdd(
S32, Q, One), Q);
5181 B.buildSelect(DstRemReg,
Cond,
B.buildSub(
S32, R,
Y), R);
5200 auto Unmerge =
B.buildUnmerge(
S32, Val);
5202 auto CvtLo =
B.buildUITOFP(
S32, Unmerge.getReg(0));
5203 auto CvtHi =
B.buildUITOFP(
S32, Unmerge.getReg(1));
5205 auto Mad =
B.buildFMAD(
5209 auto Rcp =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {Mad});
5210 auto Mul1 =
B.buildFMul(
5214 auto Mul2 =
B.buildFMul(
5216 auto Trunc =
B.buildIntrinsicTrunc(
S32, Mul2);
5219 auto Mad2 =
B.buildFMAD(
5223 auto ResultLo =
B.buildFPTOUI(
S32, Mad2);
5224 auto ResultHi =
B.buildFPTOUI(
S32, Trunc);
5226 return {ResultLo.getReg(0), ResultHi.getReg(0)};
5241 auto Rcp =
B.buildMergeLikeInstr(
S64, {RcpLo, RcpHi});
5243 auto Zero64 =
B.buildConstant(
S64, 0);
5244 auto NegDenom =
B.buildSub(
S64, Zero64, Denom);
5246 auto MulLo1 =
B.buildMul(
S64, NegDenom, Rcp);
5247 auto MulHi1 =
B.buildUMulH(
S64, Rcp, MulLo1);
5249 auto UnmergeMulHi1 =
B.buildUnmerge(
S32, MulHi1);
5250 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
5251 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
5253 auto Add1_Lo =
B.buildUAddo(
S32,
S1, RcpLo, MulHi1_Lo);
5254 auto Add1_Hi =
B.buildUAdde(
S32,
S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
5255 auto Add1 =
B.buildMergeLikeInstr(
S64, {Add1_Lo, Add1_Hi});
5257 auto MulLo2 =
B.buildMul(
S64, NegDenom, Add1);
5258 auto MulHi2 =
B.buildUMulH(
S64, Add1, MulLo2);
5259 auto UnmergeMulHi2 =
B.buildUnmerge(
S32, MulHi2);
5260 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
5261 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
5263 auto Zero32 =
B.buildConstant(
S32, 0);
5264 auto Add2_Lo =
B.buildUAddo(
S32,
S1, Add1_Lo, MulHi2_Lo);
5265 auto Add2_Hi =
B.buildUAdde(
S32,
S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
5266 auto Add2 =
B.buildMergeLikeInstr(
S64, {Add2_Lo, Add2_Hi});
5268 auto UnmergeNumer =
B.buildUnmerge(
S32, Numer);
5269 Register NumerLo = UnmergeNumer.getReg(0);
5270 Register NumerHi = UnmergeNumer.getReg(1);
5272 auto MulHi3 =
B.buildUMulH(
S64, Numer, Add2);
5273 auto Mul3 =
B.buildMul(
S64, Denom, MulHi3);
5274 auto UnmergeMul3 =
B.buildUnmerge(
S32, Mul3);
5275 Register Mul3_Lo = UnmergeMul3.getReg(0);
5276 Register Mul3_Hi = UnmergeMul3.getReg(1);
5277 auto Sub1_Lo =
B.buildUSubo(
S32,
S1, NumerLo, Mul3_Lo);
5278 auto Sub1_Hi =
B.buildUSube(
S32,
S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
5279 auto Sub1_Mi =
B.buildSub(
S32, NumerHi, Mul3_Hi);
5280 auto Sub1 =
B.buildMergeLikeInstr(
S64, {Sub1_Lo, Sub1_Hi});
5282 auto UnmergeDenom =
B.buildUnmerge(
S32, Denom);
5283 Register DenomLo = UnmergeDenom.getReg(0);
5284 Register DenomHi = UnmergeDenom.getReg(1);
5287 auto C1 =
B.buildSExt(
S32, CmpHi);
5290 auto C2 =
B.buildSExt(
S32, CmpLo);
5293 auto C3 =
B.buildSelect(
S32, CmpEq, C2, C1);
5300 auto Sub2_Lo =
B.buildUSubo(
S32,
S1, Sub1_Lo, DenomLo);
5301 auto Sub2_Mi =
B.buildUSube(
S32,
S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
5302 auto Sub2_Hi =
B.buildUSube(
S32,
S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
5303 auto Sub2 =
B.buildMergeLikeInstr(
S64, {Sub2_Lo, Sub2_Hi});
5305 auto One64 =
B.buildConstant(
S64, 1);
5306 auto Add3 =
B.buildAdd(
S64, MulHi3, One64);
5312 auto C6 =
B.buildSelect(
5316 auto Add4 =
B.buildAdd(
S64, Add3, One64);
5317 auto Sub3_Lo =
B.buildUSubo(
S32,
S1, Sub2_Lo, DenomLo);
5319 auto Sub3_Mi =
B.buildUSube(
S32,
S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
5320 auto Sub3_Hi =
B.buildUSube(
S32,
S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
5321 auto Sub3 =
B.buildMergeLikeInstr(
S64, {Sub3_Lo, Sub3_Hi});
5327 auto Sel1 =
B.buildSelect(
5334 auto Sel2 =
B.buildSelect(
5345 switch (
MI.getOpcode()) {
5348 case AMDGPU::G_UDIV: {
5349 DstDivReg =
MI.getOperand(0).getReg();
5352 case AMDGPU::G_UREM: {
5353 DstRemReg =
MI.getOperand(0).getReg();
5356 case AMDGPU::G_UDIVREM: {
5357 DstDivReg =
MI.getOperand(0).getReg();
5358 DstRemReg =
MI.getOperand(1).getReg();
5365 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
5366 Register Num =
MI.getOperand(FirstSrcOpIdx).getReg();
5367 Register Den =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
5377 MI.eraseFromParent();
5388 if (Ty !=
S32 && Ty !=
S64)
5391 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
5392 Register LHS =
MI.getOperand(FirstSrcOpIdx).getReg();
5393 Register RHS =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
5395 auto SignBitOffset =
B.buildConstant(
S32, Ty.getSizeInBits() - 1);
5396 auto LHSign =
B.buildAShr(Ty, LHS, SignBitOffset);
5397 auto RHSign =
B.buildAShr(Ty, RHS, SignBitOffset);
5399 LHS =
B.buildAdd(Ty, LHS, LHSign).getReg(0);
5400 RHS =
B.buildAdd(Ty, RHS, RHSign).getReg(0);
5402 LHS =
B.buildXor(Ty, LHS, LHSign).getReg(0);
5403 RHS =
B.buildXor(Ty, RHS, RHSign).getReg(0);
5405 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5406 switch (
MI.getOpcode()) {
5409 case AMDGPU::G_SDIV: {
5410 DstDivReg =
MI.getOperand(0).getReg();
5414 case AMDGPU::G_SREM: {
5415 DstRemReg =
MI.getOperand(0).getReg();
5419 case AMDGPU::G_SDIVREM: {
5420 DstDivReg =
MI.getOperand(0).getReg();
5421 DstRemReg =
MI.getOperand(1).getReg();
5434 auto Sign =
B.buildXor(Ty, LHSign, RHSign).getReg(0);
5435 auto SignXor =
B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
5436 B.buildSub(DstDivReg, SignXor, Sign);
5440 auto Sign = LHSign.getReg(0);
5441 auto SignXor =
B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
5442 B.buildSub(DstRemReg, SignXor, Sign);
5445 MI.eraseFromParent();
5461 if (!AllowInaccurateRcp && ResTy !=
LLT::scalar(16))
5472 if (CLHS->isExactlyValue(1.0)) {
5473 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5477 MI.eraseFromParent();
5482 if (CLHS->isExactlyValue(-1.0)) {
5483 auto FNeg =
B.buildFNeg(ResTy, RHS, Flags);
5484 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5485 .addUse(FNeg.getReg(0))
5488 MI.eraseFromParent();
5495 if (!AllowInaccurateRcp && (ResTy !=
LLT::scalar(16) ||
5500 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5503 B.buildFMul(Res, LHS, RCP, Flags);
5505 MI.eraseFromParent();
5520 if (!AllowInaccurateRcp)
5528 X =
B.buildFConstant(ResTy, 1.0).getReg(0);
5530 Register NegY = IsNegRcp ?
Y :
B.buildFNeg(ResTy,
Y).getReg(0);
5531 auto One =
B.buildFConstant(ResTy, 1.0);
5533 auto R =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5537 R =
B.buildFNeg(ResTy, R);
5539 auto Tmp0 =
B.buildFMA(ResTy, NegY, R, One);
5540 R =
B.buildFMA(ResTy, Tmp0, R, R);
5542 auto Tmp1 =
B.buildFMA(ResTy, NegY, R, One);
5543 R =
B.buildFMA(ResTy, Tmp1, R, R);
5547 B.buildCopy(Res, R);
5548 MI.eraseFromParent();
5552 auto Ret =
B.buildFMul(ResTy,
X, R);
5553 auto Tmp2 =
B.buildFMA(ResTy, NegY, Ret,
X);
5555 B.buildFMA(Res, Tmp2, R, Ret);
5556 MI.eraseFromParent();
5588 auto LHSExt =
B.buildFPExt(
S32, LHS, Flags);
5589 auto RHSExt =
B.buildFPExt(
S32, RHS, Flags);
5590 auto NegRHSExt =
B.buildFNeg(
S32, RHSExt);
5591 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5592 .addUse(RHSExt.getReg(0))
5594 auto Quot =
B.buildFMul(
S32, LHSExt, Rcp, Flags);
5596 if (ST.hasMadMacF32Insts()) {
5597 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5598 Quot =
B.buildFMAD(
S32, Err, Rcp, Quot, Flags);
5599 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5601 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5602 Quot =
B.buildFMA(
S32, Err, Rcp, Quot, Flags);
5603 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5605 auto Tmp =
B.buildFMul(
S32, Err, Rcp, Flags);
5606 Tmp =
B.buildAnd(
S32, Tmp,
B.buildConstant(
S32, 0xff800000));
5607 Quot =
B.buildFAdd(
S32, Tmp, Quot, Flags);
5608 auto RDst =
B.buildFPTrunc(
S16, Quot, Flags);
5609 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5610 .addUse(RDst.getReg(0))
5615 MI.eraseFromParent();
5628 unsigned SPDenormMode =
5631 if (ST.hasDenormModeInst()) {
5633 uint32_t DPDenormModeDefault =
Mode.fpDenormModeDPValue();
5635 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5636 B.buildInstr(AMDGPU::S_DENORM_MODE)
5637 .addImm(NewDenormModeValue);
5640 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5641 .addImm(SPDenormMode)
5663 auto One =
B.buildFConstant(
S32, 1.0f);
5665 auto DenominatorScaled =
5666 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5671 auto NumeratorScaled =
5672 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5678 auto ApproxRcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5679 .addUse(DenominatorScaled.getReg(0))
5681 auto NegDivScale0 =
B.buildFNeg(
S32, DenominatorScaled, Flags);
5684 const bool HasDynamicDenormals =
5689 if (!PreservesDenormals) {
5690 if (HasDynamicDenormals) {
5692 B.buildInstr(AMDGPU::S_GETREG_B32)
5693 .addDef(SavedSPDenormMode)
5699 auto Fma0 =
B.buildFMA(
S32, NegDivScale0, ApproxRcp, One, Flags);
5700 auto Fma1 =
B.buildFMA(
S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5701 auto Mul =
B.buildFMul(
S32, NumeratorScaled, Fma1, Flags);
5702 auto Fma2 =
B.buildFMA(
S32, NegDivScale0,
Mul, NumeratorScaled, Flags);
5703 auto Fma3 =
B.buildFMA(
S32, Fma2, Fma1,
Mul, Flags);
5704 auto Fma4 =
B.buildFMA(
S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5706 if (!PreservesDenormals) {
5707 if (HasDynamicDenormals) {
5708 assert(SavedSPDenormMode);
5709 B.buildInstr(AMDGPU::S_SETREG_B32)
5710 .addReg(SavedSPDenormMode)
5716 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S32})
5717 .addUse(Fma4.getReg(0))
5718 .addUse(Fma1.getReg(0))
5719 .addUse(Fma3.getReg(0))
5720 .addUse(NumeratorScaled.getReg(1))
5723 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5724 .addUse(Fmas.getReg(0))
5729 MI.eraseFromParent();
5748 auto One =
B.buildFConstant(
S64, 1.0);
5750 auto DivScale0 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5756 auto NegDivScale0 =
B.buildFNeg(
S64, DivScale0.getReg(0), Flags);
5758 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S64})
5759 .addUse(DivScale0.getReg(0))
5762 auto Fma0 =
B.buildFMA(
S64, NegDivScale0, Rcp, One, Flags);
5763 auto Fma1 =
B.buildFMA(
S64, Rcp, Fma0, Rcp, Flags);
5764 auto Fma2 =
B.buildFMA(
S64, NegDivScale0, Fma1, One, Flags);
5766 auto DivScale1 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5772 auto Fma3 =
B.buildFMA(
S64, Fma1, Fma2, Fma1, Flags);
5773 auto Mul =
B.buildFMul(
S64, DivScale1.getReg(0), Fma3, Flags);
5774 auto Fma4 =
B.buildFMA(
S64, NegDivScale0,
Mul, DivScale1.getReg(0), Flags);
5777 if (!ST.hasUsableDivScaleConditionOutput()) {
5783 auto NumUnmerge =
B.buildUnmerge(
S32, LHS);
5784 auto DenUnmerge =
B.buildUnmerge(
S32, RHS);
5785 auto Scale0Unmerge =
B.buildUnmerge(
S32, DivScale0);
5786 auto Scale1Unmerge =
B.buildUnmerge(
S32, DivScale1);
5789 Scale1Unmerge.getReg(1));
5791 Scale0Unmerge.getReg(1));
5792 Scale =
B.buildXor(
S1, CmpNum, CmpDen).getReg(0);
5794 Scale = DivScale1.getReg(1);
5797 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S64})
5798 .addUse(Fma4.getReg(0))
5799 .addUse(Fma3.getReg(0))
5800 .addUse(
Mul.getReg(0))
5804 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup,
ArrayRef(Res))
5805 .addUse(Fmas.getReg(0))
5810 MI.eraseFromParent();
5825 auto Mant =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5828 auto Exp =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5832 if (ST.hasFractBug()) {
5833 auto Fabs =
B.buildFAbs(Ty, Val);
5837 auto Zero =
B.buildConstant(InstrExpTy, 0);
5838 Exp =
B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5839 Mant =
B.buildSelect(Ty, IsFinite, Mant, Val);
5842 B.buildCopy(Res0, Mant);
5843 B.buildSExtOrTrunc(Res1, Exp);
5845 MI.eraseFromParent();
5860 auto Abs =
B.buildFAbs(
S32, RHS, Flags);
5863 auto C0 =
B.buildFConstant(
S32, 0x1p+96f);
5864 auto C1 =
B.buildFConstant(
S32, 0x1p-32f);
5865 auto C2 =
B.buildFConstant(
S32, 1.0f);
5868 auto Sel =
B.buildSelect(
S32, CmpRes, C1, C2, Flags);
5870 auto Mul0 =
B.buildFMul(
S32, RHS, Sel, Flags);
5872 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5873 .addUse(Mul0.getReg(0))
5876 auto Mul1 =
B.buildFMul(
S32, LHS, RCP, Flags);
5878 B.buildFMul(Res, Sel, Mul1, Flags);
5880 MI.eraseFromParent();
5889 unsigned Flags =
MI.getFlags();
5890 assert(!ST.has16BitInsts());
5892 auto Ext =
B.buildFPExt(
F32,
MI.getOperand(1), Flags);
5893 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {
F32})
5894 .addUse(Ext.getReg(0))
5896 B.buildFPTrunc(
MI.getOperand(0),
Log2, Flags);
5897 MI.eraseFromParent();
5907 const unsigned Flags =
MI.getFlags();
5916 MI.eraseFromParent();
5920 auto ScaleThreshold =
B.buildFConstant(
F32, 0x1.0p-96f);
5922 auto ScaleUpFactor =
B.buildFConstant(
F32, 0x1.0p+32f);
5923 auto ScaledX =
B.buildFMul(
F32,
X, ScaleUpFactor, Flags);
5924 auto SqrtX =
B.buildSelect(
F32, NeedScale, ScaledX,
X, Flags);
5929 .addUse(SqrtX.getReg(0))
5932 auto NegOne =
B.buildConstant(I32, -1);
5933 auto SqrtSNextDown =
B.buildAdd(I32, SqrtS, NegOne);
5935 auto NegSqrtSNextDown =
B.buildFNeg(
F32, SqrtSNextDown, Flags);
5936 auto SqrtVP =
B.buildFMA(
F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5938 auto PosOne =
B.buildConstant(I32, 1);
5939 auto SqrtSNextUp =
B.buildAdd(I32, SqrtS, PosOne);
5941 auto NegSqrtSNextUp =
B.buildFNeg(
F32, SqrtSNextUp, Flags);
5942 auto SqrtVS =
B.buildFMA(
F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5944 auto Zero =
B.buildFConstant(
F32, 0.0f);
5948 B.buildSelect(
F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5952 B.buildSelect(
F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5955 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F32}).addReg(SqrtX.getReg(0));
5956 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5958 auto Half =
B.buildFConstant(
F32, 0.5f);
5959 auto SqrtH =
B.buildFMul(
F32, SqrtR, Half, Flags);
5960 auto NegSqrtH =
B.buildFNeg(
F32, SqrtH, Flags);
5961 auto SqrtE =
B.buildFMA(
F32, NegSqrtH, SqrtS, Half, Flags);
5962 SqrtH =
B.buildFMA(
F32, SqrtH, SqrtE, SqrtH, Flags);
5963 SqrtS =
B.buildFMA(
F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5964 auto NegSqrtS =
B.buildFNeg(
F32, SqrtS, Flags);
5965 auto SqrtD =
B.buildFMA(
F32, NegSqrtS, SqrtS, SqrtX, Flags);
5966 SqrtS =
B.buildFMA(
F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5969 auto ScaleDownFactor =
B.buildFConstant(
F32, 0x1.0p-16f);
5971 auto ScaledDown =
B.buildFMul(
F32, SqrtS, ScaleDownFactor, Flags);
5973 SqrtS =
B.buildSelect(
F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5976 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5978 MI.eraseFromParent();
6013 unsigned Flags =
MI.getFlags();
6018 auto ScaleConstant =
B.buildFConstant(
F64, 0x1.0p-767);
6020 ZeroInt =
B.buildConstant(
S32, 0).getReg(0);
6024 auto ScaleUpFactor =
B.buildConstant(
S32, 256);
6025 auto ScaleUp =
B.buildSelect(
S32, Scaling, ScaleUpFactor, ZeroInt);
6026 SqrtX =
B.buildFLdexp(
F64,
X, ScaleUp, Flags).getReg(0);
6029 auto SqrtY =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F64}).addReg(SqrtX);
6031 auto Half =
B.buildFConstant(
F64, 0.5);
6032 auto SqrtH0 =
B.buildFMul(
F64, SqrtY, Half);
6033 auto SqrtS0 =
B.buildFMul(
F64, SqrtX, SqrtY);
6035 auto NegSqrtH0 =
B.buildFNeg(
F64, SqrtH0);
6036 auto SqrtR0 =
B.buildFMA(
F64, NegSqrtH0, SqrtS0, Half);
6038 auto SqrtS1 =
B.buildFMA(
F64, SqrtS0, SqrtR0, SqrtS0);
6039 auto SqrtH1 =
B.buildFMA(
F64, SqrtH0, SqrtR0, SqrtH0);
6041 auto NegSqrtS1 =
B.buildFNeg(
F64, SqrtS1);
6042 auto SqrtD0 =
B.buildFMA(
F64, NegSqrtS1, SqrtS1, SqrtX);
6044 auto SqrtS2 =
B.buildFMA(
F64, SqrtD0, SqrtH1, SqrtS1);
6046 Register SqrtRet = SqrtS2.getReg(0);
6048 auto NegSqrtS2 =
B.buildFNeg(
F64, SqrtS2);
6049 auto SqrtD1 =
B.buildFMA(
F64, NegSqrtS2, SqrtS2, SqrtX);
6050 auto SqrtD2 =
B.buildFMA(
F64, SqrtD1, SqrtH1, SqrtS2);
6053 auto ScaleDownFactor =
B.buildConstant(
S32, -128);
6054 auto ScaleDown =
B.buildSelect(
S32, Scaling, ScaleDownFactor, ZeroInt);
6055 SqrtRet =
B.buildFLdexp(
F64, SqrtD2, ScaleDown, Flags).getReg(0);
6060 auto ZeroFP =
B.buildFConstant(
F64, 0.0);
6069 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
6071 MI.eraseFromParent();
6102 auto Flags =
MI.getFlags();
6114 auto Rsq =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
6124 auto ClampMax = UseIEEE ?
B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
6125 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
6130 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
6132 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
6133 MI.eraseFromParent();
6145 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6146 IID == Intrinsic::amdgcn_permlanex16;
6147 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6148 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6149 bool IsPermlaneShuffle = IID == Intrinsic::amdgcn_permlane_bcast ||
6150 IID == Intrinsic::amdgcn_permlane_up ||
6151 IID == Intrinsic::amdgcn_permlane_down ||
6152 IID == Intrinsic::amdgcn_permlane_xor;
6156 auto LaneOp =
B.buildIntrinsic(IID, {VT}).addUse(Src0);
6158 case Intrinsic::amdgcn_readfirstlane:
6159 case Intrinsic::amdgcn_permlane64:
6160 return LaneOp.getReg(0);
6161 case Intrinsic::amdgcn_readlane:
6162 case Intrinsic::amdgcn_set_inactive:
6163 case Intrinsic::amdgcn_set_inactive_chain_arg:
6164 return LaneOp.addUse(Src1).getReg(0);
6165 case Intrinsic::amdgcn_writelane:
6166 case Intrinsic::amdgcn_permlane_bcast:
6167 case Intrinsic::amdgcn_permlane_up:
6168 case Intrinsic::amdgcn_permlane_down:
6169 case Intrinsic::amdgcn_permlane_xor:
6170 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
6171 case Intrinsic::amdgcn_permlane16:
6172 case Intrinsic::amdgcn_permlanex16: {
6174 int64_t Src4 =
MI.getOperand(6).getImm();
6175 int64_t Src5 =
MI.getOperand(7).getImm();
6176 return LaneOp.addUse(Src1)
6183 case Intrinsic::amdgcn_mov_dpp8:
6184 return LaneOp.addImm(
MI.getOperand(3).getImm()).getReg(0);
6185 case Intrinsic::amdgcn_update_dpp:
6186 return LaneOp.addUse(Src1)
6187 .addImm(
MI.getOperand(4).getImm())
6188 .addImm(
MI.getOperand(5).getImm())
6189 .addImm(
MI.getOperand(6).getImm())
6190 .addImm(
MI.getOperand(7).getImm())
6200 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6201 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16 ||
6202 IsPermlaneShuffle) {
6203 Src1 =
MI.getOperand(3).getReg();
6204 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16 ||
6205 IsPermlaneShuffle) {
6206 Src2 =
MI.getOperand(4).getReg();
6211 unsigned Size = Ty.getSizeInBits();
6213 unsigned SplitSize = 32;
6214 if (IID == Intrinsic::amdgcn_update_dpp && (
Size % 64 == 0) &&
6215 ST.hasDPALU_DPP() &&
6219 if (
Size == SplitSize) {
6225 Src0 =
B.buildAnyExt(
S32, Src0).getReg(0);
6227 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6230 if (IID == Intrinsic::amdgcn_writelane)
6233 Register LaneOpDst = createLaneOp(Src0, Src1, Src2,
S32);
6234 B.buildTrunc(DstReg, LaneOpDst);
6235 MI.eraseFromParent();
6239 if (
Size % SplitSize != 0)
6243 bool NeedsBitcast =
false;
6244 if (Ty.isVector()) {
6247 if (EltSize == SplitSize) {
6248 PartialResTy = EltTy;
6249 }
else if (EltSize == 16 || EltSize == 32) {
6250 unsigned NElem = SplitSize / EltSize;
6254 NeedsBitcast =
true;
6259 unsigned NumParts =
Size / SplitSize;
6263 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6264 Src1Parts =
B.buildUnmerge(PartialResTy, Src1);
6266 if (IID == Intrinsic::amdgcn_writelane)
6267 Src2Parts =
B.buildUnmerge(PartialResTy, Src2);
6269 for (
unsigned i = 0; i < NumParts; ++i) {
6270 Src0 = Src0Parts.
getReg(i);
6272 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6273 Src1 = Src1Parts.
getReg(i);
6275 if (IID == Intrinsic::amdgcn_writelane)
6276 Src2 = Src2Parts.
getReg(i);
6278 PartialRes.
push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
6282 B.buildBitcast(DstReg,
B.buildMergeLikeInstr(
6285 B.buildMergeLikeInstr(DstReg, PartialRes);
6287 MI.eraseFromParent();
6295 ST.getTargetLowering()->getImplicitParameterOffset(
6305 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
6306 B.buildConstant(IdxTy,
Offset).getReg(0));
6317 Register Pointer =
MI.getOperand(2).getReg();
6319 Register NumRecords =
MI.getOperand(4).getReg();
6325 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6327 auto ExtStride =
B.buildAnyExt(
S32, Stride);
6329 if (ST.has45BitNumRecordsBufferResource()) {
6334 auto PointerInt =
B.buildPtrToInt(PtrIntTy, Pointer);
6335 auto ExtPointer =
B.buildAnyExtOrTrunc(
S64, PointerInt);
6336 auto NumRecordsLHS =
B.buildShl(
S64, NumRecords,
B.buildConstant(
S32, 57));
6337 Register LowHalf =
B.buildOr(
S64, ExtPointer, NumRecordsLHS).getReg(0);
6341 auto NumRecordsRHS =
B.buildLShr(
S64, NumRecords,
B.buildConstant(
S32, 7));
6342 auto ShiftedStride =
B.buildShl(
S32, ExtStride,
B.buildConstant(
S32, 12));
6343 auto ExtShiftedStride =
6344 B.buildMergeValues(
S64, {Zero, ShiftedStride.getReg(0)});
6345 auto ShiftedFlags =
B.buildShl(
S32, Flags,
B.buildConstant(
S32, 28));
6346 auto ExtShiftedFlags =
6347 B.buildMergeValues(
S64, {Zero, ShiftedFlags.getReg(0)});
6348 auto CombinedFields =
B.buildOr(
S64, NumRecordsRHS, ExtShiftedStride);
6350 B.buildOr(
S64, CombinedFields, ExtShiftedFlags).getReg(0);
6351 B.buildMergeValues(Result, {LowHalf, HighHalf});
6353 NumRecords =
B.buildTrunc(
S32, NumRecords).getReg(0);
6354 auto Unmerge =
B.buildUnmerge(
S32, Pointer);
6355 auto LowHalf = Unmerge.getReg(0);
6356 auto HighHalf = Unmerge.getReg(1);
6358 auto AndMask =
B.buildConstant(
S32, 0x0000ffff);
6359 auto Masked =
B.buildAnd(
S32, HighHalf, AndMask);
6360 auto ShiftConst =
B.buildConstant(
S32, 16);
6361 auto ShiftedStride =
B.buildShl(
S32, ExtStride, ShiftConst);
6362 auto NewHighHalf =
B.buildOr(
S32,
Masked, ShiftedStride);
6363 Register NewHighHalfReg = NewHighHalf.getReg(0);
6364 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
6367 MI.eraseFromParent();
6384 MI.eraseFromParent();
6392 std::optional<uint32_t> KnownSize =
6394 if (KnownSize.has_value())
6395 B.buildConstant(DstReg, *KnownSize);
6413 MI.eraseFromParent();
6420 unsigned AddrSpace)
const {
6422 auto Unmerge =
B.buildUnmerge(
S32,
MI.getOperand(2).getReg());
6426 ST.hasGloballyAddressableScratch()) {
6428 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
6429 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6431 MRI.
setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
6433 Register XOR =
B.buildXor(
S32, Hi32, FlatScratchBaseHi).getReg(0);
6435 B.buildConstant(
S32, 1u << 26));
6440 MI.eraseFromParent();
6450std::pair<Register, unsigned>
6462 bool CheckNUW = ST.hasGFX1250Insts();
6464 MRI, OrigOffset,
nullptr, CheckNUW);
6468 BaseReg =
B.buildPtrToInt(MRI.
getType(OrigOffset), BaseReg).getReg(0);
6478 unsigned Overflow = ImmOffset & ~MaxImm;
6479 ImmOffset -= Overflow;
6480 if ((int32_t)Overflow < 0) {
6481 Overflow += ImmOffset;
6485 if (Overflow != 0) {
6487 BaseReg =
B.buildConstant(
S32, Overflow).getReg(0);
6489 auto OverflowVal =
B.buildConstant(
S32, Overflow);
6490 BaseReg =
B.buildAdd(
S32, BaseReg, OverflowVal).getReg(0);
6495 BaseReg =
B.buildConstant(
S32, 0).getReg(0);
6497 return std::pair(BaseReg, ImmOffset);
6504 bool ImageStore)
const {
6510 if (ST.hasUnpackedD16VMem()) {
6511 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6514 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6515 WideRegs.
push_back(
B.buildAnyExt(
S32, Unmerge.getReg(
I)).getReg(0));
6523 if (ImageStore && ST.hasImageStoreD16Bug()) {
6526 Reg =
B.buildBitcast(
S32, Reg).getReg(0);
6528 PackedRegs.
resize(2,
B.buildUndef(
S32).getReg(0));
6535 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6536 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6538 PackedRegs.
resize(6,
B.buildUndef(
S16).getReg(0));
6546 auto Unmerge =
B.buildUnmerge(
S32, Reg);
6547 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6549 PackedRegs.
resize(4,
B.buildUndef(
S32).getReg(0));
6566 bool IsFormat)
const {
6578 VData =
B.buildBitcast(Ty, VData).getReg(0);
6586 if (Ty.isVector()) {
6587 if (Ty.getElementType() ==
S16 && Ty.getNumElements() <= 4) {
6599 bool IsFormat)
const {
6606 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6621 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6624 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6628 VIndex =
MI.getOperand(3).getReg();
6631 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6634 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6635 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6639 Format =
MI.getOperand(5 + OpOffset).getImm();
6643 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6649 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6650 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6651 }
else if (IsFormat) {
6652 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6653 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6657 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6660 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6663 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6668 auto MIB =
B.buildInstr(
Opc)
6679 MIB.addImm(AuxiliaryData)
6680 .addImm(HasVIndex ? -1 : 0)
6681 .addMemOperand(MMO);
6683 MI.eraseFromParent();
6689 unsigned ImmOffset,
unsigned Format,
6692 auto MIB =
B.buildInstr(
Opc)
6703 MIB.addImm(AuxiliaryData)
6704 .addImm(HasVIndex ? -1 : 0)
6705 .addMemOperand(MMO);
6711 bool IsTyped)
const {
6725 assert(
MI.getNumExplicitDefs() == 1 ||
MI.getNumExplicitDefs() == 2);
6726 bool IsTFE =
MI.getNumExplicitDefs() == 2;
6728 StatusDst =
MI.getOperand(1).getReg();
6733 Register RSrc =
MI.getOperand(2 + OpOffset).getReg();
6736 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6739 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps + OpOffset;
6742 VIndex =
MI.getOperand(3 + OpOffset).getReg();
6745 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6748 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6749 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6753 Format =
MI.getOperand(5 + OpOffset).getImm();
6757 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6767 Dst =
MI.getOperand(0).getReg();
6768 B.setInsertPt(
B.getMBB(),
MI);
6775 Dst =
MI.getOperand(0).getReg();
6776 B.setInsertPt(
B.getMBB(),
MI);
6780 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6781 const bool Unpacked = ST.hasUnpackedD16VMem();
6791 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6792 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6793 }
else if (IsFormat) {
6797 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6799 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6800 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6805 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6806 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6809 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6810 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6813 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6814 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6820 unsigned NumValueDWords =
divideCeil(Ty.getSizeInBits(), 32);
6821 unsigned NumLoadDWords = NumValueDWords + 1;
6823 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(LoadTy);
6825 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6827 Register ExtDst =
B.getMRI()->createGenericVirtualRegister(
S32);
6828 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6829 B.buildTrunc(Dst, ExtDst);
6830 }
else if (NumValueDWords == 1) {
6831 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6834 for (
unsigned I = 0;
I != NumValueDWords; ++
I)
6835 LoadElts.
push_back(
B.getMRI()->createGenericVirtualRegister(
S32));
6837 B.buildUnmerge(LoadElts, LoadDstReg);
6839 B.buildMergeLikeInstr(Dst, LoadElts);
6842 (IsD16 && !Ty.isVector())) {
6843 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(
S32);
6845 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6846 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6847 B.buildTrunc(Dst, LoadDstReg);
6848 }
else if (Unpacked && IsD16 && Ty.isVector()) {
6850 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6852 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6853 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6855 auto Unmerge =
B.buildUnmerge(
S32, LoadDstReg);
6857 for (
unsigned I = 0,
N = Unmerge->getNumOperands() - 1;
I !=
N; ++
I)
6858 Repack.
push_back(
B.buildTrunc(EltTy, Unmerge.getReg(
I)).getReg(0));
6859 B.buildMergeLikeInstr(Dst, Repack);
6862 AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6865 MI.eraseFromParent();
6871 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6872 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6873 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6874 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6875 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6876 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6877 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6878 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6879 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6880 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6881 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6882 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6883 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6884 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6885 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6886 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6887 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6888 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6889 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6890 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6891 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6892 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6893 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6894 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6895 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6896 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6897 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6898 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6899 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6900 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6901 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6902 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6903 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6904 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6905 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6906 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6907 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6908 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6909 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6910 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6911 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6912 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6913 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6914 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6915 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6916 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6917 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6918 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6919 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6920 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6921 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6922 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6923 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6924 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6925 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6926 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6927 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6928 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6929 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6930 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6931 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6932 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6933 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6934 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6935 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6936 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6937 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6938 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6939 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6940 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6941 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6942 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6943 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6944 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6945 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6946 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6947 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6948 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6949 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6950 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6951 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
6952 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
6953 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
6954 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
6955 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
6956 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6957 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
6958 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6959 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
6960 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6969 const bool IsCmpSwap =
6970 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6971 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6972 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6973 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6984 CmpVal =
MI.getOperand(3).getReg();
6989 Register RSrc =
MI.getOperand(3 + OpOffset).getReg();
6990 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6993 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6996 VIndex =
MI.getOperand(4 + OpOffset).getReg();
6999 VIndex =
B.buildConstant(
LLT::scalar(32), 0).getReg(0);
7002 Register VOffset =
MI.getOperand(4 + OpOffset).getReg();
7003 Register SOffset =
MI.getOperand(5 + OpOffset).getReg();
7004 unsigned AuxiliaryData =
MI.getOperand(6 + OpOffset).getImm();
7023 .addImm(AuxiliaryData)
7024 .addImm(HasVIndex ? -1 : 0)
7025 .addMemOperand(MMO);
7027 MI.eraseFromParent();
7037 bool IsA16,
bool IsG16) {
7053 (
B.getMRI()->getType(AddrReg) ==
S16)) {
7058 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7062 "Bias needs to be converted to 16 bit in A16 mode");
7064 AddrReg =
B.buildBitcast(
V2S16, AddrReg).getReg(0);
7070 if (((
I + 1) >= EndIdx) ||
7077 !
MI.getOperand(ArgOffset +
I + 1).isReg()) {
7079 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7084 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
7095 int DimIdx,
int NumVAddrs) {
7099 for (
int I = 0;
I != NumVAddrs; ++
I) {
7101 if (
SrcOp.isReg()) {
7107 int NumAddrRegs = AddrRegs.
size();
7108 if (NumAddrRegs != 1) {
7111 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
7114 for (
int I = 1;
I != NumVAddrs; ++
I) {
7117 MI.getOperand(DimIdx +
I).setReg(AMDGPU::NoRegister);
7139 const unsigned NumDefs =
MI.getNumExplicitDefs();
7140 const unsigned ArgOffset = NumDefs + 1;
7141 bool IsTFE = NumDefs == 2;
7159 VData =
MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
7163 const bool IsAtomicPacked16Bit =
7164 (BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7165 BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7173 ST.hasG16() ? (BaseOpcode->
Gradients && GradTy ==
S16) : GradTy ==
S16;
7174 const bool IsA16 = AddrTy ==
S16;
7175 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() ==
S16;
7178 if (!BaseOpcode->
Atomic) {
7179 DMask =
MI.getOperand(ArgOffset + Intr->
DMaskIndex).getImm();
7182 }
else if (DMask != 0) {
7184 }
else if (!IsTFE && !BaseOpcode->
Store) {
7186 B.buildUndef(
MI.getOperand(0));
7187 MI.eraseFromParent();
7195 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
7196 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
7197 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
7198 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
7199 unsigned NewOpcode = LoadOpcode;
7200 if (BaseOpcode->
Store)
7201 NewOpcode = StoreOpcode;
7203 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
7206 MI.setDesc(
B.getTII().get(NewOpcode));
7210 if (IsTFE && DMask == 0) {
7213 MI.getOperand(ArgOffset + Intr->
DMaskIndex).setImm(DMask);
7216 if (BaseOpcode->
Atomic) {
7221 if (Ty.isVector() && !IsAtomicPacked16Bit)
7228 auto Concat =
B.buildBuildVector(PackedTy, {VData0, VData1});
7229 MI.getOperand(2).setReg(
Concat.getReg(0));
7230 MI.getOperand(3).setReg(AMDGPU::NoRegister);
7234 unsigned CorrectedNumVAddrs = Intr->
NumVAddrs;
7237 if (BaseOpcode->
Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
7243 if (IsA16 && !ST.hasA16()) {
7248 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->
Sampler);
7249 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
7251 if (IsA16 || IsG16) {
7259 const bool UseNSA = ST.hasNSAEncoding() &&
7260 PackedRegs.
size() >= ST.getNSAThreshold(MF) &&
7261 (PackedRegs.
size() <= NSAMaxSize || HasPartialNSA);
7262 const bool UsePartialNSA =
7263 UseNSA && HasPartialNSA && PackedRegs.
size() > NSAMaxSize;
7265 if (UsePartialNSA) {
7269 auto Concat =
B.buildConcatVectors(
7270 PackedAddrTy,
ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
7271 PackedRegs[NSAMaxSize - 1] =
Concat.getReg(0);
7272 PackedRegs.
resize(NSAMaxSize);
7273 }
else if (!UseNSA && PackedRegs.
size() > 1) {
7275 auto Concat =
B.buildConcatVectors(PackedAddrTy, PackedRegs);
7276 PackedRegs[0] =
Concat.getReg(0);
7280 const unsigned NumPacked = PackedRegs.
size();
7283 if (!
SrcOp.isReg()) {
7293 SrcOp.setReg(AMDGPU::NoRegister);
7310 const bool UseNSA = ST.hasNSAEncoding() &&
7311 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
7312 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
7313 const bool UsePartialNSA =
7314 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
7316 if (UsePartialNSA) {
7318 ArgOffset + Intr->
VAddrStart + NSAMaxSize - 1,
7320 }
else if (!UseNSA && Intr->
NumVAddrs > 1) {
7335 if (!Ty.isVector() || !IsD16)
7339 if (RepackedReg != VData) {
7340 MI.getOperand(1).setReg(RepackedReg);
7348 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
7351 if (NumElts < DMaskLanes)
7354 if (NumElts > 4 || DMaskLanes > 4)
7364 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
7365 const LLT AdjustedTy =
7381 if (IsD16 && ST.hasUnpackedD16VMem()) {
7388 unsigned RoundedElts = (AdjustedTy.
getSizeInBits() + 31) / 32;
7389 unsigned RoundedSize = 32 * RoundedElts;
7393 RegTy = !IsTFE && EltSize == 16 ?
V2S16 :
S32;
7398 if (!IsTFE && (RoundedTy == Ty || !Ty.
isVector()))
7404 B.setInsertPt(*
MI.getParent(), ++
MI.getIterator());
7408 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
7409 const int ResultNumRegs = LoadResultTy.
getSizeInBits() / 32;
7413 MI.getOperand(0).setReg(NewResultReg);
7421 Dst1Reg =
MI.getOperand(1).getReg();
7426 MI.removeOperand(1);
7430 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
7439 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7441 if (ResultNumRegs == 1) {
7443 ResultRegs[0] = NewResultReg;
7446 for (
int I = 0;
I != NumDataRegs; ++
I)
7448 B.buildUnmerge(ResultRegs, NewResultReg);
7453 ResultRegs.
resize(NumDataRegs);
7458 if (IsD16 && !Ty.isVector()) {
7459 B.buildTrunc(DstReg, ResultRegs[0]);
7464 if (Ty ==
V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7465 B.buildBitcast(DstReg, ResultRegs[0]);
7477 if (RegTy !=
V2S16 && !ST.hasUnpackedD16VMem()) {
7479 Reg =
B.buildBitcast(
V2S16, Reg).getReg(0);
7480 }
else if (ST.hasUnpackedD16VMem()) {
7482 Reg =
B.buildTrunc(
S16, Reg).getReg(0);
7486 auto padWithUndef = [&](
LLT Ty,
int NumElts) {
7490 for (
int I = 0;
I != NumElts; ++
I)
7497 padWithUndef(ResTy, NumElts - ResultRegs.
size());
7498 B.buildBuildVector(DstReg, ResultRegs);
7502 assert(!ST.hasUnpackedD16VMem() && ResTy ==
V2S16);
7503 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7509 if (ResultRegs.
size() == 1) {
7510 NewResultReg = ResultRegs[0];
7511 }
else if (ResultRegs.
size() == 2) {
7513 NewResultReg =
B.buildConcatVectors(
V4S16, ResultRegs).getReg(0);
7521 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
7523 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
7528 padWithUndef(ResTy, RegsToCover - ResultRegs.
size());
7529 B.buildConcatVectors(DstReg, ResultRegs);
7538 Register OrigDst =
MI.getOperand(0).getReg();
7540 LLT Ty =
B.getMRI()->getType(OrigDst);
7541 unsigned Size = Ty.getSizeInBits();
7544 if (
Size < 32 && ST.hasScalarSubwordLoads()) {
7546 Opc =
Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7547 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7550 Dst =
B.getMRI()->createGenericVirtualRegister(
LLT::scalar(32));
7552 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7561 B.setInsertPt(
B.getMBB(),
MI);
7566 B.setInsertPt(
B.getMBB(),
MI);
7572 MI.setDesc(
B.getTII().get(
Opc));
7573 MI.removeOperand(1);
7576 const unsigned MemSize = (
Size + 7) / 8;
7577 const Align MemAlign =
B.getDataLayout().getABITypeAlign(
7584 MI.addMemOperand(MF, MMO);
7585 if (Dst != OrigDst) {
7586 MI.getOperand(0).setReg(Dst);
7587 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
7588 B.buildTrunc(OrigDst, Dst);
7610 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7611 MI.removeOperand(0);
7621 if (!ST.hasTrapHandler() ||
7625 return ST.supportsGetDoorbellID() ?
7638 MI.eraseFromParent();
7648 BuildMI(*TrapBB, TrapBB->
end(),
DL,
B.getTII().get(AMDGPU::S_ENDPGM))
7650 BuildMI(BB, &
MI,
DL,
B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7654 MI.eraseFromParent();
7663 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7670 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
7690 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7693 Register Temp =
B.buildLoad(
S64, LoadAddr, *MMO).getReg(0);
7694 B.buildCopy(SGPR01, Temp);
7695 B.buildInstr(AMDGPU::S_TRAP)
7698 MI.eraseFromParent();
7709 B.buildCopy(SGPR01, LiveIn);
7710 B.buildInstr(AMDGPU::S_TRAP)
7714 MI.eraseFromParent();
7723 if (ST.hasPrivEnabledTrap2NopBug()) {
7724 ST.getInstrInfo()->insertSimulatedTrap(MRI,
B.getMBB(),
MI,
7726 MI.eraseFromParent();
7730 B.buildInstr(AMDGPU::S_TRAP)
7732 MI.eraseFromParent();
7741 if (!ST.hasTrapHandler() ||
7745 Fn,
"debugtrap handler not supported",
MI.getDebugLoc(),
DS_Warning));
7748 B.buildInstr(AMDGPU::S_TRAP)
7752 MI.eraseFromParent();
7765 Register NodePtr =
MI.getOperand(2).getReg();
7766 Register RayExtent =
MI.getOperand(3).getReg();
7767 Register RayOrigin =
MI.getOperand(4).getReg();
7769 Register RayInvDir =
MI.getOperand(6).getReg();
7772 if (!ST.hasGFX10_AEncoding()) {
7775 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7784 const unsigned NumVDataDwords = 4;
7785 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7786 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7788 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7790 const unsigned BaseOpcodes[2][2] = {
7791 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7792 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7793 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7797 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7798 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7799 : AMDGPU::MIMGEncGfx10NSA,
7800 NumVDataDwords, NumVAddrDwords);
7804 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7805 : AMDGPU::MIMGEncGfx10Default,
7806 NumVDataDwords, NumVAddrDwords);
7811 if (UseNSA && IsGFX11Plus) {
7813 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7814 auto Merged =
B.buildMergeLikeInstr(
7815 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7816 Ops.push_back(Merged.getReg(0));
7819 Ops.push_back(NodePtr);
7820 Ops.push_back(RayExtent);
7821 packLanes(RayOrigin);
7824 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7825 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7826 auto MergedDir =
B.buildMergeLikeInstr(
7829 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(0),
7830 UnmergeRayDir.getReg(0)}))
7833 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(1),
7834 UnmergeRayDir.getReg(1)}))
7837 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(2),
7838 UnmergeRayDir.getReg(2)}))
7840 Ops.push_back(MergedDir.getReg(0));
7843 packLanes(RayInvDir);
7847 auto Unmerge =
B.buildUnmerge({
S32,
S32}, NodePtr);
7848 Ops.push_back(Unmerge.getReg(0));
7849 Ops.push_back(Unmerge.getReg(1));
7851 Ops.push_back(NodePtr);
7853 Ops.push_back(RayExtent);
7856 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7857 Ops.push_back(Unmerge.getReg(0));
7858 Ops.push_back(Unmerge.getReg(1));
7859 Ops.push_back(Unmerge.getReg(2));
7862 packLanes(RayOrigin);
7864 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7865 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7869 B.buildMergeLikeInstr(R1,
7870 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7871 B.buildMergeLikeInstr(
7872 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7873 B.buildMergeLikeInstr(
7874 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7880 packLanes(RayInvDir);
7887 Register MergedOps =
B.buildMergeLikeInstr(OpTy,
Ops).getReg(0);
7889 Ops.push_back(MergedOps);
7892 auto MIB =
B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7901 .addImm(IsA16 ? 1 : 0)
7904 MI.eraseFromParent();
7914 Register DstOrigin =
MI.getOperand(1).getReg();
7916 Register NodePtr =
MI.getOperand(4).getReg();
7917 Register RayExtent =
MI.getOperand(5).getReg();
7918 Register InstanceMask =
MI.getOperand(6).getReg();
7919 Register RayOrigin =
MI.getOperand(7).getReg();
7921 Register Offsets =
MI.getOperand(9).getReg();
7922 Register TDescr =
MI.getOperand(10).getReg();
7924 if (!ST.hasBVHDualAndBVH8Insts()) {
7927 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7932 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7933 const unsigned NumVDataDwords = 10;
7934 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7936 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7937 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7938 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7941 auto RayExtentInstanceMaskVec =
B.buildMergeLikeInstr(
7942 V2S32, {RayExtent,
B.buildAnyExt(
S32, InstanceMask)});
7944 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7945 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7951 .addUse(RayExtentInstanceMaskVec.getReg(0))
7958 MI.eraseFromParent();
7967 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7968 MI.eraseFromParent();
7975 if (!ST.hasArchitectedSGPRs())
7979 auto TTMP8 =
B.buildCopy(
S32,
Register(AMDGPU::TTMP8));
7980 auto LSB =
B.buildConstant(
S32, 25);
7981 auto Width =
B.buildConstant(
S32, 5);
7982 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7983 MI.eraseFromParent();
7991 unsigned Width)
const {
7995 MRI.
setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
7996 B.buildInstr(AMDGPU::S_GETREG_B32_const)
7999 MI.eraseFromParent();
8017 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
8021 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
8024 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
8025 MI.eraseFromParent();
8036 auto Unmerge =
B.buildUnmerge({
S32,
S32},
MI.getOperand(0));
8040 .addReg(Unmerge.getReg(0));
8044 .addReg(Unmerge.getReg(1));
8045 MI.eraseFromParent();
8057 case Intrinsic::sponentry:
8063 B.buildInstr(AMDGPU::G_AMDGPU_SPONENTRY).addDef(TmpReg);
8066 B.buildIntToPtr(DstReg, TmpReg);
8067 MI.eraseFromParent();
8069 int FI =
B.getMF().getFrameInfo().CreateFixedObject(
8071 B.buildFrameIndex(
MI.getOperand(0), FI);
8072 MI.eraseFromParent();
8075 case Intrinsic::amdgcn_if:
8076 case Intrinsic::amdgcn_else: {
8079 bool Negated =
false;
8091 std::swap(CondBrTarget, UncondBrTarget);
8093 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
8094 if (IntrID == Intrinsic::amdgcn_if) {
8095 B.buildInstr(AMDGPU::SI_IF)
8098 .addMBB(UncondBrTarget);
8100 B.buildInstr(AMDGPU::SI_ELSE)
8103 .addMBB(UncondBrTarget);
8112 B.buildBr(*CondBrTarget);
8117 MI.eraseFromParent();
8118 BrCond->eraseFromParent();
8124 case Intrinsic::amdgcn_loop: {
8127 bool Negated =
false;
8137 std::swap(CondBrTarget, UncondBrTarget);
8139 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
8140 B.buildInstr(AMDGPU::SI_LOOP)
8142 .addMBB(UncondBrTarget);
8147 B.buildBr(*CondBrTarget);
8149 MI.eraseFromParent();
8150 BrCond->eraseFromParent();
8157 case Intrinsic::amdgcn_addrspacecast_nonnull:
8159 case Intrinsic::amdgcn_make_buffer_rsrc:
8161 case Intrinsic::amdgcn_kernarg_segment_ptr:
8164 B.buildConstant(
MI.getOperand(0).getReg(), 0);
8165 MI.eraseFromParent();
8171 case Intrinsic::amdgcn_implicitarg_ptr:
8173 case Intrinsic::amdgcn_workitem_id_x:
8176 case Intrinsic::amdgcn_workitem_id_y:
8179 case Intrinsic::amdgcn_workitem_id_z:
8182 case Intrinsic::amdgcn_workgroup_id_x:
8187 case Intrinsic::amdgcn_workgroup_id_y:
8192 case Intrinsic::amdgcn_workgroup_id_z:
8197 case Intrinsic::amdgcn_cluster_id_x:
8198 return ST.hasClusters() &&
8201 case Intrinsic::amdgcn_cluster_id_y:
8202 return ST.hasClusters() &&
8205 case Intrinsic::amdgcn_cluster_id_z:
8206 return ST.hasClusters() &&
8209 case Intrinsic::amdgcn_cluster_workgroup_id_x:
8210 return ST.hasClusters() &&
8213 case Intrinsic::amdgcn_cluster_workgroup_id_y:
8214 return ST.hasClusters() &&
8217 case Intrinsic::amdgcn_cluster_workgroup_id_z:
8218 return ST.hasClusters() &&
8221 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
8222 return ST.hasClusters() &&
8224 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
8225 return ST.hasClusters() &&
8228 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
8229 return ST.hasClusters() &&
8232 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
8233 return ST.hasClusters() &&
8236 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
8237 return ST.hasClusters() &&
8241 case Intrinsic::amdgcn_wave_id:
8243 case Intrinsic::amdgcn_lds_kernel_id:
8246 case Intrinsic::amdgcn_dispatch_ptr:
8249 case Intrinsic::amdgcn_queue_ptr:
8252 case Intrinsic::amdgcn_implicit_buffer_ptr:
8255 case Intrinsic::amdgcn_dispatch_id:
8258 case Intrinsic::r600_read_ngroups_x:
8262 case Intrinsic::r600_read_ngroups_y:
8265 case Intrinsic::r600_read_ngroups_z:
8268 case Intrinsic::r600_read_local_size_x:
8271 case Intrinsic::r600_read_local_size_y:
8275 case Intrinsic::r600_read_local_size_z:
8278 case Intrinsic::amdgcn_fdiv_fast:
8280 case Intrinsic::amdgcn_is_shared:
8282 case Intrinsic::amdgcn_is_private:
8284 case Intrinsic::amdgcn_wavefrontsize: {
8285 B.buildConstant(
MI.getOperand(0), ST.getWavefrontSize());
8286 MI.eraseFromParent();
8289 case Intrinsic::amdgcn_s_buffer_load:
8291 case Intrinsic::amdgcn_raw_buffer_store:
8292 case Intrinsic::amdgcn_raw_ptr_buffer_store:
8293 case Intrinsic::amdgcn_struct_buffer_store:
8294 case Intrinsic::amdgcn_struct_ptr_buffer_store:
8296 case Intrinsic::amdgcn_raw_buffer_store_format:
8297 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
8298 case Intrinsic::amdgcn_struct_buffer_store_format:
8299 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
8301 case Intrinsic::amdgcn_raw_tbuffer_store:
8302 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
8303 case Intrinsic::amdgcn_struct_tbuffer_store:
8304 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
8306 case Intrinsic::amdgcn_raw_buffer_load:
8307 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8308 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8309 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8310 case Intrinsic::amdgcn_struct_buffer_load:
8311 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8312 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8313 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
8315 case Intrinsic::amdgcn_raw_buffer_load_format:
8316 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
8317 case Intrinsic::amdgcn_struct_buffer_load_format:
8318 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8320 case Intrinsic::amdgcn_raw_tbuffer_load:
8321 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
8322 case Intrinsic::amdgcn_struct_tbuffer_load:
8323 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
8325 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8326 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8327 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8328 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8329 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8330 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8331 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8332 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8333 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8334 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8335 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8336 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8337 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8338 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8339 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8340 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8341 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8342 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8343 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8344 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8345 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8346 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8347 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8348 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8349 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8350 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8351 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8352 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8353 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8354 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8355 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8356 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8357 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8358 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8359 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8360 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8361 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8362 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8363 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8364 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8365 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8366 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8367 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8368 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8369 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8370 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8371 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8372 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8373 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8374 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
8375 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8376 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
8377 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8378 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8379 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8380 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8381 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8382 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8383 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8384 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8385 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
8386 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
8387 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
8388 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
8389 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8390 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
8391 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8392 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
8393 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8394 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8395 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8396 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8398 case Intrinsic::amdgcn_rsq_clamp:
8400 case Intrinsic::amdgcn_image_bvh_intersect_ray:
8402 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
8403 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
8405 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
8406 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
8407 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
8408 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
8409 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
8410 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
8411 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
8412 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
8416 if (IndexArgTy !=
S64) {
8417 auto NewIndex = IndexArgTy.
isVector() ?
B.buildBitcast(
S64, Index)
8418 :
B.buildAnyExt(
S64, Index);
8419 MI.getOperand(5).setReg(NewIndex.getReg(0));
8423 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8424 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8425 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8426 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8427 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8428 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8429 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8430 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8434 MI.getOperand(5).setReg(
B.buildAnyExt(
S32, Index).getReg(0));
8437 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
8438 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
8439 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
8440 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
8441 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
8442 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
8443 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8444 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8445 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8447 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8451 if (IndexArgTy != IdxTy) {
8452 auto NewIndex = IndexArgTy.
isVector() ?
B.buildBitcast(IdxTy, Index)
8453 :
B.buildAnyExt(IdxTy, Index);
8454 MI.getOperand(7).setReg(NewIndex.getReg(0));
8459 case Intrinsic::amdgcn_fmed3: {
8465 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
8466 MI.removeOperand(1);
8470 case Intrinsic::amdgcn_readlane:
8471 case Intrinsic::amdgcn_writelane:
8472 case Intrinsic::amdgcn_readfirstlane:
8473 case Intrinsic::amdgcn_permlane16:
8474 case Intrinsic::amdgcn_permlanex16:
8475 case Intrinsic::amdgcn_permlane64:
8476 case Intrinsic::amdgcn_set_inactive:
8477 case Intrinsic::amdgcn_set_inactive_chain_arg:
8478 case Intrinsic::amdgcn_mov_dpp8:
8479 case Intrinsic::amdgcn_update_dpp:
8480 case Intrinsic::amdgcn_permlane_bcast:
8481 case Intrinsic::amdgcn_permlane_up:
8482 case Intrinsic::amdgcn_permlane_down:
8483 case Intrinsic::amdgcn_permlane_xor:
8485 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8487 case Intrinsic::amdgcn_dead: {
8491 MI.eraseFromParent();
8494 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8495 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8496 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8497 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8498 B.buildLoad(
MI.getOperand(0),
MI.getOperand(2), **
MI.memoperands_begin());
8499 MI.eraseFromParent();
8501 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8502 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8503 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8504 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8505 B.buildStore(
MI.getOperand(2),
MI.getOperand(1), **
MI.memoperands_begin());
8506 MI.eraseFromParent();
8508 case Intrinsic::amdgcn_flat_load_monitor_b32:
8509 case Intrinsic::amdgcn_flat_load_monitor_b64:
8510 case Intrinsic::amdgcn_flat_load_monitor_b128:
8511 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8512 B.buildInstr(AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR)
8513 .add(
MI.getOperand(0))
8514 .add(
MI.getOperand(2))
8515 .addMemOperand(*
MI.memoperands_begin());
8516 MI.eraseFromParent();
8518 case Intrinsic::amdgcn_global_load_monitor_b32:
8519 case Intrinsic::amdgcn_global_load_monitor_b64:
8520 case Intrinsic::amdgcn_global_load_monitor_b128:
8521 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8522 B.buildInstr(AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR)
8523 .add(
MI.getOperand(0))
8524 .add(
MI.getOperand(2))
8525 .addMemOperand(*
MI.memoperands_begin());
8526 MI.eraseFromParent();
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst, const SrcOp &Src, unsigned Flags)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
static constexpr unsigned FPEnvModeBitField
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS32Vectors
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS64Vectors
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
static constexpr unsigned FPEnvTrapBitField
static constexpr unsigned MaxRegisterSize
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
static bool isRegisterVectorType(LLT Ty)
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Interface for Targets to specify which operations they can successfully select and how the others sho...
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define FP_DENORM_FLUSH_NONE
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static constexpr int Concat[]
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B, AMDGPU::Hwreg::Id HwReg, unsigned LowBit, unsigned Width) const
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeInsert(LegalizerHelper &Helper, MachineInstr &MI) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLZ_ZERO_POISON(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp10Unsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafeImpl(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags, bool IsExp10) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFEXPF64(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtract(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLS(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkGroupId(MachineInstr &MI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
bool isModuleEntryFunction() const
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isBottomOfStack() const
bool isEntryFunction() const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const fltSemantics & IEEEsingle()
static const fltSemantics & IEEEdouble()
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
Get the array size.
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
@ ICMP_SLT
signed less than
@ FCMP_OLT
0 1 0 0 True if ordered and less than
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
@ ICMP_UGE
unsigned greater or equal
@ ICMP_SGT
signed greater than
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
@ ICMP_ULT
unsigned less than
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
ConstantFP - Floating Point Values [float, double].
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
This is the shared class of boolean and integer constants.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr bool isAnyScalar() const
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & unsupported()
The instruction is unsupported.
LegalizeRuleSet & scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx)
Change the type TypeIdx to have the same scalar size as type SameSizeIdx.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & unsupportedFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Conditionally limit the maximum size of the scalar.
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
LegalizeRuleSet & minScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty if condition is met.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
LLVM_ABI LegalizeResult lowerInsert(MachineInstr &MI)
LLVM_ABI LegalizeResult lowerExtract(MachineInstr &MI)
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
use_instr_nodbg_iterator use_instr_nodbg_begin(Register RegNo) const
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI Register createGenericVirtualRegister(LLT Ty, StringRef Name="")
Create and return a new generic virtual register with low-level type Ty.
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
Represent a mutable reference to an array (0 or more elements consecutively in memory),...
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
constexpr bool isValid() const
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
unsigned getPointerSizeInBits(unsigned AS) const
A Use represents the edge between a Value definition and its users.
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
Invariant opcodes: All instruction sets have these as their low opcodes.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Undef
Value of the register doesn't matter.
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool has_single_bit(T Value) noexcept
std::function< bool(const LegalityQuery &)> LegalityPredicate
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
To bit_cast(const From &from) noexcept
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned BitWidth
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
unsigned Log2(Align A)
Returns the log2 of the alignment.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
static constexpr uint64_t encode(Fields... Values)
MIMGBaseOpcode BaseOpcode
This struct is a compact representation of a valid (non-zero power of two) alignment.
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.