37#include "llvm/IR/IntrinsicsAMDGPU.h"
38#include "llvm/IR/IntrinsicsR600.h"
40#define DEBUG_TYPE "amdgpu-legalinfo"
50 "amdgpu-global-isel-new-legality",
51 cl::desc(
"Use GlobalISel desired legality, rather than try to use"
52 "rules compatible with selection patterns"),
67 unsigned Bits = Ty.getSizeInBits();
77 const LLT Ty = Query.Types[TypeIdx];
83 return Ty.getNumElements() % 2 != 0 &&
84 EltSize > 1 && EltSize < 32 &&
85 Ty.getSizeInBits() % 32 != 0;
91 const LLT Ty = Query.Types[TypeIdx];
98 const LLT Ty = Query.Types[TypeIdx];
100 return EltTy.
getSizeInBits() == 16 && Ty.getNumElements() > 2;
106 const LLT Ty = Query.Types[TypeIdx];
108 return std::pair(TypeIdx,
115 const LLT Ty = Query.Types[TypeIdx];
117 unsigned Size = Ty.getSizeInBits();
118 unsigned Pieces = (
Size + 63) / 64;
119 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
129 const LLT Ty = Query.Types[TypeIdx];
132 const int Size = Ty.getSizeInBits();
134 const int NextMul32 = (
Size + 31) / 32;
138 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
146 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
147 return std::make_pair(TypeIdx,
LLT::scalar(MemSize));
154 const LLT Ty = Query.Types[TypeIdx];
156 const unsigned EltSize = Ty.getElementType().getSizeInBits();
159 assert(EltSize == 32 || EltSize == 64);
164 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
168 return std::pair(TypeIdx,
183 const unsigned NumElems = Ty.getElementCount().getFixedValue();
188 const unsigned Size = Ty.getSizeInBits();
201 const LLT Ty = Query.Types[TypeIdx];
208 const LLT Ty = Query.Types[TypeIdx];
209 unsigned Size = Ty.getSizeInBits();
218 const LLT QueryTy = Query.Types[TypeIdx];
225 const LLT QueryTy = Query.Types[TypeIdx];
232 const LLT QueryTy = Query.Types[TypeIdx];
238 return ((ST.useRealTrue16Insts() &&
Size == 16) ||
Size % 32 == 0) &&
244 return EltSize == 16 || EltSize % 32 == 0;
248 const int EltSize = Ty.getElementType().getSizeInBits();
249 return EltSize == 32 || EltSize == 64 ||
250 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
251 EltSize == 128 || EltSize == 256;
280 LLT Ty = Query.Types[TypeIdx];
288 const LLT QueryTy = Query.Types[TypeIdx];
373 if (Ty.isPointerOrPointerVector())
374 Ty = Ty.changeElementType(
LLT::scalar(Ty.getScalarSizeInBits()));
378 (ST.useRealTrue16Insts() && Ty ==
S16) ||
393 const LLT Ty = Query.Types[TypeIdx];
394 return !Ty.
isVector() && Ty.getSizeInBits() > 32 &&
395 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
403 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
413 bool IsLoad,
bool IsAtomic) {
417 return ST.hasFlatScratchEnabled() ? 128 : 32;
419 return ST.useDS128() ? 128 : 64;
430 return IsLoad ? 512 : 128;
435 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
444 const bool IsLoad = Query.
Opcode != AMDGPU::G_STORE;
446 unsigned RegSize = Ty.getSizeInBits();
449 unsigned AS = Query.
Types[1].getAddressSpace();
456 if (Ty.isVector() && MemSize !=
RegSize)
463 if (IsLoad && MemSize <
Size)
464 MemSize = std::max(MemSize,
Align);
484 if (!ST.hasDwordx3LoadStores())
497 if (AlignBits < MemSize) {
500 Align(AlignBits / 8)))
530 const unsigned Size = Ty.getSizeInBits();
531 if (Ty.isPointerVector())
541 unsigned EltSize = Ty.getScalarSizeInBits();
542 return EltSize != 32 && EltSize != 64;
556 const unsigned Size = Ty.getSizeInBits();
557 if (
Size != MemSizeInBits)
558 return Size <= 32 && Ty.isVector();
564 return Ty.isVector() && (!MemTy.
isVector() || MemTy == Ty) &&
573 uint64_t AlignInBits,
unsigned AddrSpace,
583 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
594 if (AlignInBits < RoundedSize)
601 RoundedSize, AddrSpace,
Align(AlignInBits / 8),
613 Query.
Types[1].getAddressSpace(), Opcode);
633 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
637 std::array<Register, 4> VectorElems;
638 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
639 for (
unsigned I = 0;
I < NumParts; ++
I)
641 B.buildExtractVectorElementConstant(
S32, VectorReg,
I).getReg(0);
642 B.buildMergeValues(MO, VectorElems);
647 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
648 auto Scalar =
B.buildBitcast(ScalarTy, BitcastReg);
649 B.buildIntToPtr(MO, Scalar);
669 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
670 auto Unmerged =
B.buildUnmerge(
LLT::scalar(32), Pointer);
671 for (
unsigned I = 0;
I < NumParts; ++
I)
673 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
675 Register Scalar =
B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
676 return B.buildBitcast(VectorTy, Scalar).getReg(0);
695 auto GetAddrSpacePtr = [&TM](
unsigned AS) {
708 const LLT BufferStridedPtr =
711 const LLT CodePtr = FlatPtr;
713 const std::initializer_list<LLT> AddrSpaces64 = {
714 GlobalPtr, ConstantPtr, FlatPtr
717 const std::initializer_list<LLT> AddrSpaces32 = {
718 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
721 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
723 const std::initializer_list<LLT> FPTypesBase = {
727 const std::initializer_list<LLT> FPTypes16 = {
731 const std::initializer_list<LLT> FPTypesPK16 = {
735 const std::initializer_list<LLT> FPTypesPK16_64 = {
S32,
S64,
S16,
V2S16,
738 const LLT MinScalarFPTy = ST.has16BitInsts() ?
S16 :
S32;
761 if (ST.hasVOP3PInsts() && ST.hasAddNoCarryInsts() && ST.hasIntClamp()) {
763 if (ST.hasPackedU64Ops()) {
766 .clampMaxNumElementsStrict(0,
S16, 2)
772 }
else if (ST.hasScalarAddSub64()) {
775 .clampMaxNumElementsStrict(0,
S16, 2)
783 .clampMaxNumElementsStrict(0,
S16, 2)
790 if (ST.hasScalarSMulU64()) {
793 .clampMaxNumElementsStrict(0,
S16, 2)
801 .clampMaxNumElementsStrict(0,
S16, 2)
811 .minScalarOrElt(0,
S16)
816 }
else if (ST.has16BitInsts()) {
850 .widenScalarToNextMultipleOf(0, 32)
860 if (ST.hasMad64_32())
865 if (ST.hasIntClamp()) {
888 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
898 if (ST.hasVOP3PInsts()) {
900 .clampMaxNumElements(0,
S8, 2)
921 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
933 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
940 .clampScalar(0,
S16,
S64);
975 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
976 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
983 if (ST.has16BitInsts()) {
984 if (ST.hasVOP3PInsts())
987 FPOpActions.legalFor({
S16});
989 TrigActions.customFor({
S16});
990 FDIVActions.customFor({
S16});
993 if (ST.hasPackedFP32Ops()) {
994 FPOpActions.legalFor({
V2S32});
995 FPOpActions.clampMaxNumElementsStrict(0,
S32, 2);
998 if (ST.hasPackedFP64Ops()) {
999 FPOpActions.legalFor({
V2S64});
1000 FPOpActions.clampMaxNumElementsStrict(0,
S64, 2);
1003 if (ST.hasPackedFP64Ops()) {
1004 FPOpActions.legalFor({
V2S64});
1005 FPOpActions.clampMaxNumElementsStrict(0,
S64, 2);
1008 auto &MinNumMaxNumIeee =
1011 if (ST.hasVOP3PInsts()) {
1012 MinNumMaxNumIeee.legalFor(FPTypesPK16)
1014 .clampMaxNumElements(0,
S16, 2)
1015 .clampScalar(0,
S16,
S64)
1017 }
else if (ST.has16BitInsts()) {
1018 MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0,
S16,
S64).scalarize(0);
1020 MinNumMaxNumIeee.legalFor(FPTypesBase)
1021 .clampScalar(0,
S32,
S64)
1026 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
1028 if (ST.hasPackedFP64Ops()) {
1029 MinNumMaxNum.customFor(FPTypesPK16_64)
1031 .clampMaxNumElements(0,
S16, 2)
1032 .clampMaxNumElements(0,
S64, 2)
1033 .clampScalar(0,
S16,
S64)
1035 }
else if (ST.hasVOP3PInsts()) {
1036 MinNumMaxNum.customFor(FPTypesPK16)
1038 .clampMaxNumElements(0,
S16, 2)
1039 .clampScalar(0,
S16,
S64)
1041 }
else if (ST.has16BitInsts()) {
1042 MinNumMaxNum.customFor(FPTypes16)
1043 .clampScalar(0,
S16,
S64)
1046 MinNumMaxNum.customFor(FPTypesBase)
1047 .clampScalar(0,
S32,
S64)
1051 if (ST.hasVOP3PInsts())
1068 .
legalFor(ST.hasPackedFP32Ops(), {V2S32})
1070 if (ST.hasPackedFP32Ops())
1074 if (ST.has16BitInsts()) {
1108 if (ST.hasFractBug()) {
1142 if (ST.hasCvtPkF16F32Inst()) {
1144 .clampMaxNumElements(0,
S16, 2);
1148 FPTruncActions.scalarize(0).lower();
1156 if (ST.has16BitInsts()) {
1170 if (ST.hasPackedFP32Ops())
1180 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1181 FMad.customFor({
S32,
S16});
1182 else if (ST.hasMadMacF32Insts())
1183 FMad.customFor({
S32});
1184 else if (ST.hasMadF16())
1185 FMad.customFor({
S16});
1190 if (ST.has16BitInsts()) {
1193 FRem.minScalar(0,
S32)
1202 .clampMaxNumElements(0,
S16, 2)
1222 if (ST.has16BitInsts())
1234 if (ST.has16BitInsts())
1247 .legalFor(ST.has16BitInsts(), {{S16, S16}})
1248 .legalFor(ST.hasVCvtPkIU16F32(), {{V2S16, V2S32}})
1252 if (
ST.has16BitInsts())
1255 if (
ST.hasVCvtPkIU16F32())
1265 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1266 .clampScalar(0,
S16,
S64)
1270 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1276 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1280 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1281 .clampScalar(0,
S16,
S64)
1285 if (
ST.has16BitInsts()) {
1286 getActionDefinitionsBuilder(
1287 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1289 .clampScalar(0,
S16,
S64)
1292 getActionDefinitionsBuilder(
1293 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1295 .clampScalar(0,
S32,
S64)
1298 getActionDefinitionsBuilder(
1299 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1302 .clampScalar(0,
S32,
S64)
1306 getActionDefinitionsBuilder(G_PTR_ADD)
1312 getActionDefinitionsBuilder(G_PTRMASK)
1314 .scalarSameSizeAs(1, 0)
1318 getActionDefinitionsBuilder(G_ICMP)
1330 {
S1}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1331 .legalForCartesianProduct(
1332 {
S32}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1333 if (
ST.has16BitInsts()) {
1334 CmpBuilder.legalFor({{
S1,
S16}});
1345 {
S1},
ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1347 if (
ST.hasSALUFloatInsts())
1356 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1357 if (
ST.has16BitInsts())
1358 ExpOps.customFor({{
S32}, {
S16}});
1360 ExpOps.customFor({
S32});
1361 ExpOps.clampScalar(0, MinScalarFPTy,
S32)
1364 getActionDefinitionsBuilder(G_FPOWI)
1365 .clampScalar(0, MinScalarFPTy,
S32)
1368 getActionDefinitionsBuilder(G_FLOG2)
1369 .legalFor(
ST.has16BitInsts(), {S16})
1374 getActionDefinitionsBuilder(G_FEXP2)
1375 .legalFor(
ST.has16BitInsts(), {S16})
1381 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1383 LogOps.clampScalar(0, MinScalarFPTy,
S32)
1387 getActionDefinitionsBuilder(G_CTPOP)
1389 .clampScalar(0,
S32,
S32)
1390 .widenScalarToNextPow2(1, 32)
1391 .clampScalar(1,
S32,
S64)
1393 .widenScalarToNextPow2(0, 32);
1396 if (
ST.has16BitInsts())
1397 getActionDefinitionsBuilder(G_IS_FPCLASS)
1398 .legalForCartesianProduct({
S1}, FPTypes16)
1399 .widenScalarToNextPow2(1)
1403 getActionDefinitionsBuilder(G_IS_FPCLASS)
1404 .legalForCartesianProduct({
S1}, FPTypesBase)
1405 .lowerFor({
S1,
S16})
1406 .widenScalarToNextPow2(1)
1413 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1415 .clampScalar(0,
S32,
S32)
1416 .clampScalar(1,
S32,
S64)
1417 .widenScalarToNextPow2(0, 32)
1418 .widenScalarToNextPow2(1, 32)
1422 getActionDefinitionsBuilder(G_CTLZ_ZERO_POISON)
1425 .clampScalar(0,
S32,
S32)
1426 .clampScalar(1,
S32,
S64)
1428 .widenScalarToNextPow2(0, 32)
1429 .widenScalarToNextPow2(1, 32);
1431 getActionDefinitionsBuilder(G_CTTZ_ZERO_POISON)
1433 .clampScalar(0,
S32,
S32)
1434 .clampScalar(1,
S32,
S64)
1436 .widenScalarToNextPow2(0, 32)
1437 .widenScalarToNextPow2(1, 32);
1439 getActionDefinitionsBuilder(G_CTLS)
1442 .clampScalar(0,
S32,
S32)
1443 .clampScalar(1,
S32,
S32);
1447 getActionDefinitionsBuilder(G_BITREVERSE)
1449 .clampScalar(0,
S32,
S64)
1451 .widenScalarToNextPow2(0);
1453 if (
ST.has16BitInsts()) {
1454 getActionDefinitionsBuilder(G_BSWAP)
1456 .clampMaxNumElementsStrict(0,
S16, 2)
1459 .widenScalarToNextPow2(0)
1460 .clampScalar(0,
S16,
S32)
1463 if (
ST.hasVOP3PInsts()) {
1464 getActionDefinitionsBuilder(G_ABS)
1466 .clampMaxNumElements(0,
S16, 2)
1468 .widenScalarToNextPow2(0)
1471 if (
ST.hasMinMaxI64Insts()) {
1472 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1474 .clampMaxNumElements(0,
S16, 2)
1476 .widenScalarToNextPow2(0)
1480 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1482 .clampMaxNumElements(0,
S16, 2)
1484 .widenScalarToNextPow2(0)
1489 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1491 .widenScalarToNextPow2(0)
1498 getActionDefinitionsBuilder(G_BSWAP)
1503 .widenScalarToNextPow2(0)
1508 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1511 .widenScalarToNextPow2(0)
1516 getActionDefinitionsBuilder(G_INTTOPTR)
1518 .legalForCartesianProduct(AddrSpaces64, {
S64})
1519 .legalForCartesianProduct(AddrSpaces32, {
S32})
1532 getActionDefinitionsBuilder(G_PTRTOINT)
1534 .legalForCartesianProduct(AddrSpaces64, {
S64})
1535 .legalForCartesianProduct(AddrSpaces32, {
S32})
1548 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1552 const auto needToSplitMemOp = [=](
const LegalityQuery &Query,
1553 bool IsLoad) ->
bool {
1557 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1571 unsigned NumRegs = (MemSize + 31) / 32;
1573 if (!
ST.hasDwordx3LoadStores())
1584 unsigned GlobalAlign32 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1585 unsigned GlobalAlign16 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1586 unsigned GlobalAlign8 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1592 for (
unsigned Op : {G_LOAD, G_STORE}) {
1593 const bool IsStore =
Op == G_STORE;
1595 auto &Actions = getActionDefinitionsBuilder(
Op);
1598 Actions.legalForTypesWithMemDesc({{
S32, GlobalPtr,
S32, GlobalAlign32},
1601 {
S64, GlobalPtr,
S64, GlobalAlign32},
1604 {
S32, GlobalPtr,
S8, GlobalAlign8},
1605 {
S32, GlobalPtr,
S16, GlobalAlign16},
1607 {
S32, LocalPtr,
S32, 32},
1608 {
S64, LocalPtr,
S64, 32},
1610 {
S32, LocalPtr,
S8, 8},
1611 {
S32, LocalPtr,
S16, 16},
1614 {
S32, PrivatePtr,
S32, 32},
1615 {
S32, PrivatePtr,
S8, 8},
1616 {
S32, PrivatePtr,
S16, 16},
1619 {
S32, ConstantPtr,
S32, GlobalAlign32},
1622 {
S64, ConstantPtr,
S64, GlobalAlign32},
1623 {
V2S32, ConstantPtr,
V2S32, GlobalAlign32}});
1625 Actions.legalForTypesWithMemDesc(
ST.useRealTrue16Insts(),
1626 {{S16, GlobalPtr, S8, GlobalAlign8},
1627 {S16, GlobalPtr, S16, GlobalAlign16},
1628 {S16, LocalPtr, S8, 8},
1629 {S16, LocalPtr, S16, 16},
1630 {S16, PrivatePtr, S8, 8},
1631 {S16, PrivatePtr, S16, 16}});
1641 Actions.unsupportedIf(
1642 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1656 Actions.customIf(
typeIs(1, Constant32Ptr));
1682 return !Query.
Types[0].isVector() &&
1683 needToSplitMemOp(Query,
Op == G_LOAD);
1685 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1690 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1693 if (DstSize > MemSize)
1699 if (MemSize > MaxSize)
1707 return Query.
Types[0].isVector() &&
1708 needToSplitMemOp(Query,
Op == G_LOAD);
1710 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1724 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1725 if (MemSize > MaxSize) {
1729 if (MaxSize % EltSize == 0) {
1735 unsigned NumPieces = MemSize / MaxSize;
1739 if (NumPieces == 1 || NumPieces >= NumElts ||
1740 NumElts % NumPieces != 0)
1741 return std::pair(0, EltTy);
1749 return std::pair(0, EltTy);
1764 return std::pair(0, EltTy);
1769 .widenScalarToNextPow2(0)
1776 getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1777 .legalForTypesWithMemDesc({{
S32, GlobalPtr,
S8, 8},
1778 {
S32, GlobalPtr,
S16, 2 * 8},
1779 {
S32, LocalPtr,
S8, 8},
1780 {
S32, LocalPtr,
S16, 16},
1781 {
S32, PrivatePtr,
S8, 8},
1782 {
S32, PrivatePtr,
S16, 16},
1783 {
S32, ConstantPtr,
S8, 8},
1784 {
S32, ConstantPtr,
S16, 2 * 8}})
1785 .legalForTypesWithMemDesc(
ST.useRealTrue16Insts(),
1786 {{S16, GlobalPtr, S8, GlobalAlign8},
1787 {S16, LocalPtr, S8, GlobalAlign8},
1788 {S16, PrivatePtr, S8, GlobalAlign8},
1789 {S16, ConstantPtr, S8, GlobalAlign8}})
1794 if (
ST.hasFlatAddressSpace()) {
1795 ExtLoads.legalForTypesWithMemDesc(
1796 {{
S32, FlatPtr,
S8, 8}, {
S32, FlatPtr,
S16, 16}});
1798 ExtLoads.legalForTypesWithMemDesc(
ST.useRealTrue16Insts(),
1799 {{S16, FlatPtr, S8, GlobalAlign8}});
1807 ExtLoads.customIf(
typeIs(1, Constant32Ptr));
1809 ExtLoads.narrowScalarIf(
1816 ExtLoads.clampScalar(0,
S32,
S32)
1817 .widenScalarToNextPow2(0)
1820 auto &Atomics = getActionDefinitionsBuilder(
1821 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1822 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1823 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1824 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1825 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr},
1826 {
S64, GlobalPtr}, {
S64, LocalPtr},
1827 {
S32, RegionPtr}, {
S64, RegionPtr}});
1828 if (
ST.hasFlatAddressSpace()) {
1829 Atomics.legalFor({{
S32, FlatPtr}, {
S64, FlatPtr}});
1833 getActionDefinitionsBuilder({G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
1834 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr}, {
S32, RegionPtr}});
1835 if (
ST.hasFlatAddressSpace()) {
1836 Atomics32.legalFor({{
S32, FlatPtr}});
1840 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1841 if (
ST.hasLDSFPAtomicAddF32()) {
1842 Atomic.legalFor({{
S32, LocalPtr}, {
S32, RegionPtr}});
1843 if (
ST.hasLdsAtomicAddF64())
1844 Atomic.legalFor({{
S64, LocalPtr}});
1845 if (
ST.hasAtomicDsPkAdd16Insts())
1846 Atomic.legalFor({{
V2F16, LocalPtr}, {
V2BF16, LocalPtr}});
1848 if (
ST.hasAtomicFaddInsts())
1849 Atomic.legalFor({{
S32, GlobalPtr}});
1850 if (
ST.hasFlatAtomicFaddF32Inst())
1851 Atomic.legalFor({{
S32, FlatPtr}});
1853 if (
ST.hasGFX90AInsts() ||
ST.hasGFX1250Insts()) {
1864 if (
ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1865 ST.hasAtomicBufferGlobalPkAddF16Insts())
1866 Atomic.legalFor({{
V2F16, GlobalPtr}, {
V2F16, BufferFatPtr}});
1867 if (
ST.hasAtomicGlobalPkAddBF16Inst())
1868 Atomic.legalFor({{
V2BF16, GlobalPtr}});
1869 if (
ST.hasAtomicFlatPkAdd16Insts())
1870 Atomic.legalFor({{
V2F16, FlatPtr}, {
V2BF16, FlatPtr}});
1875 auto &AtomicFMinFMax =
1876 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1877 .legalFor({{
F32, LocalPtr}, {
F64, LocalPtr}});
1879 if (
ST.hasAtomicFMinFMaxF32GlobalInsts())
1880 AtomicFMinFMax.legalFor({{
F32, GlobalPtr},{
F32, BufferFatPtr}});
1881 if (
ST.hasAtomicFMinFMaxF64GlobalInsts())
1882 AtomicFMinFMax.legalFor({{
F64, GlobalPtr}, {
F64, BufferFatPtr}});
1883 if (
ST.hasAtomicFMinFMaxF32FlatInsts())
1884 AtomicFMinFMax.legalFor({
F32, FlatPtr});
1885 if (
ST.hasAtomicFMinFMaxF64FlatInsts())
1886 AtomicFMinFMax.legalFor({
F64, FlatPtr});
1890 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1891 .customFor({{
S32, GlobalPtr}, {
S64, GlobalPtr},
1892 {
S32, FlatPtr}, {
S64, FlatPtr}})
1893 .legalFor({{
S32, LocalPtr}, {
S64, LocalPtr},
1894 {
S32, RegionPtr}, {
S64, RegionPtr}});
1898 getActionDefinitionsBuilder(G_SELECT)
1900 LocalPtr, FlatPtr, PrivatePtr,
1904 .clampScalar(0,
S16,
S64)
1908 .clampMaxNumElements(0,
S32, 2)
1909 .clampMaxNumElements(0, LocalPtr, 2)
1910 .clampMaxNumElements(0, PrivatePtr, 2)
1912 .widenScalarToNextPow2(0)
1917 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1919 if (
ST.has16BitInsts()) {
1920 if (
ST.hasVOP3PInsts()) {
1922 .clampMaxNumElements(0,
S16, 2);
1924 Shifts.legalFor({{
S16,
S16}});
1927 Shifts.widenScalarIf(
1932 const LLT AmountTy = Query.
Types[1];
1937 Shifts.clampScalar(1,
S32,
S32);
1938 Shifts.widenScalarToNextPow2(0, 16);
1939 Shifts.clampScalar(0,
S16,
S64);
1941 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1949 Shifts.clampScalar(1,
S32,
S32);
1950 Shifts.widenScalarToNextPow2(0, 32);
1951 Shifts.clampScalar(0,
S32,
S64);
1953 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1958 Shifts.scalarize(0);
1960 for (
unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1961 unsigned VecTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1962 unsigned EltTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1963 unsigned IdxTypeIdx = 2;
1965 getActionDefinitionsBuilder(
Op)
1967 const LLT EltTy = Query.
Types[EltTypeIdx];
1968 const LLT VecTy = Query.
Types[VecTypeIdx];
1969 const LLT IdxTy = Query.
Types[IdxTypeIdx];
1971 const bool isLegalVecType =
1981 return (EltSize == 32 || EltSize == 64) &&
1997 const LLT EltTy = Query.
Types[EltTypeIdx];
1998 const LLT VecTy = Query.
Types[VecTypeIdx];
2002 const unsigned TargetEltSize =
2003 DstEltSize % 64 == 0 ? 64 : 32;
2004 return std::pair(VecTypeIdx,
2008 .clampScalar(EltTypeIdx,
S32,
S64)
2009 .clampScalar(VecTypeIdx,
S32,
S64)
2010 .clampScalar(IdxTypeIdx,
S32,
S32)
2011 .clampMaxNumElements(VecTypeIdx,
S32, 32)
2020 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
2022 const LLT &EltTy = Query.
Types[1].getElementType();
2023 return Query.
Types[0] != EltTy;
2026 for (
unsigned Op : {G_EXTRACT, G_INSERT}) {
2027 unsigned BigTyIdx =
Op == G_EXTRACT ? 1 : 0;
2028 unsigned LitTyIdx =
Op == G_EXTRACT ? 0 : 1;
2029 getActionDefinitionsBuilder(
Op)
2032 const LLT BigTy = Query.
Types[BigTyIdx];
2038 const LLT LitTy = Query.
Types[LitTyIdx];
2043 .widenScalarToNextPow2(BigTyIdx, 32)
2051 const LLT BigTy = Query.
Types[BigTyIdx];
2052 const LLT LitTy = Query.
Types[LitTyIdx];
2060 getActionDefinitionsBuilder(G_BUILD_VECTOR)
2069 if (
ST.hasScalarPackInsts()) {
2072 .minScalarOrElt(0,
S16)
2075 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
2079 BuildVector.customFor({
V2S16,
S16});
2080 BuildVector.minScalarOrElt(0,
S32);
2082 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
2090 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
2092 .clampMaxNumElements(0,
S32, 32)
2093 .clampMaxNumElements(1,
S16, 2)
2094 .clampMaxNumElements(0,
S16, 64);
2096 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
2099 for (
unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
2100 unsigned BigTyIdx =
Op == G_MERGE_VALUES ? 0 : 1;
2101 unsigned LitTyIdx =
Op == G_MERGE_VALUES ? 1 : 0;
2103 auto notValidElt = [=](
const LegalityQuery &Query,
unsigned TypeIdx) {
2104 const LLT Ty = Query.
Types[TypeIdx];
2116 getActionDefinitionsBuilder(
Op)
2120 const LLT BigTy = Query.
Types[BigTyIdx];
2126 .widenScalarToNextPow2(LitTyIdx, 16)
2135 .clampScalar(LitTyIdx,
S32,
S512)
2136 .widenScalarToNextPow2(LitTyIdx, 32)
2140 return notValidElt(Query, LitTyIdx);
2145 return notValidElt(Query, BigTyIdx);
2150 if (
Op == G_MERGE_VALUES) {
2151 Builder.widenScalarIf(
2154 const LLT Ty = Query.
Types[LitTyIdx];
2160 Builder.widenScalarIf(
2162 const LLT Ty = Query.
Types[BigTyIdx];
2168 const LLT &Ty = Query.
Types[BigTyIdx];
2170 if (NewSizeInBits >= 256) {
2172 if (RoundedTo < NewSizeInBits)
2173 NewSizeInBits = RoundedTo;
2175 return std::pair(BigTyIdx,
LLT::scalar(NewSizeInBits));
2184 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
2185 .legalFor({{
S32}, {
S64}})
2186 .clampScalar(0,
S32,
S64);
2188 if (
ST.hasVOP3PInsts()) {
2189 SextInReg.lowerFor({{
V2S16}})
2193 .clampMaxNumElementsStrict(0,
S16, 2);
2194 }
else if (
ST.has16BitInsts()) {
2195 SextInReg.lowerFor({{
S32}, {
S64}, {
S16}});
2199 SextInReg.lowerFor({{
S32}, {
S64}});
2204 .clampScalar(0,
S32,
S64)
2207 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2211 auto &FSHRActionDefs = getActionDefinitionsBuilder(G_FSHR);
2212 FSHRActionDefs.legalFor({{
S32,
S32}})
2213 .clampMaxNumElementsStrict(0,
S16, 2);
2214 if (
ST.hasVOP3PInsts())
2216 FSHRActionDefs.scalarize(0).lower();
2218 if (
ST.hasVOP3PInsts()) {
2219 getActionDefinitionsBuilder(G_FSHL)
2221 .clampMaxNumElementsStrict(0,
S16, 2)
2225 getActionDefinitionsBuilder(G_FSHL)
2230 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2233 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({
S64});
2235 getActionDefinitionsBuilder(G_FENCE)
2238 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2243 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2245 .clampScalar(1,
S32,
S32)
2246 .clampScalar(0,
S32,
S64)
2247 .widenScalarToNextPow2(0)
2250 getActionDefinitionsBuilder(
2254 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2255 G_READ_REGISTER, G_WRITE_REGISTER,
2260 if (
ST.hasIEEEMinimumMaximumInsts()) {
2261 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2262 .legalFor(FPTypesPK16)
2263 .clampMaxNumElements(0,
S16, 2)
2265 }
else if (
ST.hasVOP3PInsts()) {
2266 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2268 .clampMaxNumElementsStrict(0,
S16, 2)
2272 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2274 .clampScalar(0,
S32,
S64)
2278 getActionDefinitionsBuilder(
2279 {G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET, G_MEMSET_INLINE})
2282 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2284 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2285 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2286 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2289 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2291 getActionDefinitionsBuilder(
2292 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2293 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2294 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2295 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2300 getActionDefinitionsBuilder({G_INTRINSIC, G_INTRINSIC_W_SIDE_EFFECTS,
2301 G_INTRINSIC_CONVERGENT,
2302 G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS})
2305 getLegacyLegalizerInfo().computeTables();
2315 switch (
MI.getOpcode()) {
2316 case TargetOpcode::G_ADDRSPACE_CAST:
2318 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2320 case TargetOpcode::G_FCEIL:
2322 case TargetOpcode::G_FREM:
2324 case TargetOpcode::G_INTRINSIC_TRUNC:
2326 case TargetOpcode::G_SITOFP:
2328 case TargetOpcode::G_UITOFP:
2330 case TargetOpcode::G_FPTOSI:
2332 case TargetOpcode::G_FPTOUI:
2334 case TargetOpcode::G_FMINNUM:
2335 case TargetOpcode::G_FMAXNUM:
2336 case TargetOpcode::G_FMINIMUMNUM:
2337 case TargetOpcode::G_FMAXIMUMNUM:
2339 case TargetOpcode::G_EXTRACT:
2341 case TargetOpcode::G_INSERT:
2343 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2345 case TargetOpcode::G_INSERT_VECTOR_ELT:
2347 case TargetOpcode::G_FSIN:
2348 case TargetOpcode::G_FCOS:
2350 case TargetOpcode::G_GLOBAL_VALUE:
2352 case TargetOpcode::G_LOAD:
2353 case TargetOpcode::G_SEXTLOAD:
2354 case TargetOpcode::G_ZEXTLOAD:
2356 case TargetOpcode::G_STORE:
2358 case TargetOpcode::G_FMAD:
2360 case TargetOpcode::G_FDIV:
2362 case TargetOpcode::G_FFREXP:
2364 case TargetOpcode::G_FSQRT:
2366 case TargetOpcode::G_UDIV:
2367 case TargetOpcode::G_UREM:
2368 case TargetOpcode::G_UDIVREM:
2370 case TargetOpcode::G_SDIV:
2371 case TargetOpcode::G_SREM:
2372 case TargetOpcode::G_SDIVREM:
2374 case TargetOpcode::G_ATOMIC_CMPXCHG:
2376 case TargetOpcode::G_FLOG2:
2378 case TargetOpcode::G_FLOG:
2379 case TargetOpcode::G_FLOG10:
2381 case TargetOpcode::G_FEXP2:
2383 case TargetOpcode::G_FEXP:
2384 case TargetOpcode::G_FEXP10:
2386 case TargetOpcode::G_FPOW:
2388 case TargetOpcode::G_FFLOOR:
2390 case TargetOpcode::G_BUILD_VECTOR:
2391 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2393 case TargetOpcode::G_MUL:
2395 case TargetOpcode::G_CTLZ:
2396 case TargetOpcode::G_CTTZ:
2398 case TargetOpcode::G_CTLS:
2400 case TargetOpcode::G_CTLZ_ZERO_POISON:
2402 case TargetOpcode::G_STACKSAVE:
2404 case TargetOpcode::G_GET_FPENV:
2406 case TargetOpcode::G_SET_FPENV:
2408 case TargetOpcode::G_TRAP:
2410 case TargetOpcode::G_DEBUGTRAP:
2430 if (ST.hasApertureRegs()) {
2435 ? AMDGPU::SRC_SHARED_BASE
2436 : AMDGPU::SRC_PRIVATE_BASE;
2437 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2438 !ST.hasGloballyAddressableScratch()) &&
2439 "Cannot use src_private_base with globally addressable scratch!");
2442 B.buildCopy({Dst}, {
Register(ApertureRegNo)});
2443 return B.buildUnmerge(
S32, Dst).getReg(1);
2458 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
2474 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2477 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2499 B.buildObjectPtrOffset(
2501 B.buildConstant(
LLT::scalar(64), StructOffset).getReg(0));
2502 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2510 switch (Def->getOpcode()) {
2511 case AMDGPU::G_FRAME_INDEX:
2512 case AMDGPU::G_GLOBAL_VALUE:
2513 case AMDGPU::G_BLOCK_ADDR:
2515 case AMDGPU::G_CONSTANT: {
2516 const ConstantInt *CI = Def->getOperand(1).getCImm();
2533 assert(
MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2535 Intrinsic::amdgcn_addrspacecast_nonnull));
2540 :
MI.getOperand(1).getReg();
2544 unsigned SrcAS = SrcTy.getAddressSpace();
2554 MI.setDesc(
B.getTII().get(TargetOpcode::G_BITCAST));
2561 auto castFlatToLocalOrPrivate = [&](
const DstOp &Dst) ->
Register {
2563 ST.hasGloballyAddressableScratch()) {
2567 Register SrcLo =
B.buildExtract(
S32, Src, 0).getReg(0);
2569 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
2570 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2572 MRI.
setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2574 return B.buildIntToPtr(Dst,
Sub).getReg(0);
2578 return B.buildExtract(Dst, Src, 0).getReg(0);
2584 castFlatToLocalOrPrivate(Dst);
2585 MI.eraseFromParent();
2591 auto SegmentNull =
B.buildConstant(DstTy, NullVal);
2592 auto FlatNull =
B.buildConstant(SrcTy, 0);
2595 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2599 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2601 MI.eraseFromParent();
2608 auto castLocalOrPrivateToFlat = [&](
const DstOp &Dst) ->
Register {
2611 Register SrcAsInt =
B.buildPtrToInt(
S32, Src).getReg(0);
2614 ST.hasGloballyAddressableScratch()) {
2619 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {
S32})
2623 if (ST.isWave64()) {
2624 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {
S32})
2630 B.buildConstant(
S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2631 Register SrcHi =
B.buildShl(
S32, ThreadID, ShAmt).getReg(0);
2633 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).
getReg(0);
2637 B.buildInstr(AMDGPU::S_MOV_B64, {
S64},
2638 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2640 MRI.
setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2641 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2650 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).
getReg(0);
2656 castLocalOrPrivateToFlat(Dst);
2657 MI.eraseFromParent();
2661 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2668 SegmentNull.getReg(0));
2670 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2672 MI.eraseFromParent();
2677 SrcTy.getSizeInBits() == 64) {
2679 B.buildExtract(Dst, Src, 0);
2680 MI.eraseFromParent();
2687 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2688 auto PtrLo =
B.buildPtrToInt(
S32, Src);
2689 if (AddrHiVal == 0) {
2691 B.buildIntToPtr(Dst, Zext);
2693 auto HighAddr =
B.buildConstant(
S32, AddrHiVal);
2694 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2697 MI.eraseFromParent();
2704 MI.eraseFromParent();
2713 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2718 auto C1 =
B.buildFConstant(Ty, C1Val);
2719 auto CopySign =
B.buildFCopysign(Ty, C1, Src);
2722 auto Tmp1 =
B.buildFAdd(Ty, Src, CopySign);
2723 auto Tmp2 =
B.buildFSub(Ty, Tmp1, CopySign);
2725 auto C2 =
B.buildFConstant(Ty, C2Val);
2726 auto Fabs =
B.buildFAbs(Ty, Src);
2729 B.buildSelect(
MI.getOperand(0).getReg(),
Cond, Src, Tmp2);
2730 MI.eraseFromParent();
2748 auto Trunc =
B.buildIntrinsicTrunc(
S64, Src);
2750 const auto Zero =
B.buildFConstant(
S64, 0.0);
2751 const auto One =
B.buildFConstant(
S64, 1.0);
2754 auto And =
B.buildAnd(
S1, Lt0, NeTrunc);
2755 auto Add =
B.buildSelect(
S64,
And, One, Zero);
2758 B.buildFAdd(
MI.getOperand(0).getReg(), Trunc,
Add);
2759 MI.eraseFromParent();
2767 Register Src0Reg =
MI.getOperand(1).getReg();
2768 Register Src1Reg =
MI.getOperand(2).getReg();
2769 auto Flags =
MI.getFlags();
2772 auto Div =
B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2773 auto Trunc =
B.buildIntrinsicTrunc(Ty, Div, Flags);
2774 auto Neg =
B.buildFNeg(Ty, Trunc, Flags);
2775 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2776 MI.eraseFromParent();
2782 const unsigned FractBits = 52;
2783 const unsigned ExpBits = 11;
2786 auto Const0 =
B.buildConstant(
S32, FractBits - 32);
2787 auto Const1 =
B.buildConstant(
S32, ExpBits);
2789 auto ExpPart =
B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {
S32})
2791 .addUse(Const0.getReg(0))
2792 .addUse(Const1.getReg(0));
2794 return B.buildSub(
S32, ExpPart,
B.buildConstant(
S32, 1023));
2808 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2815 const unsigned FractBits = 52;
2818 const auto SignBitMask =
B.buildConstant(
S32, UINT32_C(1) << 31);
2819 auto SignBit =
B.buildAnd(
S32,
Hi, SignBitMask);
2821 const auto FractMask =
B.buildConstant(
S64, (UINT64_C(1) << FractBits) - 1);
2823 const auto Zero32 =
B.buildConstant(
S32, 0);
2826 auto SignBit64 =
B.buildMergeLikeInstr(
S64, {Zero32, SignBit});
2828 auto Shr =
B.buildAShr(
S64, FractMask, Exp);
2829 auto Not =
B.buildNot(
S64, Shr);
2830 auto Tmp0 =
B.buildAnd(
S64, Src, Not);
2831 auto FiftyOne =
B.buildConstant(
S32, FractBits - 1);
2836 auto Tmp1 =
B.buildSelect(
S64, ExpLt0, SignBit64, Tmp0);
2837 B.buildSelect(
MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2838 MI.eraseFromParent();
2854 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2855 auto ThirtyTwo =
B.buildConstant(
S32, 32);
2858 auto CvtHi =
Signed ?
B.buildSITOFP(
S64, Unmerge.getReg(1))
2859 :
B.buildUITOFP(
S64, Unmerge.getReg(1));
2861 auto CvtLo =
B.buildUITOFP(
S64, Unmerge.getReg(0));
2862 auto LdExp =
B.buildFLdexp(
S64, CvtHi, ThirtyTwo);
2865 B.buildFAdd(Dst, LdExp, CvtLo);
2866 MI.eraseFromParent();
2872 auto One =
B.buildConstant(
S32, 1);
2876 auto ThirtyOne =
B.buildConstant(
S32, 31);
2877 auto X =
B.buildXor(
S32, Unmerge.getReg(0), Unmerge.getReg(1));
2878 auto OppositeSign =
B.buildAShr(
S32,
X, ThirtyOne);
2879 auto MaxShAmt =
B.buildAdd(
S32, ThirtyTwo, OppositeSign);
2880 auto LS =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32})
2881 .addUse(Unmerge.getReg(1));
2882 auto LS2 =
B.buildSub(
S32, LS, One);
2883 ShAmt =
B.buildUMin(
S32, LS2, MaxShAmt);
2885 ShAmt =
B.buildCTLZ(
S32, Unmerge.getReg(1));
2886 auto Norm =
B.buildShl(
S64, Src, ShAmt);
2887 auto Unmerge2 =
B.buildUnmerge({
S32,
S32}, Norm);
2888 auto Adjust =
B.buildUMin(
S32, One, Unmerge2.getReg(0));
2889 auto Norm2 =
B.buildOr(
S32, Unmerge2.getReg(1), Adjust);
2890 auto FVal =
Signed ?
B.buildSITOFP(
S32, Norm2) :
B.buildUITOFP(
S32, Norm2);
2891 auto Scale =
B.buildSub(
S32, ThirtyTwo, ShAmt);
2892 B.buildFLdexp(Dst, FVal, Scale);
2893 MI.eraseFromParent();
2913 unsigned Flags =
MI.getFlags();
2924 auto Trunc =
B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2932 Sign =
B.buildAShr(
S32, Src,
B.buildConstant(
S32, 31));
2933 Trunc =
B.buildFAbs(
S32, Trunc, Flags);
2937 K0 =
B.buildFConstant(
2939 K1 =
B.buildFConstant(
2942 K0 =
B.buildFConstant(
2944 K1 =
B.buildFConstant(
2948 auto Mul =
B.buildFMul(SrcLT, Trunc, K0, Flags);
2949 auto FloorMul =
B.buildFFloor(SrcLT,
Mul, Flags);
2950 auto Fma =
B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2953 :
B.buildFPTOUI(
S32, FloorMul);
2954 auto Lo =
B.buildFPTOUI(
S32, Fma);
2958 Sign =
B.buildMergeLikeInstr(
S64, {Sign, Sign});
2960 B.buildSub(Dst,
B.buildXor(
S64,
B.buildMergeLikeInstr(
S64, {Lo, Hi}), Sign),
2963 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
2964 MI.eraseFromParent();
2996 unsigned StartIdx =
Offset / 32;
2998 auto Unmerge =
B.buildUnmerge(
LLT::scalar(32), SrcReg);
3000 if (DstCount == 1) {
3002 B.buildIntToPtr(DstReg, Unmerge.getReg(StartIdx));
3007 for (
unsigned I = 0;
I < DstCount; ++
I)
3008 MergeVec.
push_back(Unmerge.getReg(StartIdx +
I));
3009 B.buildMergeLikeInstr(DstReg, MergeVec);
3012 MI.eraseFromParent();
3022 Register InsertSrc =
MI.getOperand(2).getReg();
3031 if (
Offset % 32 != 0 || DstSize % 32 != 0 || InsertSize % 32 != 0)
3035 unsigned DstCount = DstSize / 32;
3036 unsigned InsertCount = InsertSize / 32;
3037 unsigned StartIdx =
Offset / 32;
3039 auto SrcUnmerge =
B.buildUnmerge(
S32, SrcReg);
3042 for (
unsigned I = 0;
I < StartIdx; ++
I)
3045 if (InsertCount == 1) {
3049 InsertSrc =
B.buildPtrToInt(
S32, InsertSrc).getReg(0);
3052 auto InsertUnmerge =
B.buildUnmerge(
S32, InsertSrc);
3053 for (
unsigned I = 0;
I < InsertCount; ++
I)
3057 for (
unsigned I = StartIdx + InsertCount;
I < DstCount; ++
I)
3060 B.buildMergeLikeInstr(DstReg, MergeVec);
3062 MI.eraseFromParent();
3089 auto IntVec =
B.buildPtrToInt(IntVecTy, Vec);
3090 auto IntElt =
B.buildExtractVectorElement(IntTy, IntVec,
MI.getOperand(2));
3091 B.buildIntToPtr(Dst, IntElt);
3093 MI.eraseFromParent();
3100 std::optional<ValueAndVReg> MaybeIdxVal =
3104 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3107 auto Unmerge =
B.buildUnmerge(EltTy, Vec);
3108 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
3113 MI.eraseFromParent();
3142 auto IntVecSource =
B.buildPtrToInt(IntVecTy, Vec);
3143 auto IntIns =
B.buildPtrToInt(IntTy, Ins);
3144 auto IntVecDest =
B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
3146 B.buildIntToPtr(Dst, IntVecDest);
3147 MI.eraseFromParent();
3154 std::optional<ValueAndVReg> MaybeIdxVal =
3159 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3162 if (IdxVal < NumElts) {
3164 for (
unsigned i = 0; i < NumElts; ++i)
3166 B.buildUnmerge(SrcRegs, Vec);
3168 SrcRegs[IdxVal] =
MI.getOperand(2).getReg();
3169 B.buildMergeLikeInstr(Dst, SrcRegs);
3174 MI.eraseFromParent();
3185 unsigned Flags =
MI.getFlags();
3189 if (ST.hasTrigReducedRange()) {
3190 auto MulVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
3191 TrigVal =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
3192 .addUse(MulVal.getReg(0))
3196 TrigVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
3199 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
3203 MI.eraseFromParent();
3211 unsigned GAFlags)
const {
3240 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3242 if (ST.has64BitLiterals()) {
3246 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3250 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3259 if (!
B.getMRI()->getRegClassOrNull(PCReg))
3260 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3263 B.buildExtract(DstReg, PCReg, 0);
3273 if (RequiresHighHalf && ST.has64BitLiterals()) {
3275 MRI.
setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3276 B.buildInstr(AMDGPU::S_MOV_B64)
3291 MRI.
setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3294 B.buildInstr(AMDGPU::S_MOV_B32)
3299 if (RequiresHighHalf) {
3301 "Must provide a 64-bit pointer type!");
3304 MRI.
setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3306 B.buildInstr(AMDGPU::S_MOV_B32)
3317 MRI.
setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3319 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3323 if (AddrDst != DstReg)
3324 B.buildCast(DstReg, AddrDst);
3325 }
else if (AddrLo != DstReg) {
3328 B.buildCast(DstReg, AddrLo);
3337 unsigned AS = Ty.getAddressSpace();
3345 GV->
getName() !=
"llvm.amdgcn.module.lds" &&
3349 Fn,
"local memory global used by non-kernel function",
3358 B.buildUndef(DstReg);
3359 MI.eraseFromParent();
3383 auto Sz =
B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {
S32});
3384 B.buildIntToPtr(DstReg, Sz);
3385 MI.eraseFromParent();
3391 MI.eraseFromParent();
3395 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3397 MI.eraseFromParent();
3405 MI.eraseFromParent();
3411 MI.eraseFromParent();
3427 if (Ty.getSizeInBits() == 32) {
3429 auto Load =
B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3430 B.buildExtract(DstReg, Load, 0);
3432 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3434 MI.eraseFromParent();
3457 auto Cast =
B.buildAddrSpaceCast(ConstPtr, PtrReg);
3459 MI.getOperand(1).setReg(Cast.getReg(0));
3464 if (
MI.getOpcode() != AMDGPU::G_LOAD)
3490 if (WideMemSize == ValSize) {
3496 MI.setMemRefs(MF, {WideMMO});
3502 if (ValSize > WideMemSize)
3509 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3510 B.buildTrunc(ValReg, WideLoad).getReg(0);
3517 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3518 B.buildExtract(ValReg, WideLoad, 0);
3522 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3523 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3527 MI.eraseFromParent();
3540 Register DataReg =
MI.getOperand(0).getReg();
3585 "this should not have been custom lowered");
3590 Register PackedVal =
B.buildBuildVector(VecTy, { NewVal, CmpVal }).
getReg(0);
3592 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3596 .setMemRefs(
MI.memoperands());
3598 MI.eraseFromParent();
3606 switch (
DefMI->getOpcode()) {
3607 case TargetOpcode::G_INTRINSIC: {
3609 case Intrinsic::amdgcn_frexp_mant:
3610 case Intrinsic::amdgcn_log:
3611 case Intrinsic::amdgcn_log_clamp:
3612 case Intrinsic::amdgcn_exp2:
3613 case Intrinsic::amdgcn_sqrt:
3621 case TargetOpcode::G_FSQRT:
3623 case TargetOpcode::G_FFREXP: {
3624 if (
DefMI->getOperand(0).getReg() == Src)
3628 case TargetOpcode::G_FPEXT: {
3649std::pair<Register, Register>
3651 unsigned Flags)
const {
3656 auto SmallestNormal =
B.buildFConstant(
3658 auto IsLtSmallestNormal =
3661 auto Scale32 =
B.buildFConstant(
F32, 0x1.0p+32);
3662 auto One =
B.buildFConstant(
F32, 1.0);
3664 B.buildSelect(
F32, IsLtSmallestNormal, Scale32, One, Flags);
3665 auto ScaledInput =
B.buildFMul(
F32, Src, ScaleFactor, Flags);
3667 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3680 LLT Ty =
B.getMRI()->getType(Dst);
3681 unsigned Flags =
MI.getFlags();
3686 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3687 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {
F32})
3688 .addUse(Ext.getReg(0))
3690 B.buildFPTrunc(Dst,
Log2, Flags);
3691 MI.eraseFromParent();
3699 B.buildIntrinsic(Intrinsic::amdgcn_log, {
MI.getOperand(0)})
3702 MI.eraseFromParent();
3706 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3707 .addUse(ScaledInput)
3710 auto ThirtyTwo =
B.buildFConstant(Ty, 32.0);
3711 auto Zero =
B.buildFConstant(Ty, 0.0);
3713 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3714 B.buildFSub(Dst,
Log2, ResultOffset, Flags);
3716 MI.eraseFromParent();
3722 auto FMul =
B.buildFMul(Ty,
X,
Y, Flags);
3723 return B.buildFAdd(Ty,
FMul, Z, Flags).getReg(0);
3728 const bool IsLog10 =
MI.getOpcode() == TargetOpcode::G_FLOG10;
3729 assert(IsLog10 ||
MI.getOpcode() == TargetOpcode::G_FLOG);
3734 unsigned Flags =
MI.getFlags();
3747 auto PromoteSrc =
B.buildFPExt(
F32,
X);
3749 B.buildFPTrunc(Dst, LogVal);
3754 MI.eraseFromParent();
3763 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(
X).setMIFlags(Flags);
3766 if (ST.hasFastFMAF32()) {
3768 const float c_log10 = 0x1.344134p-2f;
3769 const float cc_log10 = 0x1.09f79ep-26f;
3772 const float c_log = 0x1.62e42ep-1f;
3773 const float cc_log = 0x1.efa39ep-25f;
3775 auto C =
B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3776 auto CC =
B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3780 R =
B.buildFMul(Ty,
Y,
C, NewFlags).getReg(0);
3781 auto NegR =
B.buildFNeg(Ty, R, NewFlags);
3782 auto FMA0 =
B.buildFMA(Ty,
Y,
C, NegR, NewFlags);
3783 auto FMA1 =
B.buildFMA(Ty,
Y, CC, FMA0, NewFlags);
3784 R =
B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0);
3787 const float ch_log10 = 0x1.344000p-2f;
3788 const float ct_log10 = 0x1.3509f6p-18f;
3791 const float ch_log = 0x1.62e000p-1f;
3792 const float ct_log = 0x1.0bfbe8p-15f;
3794 auto CH =
B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3795 auto CT =
B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3797 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3798 auto YH =
B.buildAnd(Ty,
Y, MaskConst);
3799 auto YT =
B.buildFSub(Ty,
Y, YH, Flags);
3803 auto YTCT =
B.buildFMul(Ty, YT, CT, NewFlags);
3806 getMad(
B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags);
3808 R =
getMad(
B, Ty, YH.getReg(0),
CH.getReg(0), Mad1, NewFlags);
3811 const bool IsFiniteOnly =
3814 if (!IsFiniteOnly) {
3817 auto Fabs =
B.buildFAbs(Ty,
Y);
3820 R =
B.buildSelect(Ty, IsFinite, R,
Y, Flags).getReg(0);
3824 auto Zero =
B.buildFConstant(Ty, 0.0);
3826 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3827 auto Shift =
B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3828 B.buildFSub(Dst, R, Shift, Flags);
3830 B.buildCopy(Dst, R);
3833 MI.eraseFromParent();
3839 unsigned Flags)
const {
3840 const double Log2BaseInverted =
3843 LLT Ty =
B.getMRI()->getType(Dst);
3848 auto LogSrc =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3851 auto ScaledResultOffset =
B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3852 auto Zero =
B.buildFConstant(Ty, 0.0);
3854 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3855 auto Log2Inv =
B.buildFConstant(Ty, Log2BaseInverted);
3857 if (ST.hasFastFMAF32())
3858 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3860 auto Mul =
B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3861 B.buildFAdd(Dst,
Mul, ResultOffset, Flags);
3869 ?
B.buildFLog2(Ty, Src, Flags)
3870 :
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3873 auto Log2BaseInvertedOperand =
B.buildFConstant(Ty, Log2BaseInverted);
3874 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3885 unsigned Flags =
MI.getFlags();
3886 LLT Ty =
B.getMRI()->getType(Dst);
3896 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3897 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {
F32})
3898 .addUse(Ext.getReg(0))
3900 B.buildFPTrunc(Dst,
Log2, Flags);
3901 MI.eraseFromParent();
3911 MI.eraseFromParent();
3919 auto RangeCheckConst =
B.buildFConstant(Ty, -0x1.f80000p+6f);
3921 RangeCheckConst, Flags);
3923 auto SixtyFour =
B.buildFConstant(Ty, 0x1.0p+6f);
3924 auto Zero =
B.buildFConstant(Ty, 0.0);
3925 auto AddOffset =
B.buildSelect(
F32, NeedsScaling, SixtyFour, Zero, Flags);
3926 auto AddInput =
B.buildFAdd(
F32, Src, AddOffset, Flags);
3928 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3929 .addUse(AddInput.getReg(0))
3932 auto TwoExpNeg64 =
B.buildFConstant(Ty, 0x1.0p-64f);
3933 auto One =
B.buildFConstant(Ty, 1.0);
3934 auto ResultScale =
B.buildSelect(
F32, NeedsScaling, TwoExpNeg64, One, Flags);
3935 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3936 MI.eraseFromParent();
3941 const SrcOp &Src,
unsigned Flags) {
3942 LLT Ty = Dst.getLLTTy(*
B.getMRI());
3945 return B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Dst})
3946 .addUse(Src.getReg())
3949 return B.buildFExp2(Dst, Src, Flags);
3955 bool IsExp10)
const {
3956 LLT Ty =
B.getMRI()->getType(
X);
3960 auto Const =
B.buildFConstant(Ty, IsExp10 ? 0x1.a934f0p+1f :
numbers::log2e);
3961 auto Mul =
B.buildFMul(Ty,
X, Const, Flags);
3968 LLT Ty =
B.getMRI()->getType(Dst);
3975 auto Threshold =
B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3978 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+6f);
3979 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3980 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X, Flags);
3983 auto ExpInput =
B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3985 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3986 .addUse(ExpInput.getReg(0))
3989 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.969d48p-93f);
3990 auto AdjustedResult =
B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3991 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3997 unsigned Flags)
const {
3998 LLT Ty =
B.getMRI()->getType(Dst);
4003 auto K0 =
B.buildFConstant(Ty, 0x1.a92000p+1f);
4004 auto K1 =
B.buildFConstant(Ty, 0x1.4f0978p-11f);
4006 auto Mul1 =
B.buildFMul(Ty,
X, K1, Flags);
4007 auto Exp2_1 =
buildExp(
B, Ty, Mul1, Flags);
4008 auto Mul0 =
B.buildFMul(Ty,
X, K0, Flags);
4009 auto Exp2_0 =
buildExp(
B, Ty, Mul0, Flags);
4010 B.buildFMul(Dst, Exp2_0, Exp2_1, Flags);
4020 auto Threshold =
B.buildFConstant(Ty, -0x1.2f7030p+5f);
4024 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+5f);
4025 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
4026 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X);
4028 auto K0 =
B.buildFConstant(Ty, 0x1.a92000p+1f);
4029 auto K1 =
B.buildFConstant(Ty, 0x1.4f0978p-11f);
4031 auto Mul1 =
B.buildFMul(Ty, AdjustedX, K1, Flags);
4032 auto Exp2_1 =
buildExp(
B, Ty, Mul1, Flags);
4033 auto Mul0 =
B.buildFMul(Ty, AdjustedX, K0, Flags);
4034 auto Exp2_0 =
buildExp(
B, Ty, Mul0, Flags);
4036 auto MulExps =
B.buildFMul(Ty, Exp2_0, Exp2_1, Flags);
4037 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.9f623ep-107f);
4038 auto AdjustedResult =
B.buildFMul(Ty, MulExps, ResultScaleFactor, Flags);
4040 B.buildSelect(Dst, NeedsScaling, AdjustedResult, MulExps);
4059 if (
MI.getOpcode() == TargetOpcode::G_FEXP2) {
4061 Dn =
B.buildFRint(
S64,
X, Flags).getReg(0);
4063 F =
B.buildFSub(
S64,
X, Dn, Flags).getReg(0);
4065 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.62e42fefa39efp-1));
4066 auto C2 =
B.buildFConstant(
S64,
APFloat(0x1.abc9e3b39803fp-56));
4067 auto Mul2 =
B.buildFMul(
S64,
F, C2, Flags).getReg(0);
4068 T =
B.buildFMA(
S64,
F, C1, Mul2, Flags).getReg(0);
4070 }
else if (
MI.getOpcode() == TargetOpcode::G_FEXP10) {
4071 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.a934f0979a371p+1));
4072 auto Mul =
B.buildFMul(
S64,
X, C1, Flags).getReg(0);
4073 Dn =
B.buildFRint(
S64,
Mul, Flags).getReg(0);
4075 auto NegDn =
B.buildFNeg(
S64, Dn, Flags).getReg(0);
4076 auto C2 =
B.buildFConstant(
S64,
APFloat(-0x1.9dc1da994fd21p-59));
4077 auto C3 =
B.buildFConstant(
S64,
APFloat(0x1.34413509f79ffp-2));
4078 auto Inner =
B.buildFMA(
S64, NegDn, C3,
X, Flags).getReg(0);
4079 F =
B.buildFMA(
S64, NegDn, C2, Inner, Flags).getReg(0);
4081 auto C4 =
B.buildFConstant(
S64,
APFloat(0x1.26bb1bbb55516p+1));
4082 auto C5 =
B.buildFConstant(
S64,
APFloat(-0x1.f48ad494ea3e9p-53));
4083 auto MulF =
B.buildFMul(
S64,
F, C5, Flags).getReg(0);
4084 T =
B.buildFMA(
S64,
F, C4, MulF, Flags).getReg(0);
4087 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.71547652b82fep+0));
4088 auto Mul =
B.buildFMul(
S64,
X, C1, Flags).getReg(0);
4089 Dn =
B.buildFRint(
S64,
Mul, Flags).getReg(0);
4091 auto NegDn =
B.buildFNeg(
S64, Dn, Flags).getReg(0);
4092 auto C2 =
B.buildFConstant(
S64,
APFloat(0x1.abc9e3b39803fp-56));
4093 auto C3 =
B.buildFConstant(
S64,
APFloat(0x1.62e42fefa39efp-1));
4094 auto Inner =
B.buildFMA(
S64, NegDn, C3,
X, Flags).getReg(0);
4095 T =
B.buildFMA(
S64, NegDn, C2, Inner, Flags).getReg(0);
4099 auto P =
B.buildFConstant(
S64, 0x1.ade156a5dcb37p-26);
4100 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.28af3fca7ab0cp-22),
4102 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.71dee623fde64p-19),
4104 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.a01997c89e6b0p-16),
4106 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.a01a014761f6ep-13),
4108 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.6c16c1852b7b0p-10),
4110 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.1111111122322p-7), Flags);
4111 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.55555555502a1p-5), Flags);
4112 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.5555555555511p-3), Flags);
4113 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.000000000000bp-1), Flags);
4115 auto One =
B.buildFConstant(
S64, 1.0);
4116 P =
B.buildFMA(
S64,
T,
P, One, Flags);
4117 P =
B.buildFMA(
S64,
T,
P, One, Flags);
4120 auto DnInt =
B.buildFPTOSI(
S32, Dn);
4121 auto Z =
B.buildFLdexp(
S64,
P, DnInt, Flags);
4128 Z =
B.buildSelect(
S64, CondHi, Z, PInf, Flags);
4135 B.buildSelect(
MI.getOperand(0).getReg(), CondLo, Z, Zero, Flags);
4137 MI.eraseFromParent();
4145 const unsigned Flags =
MI.getFlags();
4157 const bool IsExp10 =
MI.getOpcode() == TargetOpcode::G_FEXP10;
4165 MI.eraseFromParent();
4176 auto Ext =
B.buildFPExt(
F32,
X, Flags);
4179 B.buildFPTrunc(Dst, Lowered, Flags);
4180 MI.eraseFromParent();
4191 MI.eraseFromParent();
4219 const unsigned FlagsNoContract = Flags &
~MachineInstr::FmContract;
4222 if (ST.hasFastFMAF32()) {
4224 const float cc_exp = 0x1.4ae0bep-26f;
4225 const float c_exp10 = 0x1.a934f0p+1f;
4226 const float cc_exp10 = 0x1.2f346ep-24f;
4228 auto C =
B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
4229 PH =
B.buildFMul(Ty,
X,
C, Flags).getReg(0);
4230 auto NegPH =
B.buildFNeg(Ty, PH, Flags);
4231 auto FMA0 =
B.buildFMA(Ty,
X,
C, NegPH, Flags);
4233 auto CC =
B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
4234 PL =
B.buildFMA(Ty,
X, CC, FMA0, Flags).getReg(0);
4236 const float ch_exp = 0x1.714000p+0f;
4237 const float cl_exp = 0x1.47652ap-12f;
4239 const float ch_exp10 = 0x1.a92000p+1f;
4240 const float cl_exp10 = 0x1.4f0978p-11f;
4242 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
4243 auto XH =
B.buildAnd(Ty,
X, MaskConst);
4244 auto XL =
B.buildFSub(Ty,
X, XH, Flags);
4246 auto CH =
B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
4247 PH =
B.buildFMul(Ty, XH,
CH, Flags).getReg(0);
4249 auto CL =
B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
4250 auto XLCL =
B.buildFMul(Ty, XL, CL, Flags);
4253 getMad(
B, Ty, XL.getReg(0),
CH.getReg(0), XLCL.getReg(0), Flags);
4254 PL =
getMad(
B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
4257 auto E =
B.buildIntrinsicRoundeven(Ty, PH, Flags);
4260 auto PHSubE =
B.buildFSub(Ty, PH, E, FlagsNoContract);
4261 auto A =
B.buildFAdd(Ty, PHSubE, PL, Flags);
4264 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
4265 .addUse(
A.getReg(0))
4267 auto R =
B.buildFLdexp(Ty, Exp2, IntE, Flags);
4269 auto UnderflowCheckConst =
4270 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
4271 auto Zero =
B.buildFConstant(Ty, 0.0);
4275 R =
B.buildSelect(Ty, Underflow, Zero, R);
4278 auto OverflowCheckConst =
4279 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
4284 R =
B.buildSelect(Ty, Overflow, Inf, R, Flags);
4287 B.buildCopy(Dst, R);
4288 MI.eraseFromParent();
4297 unsigned Flags =
MI.getFlags();
4298 LLT Ty =
B.getMRI()->getType(Dst);
4303 auto Log =
B.buildFLog2(
F32, Src0, Flags);
4304 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
4305 .addUse(Log.getReg(0))
4308 B.buildFExp2(Dst,
Mul, Flags);
4309 }
else if (Ty == F16) {
4311 auto Log =
B.buildFLog2(F16, Src0, Flags);
4312 auto Ext0 =
B.buildFPExt(
F32, Log, Flags);
4313 auto Ext1 =
B.buildFPExt(
F32, Src1, Flags);
4314 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
4315 .addUse(Ext0.getReg(0))
4316 .addUse(Ext1.getReg(0))
4318 B.buildFExp2(Dst,
B.buildFPTrunc(F16,
Mul), Flags);
4322 MI.eraseFromParent();
4330 ModSrc = SrcFNeg->getOperand(1).getReg();
4332 ModSrc = SrcFAbs->getOperand(1).getReg();
4334 ModSrc = SrcFAbs->getOperand(1).getReg();
4345 Register OrigSrc =
MI.getOperand(1).getReg();
4346 unsigned Flags =
MI.getFlags();
4348 "this should not have been custom lowered");
4358 auto Fract =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {
F64})
4378 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
4380 B.buildFMinNum(Min, Fract, Const, Flags);
4385 CorrectedFract =
B.buildSelect(
F64, IsNan, ModSrc, Min, Flags).getReg(0);
4388 auto NegFract =
B.buildFNeg(
F64, CorrectedFract, Flags);
4389 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
4391 MI.eraseFromParent();
4407 if (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4409 Src0 =
B.buildTrunc(
S16,
MI.getOperand(1).getReg()).getReg(0);
4410 Src1 =
B.buildTrunc(
S16,
MI.getOperand(2).getReg()).getReg(0);
4413 auto Merge =
B.buildMergeLikeInstr(
S32, {Src0, Src1});
4414 B.buildBitcast(Dst,
Merge);
4416 MI.eraseFromParent();
4433 bool UsePartialMad64_32,
4434 bool SeparateOddAlignedProducts)
const {
4449 auto getZero32 = [&]() ->
Register {
4451 Zero32 =
B.buildConstant(
S32, 0).getReg(0);
4454 auto getZero64 = [&]() ->
Register {
4456 Zero64 =
B.buildConstant(
S64, 0).getReg(0);
4461 for (
unsigned i = 0; i < Src0.
size(); ++i) {
4472 if (CarryIn.empty())
4475 bool HaveCarryOut =
true;
4477 if (CarryIn.size() == 1) {
4479 LocalAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4483 CarryAccum = getZero32();
4485 CarryAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4486 for (
unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4488 B.buildUAdde(
S32,
S1, CarryAccum, getZero32(), CarryIn[i])
4493 LocalAccum = getZero32();
4494 HaveCarryOut =
false;
4499 B.buildUAdde(
S32,
S1, CarryAccum, LocalAccum, CarryIn.back());
4500 LocalAccum =
Add.getReg(0);
4514 auto buildMadChain =
4517 assert((DstIndex + 1 < Accum.
size() && LocalAccum.size() == 2) ||
4518 (DstIndex + 1 >= Accum.
size() && LocalAccum.size() == 1));
4525 if (LocalAccum.size() == 1 &&
4526 (!UsePartialMad64_32 || !CarryIn.empty())) {
4529 unsigned j1 = DstIndex - j0;
4530 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4534 auto Mul =
B.buildMul(
S32, Src0[j0], Src1[j1]);
4536 LocalAccum[0] =
Mul.getReg(0);
4538 if (CarryIn.empty()) {
4539 LocalAccum[0] =
B.buildAdd(
S32, LocalAccum[0],
Mul).getReg(0);
4542 B.buildUAdde(
S32,
S1, LocalAccum[0],
Mul, CarryIn.back())
4548 }
while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4552 if (j0 <= DstIndex) {
4553 bool HaveSmallAccum =
false;
4556 if (LocalAccum[0]) {
4557 if (LocalAccum.size() == 1) {
4558 Tmp =
B.buildAnyExt(
S64, LocalAccum[0]).getReg(0);
4559 HaveSmallAccum =
true;
4560 }
else if (LocalAccum[1]) {
4561 Tmp =
B.buildMergeLikeInstr(
S64, LocalAccum).getReg(0);
4562 HaveSmallAccum =
false;
4564 Tmp =
B.buildZExt(
S64, LocalAccum[0]).getReg(0);
4565 HaveSmallAccum =
true;
4568 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4570 HaveSmallAccum =
true;
4574 unsigned j1 = DstIndex - j0;
4575 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4579 auto Mad =
B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {
S64,
S1},
4580 {Src0[j0], Src1[j1], Tmp});
4581 Tmp = Mad.getReg(0);
4582 if (!HaveSmallAccum)
4583 CarryOut.push_back(Mad.getReg(1));
4584 HaveSmallAccum =
false;
4587 }
while (j0 <= DstIndex);
4589 auto Unmerge =
B.buildUnmerge(
S32, Tmp);
4590 LocalAccum[0] = Unmerge.getReg(0);
4591 if (LocalAccum.size() > 1)
4592 LocalAccum[1] = Unmerge.getReg(1);
4619 for (
unsigned i = 0; i <= Accum.
size() / 2; ++i) {
4620 Carry OddCarryIn = std::move(OddCarry);
4621 Carry EvenCarryIn = std::move(EvenCarry);
4626 if (2 * i < Accum.
size()) {
4627 auto LocalAccum = Accum.
drop_front(2 * i).take_front(2);
4628 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4633 if (!SeparateOddAlignedProducts) {
4634 auto LocalAccum = Accum.
drop_front(2 * i - 1).take_front(2);
4635 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4637 bool IsHighest = 2 * i >= Accum.
size();
4640 .take_front(IsHighest ? 1 : 2);
4641 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4647 Lo =
B.buildUAddo(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0]);
4649 Lo =
B.buildAdd(
S32, Accum[2 * i - 1], SeparateOddOut[0]);
4651 Lo =
B.buildUAdde(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0],
4654 Accum[2 * i - 1] =
Lo->getOperand(0).getReg();
4657 auto Hi =
B.buildUAdde(
S32,
S1, Accum[2 * i], SeparateOddOut[1],
4658 Lo->getOperand(1).getReg());
4659 Accum[2 * i] =
Hi.getReg(0);
4660 SeparateOddCarry =
Hi.getReg(1);
4667 if (
Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4668 EvenCarryIn.push_back(CarryOut);
4670 if (2 * i < Accum.
size()) {
4671 if (
Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4672 OddCarry.push_back(CarryOut);
4684 assert(ST.hasMad64_32());
4685 assert(
MI.getOpcode() == TargetOpcode::G_MUL);
4697 unsigned Size = Ty.getSizeInBits();
4698 if (ST.hasVMulU64Inst() &&
Size == 64)
4701 unsigned NumParts =
Size / 32;
4713 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4717 for (
unsigned i = 0; i < NumParts; ++i) {
4721 B.buildUnmerge(Src0Parts, Src0);
4722 B.buildUnmerge(Src1Parts, Src1);
4725 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4726 SeparateOddAlignedProducts);
4728 B.buildMergeLikeInstr(DstReg, AccumRegs);
4729 MI.eraseFromParent();
4744 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_CTLZ
4745 ? AMDGPU::G_AMDGPU_FFBH_U32
4746 : AMDGPU::G_AMDGPU_FFBL_B32;
4747 auto Tmp =
B.buildInstr(NewOpc, {DstTy}, {Src});
4750 MI.eraseFromParent();
4760 TypeSize NumBits = SrcTy.getSizeInBits();
4764 auto ShiftAmt =
B.buildConstant(
S32, 32u - NumBits);
4765 auto Extend =
B.buildAnyExt(
S32, {Src}).
getReg(0u);
4766 auto Shift =
B.buildShl(
S32, Extend, ShiftAmt);
4767 auto Ctlz =
B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {
S32}, {Shift});
4768 B.buildTrunc(Dst, Ctlz);
4769 MI.eraseFromParent();
4780 assert(SrcTy ==
S32 &&
"legalizeCTLS only supports s32");
4781 unsigned BitWidth = SrcTy.getSizeInBits();
4783 auto Sffbh =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32}).addUse(Src);
4785 B.buildSub(Dst, Clamped,
B.buildConstant(
S32, 1));
4786 MI.eraseFromParent();
4792 if (
MI.getOpcode() != TargetOpcode::G_XOR)
4795 return ConstVal == -1;
4802 Register CondDef =
MI.getOperand(0).getReg();
4821 if (
UseMI->getParent() != Parent ||
UseMI->getOpcode() != AMDGPU::G_BRCOND)
4830 UncondBrTarget = &*NextMBB;
4832 if (
Next->getOpcode() != AMDGPU::G_BR)
4851 *ArgRC,
B.getDebugLoc(), ArgTy);
4855 const unsigned Mask = Arg->
getMask();
4863 auto ShiftAmt =
B.buildConstant(
S32, Shift);
4864 AndMaskSrc =
B.buildLShr(
S32, LiveIn, ShiftAmt).getReg(0);
4867 B.buildAnd(DstReg, AndMaskSrc,
B.buildConstant(
S32, Mask >> Shift));
4869 B.buildCopy(DstReg, LiveIn);
4879 if (!ST.hasClusters()) {
4882 MI.eraseFromParent();
4902 auto One =
B.buildConstant(
S32, 1);
4903 auto ClusterSizeXYZ =
B.buildAdd(
S32, ClusterMaxIdXYZ, One);
4904 auto GlobalIdXYZ =
B.buildAdd(
S32, ClusterWorkGroupIdXYZ,
4905 B.buildMul(
S32, ClusterIdXYZ, ClusterSizeXYZ));
4912 B.buildCopy(DstReg, GlobalIdXYZ);
4913 MI.eraseFromParent();
4917 B.buildCopy(DstReg, ClusterIdXYZ);
4918 MI.eraseFromParent();
4923 unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4925 MRI.
setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4926 B.buildInstr(AMDGPU::S_GETREG_B32_const)
4928 .addImm(ClusterIdField);
4929 auto Zero =
B.buildConstant(
S32, 0);
4932 B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4933 MI.eraseFromParent();
4975 auto LoadConstant = [&](
unsigned N) {
4976 B.buildConstant(DstReg,
N);
4980 if (ST.hasArchitectedSGPRs() &&
4987 Arg = &WorkGroupIDX;
4988 ArgRC = &AMDGPU::SReg_32RegClass;
4992 Arg = &WorkGroupIDY;
4993 ArgRC = &AMDGPU::SReg_32RegClass;
4997 Arg = &WorkGroupIDZ;
4998 ArgRC = &AMDGPU::SReg_32RegClass;
5002 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
5003 return LoadConstant(0);
5004 Arg = &ClusterWorkGroupIDX;
5005 ArgRC = &AMDGPU::SReg_32RegClass;
5009 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
5010 return LoadConstant(0);
5011 Arg = &ClusterWorkGroupIDY;
5012 ArgRC = &AMDGPU::SReg_32RegClass;
5016 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
5017 return LoadConstant(0);
5018 Arg = &ClusterWorkGroupIDZ;
5019 ArgRC = &AMDGPU::SReg_32RegClass;
5024 return LoadConstant(ClusterDims.
getDims()[0] - 1);
5025 Arg = &ClusterWorkGroupMaxIDX;
5026 ArgRC = &AMDGPU::SReg_32RegClass;
5031 return LoadConstant(ClusterDims.
getDims()[1] - 1);
5032 Arg = &ClusterWorkGroupMaxIDY;
5033 ArgRC = &AMDGPU::SReg_32RegClass;
5038 return LoadConstant(ClusterDims.
getDims()[2] - 1);
5039 Arg = &ClusterWorkGroupMaxIDZ;
5040 ArgRC = &AMDGPU::SReg_32RegClass;
5044 Arg = &ClusterWorkGroupMaxFlatID;
5045 ArgRC = &AMDGPU::SReg_32RegClass;
5060 return LoadConstant(0);
5065 B.buildUndef(DstReg);
5069 if (!Arg->isRegister() || !Arg->getRegister().isValid())
5081 MI.eraseFromParent();
5087 B.buildConstant(
MI.getOperand(0).getReg(),
C);
5088 MI.eraseFromParent();
5095 unsigned MaxID = ST.getMaxWorkitemID(
B.getMF().getFunction(), Dim);
5109 B.buildUndef(DstReg);
5110 MI.eraseFromParent();
5114 if (Arg->isMasked()) {
5128 MI.eraseFromParent();
5143 Register KernArgReg =
B.getMRI()->createGenericVirtualRegister(PtrTy);
5152 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
5160 Align Alignment)
const {
5164 "unexpected kernarg parameter type");
5171 MI.eraseFromParent();
5206 auto FloatY =
B.buildUITOFP(
S32,
Y);
5207 auto RcpIFlag =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {FloatY});
5209 auto ScaledY =
B.buildFMul(
S32, RcpIFlag, Scale);
5210 auto Z =
B.buildFPTOUI(
S32, ScaledY);
5213 auto NegY =
B.buildSub(
S32,
B.buildConstant(
S32, 0),
Y);
5214 auto NegYZ =
B.buildMul(
S32, NegY, Z);
5215 Z =
B.buildAdd(
S32, Z,
B.buildUMulH(
S32, Z, NegYZ));
5218 auto Q =
B.buildUMulH(
S32,
X, Z);
5219 auto R =
B.buildSub(
S32,
X,
B.buildMul(
S32, Q,
Y));
5222 auto One =
B.buildConstant(
S32, 1);
5225 Q =
B.buildSelect(
S32,
Cond,
B.buildAdd(
S32, Q, One), Q);
5231 B.buildSelect(DstDivReg,
Cond,
B.buildAdd(
S32, Q, One), Q);
5234 B.buildSelect(DstRemReg,
Cond,
B.buildSub(
S32, R,
Y), R);
5253 auto Unmerge =
B.buildUnmerge(
S32, Val);
5255 auto CvtLo =
B.buildUITOFP(
S32, Unmerge.getReg(0));
5256 auto CvtHi =
B.buildUITOFP(
S32, Unmerge.getReg(1));
5258 auto Mad =
B.buildFMAD(
5262 auto Rcp =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {Mad});
5263 auto Mul1 =
B.buildFMul(
5267 auto Mul2 =
B.buildFMul(
5269 auto Trunc =
B.buildIntrinsicTrunc(
S32, Mul2);
5272 auto Mad2 =
B.buildFMAD(
5276 auto ResultLo =
B.buildFPTOUI(
S32, Mad2);
5277 auto ResultHi =
B.buildFPTOUI(
S32, Trunc);
5279 return {ResultLo.getReg(0), ResultHi.getReg(0)};
5294 auto Rcp =
B.buildMergeLikeInstr(
S64, {RcpLo, RcpHi});
5296 auto Zero64 =
B.buildConstant(
S64, 0);
5297 auto NegDenom =
B.buildSub(
S64, Zero64, Denom);
5299 auto MulLo1 =
B.buildMul(
S64, NegDenom, Rcp);
5300 auto MulHi1 =
B.buildUMulH(
S64, Rcp, MulLo1);
5302 auto UnmergeMulHi1 =
B.buildUnmerge(
S32, MulHi1);
5303 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
5304 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
5306 auto Add1_Lo =
B.buildUAddo(
S32,
S1, RcpLo, MulHi1_Lo);
5307 auto Add1_Hi =
B.buildUAdde(
S32,
S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
5308 auto Add1 =
B.buildMergeLikeInstr(
S64, {Add1_Lo, Add1_Hi});
5310 auto MulLo2 =
B.buildMul(
S64, NegDenom, Add1);
5311 auto MulHi2 =
B.buildUMulH(
S64, Add1, MulLo2);
5312 auto UnmergeMulHi2 =
B.buildUnmerge(
S32, MulHi2);
5313 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
5314 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
5316 auto Zero32 =
B.buildConstant(
S32, 0);
5317 auto Add2_Lo =
B.buildUAddo(
S32,
S1, Add1_Lo, MulHi2_Lo);
5318 auto Add2_Hi =
B.buildUAdde(
S32,
S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
5319 auto Add2 =
B.buildMergeLikeInstr(
S64, {Add2_Lo, Add2_Hi});
5321 auto UnmergeNumer =
B.buildUnmerge(
S32, Numer);
5322 Register NumerLo = UnmergeNumer.getReg(0);
5323 Register NumerHi = UnmergeNumer.getReg(1);
5325 auto MulHi3 =
B.buildUMulH(
S64, Numer, Add2);
5326 auto Mul3 =
B.buildMul(
S64, Denom, MulHi3);
5327 auto UnmergeMul3 =
B.buildUnmerge(
S32, Mul3);
5328 Register Mul3_Lo = UnmergeMul3.getReg(0);
5329 Register Mul3_Hi = UnmergeMul3.getReg(1);
5330 auto Sub1_Lo =
B.buildUSubo(
S32,
S1, NumerLo, Mul3_Lo);
5331 auto Sub1_Hi =
B.buildUSube(
S32,
S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
5332 auto Sub1_Mi =
B.buildSub(
S32, NumerHi, Mul3_Hi);
5333 auto Sub1 =
B.buildMergeLikeInstr(
S64, {Sub1_Lo, Sub1_Hi});
5335 auto UnmergeDenom =
B.buildUnmerge(
S32, Denom);
5336 Register DenomLo = UnmergeDenom.getReg(0);
5337 Register DenomHi = UnmergeDenom.getReg(1);
5340 auto C1 =
B.buildSExt(
S32, CmpHi);
5343 auto C2 =
B.buildSExt(
S32, CmpLo);
5346 auto C3 =
B.buildSelect(
S32, CmpEq, C2, C1);
5353 auto Sub2_Lo =
B.buildUSubo(
S32,
S1, Sub1_Lo, DenomLo);
5354 auto Sub2_Mi =
B.buildUSube(
S32,
S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
5355 auto Sub2_Hi =
B.buildUSube(
S32,
S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
5356 auto Sub2 =
B.buildMergeLikeInstr(
S64, {Sub2_Lo, Sub2_Hi});
5358 auto One64 =
B.buildConstant(
S64, 1);
5359 auto Add3 =
B.buildAdd(
S64, MulHi3, One64);
5365 auto C6 =
B.buildSelect(
5369 auto Add4 =
B.buildAdd(
S64, Add3, One64);
5370 auto Sub3_Lo =
B.buildUSubo(
S32,
S1, Sub2_Lo, DenomLo);
5372 auto Sub3_Mi =
B.buildUSube(
S32,
S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
5373 auto Sub3_Hi =
B.buildUSube(
S32,
S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
5374 auto Sub3 =
B.buildMergeLikeInstr(
S64, {Sub3_Lo, Sub3_Hi});
5380 auto Sel1 =
B.buildSelect(
5387 auto Sel2 =
B.buildSelect(
5398 switch (
MI.getOpcode()) {
5401 case AMDGPU::G_UDIV: {
5402 DstDivReg =
MI.getOperand(0).getReg();
5405 case AMDGPU::G_UREM: {
5406 DstRemReg =
MI.getOperand(0).getReg();
5409 case AMDGPU::G_UDIVREM: {
5410 DstDivReg =
MI.getOperand(0).getReg();
5411 DstRemReg =
MI.getOperand(1).getReg();
5418 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
5419 Register Num =
MI.getOperand(FirstSrcOpIdx).getReg();
5420 Register Den =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
5430 MI.eraseFromParent();
5441 if (Ty !=
S32 && Ty !=
S64)
5444 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
5445 Register LHS =
MI.getOperand(FirstSrcOpIdx).getReg();
5446 Register RHS =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
5448 auto SignBitOffset =
B.buildConstant(
S32, Ty.getSizeInBits() - 1);
5449 auto LHSign =
B.buildAShr(Ty, LHS, SignBitOffset);
5450 auto RHSign =
B.buildAShr(Ty, RHS, SignBitOffset);
5452 LHS =
B.buildAdd(Ty, LHS, LHSign).getReg(0);
5453 RHS =
B.buildAdd(Ty, RHS, RHSign).getReg(0);
5455 LHS =
B.buildXor(Ty, LHS, LHSign).getReg(0);
5456 RHS =
B.buildXor(Ty, RHS, RHSign).getReg(0);
5458 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5459 switch (
MI.getOpcode()) {
5462 case AMDGPU::G_SDIV: {
5463 DstDivReg =
MI.getOperand(0).getReg();
5467 case AMDGPU::G_SREM: {
5468 DstRemReg =
MI.getOperand(0).getReg();
5472 case AMDGPU::G_SDIVREM: {
5473 DstDivReg =
MI.getOperand(0).getReg();
5474 DstRemReg =
MI.getOperand(1).getReg();
5487 auto Sign =
B.buildXor(Ty, LHSign, RHSign).getReg(0);
5488 auto SignXor =
B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
5489 B.buildSub(DstDivReg, SignXor, Sign);
5493 auto Sign = LHSign.getReg(0);
5494 auto SignXor =
B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
5495 B.buildSub(DstRemReg, SignXor, Sign);
5498 MI.eraseFromParent();
5514 if (!AllowInaccurateRcp && ResTy !=
LLT::scalar(16))
5525 if (CLHS->isOne()) {
5526 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5530 MI.eraseFromParent();
5535 if (CLHS->isMinusOne()) {
5536 auto FNeg =
B.buildFNeg(ResTy, RHS, Flags);
5537 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5538 .addUse(FNeg.getReg(0))
5541 MI.eraseFromParent();
5548 if (!AllowInaccurateRcp && (ResTy !=
LLT::scalar(16) ||
5553 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5556 B.buildFMul(Res, LHS, RCP, Flags);
5558 MI.eraseFromParent();
5573 if (!AllowInaccurateRcp)
5581 X =
B.buildFConstant(ResTy, 1.0).getReg(0);
5583 Register NegY = IsNegRcp ?
Y :
B.buildFNeg(ResTy,
Y).getReg(0);
5584 auto One =
B.buildFConstant(ResTy, 1.0);
5586 auto R =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5590 R =
B.buildFNeg(ResTy, R);
5592 auto Tmp0 =
B.buildFMA(ResTy, NegY, R, One);
5593 R =
B.buildFMA(ResTy, Tmp0, R, R);
5595 auto Tmp1 =
B.buildFMA(ResTy, NegY, R, One);
5596 R =
B.buildFMA(ResTy, Tmp1, R, R);
5599 if (IsNegRcp || (CLHS && CLHS->
isOne())) {
5600 B.buildCopy(Res, R);
5601 MI.eraseFromParent();
5605 auto Ret =
B.buildFMul(ResTy,
X, R);
5606 auto Tmp2 =
B.buildFMA(ResTy, NegY, Ret,
X);
5608 B.buildFMA(Res, Tmp2, R, Ret);
5609 MI.eraseFromParent();
5641 auto LHSExt =
B.buildFPExt(
S32, LHS, Flags);
5642 auto RHSExt =
B.buildFPExt(
S32, RHS, Flags);
5643 auto NegRHSExt =
B.buildFNeg(
S32, RHSExt);
5644 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5645 .addUse(RHSExt.getReg(0))
5647 auto Quot =
B.buildFMul(
S32, LHSExt, Rcp, Flags);
5649 if (ST.hasMadMacF32Insts()) {
5650 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5651 Quot =
B.buildFMAD(
S32, Err, Rcp, Quot, Flags);
5652 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5654 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5655 Quot =
B.buildFMA(
S32, Err, Rcp, Quot, Flags);
5656 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5658 auto Tmp =
B.buildFMul(
S32, Err, Rcp, Flags);
5659 Tmp =
B.buildAnd(
S32, Tmp,
B.buildConstant(
S32, 0xff800000));
5660 Quot =
B.buildFAdd(
S32, Tmp, Quot, Flags);
5661 auto RDst =
B.buildFPTrunc(
S16, Quot, Flags);
5662 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5663 .addUse(RDst.getReg(0))
5668 MI.eraseFromParent();
5681 unsigned SPDenormMode =
5684 if (ST.hasDenormModeInst()) {
5686 uint32_t DPDenormModeDefault =
Mode.fpDenormModeDPValue();
5688 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5689 B.buildInstr(AMDGPU::S_DENORM_MODE)
5690 .addImm(NewDenormModeValue);
5693 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5694 .addImm(SPDenormMode)
5716 auto One =
B.buildFConstant(
S32, 1.0f);
5718 auto DenominatorScaled =
5719 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5724 auto NumeratorScaled =
5725 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5731 auto ApproxRcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5732 .addUse(DenominatorScaled.getReg(0))
5734 auto NegDivScale0 =
B.buildFNeg(
S32, DenominatorScaled, Flags);
5737 const bool HasDynamicDenormals =
5742 if (!PreservesDenormals) {
5743 if (HasDynamicDenormals) {
5745 B.buildInstr(AMDGPU::S_GETREG_B32)
5746 .addDef(SavedSPDenormMode)
5752 auto Fma0 =
B.buildFMA(
S32, NegDivScale0, ApproxRcp, One, Flags);
5753 auto Fma1 =
B.buildFMA(
S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5754 auto Mul =
B.buildFMul(
S32, NumeratorScaled, Fma1, Flags);
5755 auto Fma2 =
B.buildFMA(
S32, NegDivScale0,
Mul, NumeratorScaled, Flags);
5756 auto Fma3 =
B.buildFMA(
S32, Fma2, Fma1,
Mul, Flags);
5757 auto Fma4 =
B.buildFMA(
S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5759 if (!PreservesDenormals) {
5760 if (HasDynamicDenormals) {
5761 assert(SavedSPDenormMode);
5762 B.buildInstr(AMDGPU::S_SETREG_B32)
5763 .addReg(SavedSPDenormMode)
5769 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S32})
5770 .addUse(Fma4.getReg(0))
5771 .addUse(Fma1.getReg(0))
5772 .addUse(Fma3.getReg(0))
5773 .addUse(NumeratorScaled.getReg(1))
5776 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5777 .addUse(Fmas.getReg(0))
5782 MI.eraseFromParent();
5801 auto One =
B.buildFConstant(
S64, 1.0);
5803 auto DivScale0 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5809 auto NegDivScale0 =
B.buildFNeg(
S64, DivScale0.getReg(0), Flags);
5811 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S64})
5812 .addUse(DivScale0.getReg(0))
5815 auto Fma0 =
B.buildFMA(
S64, NegDivScale0, Rcp, One, Flags);
5816 auto Fma1 =
B.buildFMA(
S64, Rcp, Fma0, Rcp, Flags);
5817 auto Fma2 =
B.buildFMA(
S64, NegDivScale0, Fma1, One, Flags);
5819 auto DivScale1 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5825 auto Fma3 =
B.buildFMA(
S64, Fma1, Fma2, Fma1, Flags);
5826 auto Mul =
B.buildFMul(
S64, DivScale1.getReg(0), Fma3, Flags);
5827 auto Fma4 =
B.buildFMA(
S64, NegDivScale0,
Mul, DivScale1.getReg(0), Flags);
5830 if (!ST.hasUsableDivScaleConditionOutput()) {
5836 auto NumUnmerge =
B.buildUnmerge(
S32, LHS);
5837 auto DenUnmerge =
B.buildUnmerge(
S32, RHS);
5838 auto Scale0Unmerge =
B.buildUnmerge(
S32, DivScale0);
5839 auto Scale1Unmerge =
B.buildUnmerge(
S32, DivScale1);
5842 Scale1Unmerge.getReg(1));
5844 Scale0Unmerge.getReg(1));
5845 Scale =
B.buildXor(
S1, CmpNum, CmpDen).getReg(0);
5847 Scale = DivScale1.getReg(1);
5850 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S64})
5851 .addUse(Fma4.getReg(0))
5852 .addUse(Fma3.getReg(0))
5853 .addUse(
Mul.getReg(0))
5857 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup,
ArrayRef(Res))
5858 .addUse(Fmas.getReg(0))
5863 MI.eraseFromParent();
5878 auto Mant =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5881 auto Exp =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5885 if (ST.hasFractBug()) {
5886 auto Fabs =
B.buildFAbs(Ty, Val);
5890 auto Zero =
B.buildConstant(InstrExpTy, 0);
5891 Exp =
B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5892 Mant =
B.buildSelect(Ty, IsFinite, Mant, Val);
5895 B.buildCopy(Res0, Mant);
5896 B.buildSExtOrTrunc(Res1, Exp);
5898 MI.eraseFromParent();
5913 auto Abs =
B.buildFAbs(
S32, RHS, Flags);
5916 auto C0 =
B.buildFConstant(
S32, 0x1p+96f);
5917 auto C1 =
B.buildFConstant(
S32, 0x1p-32f);
5918 auto C2 =
B.buildFConstant(
S32, 1.0f);
5921 auto Sel =
B.buildSelect(
S32, CmpRes, C1, C2, Flags);
5923 auto Mul0 =
B.buildFMul(
S32, RHS, Sel, Flags);
5925 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5926 .addUse(Mul0.getReg(0))
5929 auto Mul1 =
B.buildFMul(
S32, LHS, RCP, Flags);
5931 B.buildFMul(Res, Sel, Mul1, Flags);
5933 MI.eraseFromParent();
5942 unsigned Flags =
MI.getFlags();
5943 assert(!ST.has16BitInsts());
5945 auto Ext =
B.buildFPExt(
F32,
MI.getOperand(1), Flags);
5946 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {
F32})
5947 .addUse(Ext.getReg(0))
5949 B.buildFPTrunc(
MI.getOperand(0),
Log2, Flags);
5950 MI.eraseFromParent();
5960 const unsigned Flags =
MI.getFlags();
5969 MI.eraseFromParent();
5973 auto ScaleThreshold =
B.buildFConstant(
F32, 0x1.0p-96f);
5975 auto ScaleUpFactor =
B.buildFConstant(
F32, 0x1.0p+32f);
5976 auto ScaledX =
B.buildFMul(
F32,
X, ScaleUpFactor, Flags);
5977 auto SqrtX =
B.buildSelect(
F32, NeedScale, ScaledX,
X, Flags);
5982 .addUse(SqrtX.getReg(0))
5985 auto NegOne =
B.buildConstant(I32, -1);
5986 auto SqrtSNextDown =
B.buildAdd(I32, SqrtS, NegOne);
5988 auto NegSqrtSNextDown =
B.buildFNeg(
F32, SqrtSNextDown, Flags);
5989 auto SqrtVP =
B.buildFMA(
F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5991 auto PosOne =
B.buildConstant(I32, 1);
5992 auto SqrtSNextUp =
B.buildAdd(I32, SqrtS, PosOne);
5994 auto NegSqrtSNextUp =
B.buildFNeg(
F32, SqrtSNextUp, Flags);
5995 auto SqrtVS =
B.buildFMA(
F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5997 auto Zero =
B.buildFConstant(
F32, 0.0f);
6001 B.buildSelect(
F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
6005 B.buildSelect(
F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
6008 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F32}).addReg(SqrtX.getReg(0));
6009 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
6011 auto Half =
B.buildFConstant(
F32, 0.5f);
6012 auto SqrtH =
B.buildFMul(
F32, SqrtR, Half, Flags);
6013 auto NegSqrtH =
B.buildFNeg(
F32, SqrtH, Flags);
6014 auto SqrtE =
B.buildFMA(
F32, NegSqrtH, SqrtS, Half, Flags);
6015 SqrtH =
B.buildFMA(
F32, SqrtH, SqrtE, SqrtH, Flags);
6016 SqrtS =
B.buildFMA(
F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
6017 auto NegSqrtS =
B.buildFNeg(
F32, SqrtS, Flags);
6018 auto SqrtD =
B.buildFMA(
F32, NegSqrtS, SqrtS, SqrtX, Flags);
6019 SqrtS =
B.buildFMA(
F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
6022 auto ScaleDownFactor =
B.buildFConstant(
F32, 0x1.0p-16f);
6024 auto ScaledDown =
B.buildFMul(
F32, SqrtS, ScaleDownFactor, Flags);
6026 SqrtS =
B.buildSelect(
F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
6029 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
6031 MI.eraseFromParent();
6066 unsigned Flags =
MI.getFlags();
6071 auto ScaleConstant =
B.buildFConstant(
F64, 0x1.0p-767);
6073 ZeroInt =
B.buildConstant(
S32, 0).getReg(0);
6077 auto ScaleUpFactor =
B.buildConstant(
S32, 256);
6078 auto ScaleUp =
B.buildSelect(
S32, Scaling, ScaleUpFactor, ZeroInt);
6079 SqrtX =
B.buildFLdexp(
F64,
X, ScaleUp, Flags).getReg(0);
6082 auto SqrtY =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F64}).addReg(SqrtX);
6084 auto Half =
B.buildFConstant(
F64, 0.5);
6085 auto SqrtH0 =
B.buildFMul(
F64, SqrtY, Half);
6086 auto SqrtS0 =
B.buildFMul(
F64, SqrtX, SqrtY);
6088 auto NegSqrtH0 =
B.buildFNeg(
F64, SqrtH0);
6089 auto SqrtR0 =
B.buildFMA(
F64, NegSqrtH0, SqrtS0, Half);
6091 auto SqrtS1 =
B.buildFMA(
F64, SqrtS0, SqrtR0, SqrtS0);
6092 auto SqrtH1 =
B.buildFMA(
F64, SqrtH0, SqrtR0, SqrtH0);
6094 auto NegSqrtS1 =
B.buildFNeg(
F64, SqrtS1);
6095 auto SqrtD0 =
B.buildFMA(
F64, NegSqrtS1, SqrtS1, SqrtX);
6097 auto SqrtS2 =
B.buildFMA(
F64, SqrtD0, SqrtH1, SqrtS1);
6099 Register SqrtRet = SqrtS2.getReg(0);
6101 auto NegSqrtS2 =
B.buildFNeg(
F64, SqrtS2);
6102 auto SqrtD1 =
B.buildFMA(
F64, NegSqrtS2, SqrtS2, SqrtX);
6103 auto SqrtD2 =
B.buildFMA(
F64, SqrtD1, SqrtH1, SqrtS2);
6106 auto ScaleDownFactor =
B.buildConstant(
S32, -128);
6107 auto ScaleDown =
B.buildSelect(
S32, Scaling, ScaleDownFactor, ZeroInt);
6108 SqrtRet =
B.buildFLdexp(
F64, SqrtD2, ScaleDown, Flags).getReg(0);
6113 auto ZeroFP =
B.buildFConstant(
F64, 0.0);
6122 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
6124 MI.eraseFromParent();
6155 auto Flags =
MI.getFlags();
6167 auto Rsq =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
6177 auto ClampMax = UseIEEE ?
B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
6178 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
6183 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
6185 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
6186 MI.eraseFromParent();
6198 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6199 IID == Intrinsic::amdgcn_permlanex16;
6200 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6201 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6202 bool IsPermlaneShuffle = IID == Intrinsic::amdgcn_permlane_bcast ||
6203 IID == Intrinsic::amdgcn_permlane_up ||
6204 IID == Intrinsic::amdgcn_permlane_down ||
6205 IID == Intrinsic::amdgcn_permlane_xor;
6209 auto LaneOp =
B.buildIntrinsic(IID, {VT}).addUse(Src0);
6211 case Intrinsic::amdgcn_readfirstlane:
6212 case Intrinsic::amdgcn_permlane64:
6213 return LaneOp.getReg(0);
6214 case Intrinsic::amdgcn_readlane:
6215 case Intrinsic::amdgcn_set_inactive:
6216 case Intrinsic::amdgcn_set_inactive_chain_arg:
6217 return LaneOp.addUse(Src1).getReg(0);
6218 case Intrinsic::amdgcn_writelane:
6219 case Intrinsic::amdgcn_permlane_bcast:
6220 case Intrinsic::amdgcn_permlane_up:
6221 case Intrinsic::amdgcn_permlane_down:
6222 case Intrinsic::amdgcn_permlane_xor:
6223 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
6224 case Intrinsic::amdgcn_permlane16:
6225 case Intrinsic::amdgcn_permlanex16: {
6227 int64_t Src4 =
MI.getOperand(6).getImm();
6228 int64_t Src5 =
MI.getOperand(7).getImm();
6229 return LaneOp.addUse(Src1)
6236 case Intrinsic::amdgcn_mov_dpp8:
6237 return LaneOp.addImm(
MI.getOperand(3).getImm()).getReg(0);
6238 case Intrinsic::amdgcn_update_dpp:
6239 return LaneOp.addUse(Src1)
6240 .addImm(
MI.getOperand(4).getImm())
6241 .addImm(
MI.getOperand(5).getImm())
6242 .addImm(
MI.getOperand(6).getImm())
6243 .addImm(
MI.getOperand(7).getImm())
6253 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6254 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16 ||
6255 IsPermlaneShuffle) {
6256 Src1 =
MI.getOperand(3).getReg();
6257 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16 ||
6258 IsPermlaneShuffle) {
6259 Src2 =
MI.getOperand(4).getReg();
6264 unsigned Size = Ty.getSizeInBits();
6266 unsigned SplitSize = 32;
6267 if (IID == Intrinsic::amdgcn_update_dpp && (
Size % 64 == 0) &&
6268 ST.hasDPALU_DPP() &&
6272 if (
Size == SplitSize) {
6278 Src0 =
B.buildAnyExt(
S32, Src0).getReg(0);
6280 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6283 if (IID == Intrinsic::amdgcn_writelane)
6286 Register LaneOpDst = createLaneOp(Src0, Src1, Src2,
S32);
6287 B.buildTrunc(DstReg, LaneOpDst);
6288 MI.eraseFromParent();
6292 if (
Size % SplitSize != 0)
6296 bool NeedsBitcast =
false;
6297 if (Ty.isVector()) {
6300 if (EltSize == SplitSize) {
6301 PartialResTy = EltTy;
6302 }
else if (EltSize == 16 || EltSize == 32) {
6303 unsigned NElem = SplitSize / EltSize;
6307 NeedsBitcast =
true;
6312 unsigned NumParts =
Size / SplitSize;
6316 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6317 Src1Parts =
B.buildUnmerge(PartialResTy, Src1);
6319 if (IID == Intrinsic::amdgcn_writelane)
6320 Src2Parts =
B.buildUnmerge(PartialResTy, Src2);
6322 for (
unsigned i = 0; i < NumParts; ++i) {
6323 Src0 = Src0Parts.
getReg(i);
6325 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6326 Src1 = Src1Parts.
getReg(i);
6328 if (IID == Intrinsic::amdgcn_writelane)
6329 Src2 = Src2Parts.
getReg(i);
6331 PartialRes.
push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
6335 B.buildBitcast(DstReg,
B.buildMergeLikeInstr(
6338 B.buildMergeLikeInstr(DstReg, PartialRes);
6340 MI.eraseFromParent();
6348 ST.getTargetLowering()->getImplicitParameterOffset(
6358 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
6359 B.buildConstant(IdxTy,
Offset).getReg(0));
6370 Register Pointer =
MI.getOperand(2).getReg();
6372 Register NumRecords =
MI.getOperand(4).getReg();
6378 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6380 auto ExtStride =
B.buildAnyExt(
S32, Stride);
6382 if (ST.has45BitNumRecordsBufferResource()) {
6387 auto PointerInt =
B.buildPtrToInt(PtrIntTy, Pointer);
6388 auto ExtPointer =
B.buildAnyExtOrTrunc(
S64, PointerInt);
6389 auto NumRecordsLHS =
B.buildShl(
S64, NumRecords,
B.buildConstant(
S32, 57));
6390 Register LowHalf =
B.buildOr(
S64, ExtPointer, NumRecordsLHS).getReg(0);
6394 auto NumRecordsRHS =
B.buildLShr(
S64, NumRecords,
B.buildConstant(
S32, 7));
6395 auto ShiftedStride =
B.buildShl(
S32, ExtStride,
B.buildConstant(
S32, 12));
6396 auto ExtShiftedStride =
6397 B.buildMergeValues(
S64, {Zero, ShiftedStride.getReg(0)});
6398 auto ShiftedFlags =
B.buildShl(
S32, Flags,
B.buildConstant(
S32, 28));
6399 auto ExtShiftedFlags =
6400 B.buildMergeValues(
S64, {Zero, ShiftedFlags.getReg(0)});
6401 auto CombinedFields =
B.buildOr(
S64, NumRecordsRHS, ExtShiftedStride);
6403 B.buildOr(
S64, CombinedFields, ExtShiftedFlags).getReg(0);
6404 B.buildMergeValues(Result, {LowHalf, HighHalf});
6406 NumRecords =
B.buildTrunc(
S32, NumRecords).getReg(0);
6407 auto Unmerge =
B.buildUnmerge(
S32, Pointer);
6408 auto LowHalf = Unmerge.getReg(0);
6409 auto HighHalf = Unmerge.getReg(1);
6411 auto AndMask =
B.buildConstant(
S32, 0x0000ffff);
6412 auto Masked =
B.buildAnd(
S32, HighHalf, AndMask);
6413 auto ShiftConst =
B.buildConstant(
S32, 16);
6414 auto ShiftedStride =
B.buildShl(
S32, ExtStride, ShiftConst);
6415 auto NewHighHalf =
B.buildOr(
S32,
Masked, ShiftedStride);
6416 Register NewHighHalfReg = NewHighHalf.getReg(0);
6417 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
6420 MI.eraseFromParent();
6437 MI.eraseFromParent();
6445 std::optional<uint32_t> KnownSize =
6447 if (KnownSize.has_value())
6448 B.buildConstant(DstReg, *KnownSize);
6466 MI.eraseFromParent();
6473 unsigned AddrSpace)
const {
6475 auto Unmerge =
B.buildUnmerge(
S32,
MI.getOperand(2).getReg());
6479 ST.hasGloballyAddressableScratch()) {
6481 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
6482 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6484 MRI.
setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
6486 Register XOR =
B.buildXor(
S32, Hi32, FlatScratchBaseHi).getReg(0);
6488 B.buildConstant(
S32, 1u << 26));
6493 MI.eraseFromParent();
6503std::pair<Register, unsigned>
6515 bool CheckNUW = ST.hasGFX1250Insts();
6517 MRI, OrigOffset,
nullptr, CheckNUW);
6521 BaseReg =
B.buildPtrToInt(MRI.
getType(OrigOffset), BaseReg).getReg(0);
6531 unsigned Overflow = ImmOffset & ~MaxImm;
6532 ImmOffset -= Overflow;
6533 if ((int32_t)Overflow < 0) {
6534 Overflow += ImmOffset;
6538 if (Overflow != 0) {
6540 BaseReg =
B.buildConstant(
S32, Overflow).getReg(0);
6542 auto OverflowVal =
B.buildConstant(
S32, Overflow);
6543 BaseReg =
B.buildAdd(
S32, BaseReg, OverflowVal).getReg(0);
6548 BaseReg =
B.buildConstant(
S32, 0).getReg(0);
6550 return std::pair(BaseReg, ImmOffset);
6557 bool ImageStore)
const {
6563 if (ST.hasUnpackedD16VMem()) {
6564 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6567 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6568 WideRegs.
push_back(
B.buildAnyExt(
S32, Unmerge.getReg(
I)).getReg(0));
6576 if (ImageStore && ST.hasImageStoreD16Bug()) {
6579 Reg =
B.buildBitcast(
S32, Reg).getReg(0);
6581 PackedRegs.
resize(2,
B.buildUndef(
S32).getReg(0));
6588 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6589 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6591 PackedRegs.
resize(6,
B.buildUndef(
S16).getReg(0));
6599 auto Unmerge =
B.buildUnmerge(
S32, Reg);
6600 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6602 PackedRegs.
resize(4,
B.buildUndef(
S32).getReg(0));
6619 bool IsFormat)
const {
6631 VData =
B.buildBitcast(Ty, VData).getReg(0);
6639 if (Ty.isVector()) {
6640 if (Ty.getElementType() ==
S16 && Ty.getNumElements() <= 4) {
6652 bool IsFormat)
const {
6659 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6674 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6677 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6681 VIndex =
MI.getOperand(3).getReg();
6684 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6687 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6688 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6692 Format =
MI.getOperand(5 + OpOffset).getImm();
6696 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6702 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6703 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6704 }
else if (IsFormat) {
6705 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6706 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6710 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6713 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6716 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6721 auto MIB =
B.buildInstr(
Opc)
6732 MIB.addImm(AuxiliaryData)
6733 .addImm(HasVIndex ? -1 : 0)
6734 .addMemOperand(MMO);
6736 MI.eraseFromParent();
6742 unsigned ImmOffset,
unsigned Format,
6745 auto MIB =
B.buildInstr(
Opc)
6756 MIB.addImm(AuxiliaryData)
6757 .addImm(HasVIndex ? -1 : 0)
6758 .addMemOperand(MMO);
6764 bool IsTyped)
const {
6778 assert(
MI.getNumExplicitDefs() == 1 ||
MI.getNumExplicitDefs() == 2);
6779 bool IsTFE =
MI.getNumExplicitDefs() == 2;
6781 StatusDst =
MI.getOperand(1).getReg();
6786 Register RSrc =
MI.getOperand(2 + OpOffset).getReg();
6789 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6792 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps + OpOffset;
6795 VIndex =
MI.getOperand(3 + OpOffset).getReg();
6798 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6801 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6802 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6806 Format =
MI.getOperand(5 + OpOffset).getImm();
6810 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6820 Dst =
MI.getOperand(0).getReg();
6821 B.setInsertPt(
B.getMBB(),
MI);
6828 Dst =
MI.getOperand(0).getReg();
6829 B.setInsertPt(
B.getMBB(),
MI);
6833 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6834 const bool Unpacked = ST.hasUnpackedD16VMem();
6844 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6845 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6846 }
else if (IsFormat) {
6850 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6852 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6853 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6858 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6859 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6862 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6863 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6866 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6867 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6873 unsigned NumValueDWords =
divideCeil(Ty.getSizeInBits(), 32);
6874 unsigned NumLoadDWords = NumValueDWords + 1;
6876 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(LoadTy);
6878 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6880 Register ExtDst =
B.getMRI()->createGenericVirtualRegister(
S32);
6881 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6882 B.buildTrunc(Dst, ExtDst);
6883 }
else if (NumValueDWords == 1) {
6884 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6887 for (
unsigned I = 0;
I != NumValueDWords; ++
I)
6888 LoadElts.
push_back(
B.getMRI()->createGenericVirtualRegister(
S32));
6890 B.buildUnmerge(LoadElts, LoadDstReg);
6892 B.buildMergeLikeInstr(Dst, LoadElts);
6895 (IsD16 && !Ty.isVector())) {
6896 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(
S32);
6898 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6899 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6900 B.buildTrunc(Dst, LoadDstReg);
6901 }
else if (Unpacked && IsD16 && Ty.isVector()) {
6903 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6905 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6906 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6908 auto Unmerge =
B.buildUnmerge(
S32, LoadDstReg);
6910 for (
unsigned I = 0,
N = Unmerge->getNumOperands() - 1;
I !=
N; ++
I)
6911 Repack.
push_back(
B.buildTrunc(EltTy, Unmerge.getReg(
I)).getReg(0));
6912 B.buildMergeLikeInstr(Dst, Repack);
6915 AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6918 MI.eraseFromParent();
6924 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6925 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6926 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6927 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6928 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6929 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6930 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6931 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6932 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6933 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6934 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6935 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6936 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6937 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6938 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6939 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6940 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6941 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6942 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6943 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6944 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6945 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6946 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6947 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6948 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6949 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6950 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6951 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6952 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6953 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6954 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6955 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6956 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6957 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6958 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6959 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6960 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6961 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6962 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6963 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6964 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6965 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6966 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6967 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6968 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6969 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6970 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6971 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6972 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6973 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6974 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6975 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6976 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6977 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6978 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6979 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6980 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6981 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6982 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6983 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6984 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6985 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6986 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6987 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6988 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6989 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6990 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6991 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6992 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6993 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6994 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6995 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6996 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6997 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6998 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6999 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7000 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7001 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7002 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7003 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
7004 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
7005 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
7006 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
7007 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
7008 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
7009 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
7010 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
7011 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
7012 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
7013 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
7022 const bool IsCmpSwap =
7023 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
7024 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
7025 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
7026 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
7037 CmpVal =
MI.getOperand(3).getReg();
7042 Register RSrc =
MI.getOperand(3 + OpOffset).getReg();
7043 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
7046 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
7049 VIndex =
MI.getOperand(4 + OpOffset).getReg();
7052 VIndex =
B.buildConstant(
LLT::scalar(32), 0).getReg(0);
7055 Register VOffset =
MI.getOperand(4 + OpOffset).getReg();
7056 Register SOffset =
MI.getOperand(5 + OpOffset).getReg();
7057 unsigned AuxiliaryData =
MI.getOperand(6 + OpOffset).getImm();
7076 .addImm(AuxiliaryData)
7077 .addImm(HasVIndex ? -1 : 0)
7078 .addMemOperand(MMO);
7080 MI.eraseFromParent();
7090 bool IsA16,
bool IsG16) {
7106 (
B.getMRI()->getType(AddrReg) ==
S16)) {
7111 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7115 "Bias needs to be converted to 16 bit in A16 mode");
7117 AddrReg =
B.buildBitcast(
V2S16, AddrReg).getReg(0);
7123 if (((
I + 1) >= EndIdx) ||
7130 !
MI.getOperand(ArgOffset +
I + 1).isReg()) {
7132 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7137 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
7148 int DimIdx,
int NumVAddrs) {
7152 for (
int I = 0;
I != NumVAddrs; ++
I) {
7154 if (
SrcOp.isReg()) {
7160 int NumAddrRegs = AddrRegs.
size();
7161 if (NumAddrRegs != 1) {
7164 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
7167 for (
int I = 1;
I != NumVAddrs; ++
I) {
7170 MI.getOperand(DimIdx +
I).setReg(AMDGPU::NoRegister);
7192 const unsigned NumDefs =
MI.getNumExplicitDefs();
7193 const unsigned ArgOffset = NumDefs + 1;
7194 bool IsTFE = NumDefs == 2;
7212 VData =
MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
7216 const bool IsAtomicPacked16Bit =
7217 (BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7218 BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7226 ST.hasG16() ? (BaseOpcode->
Gradients && GradTy ==
S16) : GradTy ==
S16;
7227 const bool IsA16 = AddrTy ==
S16;
7228 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() ==
S16;
7231 if (!BaseOpcode->
Atomic) {
7232 DMask =
MI.getOperand(ArgOffset + Intr->
DMaskIndex).getImm();
7235 }
else if (DMask != 0) {
7237 }
else if (!IsTFE && !BaseOpcode->
Store) {
7239 B.buildUndef(
MI.getOperand(0));
7240 MI.eraseFromParent();
7248 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
7249 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
7250 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
7251 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
7252 unsigned NewOpcode = LoadOpcode;
7253 if (BaseOpcode->
Store)
7254 NewOpcode = StoreOpcode;
7256 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
7259 MI.setDesc(
B.getTII().get(NewOpcode));
7263 if (IsTFE && DMask == 0) {
7266 MI.getOperand(ArgOffset + Intr->
DMaskIndex).setImm(DMask);
7269 if (BaseOpcode->
Atomic) {
7274 if (Ty.isVector() && !IsAtomicPacked16Bit)
7281 auto Concat =
B.buildBuildVector(PackedTy, {VData0, VData1});
7282 MI.getOperand(2).setReg(
Concat.getReg(0));
7283 MI.getOperand(3).setReg(AMDGPU::NoRegister);
7287 unsigned CorrectedNumVAddrs = Intr->
NumVAddrs;
7290 if (BaseOpcode->
Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
7296 if (IsA16 && !ST.hasA16()) {
7301 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->
Sampler);
7302 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
7304 if (IsA16 || IsG16) {
7312 const bool UseNSA = ST.hasNSAEncoding() &&
7313 PackedRegs.
size() >= ST.getNSAThreshold(MF) &&
7314 (PackedRegs.
size() <= NSAMaxSize || HasPartialNSA);
7315 const bool UsePartialNSA =
7316 UseNSA && HasPartialNSA && PackedRegs.
size() > NSAMaxSize;
7318 if (UsePartialNSA) {
7322 auto Concat =
B.buildConcatVectors(
7323 PackedAddrTy,
ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
7324 PackedRegs[NSAMaxSize - 1] =
Concat.getReg(0);
7325 PackedRegs.
resize(NSAMaxSize);
7326 }
else if (!UseNSA && PackedRegs.
size() > 1) {
7328 auto Concat =
B.buildConcatVectors(PackedAddrTy, PackedRegs);
7329 PackedRegs[0] =
Concat.getReg(0);
7333 const unsigned NumPacked = PackedRegs.
size();
7336 if (!
SrcOp.isReg()) {
7346 SrcOp.setReg(AMDGPU::NoRegister);
7363 const bool UseNSA = ST.hasNSAEncoding() &&
7364 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
7365 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
7366 const bool UsePartialNSA =
7367 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
7369 if (UsePartialNSA) {
7371 ArgOffset + Intr->
VAddrStart + NSAMaxSize - 1,
7373 }
else if (!UseNSA && Intr->
NumVAddrs > 1) {
7388 if (!Ty.isVector() || !IsD16)
7392 if (RepackedReg != VData) {
7393 MI.getOperand(1).setReg(RepackedReg);
7401 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
7404 if (NumElts < DMaskLanes)
7407 if (NumElts > 4 || DMaskLanes > 4)
7417 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
7418 const LLT AdjustedTy =
7434 if (IsD16 && ST.hasUnpackedD16VMem()) {
7441 unsigned RoundedElts = (AdjustedTy.
getSizeInBits() + 31) / 32;
7442 unsigned RoundedSize = 32 * RoundedElts;
7446 RegTy = !IsTFE && EltSize == 16 ?
V2S16 :
S32;
7451 if (!IsTFE && (RoundedTy == Ty || !Ty.
isVector()))
7457 B.setInsertPt(*
MI.getParent(), ++
MI.getIterator());
7461 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
7462 const int ResultNumRegs = LoadResultTy.
getSizeInBits() / 32;
7466 MI.getOperand(0).setReg(NewResultReg);
7474 Dst1Reg =
MI.getOperand(1).getReg();
7479 MI.removeOperand(1);
7483 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
7492 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7494 if (ResultNumRegs == 1) {
7496 ResultRegs[0] = NewResultReg;
7499 for (
int I = 0;
I != NumDataRegs; ++
I)
7501 B.buildUnmerge(ResultRegs, NewResultReg);
7506 ResultRegs.
resize(NumDataRegs);
7511 if (IsD16 && !Ty.isVector()) {
7512 B.buildTrunc(DstReg, ResultRegs[0]);
7517 if (Ty ==
V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7518 B.buildBitcast(DstReg, ResultRegs[0]);
7530 if (RegTy !=
V2S16 && !ST.hasUnpackedD16VMem()) {
7532 Reg =
B.buildBitcast(
V2S16, Reg).getReg(0);
7533 }
else if (ST.hasUnpackedD16VMem()) {
7535 Reg =
B.buildTrunc(
S16, Reg).getReg(0);
7539 auto padWithUndef = [&](
LLT Ty,
int NumElts) {
7543 for (
int I = 0;
I != NumElts; ++
I)
7550 padWithUndef(ResTy, NumElts - ResultRegs.
size());
7551 B.buildBuildVector(DstReg, ResultRegs);
7555 assert(!ST.hasUnpackedD16VMem() && ResTy ==
V2S16);
7556 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7562 if (ResultRegs.
size() == 1) {
7563 NewResultReg = ResultRegs[0];
7564 }
else if (ResultRegs.
size() == 2) {
7566 NewResultReg =
B.buildConcatVectors(
V4S16, ResultRegs).getReg(0);
7574 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
7576 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
7581 padWithUndef(ResTy, RegsToCover - ResultRegs.
size());
7582 B.buildConcatVectors(DstReg, ResultRegs);
7591 Register OrigDst =
MI.getOperand(0).getReg();
7593 LLT Ty =
B.getMRI()->getType(OrigDst);
7594 unsigned Size = Ty.getSizeInBits();
7597 if (
Size < 32 && ST.hasScalarSubwordLoads()) {
7599 Opc =
Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7600 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7603 Dst =
B.getMRI()->createGenericVirtualRegister(
LLT::scalar(32));
7605 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7614 B.setInsertPt(
B.getMBB(),
MI);
7619 B.setInsertPt(
B.getMBB(),
MI);
7625 MI.setDesc(
B.getTII().get(
Opc));
7626 MI.removeOperand(1);
7629 const unsigned MemSize = (
Size + 7) / 8;
7630 const Align MemAlign =
B.getDataLayout().getABITypeAlign(
7637 MI.addMemOperand(MF, MMO);
7638 if (Dst != OrigDst) {
7639 MI.getOperand(0).setReg(Dst);
7640 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
7641 B.buildTrunc(OrigDst, Dst);
7663 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7664 MI.removeOperand(0);
7674 if (!ST.hasTrapHandler() ||
7678 return ST.supportsGetDoorbellID() ?
7691 MI.eraseFromParent();
7701 BuildMI(*TrapBB, TrapBB->
end(),
DL,
B.getTII().get(AMDGPU::S_ENDPGM))
7703 BuildMI(BB, &
MI,
DL,
B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7707 MI.eraseFromParent();
7716 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7723 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
7743 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7746 Register Temp =
B.buildLoad(
S64, LoadAddr, *MMO).getReg(0);
7747 B.buildCopy(SGPR01, Temp);
7748 B.buildInstr(AMDGPU::S_TRAP)
7751 MI.eraseFromParent();
7762 B.buildCopy(SGPR01, LiveIn);
7763 B.buildInstr(AMDGPU::S_TRAP)
7767 MI.eraseFromParent();
7776 if (ST.hasPrivEnabledTrap2NopBug()) {
7777 ST.getInstrInfo()->insertSimulatedTrap(MRI,
B.getMBB(),
MI,
7779 MI.eraseFromParent();
7783 B.buildInstr(AMDGPU::S_TRAP)
7785 MI.eraseFromParent();
7794 if (!ST.hasTrapHandler() ||
7798 Fn,
"debugtrap handler not supported",
MI.getDebugLoc(),
DS_Warning));
7801 B.buildInstr(AMDGPU::S_TRAP)
7805 MI.eraseFromParent();
7818 Register NodePtr =
MI.getOperand(2).getReg();
7819 Register RayExtent =
MI.getOperand(3).getReg();
7820 Register RayOrigin =
MI.getOperand(4).getReg();
7822 Register RayInvDir =
MI.getOperand(6).getReg();
7825 if (!ST.hasGFX10_AEncoding()) {
7828 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7837 const unsigned NumVDataDwords = 4;
7838 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7839 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7841 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7843 const unsigned BaseOpcodes[2][2] = {
7844 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7845 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7846 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7850 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7851 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7852 : AMDGPU::MIMGEncGfx10NSA,
7853 NumVDataDwords, NumVAddrDwords);
7857 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7858 : AMDGPU::MIMGEncGfx10Default,
7859 NumVDataDwords, NumVAddrDwords);
7864 if (UseNSA && IsGFX11Plus) {
7866 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7867 auto Merged =
B.buildMergeLikeInstr(
7868 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7869 Ops.push_back(Merged.getReg(0));
7872 Ops.push_back(NodePtr);
7873 Ops.push_back(RayExtent);
7874 packLanes(RayOrigin);
7877 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7878 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7879 auto MergedDir =
B.buildMergeLikeInstr(
7882 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(0),
7883 UnmergeRayDir.getReg(0)}))
7886 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(1),
7887 UnmergeRayDir.getReg(1)}))
7890 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(2),
7891 UnmergeRayDir.getReg(2)}))
7893 Ops.push_back(MergedDir.getReg(0));
7896 packLanes(RayInvDir);
7900 auto Unmerge =
B.buildUnmerge({
S32,
S32}, NodePtr);
7901 Ops.push_back(Unmerge.getReg(0));
7902 Ops.push_back(Unmerge.getReg(1));
7904 Ops.push_back(NodePtr);
7906 Ops.push_back(RayExtent);
7909 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7910 Ops.push_back(Unmerge.getReg(0));
7911 Ops.push_back(Unmerge.getReg(1));
7912 Ops.push_back(Unmerge.getReg(2));
7915 packLanes(RayOrigin);
7917 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7918 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7922 B.buildMergeLikeInstr(R1,
7923 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7924 B.buildMergeLikeInstr(
7925 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7926 B.buildMergeLikeInstr(
7927 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7933 packLanes(RayInvDir);
7940 Register MergedOps =
B.buildMergeLikeInstr(OpTy,
Ops).getReg(0);
7942 Ops.push_back(MergedOps);
7945 auto MIB =
B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7954 .addImm(IsA16 ? 1 : 0)
7957 MI.eraseFromParent();
7967 Register DstOrigin =
MI.getOperand(1).getReg();
7969 Register NodePtr =
MI.getOperand(4).getReg();
7970 Register RayExtent =
MI.getOperand(5).getReg();
7971 Register InstanceMask =
MI.getOperand(6).getReg();
7972 Register RayOrigin =
MI.getOperand(7).getReg();
7974 Register Offsets =
MI.getOperand(9).getReg();
7975 Register TDescr =
MI.getOperand(10).getReg();
7977 if (!ST.hasBVHDualAndBVH8Insts()) {
7980 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7985 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7986 const unsigned NumVDataDwords = 10;
7987 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7989 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7990 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7991 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7994 auto RayExtentInstanceMaskVec =
B.buildMergeLikeInstr(
7995 V2S32, {RayExtent,
B.buildAnyExt(
S32, InstanceMask)});
7997 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7998 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
8004 .addUse(RayExtentInstanceMaskVec.getReg(0))
8011 MI.eraseFromParent();
8020 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
8021 MI.eraseFromParent();
8028 if (!ST.hasArchitectedSGPRs())
8032 auto TTMP8 =
B.buildCopy(
S32,
Register(AMDGPU::TTMP8));
8033 auto LSB =
B.buildConstant(
S32, 25);
8034 auto Width =
B.buildConstant(
S32, 5);
8035 B.buildUbfx(DstReg, TTMP8, LSB, Width);
8036 MI.eraseFromParent();
8044 unsigned Width)
const {
8048 MRI.
setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
8049 B.buildInstr(AMDGPU::S_GETREG_B32_const)
8052 MI.eraseFromParent();
8070 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
8074 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
8077 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
8078 MI.eraseFromParent();
8089 auto Unmerge =
B.buildUnmerge({
S32,
S32},
MI.getOperand(0));
8093 .addReg(Unmerge.getReg(0));
8097 .addReg(Unmerge.getReg(1));
8098 MI.eraseFromParent();
8110 case Intrinsic::amdgcn_icmp: {
8121 if (!Src1Const || Src1Const->Value != 0)
8125 int64_t Pred =
MI.getOperand(4).getImm();
8131 B.buildIntrinsic(Intrinsic::amdgcn_ballot, Dst).addUse(Src0);
8132 MI.eraseFromParent();
8135 case Intrinsic::sponentry:
8141 B.buildInstr(AMDGPU::G_AMDGPU_SPONENTRY).addDef(TmpReg);
8144 B.buildIntToPtr(DstReg, TmpReg);
8145 MI.eraseFromParent();
8147 int FI =
B.getMF().getFrameInfo().CreateFixedObject(
8149 B.buildFrameIndex(
MI.getOperand(0), FI);
8150 MI.eraseFromParent();
8153 case Intrinsic::amdgcn_if:
8154 case Intrinsic::amdgcn_else: {
8157 bool Negated =
false;
8169 std::swap(CondBrTarget, UncondBrTarget);
8171 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
8172 if (IntrID == Intrinsic::amdgcn_if) {
8173 B.buildInstr(AMDGPU::SI_IF)
8176 .addMBB(UncondBrTarget);
8178 B.buildInstr(AMDGPU::SI_ELSE)
8181 .addMBB(UncondBrTarget);
8190 B.buildBr(*CondBrTarget);
8195 MI.eraseFromParent();
8196 BrCond->eraseFromParent();
8202 case Intrinsic::amdgcn_loop: {
8205 bool Negated =
false;
8215 std::swap(CondBrTarget, UncondBrTarget);
8217 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
8218 B.buildInstr(AMDGPU::SI_LOOP)
8220 .addMBB(UncondBrTarget);
8225 B.buildBr(*CondBrTarget);
8227 MI.eraseFromParent();
8228 BrCond->eraseFromParent();
8235 case Intrinsic::amdgcn_wave_reduce_min:
8236 case Intrinsic::amdgcn_wave_reduce_umin:
8237 case Intrinsic::amdgcn_wave_reduce_fmin:
8238 case Intrinsic::amdgcn_wave_reduce_max:
8239 case Intrinsic::amdgcn_wave_reduce_umax:
8240 case Intrinsic::amdgcn_wave_reduce_fmax:
8241 case Intrinsic::amdgcn_wave_reduce_add:
8242 case Intrinsic::amdgcn_wave_reduce_fadd:
8243 case Intrinsic::amdgcn_wave_reduce_sub:
8244 case Intrinsic::amdgcn_wave_reduce_fsub:
8245 case Intrinsic::amdgcn_wave_reduce_and:
8246 case Intrinsic::amdgcn_wave_reduce_or:
8247 case Intrinsic::amdgcn_wave_reduce_xor: {
8252 bool IsFPOp = IntrID == Intrinsic::amdgcn_wave_reduce_fmin ||
8253 IntrID == Intrinsic::amdgcn_wave_reduce_fmax ||
8254 IntrID == Intrinsic::amdgcn_wave_reduce_fadd ||
8255 IntrID == Intrinsic::amdgcn_wave_reduce_fsub;
8256 bool NeedsSignExt = IntrID == Intrinsic::amdgcn_wave_reduce_min ||
8257 IntrID == Intrinsic::amdgcn_wave_reduce_max ||
8258 IntrID == Intrinsic::amdgcn_wave_reduce_add ||
8259 IntrID == Intrinsic::amdgcn_wave_reduce_sub;
8260 auto Ext = IsFPOp ?
B.buildFPExt(
LLT::scalar(32), SrcReg)
8266 .addUse(Ext.getReg(0))
8267 .addImm(
MI.getOperand(3).getImm());
8269 B.buildFPTrunc(DstReg, NewDst);
8271 B.buildTrunc(DstReg, NewDst);
8272 MI.eraseFromParent();
8275 case Intrinsic::amdgcn_addrspacecast_nonnull:
8277 case Intrinsic::amdgcn_make_buffer_rsrc:
8279 case Intrinsic::amdgcn_kernarg_segment_ptr:
8282 B.buildConstant(
MI.getOperand(0).getReg(), 0);
8283 MI.eraseFromParent();
8289 case Intrinsic::amdgcn_implicitarg_ptr:
8291 case Intrinsic::amdgcn_workitem_id_x:
8294 case Intrinsic::amdgcn_workitem_id_y:
8297 case Intrinsic::amdgcn_workitem_id_z:
8300 case Intrinsic::amdgcn_workgroup_id_x:
8305 case Intrinsic::amdgcn_workgroup_id_y:
8310 case Intrinsic::amdgcn_workgroup_id_z:
8315 case Intrinsic::amdgcn_cluster_id_x:
8316 return ST.hasClusters() &&
8319 case Intrinsic::amdgcn_cluster_id_y:
8320 return ST.hasClusters() &&
8323 case Intrinsic::amdgcn_cluster_id_z:
8324 return ST.hasClusters() &&
8327 case Intrinsic::amdgcn_cluster_workgroup_id_x:
8328 return ST.hasClusters() &&
8331 case Intrinsic::amdgcn_cluster_workgroup_id_y:
8332 return ST.hasClusters() &&
8335 case Intrinsic::amdgcn_cluster_workgroup_id_z:
8336 return ST.hasClusters() &&
8339 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
8340 return ST.hasClusters() &&
8342 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
8343 return ST.hasClusters() &&
8346 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
8347 return ST.hasClusters() &&
8350 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
8351 return ST.hasClusters() &&
8354 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
8355 return ST.hasClusters() &&
8359 case Intrinsic::amdgcn_wave_id:
8361 case Intrinsic::amdgcn_lds_kernel_id:
8364 case Intrinsic::amdgcn_dispatch_ptr:
8367 case Intrinsic::amdgcn_queue_ptr:
8370 case Intrinsic::amdgcn_implicit_buffer_ptr:
8373 case Intrinsic::amdgcn_dispatch_id:
8376 case Intrinsic::r600_read_ngroups_x:
8380 case Intrinsic::r600_read_ngroups_y:
8383 case Intrinsic::r600_read_ngroups_z:
8386 case Intrinsic::r600_read_local_size_x:
8389 case Intrinsic::r600_read_local_size_y:
8393 case Intrinsic::r600_read_local_size_z:
8396 case Intrinsic::amdgcn_fdiv_fast:
8398 case Intrinsic::amdgcn_is_shared:
8400 case Intrinsic::amdgcn_is_private:
8402 case Intrinsic::amdgcn_wavefrontsize: {
8403 B.buildConstant(
MI.getOperand(0), ST.getWavefrontSize());
8404 MI.eraseFromParent();
8407 case Intrinsic::amdgcn_s_buffer_load:
8409 case Intrinsic::amdgcn_raw_buffer_store:
8410 case Intrinsic::amdgcn_raw_ptr_buffer_store:
8411 case Intrinsic::amdgcn_struct_buffer_store:
8412 case Intrinsic::amdgcn_struct_ptr_buffer_store:
8414 case Intrinsic::amdgcn_raw_buffer_store_format:
8415 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
8416 case Intrinsic::amdgcn_struct_buffer_store_format:
8417 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
8419 case Intrinsic::amdgcn_raw_tbuffer_store:
8420 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
8421 case Intrinsic::amdgcn_struct_tbuffer_store:
8422 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
8424 case Intrinsic::amdgcn_raw_buffer_load:
8425 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8426 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8427 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8428 case Intrinsic::amdgcn_struct_buffer_load:
8429 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8430 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8431 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
8433 case Intrinsic::amdgcn_raw_buffer_load_format:
8434 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
8435 case Intrinsic::amdgcn_struct_buffer_load_format:
8436 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8438 case Intrinsic::amdgcn_raw_tbuffer_load:
8439 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
8440 case Intrinsic::amdgcn_struct_tbuffer_load:
8441 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
8443 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8444 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8445 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8446 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8447 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8448 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8449 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8450 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8451 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8452 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8453 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8454 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8455 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8456 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8457 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8458 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8459 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8460 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8461 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8462 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8463 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8464 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8465 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8466 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8467 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8468 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8469 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8470 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8471 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8472 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8473 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8474 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8475 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8476 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8477 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8478 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8479 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8480 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8481 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8482 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8483 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8484 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8485 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8486 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8487 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8488 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8489 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8490 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8491 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8492 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
8493 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8494 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
8495 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8496 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8497 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8498 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8499 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8500 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8501 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8502 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8503 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
8504 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
8505 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
8506 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
8507 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8508 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
8509 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8510 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
8511 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8512 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8513 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8514 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8516 case Intrinsic::amdgcn_rsq_clamp:
8518 case Intrinsic::amdgcn_image_bvh_intersect_ray:
8520 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
8521 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
8523 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
8524 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
8525 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
8526 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
8527 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
8528 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
8529 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
8530 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
8534 if (IndexArgTy !=
S64) {
8535 auto NewIndex = IndexArgTy.
isVector() ?
B.buildBitcast(
S64, Index)
8536 :
B.buildAnyExt(
S64, Index);
8537 MI.getOperand(5).setReg(NewIndex.getReg(0));
8541 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8542 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8543 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8544 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8545 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8546 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8547 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8548 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8552 MI.getOperand(5).setReg(
B.buildAnyExt(
S32, Index).getReg(0));
8555 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
8556 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
8557 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
8558 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
8559 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
8560 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
8561 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8562 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8563 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8565 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8569 if (IndexArgTy != IdxTy) {
8570 auto NewIndex = IndexArgTy.
isVector() ?
B.buildBitcast(IdxTy, Index)
8571 :
B.buildAnyExt(IdxTy, Index);
8572 MI.getOperand(7).setReg(NewIndex.getReg(0));
8577 case Intrinsic::amdgcn_fmed3: {
8583 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
8584 MI.removeOperand(1);
8588 case Intrinsic::amdgcn_readlane:
8589 case Intrinsic::amdgcn_writelane:
8590 case Intrinsic::amdgcn_readfirstlane:
8591 case Intrinsic::amdgcn_permlane16:
8592 case Intrinsic::amdgcn_permlanex16:
8593 case Intrinsic::amdgcn_permlane64:
8594 case Intrinsic::amdgcn_set_inactive:
8595 case Intrinsic::amdgcn_set_inactive_chain_arg:
8596 case Intrinsic::amdgcn_mov_dpp8:
8597 case Intrinsic::amdgcn_update_dpp:
8598 case Intrinsic::amdgcn_permlane_bcast:
8599 case Intrinsic::amdgcn_permlane_up:
8600 case Intrinsic::amdgcn_permlane_down:
8601 case Intrinsic::amdgcn_permlane_xor:
8603 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8605 case Intrinsic::amdgcn_dead: {
8609 MI.eraseFromParent();
8612 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8613 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8614 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8615 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8616 B.buildLoad(
MI.getOperand(0),
MI.getOperand(2), **
MI.memoperands_begin());
8617 MI.eraseFromParent();
8619 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8620 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8621 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8622 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8623 B.buildStore(
MI.getOperand(2),
MI.getOperand(1), **
MI.memoperands_begin());
8624 MI.eraseFromParent();
8626 case Intrinsic::amdgcn_av_load_b128:
8627 case Intrinsic::amdgcn_av_store_b128: {
8629 if (!ST.hasFlatGlobalInsts()) {
8630 const char *Name = IntrID == Intrinsic::amdgcn_av_load_b128
8631 ?
"llvm.amdgcn.av.load.b128"
8632 :
"llvm.amdgcn.av.store.b128";
8635 Fn,
Twine(Name) +
" not supported on subtarget",
MI.getDebugLoc()));
8638 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8639 if (IntrID == Intrinsic::amdgcn_av_load_b128)
8640 B.buildLoad(
MI.getOperand(0),
MI.getOperand(2), **
MI.memoperands_begin());
8642 B.buildStore(
MI.getOperand(2),
MI.getOperand(1),
8643 **
MI.memoperands_begin());
8644 MI.eraseFromParent();
8647 case Intrinsic::amdgcn_flat_load_monitor_b32:
8648 case Intrinsic::amdgcn_flat_load_monitor_b64:
8649 case Intrinsic::amdgcn_flat_load_monitor_b128:
8650 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8651 B.buildInstr(AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR)
8652 .add(
MI.getOperand(0))
8653 .add(
MI.getOperand(2))
8654 .addMemOperand(*
MI.memoperands_begin());
8655 MI.eraseFromParent();
8657 case Intrinsic::amdgcn_global_load_monitor_b32:
8658 case Intrinsic::amdgcn_global_load_monitor_b64:
8659 case Intrinsic::amdgcn_global_load_monitor_b128:
8660 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8661 B.buildInstr(AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR)
8662 .add(
MI.getOperand(0))
8663 .add(
MI.getOperand(2))
8664 .addMemOperand(*
MI.memoperands_begin());
8665 MI.eraseFromParent();
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst, const SrcOp &Src, unsigned Flags)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
static constexpr unsigned FPEnvModeBitField
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS32Vectors
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS64Vectors
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
static constexpr unsigned FPEnvTrapBitField
static constexpr unsigned MaxRegisterSize
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
static bool isRegisterVectorType(LLT Ty)
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Interface for Targets to specify which operations they can successfully select and how the others sho...
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define FP_DENORM_FLUSH_NONE
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static constexpr int Concat[]
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B, AMDGPU::Hwreg::Id HwReg, unsigned LowBit, unsigned Width) const
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeInsert(LegalizerHelper &Helper, MachineInstr &MI) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLZ_ZERO_POISON(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp10Unsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafeImpl(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags, bool IsExp10) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFEXPF64(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtract(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLS(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkGroupId(MachineInstr &MI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
bool isModuleEntryFunction() const
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isBottomOfStack() const
bool isEntryFunction() const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const fltSemantics & IEEEsingle()
static const fltSemantics & IEEEdouble()
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
Get the array size.
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
@ ICMP_SLT
signed less than
@ FCMP_OLT
0 1 0 0 True if ordered and less than
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
@ ICMP_UGE
unsigned greater or equal
@ ICMP_SGT
signed greater than
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
@ ICMP_ULT
unsigned less than
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
ConstantFP - Floating Point Values [float, double].
bool isMinusOne() const
Returns true if this value is exactly -1.0.
bool isOne() const
Returns true if this value is exactly +1.0.
This is the shared class of boolean and integer constants.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr bool isAnyScalar() const
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & unsupported()
The instruction is unsupported.
LegalizeRuleSet & scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx)
Change the type TypeIdx to have the same scalar size as type SameSizeIdx.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & unsupportedFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Conditionally limit the maximum size of the scalar.
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
LegalizeRuleSet & minScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty if condition is met.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
LLVM_ABI LegalizeResult lowerInsert(MachineInstr &MI)
LLVM_ABI LegalizeResult lowerExtract(MachineInstr &MI)
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
use_instr_nodbg_iterator use_instr_nodbg_begin(Register RegNo) const
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI Register createGenericVirtualRegister(LLT Ty, StringRef Name="")
Create and return a new generic virtual register with low-level type Ty.
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
Represent a mutable reference to an array (0 or more elements consecutively in memory),...
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
constexpr bool isValid() const
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
unsigned getPointerSizeInBits(unsigned AS) const
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
A Use represents the edge between a Value definition and its users.
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
Invariant opcodes: All instruction sets have these as their low opcodes.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Undef
Value of the register doesn't matter.
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool has_single_bit(T Value) noexcept
std::function< bool(const LegalityQuery &)> LegalityPredicate
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
To bit_cast(const From &from) noexcept
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned BitWidth
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Next
unsigned Log2(Align A)
Returns the log2 of the alignment.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
static constexpr uint64_t encode(Fields... Values)
MIMGBaseOpcode BaseOpcode
This struct is a compact representation of a valid (non-zero power of two) alignment.
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.