37#include "llvm/IR/IntrinsicsAMDGPU.h"
38#include "llvm/IR/IntrinsicsR600.h"
40#define DEBUG_TYPE "amdgpu-legalinfo"
50 "amdgpu-global-isel-new-legality",
51 cl::desc(
"Use GlobalISel desired legality, rather than try to use"
52 "rules compatible with selection patterns"),
67 unsigned Bits = Ty.getSizeInBits();
77 const LLT Ty = Query.Types[TypeIdx];
83 return Ty.getNumElements() % 2 != 0 &&
84 EltSize > 1 && EltSize < 32 &&
85 Ty.getSizeInBits() % 32 != 0;
91 const LLT Ty = Query.Types[TypeIdx];
98 const LLT Ty = Query.Types[TypeIdx];
100 return EltTy.
getSizeInBits() == 16 && Ty.getNumElements() > 2;
106 const LLT Ty = Query.Types[TypeIdx];
108 return std::pair(TypeIdx,
115 const LLT Ty = Query.Types[TypeIdx];
117 unsigned Size = Ty.getSizeInBits();
118 unsigned Pieces = (
Size + 63) / 64;
119 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
129 const LLT Ty = Query.Types[TypeIdx];
132 const int Size = Ty.getSizeInBits();
134 const int NextMul32 = (
Size + 31) / 32;
138 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
146 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
147 return std::make_pair(TypeIdx,
LLT::scalar(MemSize));
154 const LLT Ty = Query.Types[TypeIdx];
156 const unsigned EltSize = Ty.getElementType().getSizeInBits();
159 assert(EltSize == 32 || EltSize == 64);
164 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
168 return std::pair(TypeIdx,
183 const unsigned NumElems = Ty.getElementCount().getFixedValue();
188 const unsigned Size = Ty.getSizeInBits();
201 const LLT Ty = Query.Types[TypeIdx];
208 const LLT Ty = Query.Types[TypeIdx];
209 unsigned Size = Ty.getSizeInBits();
218 const LLT QueryTy = Query.Types[TypeIdx];
225 const LLT QueryTy = Query.Types[TypeIdx];
232 const LLT QueryTy = Query.Types[TypeIdx];
238 return ((ST.useRealTrue16Insts() &&
Size == 16) ||
Size % 32 == 0) &&
244 return EltSize == 16 || EltSize % 32 == 0;
248 const int EltSize = Ty.getElementType().getSizeInBits();
249 return EltSize == 32 || EltSize == 64 ||
250 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
251 EltSize == 128 || EltSize == 256;
280 LLT Ty = Query.Types[TypeIdx];
288 const LLT QueryTy = Query.Types[TypeIdx];
372 if (Ty.isPointerOrPointerVector())
373 Ty = Ty.changeElementType(
LLT::scalar(Ty.getScalarSizeInBits()));
377 (ST.useRealTrue16Insts() && Ty ==
S16) ||
392 const LLT Ty = Query.Types[TypeIdx];
393 return !Ty.
isVector() && Ty.getSizeInBits() > 32 &&
394 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
402 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
412 bool IsLoad,
bool IsAtomic) {
416 return ST.hasFlatScratchEnabled() ? 128 : 32;
418 return ST.useDS128() ? 128 : 64;
429 return IsLoad ? 512 : 128;
434 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
443 const bool IsLoad = Query.
Opcode != AMDGPU::G_STORE;
445 unsigned RegSize = Ty.getSizeInBits();
448 unsigned AS = Query.
Types[1].getAddressSpace();
455 if (Ty.isVector() && MemSize !=
RegSize)
462 if (IsLoad && MemSize <
Size)
463 MemSize = std::max(MemSize,
Align);
483 if (!ST.hasDwordx3LoadStores())
496 if (AlignBits < MemSize) {
499 Align(AlignBits / 8)))
529 const unsigned Size = Ty.getSizeInBits();
530 if (Ty.isPointerVector())
540 unsigned EltSize = Ty.getScalarSizeInBits();
541 return EltSize != 32 && EltSize != 64;
555 const unsigned Size = Ty.getSizeInBits();
556 if (
Size != MemSizeInBits)
557 return Size <= 32 && Ty.isVector();
563 return Ty.isVector() && (!MemTy.
isVector() || MemTy == Ty) &&
572 uint64_t AlignInBits,
unsigned AddrSpace,
582 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
593 if (AlignInBits < RoundedSize)
600 RoundedSize, AddrSpace,
Align(AlignInBits / 8),
612 Query.
Types[1].getAddressSpace(), Opcode);
632 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
636 std::array<Register, 4> VectorElems;
637 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
638 for (
unsigned I = 0;
I < NumParts; ++
I)
640 B.buildExtractVectorElementConstant(
S32, VectorReg,
I).getReg(0);
641 B.buildMergeValues(MO, VectorElems);
646 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
647 auto Scalar =
B.buildBitcast(ScalarTy, BitcastReg);
648 B.buildIntToPtr(MO, Scalar);
668 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
669 auto Unmerged =
B.buildUnmerge(
LLT::scalar(32), Pointer);
670 for (
unsigned I = 0;
I < NumParts; ++
I)
672 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
674 Register Scalar =
B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
675 return B.buildBitcast(VectorTy, Scalar).getReg(0);
694 auto GetAddrSpacePtr = [&TM](
unsigned AS) {
707 const LLT BufferStridedPtr =
710 const LLT CodePtr = FlatPtr;
712 const std::initializer_list<LLT> AddrSpaces64 = {
713 GlobalPtr, ConstantPtr, FlatPtr
716 const std::initializer_list<LLT> AddrSpaces32 = {
717 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
720 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
722 const std::initializer_list<LLT> FPTypesBase = {
726 const std::initializer_list<LLT> FPTypes16 = {
730 const std::initializer_list<LLT> FPTypesPK16 = {
734 const LLT MinScalarFPTy = ST.has16BitInsts() ?
S16 :
S32;
755 if (ST.hasVOP3PInsts() && ST.hasAddNoCarryInsts() && ST.hasIntClamp()) {
757 if (ST.hasScalarAddSub64()) {
760 .clampMaxNumElementsStrict(0,
S16, 2)
768 .clampMaxNumElementsStrict(0,
S16, 2)
775 if (ST.hasScalarSMulU64()) {
778 .clampMaxNumElementsStrict(0,
S16, 2)
786 .clampMaxNumElementsStrict(0,
S16, 2)
796 .minScalarOrElt(0,
S16)
801 }
else if (ST.has16BitInsts()) {
835 .widenScalarToNextMultipleOf(0, 32)
845 if (ST.hasMad64_32())
850 if (ST.hasIntClamp()) {
873 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
883 if (ST.hasVOP3PInsts()) {
885 .clampMaxNumElements(0,
S8, 2)
906 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
918 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
925 .clampScalar(0,
S16,
S64);
958 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
959 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
966 if (ST.has16BitInsts()) {
967 if (ST.hasVOP3PInsts())
970 FPOpActions.legalFor({
S16});
972 TrigActions.customFor({
S16});
973 FDIVActions.customFor({
S16});
976 if (ST.hasPackedFP32Ops()) {
977 FPOpActions.legalFor({
V2S32});
978 FPOpActions.clampMaxNumElementsStrict(0,
S32, 2);
981 auto &MinNumMaxNumIeee =
984 if (ST.hasVOP3PInsts()) {
985 MinNumMaxNumIeee.legalFor(FPTypesPK16)
987 .clampMaxNumElements(0,
S16, 2)
990 }
else if (ST.has16BitInsts()) {
991 MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0,
S16,
S64).scalarize(0);
993 MinNumMaxNumIeee.legalFor(FPTypesBase)
999 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
1001 if (ST.hasVOP3PInsts()) {
1002 MinNumMaxNum.customFor(FPTypesPK16)
1004 .clampMaxNumElements(0,
S16, 2)
1005 .clampScalar(0,
S16,
S64)
1007 }
else if (ST.has16BitInsts()) {
1008 MinNumMaxNum.customFor(FPTypes16)
1009 .clampScalar(0,
S16,
S64)
1012 MinNumMaxNum.customFor(FPTypesBase)
1013 .clampScalar(0,
S32,
S64)
1017 if (ST.hasVOP3PInsts())
1033 .legalFor(FPTypesPK16)
1038 if (ST.has16BitInsts()) {
1072 if (ST.hasFractBug()) {
1106 if (ST.hasCvtPkF16F32Inst()) {
1108 .clampMaxNumElements(0,
S16, 2);
1112 FPTruncActions.scalarize(0).lower();
1120 if (ST.has16BitInsts()) {
1140 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1141 FMad.customFor({
S32,
S16});
1142 else if (ST.hasMadMacF32Insts())
1143 FMad.customFor({
S32});
1144 else if (ST.hasMadF16())
1145 FMad.customFor({
S16});
1150 if (ST.has16BitInsts()) {
1153 FRem.minScalar(0,
S32)
1162 .clampMaxNumElements(0,
S16, 2)
1181 if (ST.has16BitInsts())
1192 if (ST.has16BitInsts())
1206 if (ST.has16BitInsts())
1217 .clampScalar(0,
S16,
S64)
1232 .clampScalar(0,
S16,
S64)
1236 if (ST.has16BitInsts()) {
1238 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1240 .clampScalar(0,
S16,
S64)
1244 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1246 .clampScalar(0,
S32,
S64)
1250 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1253 .clampScalar(0,
S32,
S64)
1265 .scalarSameSizeAs(1, 0)
1281 {
S1}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1282 .legalForCartesianProduct(
1283 {
S32}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1284 if (ST.has16BitInsts()) {
1285 CmpBuilder.legalFor({{
S1,
S16}});
1296 {
S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1298 if (ST.hasSALUFloatInsts())
1308 if (ST.has16BitInsts())
1309 ExpOps.customFor({{
S32}, {
S16}});
1311 ExpOps.customFor({
S32});
1312 ExpOps.clampScalar(0, MinScalarFPTy,
S32)
1320 .
legalFor(ST.has16BitInsts(), {S16})
1326 .
legalFor(ST.has16BitInsts(), {S16})
1340 .clampScalar(0,
S32,
S32)
1347 if (ST.has16BitInsts())
1350 .widenScalarToNextPow2(1)
1356 .lowerFor({
S1,
S16})
1357 .widenScalarToNextPow2(1)
1384 .clampScalar(0,
S32,
S32)
1400 .clampScalar(0,
S32,
S64)
1404 if (ST.has16BitInsts()) {
1407 .clampMaxNumElementsStrict(0,
S16, 2)
1414 if (ST.hasVOP3PInsts()) {
1417 .clampMaxNumElements(0,
S16, 2)
1422 if (ST.hasIntMinMax64()) {
1425 .clampMaxNumElements(0,
S16, 2)
1433 .clampMaxNumElements(0,
S16, 2)
1442 .widenScalarToNextPow2(0)
1470 .legalForCartesianProduct(AddrSpaces32, {
S32})
1486 .legalForCartesianProduct(AddrSpaces32, {
S32})
1503 const auto needToSplitMemOp = [=](
const LegalityQuery &Query,
1504 bool IsLoad) ->
bool {
1508 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1522 unsigned NumRegs = (MemSize + 31) / 32;
1524 if (!ST.hasDwordx3LoadStores())
1535 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1536 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1537 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1543 for (
unsigned Op : {G_LOAD, G_STORE}) {
1544 const bool IsStore =
Op == G_STORE;
1549 Actions.legalForTypesWithMemDesc({{
S32, GlobalPtr,
S32, GlobalAlign32},
1552 {
S64, GlobalPtr,
S64, GlobalAlign32},
1555 {
S32, GlobalPtr,
S8, GlobalAlign8},
1556 {
S32, GlobalPtr,
S16, GlobalAlign16},
1558 {
S32, LocalPtr,
S32, 32},
1559 {
S64, LocalPtr,
S64, 32},
1561 {
S32, LocalPtr,
S8, 8},
1562 {
S32, LocalPtr,
S16, 16},
1565 {
S32, PrivatePtr,
S32, 32},
1566 {
S32, PrivatePtr,
S8, 8},
1567 {
S32, PrivatePtr,
S16, 16},
1570 {
S32, ConstantPtr,
S32, GlobalAlign32},
1573 {
S64, ConstantPtr,
S64, GlobalAlign32},
1574 {
V2S32, ConstantPtr,
V2S32, GlobalAlign32}});
1583 Actions.unsupportedIf(
1584 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1598 Actions.customIf(
typeIs(1, Constant32Ptr));
1624 return !Query.
Types[0].isVector() &&
1625 needToSplitMemOp(Query,
Op == G_LOAD);
1627 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1632 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1635 if (DstSize > MemSize)
1641 if (MemSize > MaxSize)
1649 return Query.
Types[0].isVector() &&
1650 needToSplitMemOp(Query,
Op == G_LOAD);
1652 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1666 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1667 if (MemSize > MaxSize) {
1671 if (MaxSize % EltSize == 0) {
1677 unsigned NumPieces = MemSize / MaxSize;
1681 if (NumPieces == 1 || NumPieces >= NumElts ||
1682 NumElts % NumPieces != 0)
1683 return std::pair(0, EltTy);
1691 return std::pair(0, EltTy);
1706 return std::pair(0, EltTy);
1711 .widenScalarToNextPow2(0)
1718 .legalForTypesWithMemDesc({{
S32, GlobalPtr,
S8, 8},
1719 {
S32, GlobalPtr,
S16, 2 * 8},
1720 {
S32, LocalPtr,
S8, 8},
1721 {
S32, LocalPtr,
S16, 16},
1722 {
S32, PrivatePtr,
S8, 8},
1723 {
S32, PrivatePtr,
S16, 16},
1724 {
S32, ConstantPtr,
S8, 8},
1725 {
S32, ConstantPtr,
S16, 2 * 8}})
1731 if (ST.hasFlatAddressSpace()) {
1732 ExtLoads.legalForTypesWithMemDesc(
1733 {{
S32, FlatPtr,
S8, 8}, {
S32, FlatPtr,
S16, 16}});
1748 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1749 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1750 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1751 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1752 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr},
1753 {
S64, GlobalPtr}, {
S64, LocalPtr},
1754 {
S32, RegionPtr}, {
S64, RegionPtr}});
1755 if (ST.hasFlatAddressSpace()) {
1756 Atomics.legalFor({{
S32, FlatPtr}, {
S64, FlatPtr}});
1761 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr}, {
S32, RegionPtr}});
1762 if (ST.hasFlatAddressSpace()) {
1763 Atomics32.legalFor({{
S32, FlatPtr}});
1768 if (ST.hasLDSFPAtomicAddF32()) {
1769 Atomic.legalFor({{
S32, LocalPtr}, {
S32, RegionPtr}});
1770 if (ST.hasLdsAtomicAddF64())
1771 Atomic.legalFor({{
S64, LocalPtr}});
1772 if (ST.hasAtomicDsPkAdd16Insts())
1773 Atomic.legalFor({{
V2F16, LocalPtr}, {
V2BF16, LocalPtr}});
1775 if (ST.hasAtomicFaddInsts())
1776 Atomic.legalFor({{
S32, GlobalPtr}});
1777 if (ST.hasFlatAtomicFaddF32Inst())
1778 Atomic.legalFor({{
S32, FlatPtr}});
1780 if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
1791 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1792 ST.hasAtomicBufferGlobalPkAddF16Insts())
1793 Atomic.legalFor({{
V2F16, GlobalPtr}, {
V2F16, BufferFatPtr}});
1794 if (ST.hasAtomicGlobalPkAddBF16Inst())
1795 Atomic.legalFor({{
V2BF16, GlobalPtr}});
1796 if (ST.hasAtomicFlatPkAdd16Insts())
1797 Atomic.legalFor({{
V2F16, FlatPtr}, {
V2BF16, FlatPtr}});
1802 auto &AtomicFMinFMax =
1804 .legalFor({{
F32, LocalPtr}, {
F64, LocalPtr}});
1806 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1808 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1809 AtomicFMinFMax.
legalFor({{
F64, GlobalPtr}, {
F64, BufferFatPtr}});
1810 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1812 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1819 {
S32, FlatPtr}, {
S64, FlatPtr}})
1820 .legalFor({{
S32, LocalPtr}, {
S64, LocalPtr},
1821 {
S32, RegionPtr}, {
S64, RegionPtr}});
1827 LocalPtr, FlatPtr, PrivatePtr,
1831 .clampScalar(0,
S16,
S64)
1846 if (ST.has16BitInsts()) {
1847 if (ST.hasVOP3PInsts()) {
1849 .clampMaxNumElements(0,
S16, 2);
1851 Shifts.legalFor({{
S16,
S16}});
1854 Shifts.widenScalarIf(
1859 const LLT AmountTy = Query.
Types[1];
1864 Shifts.clampScalar(1,
S32,
S32);
1865 Shifts.widenScalarToNextPow2(0, 16);
1866 Shifts.clampScalar(0,
S16,
S64);
1876 Shifts.clampScalar(1,
S32,
S32);
1877 Shifts.widenScalarToNextPow2(0, 32);
1878 Shifts.clampScalar(0,
S32,
S64);
1887 for (
unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1888 unsigned VecTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1889 unsigned EltTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1890 unsigned IdxTypeIdx = 2;
1894 const LLT EltTy = Query.
Types[EltTypeIdx];
1895 const LLT VecTy = Query.
Types[VecTypeIdx];
1896 const LLT IdxTy = Query.
Types[IdxTypeIdx];
1898 const bool isLegalVecType =
1908 return (EltSize == 32 || EltSize == 64) &&
1924 const LLT EltTy = Query.
Types[EltTypeIdx];
1925 const LLT VecTy = Query.
Types[VecTypeIdx];
1929 const unsigned TargetEltSize =
1930 DstEltSize % 64 == 0 ? 64 : 32;
1931 return std::pair(VecTypeIdx,
1935 .clampScalar(EltTypeIdx,
S32,
S64)
1949 const LLT &EltTy = Query.
Types[1].getElementType();
1950 return Query.
Types[0] != EltTy;
1953 for (
unsigned Op : {G_EXTRACT, G_INSERT}) {
1954 unsigned BigTyIdx =
Op == G_EXTRACT ? 1 : 0;
1955 unsigned LitTyIdx =
Op == G_EXTRACT ? 0 : 1;
1959 const LLT BigTy = Query.
Types[BigTyIdx];
1965 const LLT LitTy = Query.
Types[LitTyIdx];
1978 const LLT BigTy = Query.
Types[BigTyIdx];
1979 const LLT LitTy = Query.
Types[LitTyIdx];
1996 if (ST.hasScalarPackInsts()) {
1999 .minScalarOrElt(0,
S16)
2006 BuildVector.customFor({
V2S16,
S16});
2007 BuildVector.minScalarOrElt(0,
S32);
2026 for (
unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
2027 unsigned BigTyIdx =
Op == G_MERGE_VALUES ? 0 : 1;
2028 unsigned LitTyIdx =
Op == G_MERGE_VALUES ? 1 : 0;
2030 auto notValidElt = [=](
const LegalityQuery &Query,
unsigned TypeIdx) {
2031 const LLT Ty = Query.
Types[TypeIdx];
2032 if (Ty.isVector()) {
2047 const LLT BigTy = Query.
Types[BigTyIdx];
2067 return notValidElt(Query, LitTyIdx);
2072 return notValidElt(Query, BigTyIdx);
2077 if (
Op == G_MERGE_VALUES) {
2078 Builder.widenScalarIf(
2081 const LLT Ty = Query.
Types[LitTyIdx];
2082 return Ty.getSizeInBits() < 32;
2089 const LLT Ty = Query.
Types[BigTyIdx];
2090 return Ty.getSizeInBits() % 16 != 0;
2095 const LLT &Ty = Query.
Types[BigTyIdx];
2096 unsigned NewSizeInBits = 1 <<
Log2_32_Ceil(Ty.getSizeInBits() + 1);
2097 if (NewSizeInBits >= 256) {
2098 unsigned RoundedTo =
alignTo<64>(Ty.getSizeInBits() + 1);
2099 if (RoundedTo < NewSizeInBits)
2100 NewSizeInBits = RoundedTo;
2102 return std::pair(BigTyIdx,
LLT::scalar(NewSizeInBits));
2113 .clampScalar(0,
S32,
S64);
2115 if (ST.hasVOP3PInsts()) {
2116 SextInReg.lowerFor({{
V2S16}})
2120 .clampMaxNumElementsStrict(0,
S16, 2);
2121 }
else if (ST.has16BitInsts()) {
2122 SextInReg.lowerFor({{
S32}, {
S64}, {
S16}});
2126 SextInReg.lowerFor({{
S32}, {
S64}});
2139 FSHRActionDefs.legalFor({{
S32,
S32}})
2140 .clampMaxNumElementsStrict(0,
S16, 2);
2141 if (ST.hasVOP3PInsts())
2143 FSHRActionDefs.scalarize(0).lower();
2145 if (ST.hasVOP3PInsts()) {
2148 .clampMaxNumElementsStrict(0,
S16, 2)
2172 .clampScalar(1,
S32,
S32)
2181 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2182 G_READ_REGISTER, G_WRITE_REGISTER,
2187 if (ST.hasIEEEMinimumMaximumInsts()) {
2189 .legalFor(FPTypesPK16)
2192 }
else if (ST.hasVOP3PInsts()) {
2195 .clampMaxNumElementsStrict(0,
S16, 2)
2211 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2212 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2218 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2219 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2220 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2221 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2227 verify(*ST.getInstrInfo());
2236 switch (
MI.getOpcode()) {
2237 case TargetOpcode::G_ADDRSPACE_CAST:
2239 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2241 case TargetOpcode::G_FCEIL:
2243 case TargetOpcode::G_FREM:
2245 case TargetOpcode::G_INTRINSIC_TRUNC:
2247 case TargetOpcode::G_SITOFP:
2249 case TargetOpcode::G_UITOFP:
2251 case TargetOpcode::G_FPTOSI:
2253 case TargetOpcode::G_FPTOUI:
2255 case TargetOpcode::G_FMINNUM:
2256 case TargetOpcode::G_FMAXNUM:
2257 case TargetOpcode::G_FMINIMUMNUM:
2258 case TargetOpcode::G_FMAXIMUMNUM:
2260 case TargetOpcode::G_EXTRACT:
2262 case TargetOpcode::G_INSERT:
2264 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2266 case TargetOpcode::G_INSERT_VECTOR_ELT:
2268 case TargetOpcode::G_FSIN:
2269 case TargetOpcode::G_FCOS:
2271 case TargetOpcode::G_GLOBAL_VALUE:
2273 case TargetOpcode::G_LOAD:
2274 case TargetOpcode::G_SEXTLOAD:
2275 case TargetOpcode::G_ZEXTLOAD:
2277 case TargetOpcode::G_STORE:
2279 case TargetOpcode::G_FMAD:
2281 case TargetOpcode::G_FDIV:
2283 case TargetOpcode::G_FFREXP:
2285 case TargetOpcode::G_FSQRT:
2287 case TargetOpcode::G_UDIV:
2288 case TargetOpcode::G_UREM:
2289 case TargetOpcode::G_UDIVREM:
2291 case TargetOpcode::G_SDIV:
2292 case TargetOpcode::G_SREM:
2293 case TargetOpcode::G_SDIVREM:
2295 case TargetOpcode::G_ATOMIC_CMPXCHG:
2297 case TargetOpcode::G_FLOG2:
2299 case TargetOpcode::G_FLOG:
2300 case TargetOpcode::G_FLOG10:
2302 case TargetOpcode::G_FEXP2:
2304 case TargetOpcode::G_FEXP:
2305 case TargetOpcode::G_FEXP10:
2307 case TargetOpcode::G_FPOW:
2309 case TargetOpcode::G_FFLOOR:
2311 case TargetOpcode::G_BUILD_VECTOR:
2312 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2314 case TargetOpcode::G_MUL:
2316 case TargetOpcode::G_CTLZ:
2317 case TargetOpcode::G_CTTZ:
2319 case TargetOpcode::G_CTLS:
2321 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2323 case TargetOpcode::G_STACKSAVE:
2325 case TargetOpcode::G_GET_FPENV:
2327 case TargetOpcode::G_SET_FPENV:
2329 case TargetOpcode::G_TRAP:
2331 case TargetOpcode::G_DEBUGTRAP:
2351 if (ST.hasApertureRegs()) {
2356 ? AMDGPU::SRC_SHARED_BASE
2357 : AMDGPU::SRC_PRIVATE_BASE;
2358 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2359 !ST.hasGloballyAddressableScratch()) &&
2360 "Cannot use src_private_base with globally addressable scratch!");
2363 B.buildCopy({Dst}, {
Register(ApertureRegNo)});
2364 return B.buildUnmerge(
S32, Dst).getReg(1);
2379 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
2395 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2398 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2420 B.buildObjectPtrOffset(
2422 B.buildConstant(
LLT::scalar(64), StructOffset).getReg(0));
2423 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2431 switch (Def->getOpcode()) {
2432 case AMDGPU::G_FRAME_INDEX:
2433 case AMDGPU::G_GLOBAL_VALUE:
2434 case AMDGPU::G_BLOCK_ADDR:
2436 case AMDGPU::G_CONSTANT: {
2437 const ConstantInt *CI = Def->getOperand(1).getCImm();
2454 assert(
MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2456 Intrinsic::amdgcn_addrspacecast_nonnull));
2461 :
MI.getOperand(1).getReg();
2465 unsigned SrcAS = SrcTy.getAddressSpace();
2475 MI.setDesc(
B.getTII().get(TargetOpcode::G_BITCAST));
2482 auto castFlatToLocalOrPrivate = [&](
const DstOp &Dst) ->
Register {
2484 ST.hasGloballyAddressableScratch()) {
2488 Register SrcLo =
B.buildExtract(
S32, Src, 0).getReg(0);
2490 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
2491 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2493 MRI.
setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2495 return B.buildIntToPtr(Dst,
Sub).getReg(0);
2499 return B.buildExtract(Dst, Src, 0).getReg(0);
2505 castFlatToLocalOrPrivate(Dst);
2506 MI.eraseFromParent();
2512 auto SegmentNull =
B.buildConstant(DstTy, NullVal);
2513 auto FlatNull =
B.buildConstant(SrcTy, 0);
2516 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2520 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2522 MI.eraseFromParent();
2529 auto castLocalOrPrivateToFlat = [&](
const DstOp &Dst) ->
Register {
2532 Register SrcAsInt =
B.buildPtrToInt(
S32, Src).getReg(0);
2535 ST.hasGloballyAddressableScratch()) {
2540 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {
S32})
2544 if (ST.isWave64()) {
2545 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {
S32})
2551 B.buildConstant(
S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2552 Register SrcHi =
B.buildShl(
S32, ThreadID, ShAmt).getReg(0);
2554 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).
getReg(0);
2558 B.buildInstr(AMDGPU::S_MOV_B64, {
S64},
2559 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2561 MRI.
setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2562 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2571 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).
getReg(0);
2577 castLocalOrPrivateToFlat(Dst);
2578 MI.eraseFromParent();
2582 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2589 SegmentNull.getReg(0));
2591 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2593 MI.eraseFromParent();
2598 SrcTy.getSizeInBits() == 64) {
2600 B.buildExtract(Dst, Src, 0);
2601 MI.eraseFromParent();
2608 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2609 auto PtrLo =
B.buildPtrToInt(
S32, Src);
2610 if (AddrHiVal == 0) {
2612 B.buildIntToPtr(Dst, Zext);
2614 auto HighAddr =
B.buildConstant(
S32, AddrHiVal);
2615 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2618 MI.eraseFromParent();
2625 MI.eraseFromParent();
2634 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2639 auto C1 =
B.buildFConstant(Ty, C1Val);
2640 auto CopySign =
B.buildFCopysign(Ty, C1, Src);
2643 auto Tmp1 =
B.buildFAdd(Ty, Src, CopySign);
2644 auto Tmp2 =
B.buildFSub(Ty, Tmp1, CopySign);
2646 auto C2 =
B.buildFConstant(Ty, C2Val);
2647 auto Fabs =
B.buildFAbs(Ty, Src);
2650 B.buildSelect(
MI.getOperand(0).getReg(),
Cond, Src, Tmp2);
2651 MI.eraseFromParent();
2669 auto Trunc =
B.buildIntrinsicTrunc(
S64, Src);
2671 const auto Zero =
B.buildFConstant(
S64, 0.0);
2672 const auto One =
B.buildFConstant(
S64, 1.0);
2675 auto And =
B.buildAnd(
S1, Lt0, NeTrunc);
2676 auto Add =
B.buildSelect(
S64,
And, One, Zero);
2679 B.buildFAdd(
MI.getOperand(0).getReg(), Trunc,
Add);
2680 MI.eraseFromParent();
2688 Register Src0Reg =
MI.getOperand(1).getReg();
2689 Register Src1Reg =
MI.getOperand(2).getReg();
2690 auto Flags =
MI.getFlags();
2693 auto Div =
B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2694 auto Trunc =
B.buildIntrinsicTrunc(Ty, Div, Flags);
2695 auto Neg =
B.buildFNeg(Ty, Trunc, Flags);
2696 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2697 MI.eraseFromParent();
2703 const unsigned FractBits = 52;
2704 const unsigned ExpBits = 11;
2707 auto Const0 =
B.buildConstant(
S32, FractBits - 32);
2708 auto Const1 =
B.buildConstant(
S32, ExpBits);
2710 auto ExpPart =
B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {
S32})
2712 .addUse(Const0.getReg(0))
2713 .addUse(Const1.getReg(0));
2715 return B.buildSub(
S32, ExpPart,
B.buildConstant(
S32, 1023));
2729 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2736 const unsigned FractBits = 52;
2739 const auto SignBitMask =
B.buildConstant(
S32, UINT32_C(1) << 31);
2740 auto SignBit =
B.buildAnd(
S32,
Hi, SignBitMask);
2742 const auto FractMask =
B.buildConstant(
S64, (UINT64_C(1) << FractBits) - 1);
2744 const auto Zero32 =
B.buildConstant(
S32, 0);
2747 auto SignBit64 =
B.buildMergeLikeInstr(
S64, {Zero32, SignBit});
2749 auto Shr =
B.buildAShr(
S64, FractMask, Exp);
2750 auto Not =
B.buildNot(
S64, Shr);
2751 auto Tmp0 =
B.buildAnd(
S64, Src, Not);
2752 auto FiftyOne =
B.buildConstant(
S32, FractBits - 1);
2757 auto Tmp1 =
B.buildSelect(
S64, ExpLt0, SignBit64, Tmp0);
2758 B.buildSelect(
MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2759 MI.eraseFromParent();
2775 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2776 auto ThirtyTwo =
B.buildConstant(
S32, 32);
2779 auto CvtHi =
Signed ?
B.buildSITOFP(
S64, Unmerge.getReg(1))
2780 :
B.buildUITOFP(
S64, Unmerge.getReg(1));
2782 auto CvtLo =
B.buildUITOFP(
S64, Unmerge.getReg(0));
2783 auto LdExp =
B.buildFLdexp(
S64, CvtHi, ThirtyTwo);
2786 B.buildFAdd(Dst, LdExp, CvtLo);
2787 MI.eraseFromParent();
2793 auto One =
B.buildConstant(
S32, 1);
2797 auto ThirtyOne =
B.buildConstant(
S32, 31);
2798 auto X =
B.buildXor(
S32, Unmerge.getReg(0), Unmerge.getReg(1));
2799 auto OppositeSign =
B.buildAShr(
S32,
X, ThirtyOne);
2800 auto MaxShAmt =
B.buildAdd(
S32, ThirtyTwo, OppositeSign);
2801 auto LS =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32})
2802 .addUse(Unmerge.getReg(1));
2803 auto LS2 =
B.buildSub(
S32, LS, One);
2804 ShAmt =
B.buildUMin(
S32, LS2, MaxShAmt);
2806 ShAmt =
B.buildCTLZ(
S32, Unmerge.getReg(1));
2807 auto Norm =
B.buildShl(
S64, Src, ShAmt);
2808 auto Unmerge2 =
B.buildUnmerge({
S32,
S32}, Norm);
2809 auto Adjust =
B.buildUMin(
S32, One, Unmerge2.getReg(0));
2810 auto Norm2 =
B.buildOr(
S32, Unmerge2.getReg(1), Adjust);
2811 auto FVal =
Signed ?
B.buildSITOFP(
S32, Norm2) :
B.buildUITOFP(
S32, Norm2);
2812 auto Scale =
B.buildSub(
S32, ThirtyTwo, ShAmt);
2813 B.buildFLdexp(Dst, FVal, Scale);
2814 MI.eraseFromParent();
2834 unsigned Flags =
MI.getFlags();
2845 auto Trunc =
B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2853 Sign =
B.buildAShr(
S32, Src,
B.buildConstant(
S32, 31));
2854 Trunc =
B.buildFAbs(
S32, Trunc, Flags);
2858 K0 =
B.buildFConstant(
2860 K1 =
B.buildFConstant(
2863 K0 =
B.buildFConstant(
2865 K1 =
B.buildFConstant(
2869 auto Mul =
B.buildFMul(SrcLT, Trunc, K0, Flags);
2870 auto FloorMul =
B.buildFFloor(SrcLT,
Mul, Flags);
2871 auto Fma =
B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2874 :
B.buildFPTOUI(
S32, FloorMul);
2875 auto Lo =
B.buildFPTOUI(
S32, Fma);
2879 Sign =
B.buildMergeLikeInstr(
S64, {Sign, Sign});
2881 B.buildSub(Dst,
B.buildXor(
S64,
B.buildMergeLikeInstr(
S64, {Lo, Hi}), Sign),
2884 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
2885 MI.eraseFromParent();
2917 unsigned StartIdx =
Offset / 32;
2919 auto Unmerge =
B.buildUnmerge(
LLT::scalar(32), SrcReg);
2921 if (DstCount == 1) {
2923 B.buildIntToPtr(DstReg, Unmerge.getReg(StartIdx));
2928 for (
unsigned I = 0;
I < DstCount; ++
I)
2929 MergeVec.
push_back(Unmerge.getReg(StartIdx +
I));
2930 B.buildMergeLikeInstr(DstReg, MergeVec);
2933 MI.eraseFromParent();
2943 Register InsertSrc =
MI.getOperand(2).getReg();
2952 if (
Offset % 32 != 0 || DstSize % 32 != 0 || InsertSize % 32 != 0)
2956 unsigned DstCount = DstSize / 32;
2957 unsigned InsertCount = InsertSize / 32;
2958 unsigned StartIdx =
Offset / 32;
2960 auto SrcUnmerge =
B.buildUnmerge(
S32, SrcReg);
2963 for (
unsigned I = 0;
I < StartIdx; ++
I)
2966 if (InsertCount == 1) {
2970 InsertSrc =
B.buildPtrToInt(
S32, InsertSrc).getReg(0);
2973 auto InsertUnmerge =
B.buildUnmerge(
S32, InsertSrc);
2974 for (
unsigned I = 0;
I < InsertCount; ++
I)
2978 for (
unsigned I = StartIdx + InsertCount;
I < DstCount; ++
I)
2981 B.buildMergeLikeInstr(DstReg, MergeVec);
2983 MI.eraseFromParent();
3010 auto IntVec =
B.buildPtrToInt(IntVecTy, Vec);
3011 auto IntElt =
B.buildExtractVectorElement(IntTy, IntVec,
MI.getOperand(2));
3012 B.buildIntToPtr(Dst, IntElt);
3014 MI.eraseFromParent();
3021 std::optional<ValueAndVReg> MaybeIdxVal =
3025 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3028 auto Unmerge =
B.buildUnmerge(EltTy, Vec);
3029 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
3034 MI.eraseFromParent();
3063 auto IntVecSource =
B.buildPtrToInt(IntVecTy, Vec);
3064 auto IntIns =
B.buildPtrToInt(IntTy, Ins);
3065 auto IntVecDest =
B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
3067 B.buildIntToPtr(Dst, IntVecDest);
3068 MI.eraseFromParent();
3075 std::optional<ValueAndVReg> MaybeIdxVal =
3080 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3083 if (IdxVal < NumElts) {
3085 for (
unsigned i = 0; i < NumElts; ++i)
3087 B.buildUnmerge(SrcRegs, Vec);
3089 SrcRegs[IdxVal] =
MI.getOperand(2).getReg();
3090 B.buildMergeLikeInstr(Dst, SrcRegs);
3095 MI.eraseFromParent();
3106 unsigned Flags =
MI.getFlags();
3110 if (ST.hasTrigReducedRange()) {
3111 auto MulVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
3112 TrigVal =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
3113 .addUse(MulVal.getReg(0))
3117 TrigVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
3120 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
3124 MI.eraseFromParent();
3132 unsigned GAFlags)
const {
3161 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3163 if (ST.has64BitLiterals()) {
3167 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3171 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3180 if (!
B.getMRI()->getRegClassOrNull(PCReg))
3181 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3184 B.buildExtract(DstReg, PCReg, 0);
3194 if (RequiresHighHalf && ST.has64BitLiterals()) {
3196 MRI.
setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3197 B.buildInstr(AMDGPU::S_MOV_B64)
3212 MRI.
setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3215 B.buildInstr(AMDGPU::S_MOV_B32)
3220 if (RequiresHighHalf) {
3222 "Must provide a 64-bit pointer type!");
3225 MRI.
setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3227 B.buildInstr(AMDGPU::S_MOV_B32)
3238 MRI.
setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3240 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3244 if (AddrDst != DstReg)
3245 B.buildCast(DstReg, AddrDst);
3246 }
else if (AddrLo != DstReg) {
3249 B.buildCast(DstReg, AddrLo);
3266 GV->
getName() !=
"llvm.amdgcn.module.lds" &&
3270 Fn,
"local memory global used by non-kernel function",
3279 B.buildUndef(DstReg);
3280 MI.eraseFromParent();
3304 auto Sz =
B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {
S32});
3305 B.buildIntToPtr(DstReg, Sz);
3306 MI.eraseFromParent();
3312 MI.eraseFromParent();
3316 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3318 MI.eraseFromParent();
3326 MI.eraseFromParent();
3332 MI.eraseFromParent();
3348 if (Ty.getSizeInBits() == 32) {
3350 auto Load =
B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3351 B.buildExtract(DstReg, Load, 0);
3353 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3355 MI.eraseFromParent();
3378 auto Cast =
B.buildAddrSpaceCast(ConstPtr, PtrReg);
3380 MI.getOperand(1).setReg(Cast.getReg(0));
3385 if (
MI.getOpcode() != AMDGPU::G_LOAD)
3411 if (WideMemSize == ValSize) {
3417 MI.setMemRefs(MF, {WideMMO});
3423 if (ValSize > WideMemSize)
3430 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3431 B.buildTrunc(ValReg, WideLoad).getReg(0);
3438 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3439 B.buildExtract(ValReg, WideLoad, 0);
3443 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3444 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3448 MI.eraseFromParent();
3461 Register DataReg =
MI.getOperand(0).getReg();
3505 "this should not have been custom lowered");
3510 Register PackedVal =
B.buildBuildVector(VecTy, { NewVal, CmpVal }).
getReg(0);
3512 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3516 .setMemRefs(
MI.memoperands());
3518 MI.eraseFromParent();
3526 switch (
DefMI->getOpcode()) {
3527 case TargetOpcode::G_INTRINSIC: {
3529 case Intrinsic::amdgcn_frexp_mant:
3530 case Intrinsic::amdgcn_log:
3531 case Intrinsic::amdgcn_log_clamp:
3532 case Intrinsic::amdgcn_exp2:
3533 case Intrinsic::amdgcn_sqrt:
3541 case TargetOpcode::G_FSQRT:
3543 case TargetOpcode::G_FFREXP: {
3544 if (
DefMI->getOperand(0).getReg() == Src)
3548 case TargetOpcode::G_FPEXT: {
3569std::pair<Register, Register>
3571 unsigned Flags)
const {
3576 auto SmallestNormal =
B.buildFConstant(
3578 auto IsLtSmallestNormal =
3581 auto Scale32 =
B.buildFConstant(
F32, 0x1.0p+32);
3582 auto One =
B.buildFConstant(
F32, 1.0);
3584 B.buildSelect(
F32, IsLtSmallestNormal, Scale32, One, Flags);
3585 auto ScaledInput =
B.buildFMul(
F32, Src, ScaleFactor, Flags);
3587 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3600 LLT Ty =
B.getMRI()->getType(Dst);
3601 unsigned Flags =
MI.getFlags();
3606 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3607 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {
F32})
3608 .addUse(Ext.getReg(0))
3610 B.buildFPTrunc(Dst,
Log2, Flags);
3611 MI.eraseFromParent();
3619 B.buildIntrinsic(Intrinsic::amdgcn_log, {
MI.getOperand(0)})
3622 MI.eraseFromParent();
3626 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3627 .addUse(ScaledInput)
3630 auto ThirtyTwo =
B.buildFConstant(Ty, 32.0);
3631 auto Zero =
B.buildFConstant(Ty, 0.0);
3633 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3634 B.buildFSub(Dst,
Log2, ResultOffset, Flags);
3636 MI.eraseFromParent();
3642 auto FMul =
B.buildFMul(Ty,
X,
Y, Flags);
3643 return B.buildFAdd(Ty,
FMul, Z, Flags).getReg(0);
3648 const bool IsLog10 =
MI.getOpcode() == TargetOpcode::G_FLOG10;
3649 assert(IsLog10 ||
MI.getOpcode() == TargetOpcode::G_FLOG);
3654 unsigned Flags =
MI.getFlags();
3667 auto PromoteSrc =
B.buildFPExt(
F32,
X);
3669 B.buildFPTrunc(Dst, LogVal);
3674 MI.eraseFromParent();
3683 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(
X).setMIFlags(Flags);
3686 if (ST.hasFastFMAF32()) {
3688 const float c_log10 = 0x1.344134p-2f;
3689 const float cc_log10 = 0x1.09f79ep-26f;
3692 const float c_log = 0x1.62e42ep-1f;
3693 const float cc_log = 0x1.efa39ep-25f;
3695 auto C =
B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3696 auto CC =
B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3700 R =
B.buildFMul(Ty,
Y,
C, NewFlags).getReg(0);
3701 auto NegR =
B.buildFNeg(Ty, R, NewFlags);
3702 auto FMA0 =
B.buildFMA(Ty,
Y,
C, NegR, NewFlags);
3703 auto FMA1 =
B.buildFMA(Ty,
Y, CC, FMA0, NewFlags);
3704 R =
B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0);
3707 const float ch_log10 = 0x1.344000p-2f;
3708 const float ct_log10 = 0x1.3509f6p-18f;
3711 const float ch_log = 0x1.62e000p-1f;
3712 const float ct_log = 0x1.0bfbe8p-15f;
3714 auto CH =
B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3715 auto CT =
B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3717 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3718 auto YH =
B.buildAnd(Ty,
Y, MaskConst);
3719 auto YT =
B.buildFSub(Ty,
Y, YH, Flags);
3723 auto YTCT =
B.buildFMul(Ty, YT, CT, NewFlags);
3726 getMad(
B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags);
3728 R =
getMad(
B, Ty, YH.getReg(0),
CH.getReg(0), Mad1, NewFlags);
3731 const bool IsFiniteOnly =
3734 if (!IsFiniteOnly) {
3737 auto Fabs =
B.buildFAbs(Ty,
Y);
3740 R =
B.buildSelect(Ty, IsFinite, R,
Y, Flags).getReg(0);
3744 auto Zero =
B.buildFConstant(Ty, 0.0);
3746 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3747 auto Shift =
B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3748 B.buildFSub(Dst, R, Shift, Flags);
3750 B.buildCopy(Dst, R);
3753 MI.eraseFromParent();
3759 unsigned Flags)
const {
3760 const double Log2BaseInverted =
3763 LLT Ty =
B.getMRI()->getType(Dst);
3768 auto LogSrc =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3771 auto ScaledResultOffset =
B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3772 auto Zero =
B.buildFConstant(Ty, 0.0);
3774 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3775 auto Log2Inv =
B.buildFConstant(Ty, Log2BaseInverted);
3777 if (ST.hasFastFMAF32())
3778 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3780 auto Mul =
B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3781 B.buildFAdd(Dst,
Mul, ResultOffset, Flags);
3789 ?
B.buildFLog2(Ty, Src, Flags)
3790 :
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3793 auto Log2BaseInvertedOperand =
B.buildFConstant(Ty, Log2BaseInverted);
3794 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3805 unsigned Flags =
MI.getFlags();
3806 LLT Ty =
B.getMRI()->getType(Dst);
3816 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3817 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {
F32})
3818 .addUse(Ext.getReg(0))
3820 B.buildFPTrunc(Dst,
Log2, Flags);
3821 MI.eraseFromParent();
3831 MI.eraseFromParent();
3839 auto RangeCheckConst =
B.buildFConstant(Ty, -0x1.f80000p+6f);
3841 RangeCheckConst, Flags);
3843 auto SixtyFour =
B.buildFConstant(Ty, 0x1.0p+6f);
3844 auto Zero =
B.buildFConstant(Ty, 0.0);
3845 auto AddOffset =
B.buildSelect(
F32, NeedsScaling, SixtyFour, Zero, Flags);
3846 auto AddInput =
B.buildFAdd(
F32, Src, AddOffset, Flags);
3848 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3849 .addUse(AddInput.getReg(0))
3852 auto TwoExpNeg64 =
B.buildFConstant(Ty, 0x1.0p-64f);
3853 auto One =
B.buildFConstant(Ty, 1.0);
3854 auto ResultScale =
B.buildSelect(
F32, NeedsScaling, TwoExpNeg64, One, Flags);
3855 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3856 MI.eraseFromParent();
3861 const SrcOp &Src,
unsigned Flags) {
3862 LLT Ty = Dst.getLLTTy(*
B.getMRI());
3865 return B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Dst})
3866 .addUse(Src.getReg())
3869 return B.buildFExp2(Dst, Src, Flags);
3875 bool IsExp10)
const {
3876 LLT Ty =
B.getMRI()->getType(
X);
3880 auto Const =
B.buildFConstant(Ty, IsExp10 ? 0x1.a934f0p+1f :
numbers::log2e);
3881 auto Mul =
B.buildFMul(Ty,
X, Const, Flags);
3888 LLT Ty =
B.getMRI()->getType(Dst);
3895 auto Threshold =
B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3898 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+6f);
3899 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3900 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X, Flags);
3903 auto ExpInput =
B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3905 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3906 .addUse(ExpInput.getReg(0))
3909 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.969d48p-93f);
3910 auto AdjustedResult =
B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3911 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3917 unsigned Flags)
const {
3918 LLT Ty =
B.getMRI()->getType(Dst);
3923 auto K0 =
B.buildFConstant(Ty, 0x1.a92000p+1f);
3924 auto K1 =
B.buildFConstant(Ty, 0x1.4f0978p-11f);
3926 auto Mul1 =
B.buildFMul(Ty,
X, K1, Flags);
3927 auto Exp2_1 =
buildExp(
B, Ty, Mul1, Flags);
3928 auto Mul0 =
B.buildFMul(Ty,
X, K0, Flags);
3929 auto Exp2_0 =
buildExp(
B, Ty, Mul0, Flags);
3930 B.buildFMul(Dst, Exp2_0, Exp2_1, Flags);
3940 auto Threshold =
B.buildFConstant(Ty, -0x1.2f7030p+5f);
3944 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+5f);
3945 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3946 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X);
3948 auto K0 =
B.buildFConstant(Ty, 0x1.a92000p+1f);
3949 auto K1 =
B.buildFConstant(Ty, 0x1.4f0978p-11f);
3951 auto Mul1 =
B.buildFMul(Ty, AdjustedX, K1, Flags);
3952 auto Exp2_1 =
buildExp(
B, Ty, Mul1, Flags);
3953 auto Mul0 =
B.buildFMul(Ty, AdjustedX, K0, Flags);
3954 auto Exp2_0 =
buildExp(
B, Ty, Mul0, Flags);
3956 auto MulExps =
B.buildFMul(Ty, Exp2_0, Exp2_1, Flags);
3957 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.9f623ep-107f);
3958 auto AdjustedResult =
B.buildFMul(Ty, MulExps, ResultScaleFactor, Flags);
3960 B.buildSelect(Dst, NeedsScaling, AdjustedResult, MulExps);
3979 if (
MI.getOpcode() == TargetOpcode::G_FEXP2) {
3981 Dn =
B.buildFRint(
S64,
X, Flags).getReg(0);
3983 F =
B.buildFSub(
S64,
X, Dn, Flags).getReg(0);
3985 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.62e42fefa39efp-1));
3986 auto C2 =
B.buildFConstant(
S64,
APFloat(0x1.abc9e3b39803fp-56));
3987 auto Mul2 =
B.buildFMul(
S64,
F, C2, Flags).getReg(0);
3988 T =
B.buildFMA(
S64,
F, C1, Mul2, Flags).getReg(0);
3990 }
else if (
MI.getOpcode() == TargetOpcode::G_FEXP10) {
3991 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.a934f0979a371p+1));
3992 auto Mul =
B.buildFMul(
S64,
X, C1, Flags).getReg(0);
3993 Dn =
B.buildFRint(
S64,
Mul, Flags).getReg(0);
3995 auto NegDn =
B.buildFNeg(
S64, Dn, Flags).getReg(0);
3996 auto C2 =
B.buildFConstant(
S64,
APFloat(-0x1.9dc1da994fd21p-59));
3997 auto C3 =
B.buildFConstant(
S64,
APFloat(0x1.34413509f79ffp-2));
3998 auto Inner =
B.buildFMA(
S64, NegDn, C3,
X, Flags).getReg(0);
3999 F =
B.buildFMA(
S64, NegDn, C2, Inner, Flags).getReg(0);
4001 auto C4 =
B.buildFConstant(
S64,
APFloat(0x1.26bb1bbb55516p+1));
4002 auto C5 =
B.buildFConstant(
S64,
APFloat(-0x1.f48ad494ea3e9p-53));
4003 auto MulF =
B.buildFMul(
S64,
F, C5, Flags).getReg(0);
4004 T =
B.buildFMA(
S64,
F, C4, MulF, Flags).getReg(0);
4007 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.71547652b82fep+0));
4008 auto Mul =
B.buildFMul(
S64,
X, C1, Flags).getReg(0);
4009 Dn =
B.buildFRint(
S64,
Mul, Flags).getReg(0);
4011 auto NegDn =
B.buildFNeg(
S64, Dn, Flags).getReg(0);
4012 auto C2 =
B.buildFConstant(
S64,
APFloat(0x1.abc9e3b39803fp-56));
4013 auto C3 =
B.buildFConstant(
S64,
APFloat(0x1.62e42fefa39efp-1));
4014 auto Inner =
B.buildFMA(
S64, NegDn, C3,
X, Flags).getReg(0);
4015 T =
B.buildFMA(
S64, NegDn, C2, Inner, Flags).getReg(0);
4019 auto P =
B.buildFConstant(
S64, 0x1.ade156a5dcb37p-26);
4020 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.28af3fca7ab0cp-22),
4022 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.71dee623fde64p-19),
4024 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.a01997c89e6b0p-16),
4026 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.a01a014761f6ep-13),
4028 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.6c16c1852b7b0p-10),
4030 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.1111111122322p-7), Flags);
4031 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.55555555502a1p-5), Flags);
4032 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.5555555555511p-3), Flags);
4033 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.000000000000bp-1), Flags);
4035 auto One =
B.buildFConstant(
S64, 1.0);
4036 P =
B.buildFMA(
S64,
T,
P, One, Flags);
4037 P =
B.buildFMA(
S64,
T,
P, One, Flags);
4040 auto DnInt =
B.buildFPTOSI(
S32, Dn);
4041 auto Z =
B.buildFLdexp(
S64,
P, DnInt, Flags);
4048 Z =
B.buildSelect(
S64, CondHi, Z, PInf, Flags);
4055 B.buildSelect(
MI.getOperand(0).getReg(), CondLo, Z, Zero, Flags);
4057 MI.eraseFromParent();
4065 const unsigned Flags =
MI.getFlags();
4077 const bool IsExp10 =
MI.getOpcode() == TargetOpcode::G_FEXP10;
4085 MI.eraseFromParent();
4096 auto Ext =
B.buildFPExt(
F32,
X, Flags);
4099 B.buildFPTrunc(Dst, Lowered, Flags);
4100 MI.eraseFromParent();
4111 MI.eraseFromParent();
4139 const unsigned FlagsNoContract = Flags &
~MachineInstr::FmContract;
4142 if (ST.hasFastFMAF32()) {
4144 const float cc_exp = 0x1.4ae0bep-26f;
4145 const float c_exp10 = 0x1.a934f0p+1f;
4146 const float cc_exp10 = 0x1.2f346ep-24f;
4148 auto C =
B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
4149 PH =
B.buildFMul(Ty,
X,
C, Flags).getReg(0);
4150 auto NegPH =
B.buildFNeg(Ty, PH, Flags);
4151 auto FMA0 =
B.buildFMA(Ty,
X,
C, NegPH, Flags);
4153 auto CC =
B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
4154 PL =
B.buildFMA(Ty,
X, CC, FMA0, Flags).getReg(0);
4156 const float ch_exp = 0x1.714000p+0f;
4157 const float cl_exp = 0x1.47652ap-12f;
4159 const float ch_exp10 = 0x1.a92000p+1f;
4160 const float cl_exp10 = 0x1.4f0978p-11f;
4162 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
4163 auto XH =
B.buildAnd(Ty,
X, MaskConst);
4164 auto XL =
B.buildFSub(Ty,
X, XH, Flags);
4166 auto CH =
B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
4167 PH =
B.buildFMul(Ty, XH,
CH, Flags).getReg(0);
4169 auto CL =
B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
4170 auto XLCL =
B.buildFMul(Ty, XL, CL, Flags);
4173 getMad(
B, Ty, XL.getReg(0),
CH.getReg(0), XLCL.getReg(0), Flags);
4174 PL =
getMad(
B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
4177 auto E =
B.buildIntrinsicRoundeven(Ty, PH, Flags);
4180 auto PHSubE =
B.buildFSub(Ty, PH, E, FlagsNoContract);
4181 auto A =
B.buildFAdd(Ty, PHSubE, PL, Flags);
4184 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
4185 .addUse(
A.getReg(0))
4187 auto R =
B.buildFLdexp(Ty, Exp2, IntE, Flags);
4189 auto UnderflowCheckConst =
4190 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
4191 auto Zero =
B.buildFConstant(Ty, 0.0);
4195 R =
B.buildSelect(Ty, Underflow, Zero, R);
4198 auto OverflowCheckConst =
4199 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
4204 R =
B.buildSelect(Ty, Overflow, Inf, R, Flags);
4207 B.buildCopy(Dst, R);
4208 MI.eraseFromParent();
4217 unsigned Flags =
MI.getFlags();
4218 LLT Ty =
B.getMRI()->getType(Dst);
4223 auto Log =
B.buildFLog2(
F32, Src0, Flags);
4224 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
4225 .addUse(Log.getReg(0))
4228 B.buildFExp2(Dst,
Mul, Flags);
4229 }
else if (Ty == F16) {
4231 auto Log =
B.buildFLog2(F16, Src0, Flags);
4232 auto Ext0 =
B.buildFPExt(
F32, Log, Flags);
4233 auto Ext1 =
B.buildFPExt(
F32, Src1, Flags);
4234 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
4235 .addUse(Ext0.getReg(0))
4236 .addUse(Ext1.getReg(0))
4238 B.buildFExp2(Dst,
B.buildFPTrunc(F16,
Mul), Flags);
4242 MI.eraseFromParent();
4250 ModSrc = SrcFNeg->getOperand(1).getReg();
4252 ModSrc = SrcFAbs->getOperand(1).getReg();
4254 ModSrc = SrcFAbs->getOperand(1).getReg();
4265 Register OrigSrc =
MI.getOperand(1).getReg();
4266 unsigned Flags =
MI.getFlags();
4268 "this should not have been custom lowered");
4278 auto Fract =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {
F64})
4298 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
4300 B.buildFMinNum(Min, Fract, Const, Flags);
4305 CorrectedFract =
B.buildSelect(
F64, IsNan, ModSrc, Min, Flags).getReg(0);
4308 auto NegFract =
B.buildFNeg(
F64, CorrectedFract, Flags);
4309 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
4311 MI.eraseFromParent();
4327 if (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4329 Src0 =
B.buildTrunc(
S16,
MI.getOperand(1).getReg()).getReg(0);
4330 Src1 =
B.buildTrunc(
S16,
MI.getOperand(2).getReg()).getReg(0);
4333 auto Merge =
B.buildMergeLikeInstr(
S32, {Src0, Src1});
4334 B.buildBitcast(Dst,
Merge);
4336 MI.eraseFromParent();
4353 bool UsePartialMad64_32,
4354 bool SeparateOddAlignedProducts)
const {
4369 auto getZero32 = [&]() ->
Register {
4371 Zero32 =
B.buildConstant(
S32, 0).getReg(0);
4374 auto getZero64 = [&]() ->
Register {
4376 Zero64 =
B.buildConstant(
S64, 0).getReg(0);
4381 for (
unsigned i = 0; i < Src0.
size(); ++i) {
4392 if (CarryIn.empty())
4395 bool HaveCarryOut =
true;
4397 if (CarryIn.size() == 1) {
4399 LocalAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4403 CarryAccum = getZero32();
4405 CarryAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4406 for (
unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4408 B.buildUAdde(
S32,
S1, CarryAccum, getZero32(), CarryIn[i])
4413 LocalAccum = getZero32();
4414 HaveCarryOut =
false;
4419 B.buildUAdde(
S32,
S1, CarryAccum, LocalAccum, CarryIn.back());
4420 LocalAccum =
Add.getReg(0);
4434 auto buildMadChain =
4437 assert((DstIndex + 1 < Accum.
size() && LocalAccum.size() == 2) ||
4438 (DstIndex + 1 >= Accum.
size() && LocalAccum.size() == 1));
4445 if (LocalAccum.size() == 1 &&
4446 (!UsePartialMad64_32 || !CarryIn.empty())) {
4449 unsigned j1 = DstIndex - j0;
4450 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4454 auto Mul =
B.buildMul(
S32, Src0[j0], Src1[j1]);
4456 LocalAccum[0] =
Mul.getReg(0);
4458 if (CarryIn.empty()) {
4459 LocalAccum[0] =
B.buildAdd(
S32, LocalAccum[0],
Mul).getReg(0);
4462 B.buildUAdde(
S32,
S1, LocalAccum[0],
Mul, CarryIn.back())
4468 }
while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4472 if (j0 <= DstIndex) {
4473 bool HaveSmallAccum =
false;
4476 if (LocalAccum[0]) {
4477 if (LocalAccum.size() == 1) {
4478 Tmp =
B.buildAnyExt(
S64, LocalAccum[0]).getReg(0);
4479 HaveSmallAccum =
true;
4480 }
else if (LocalAccum[1]) {
4481 Tmp =
B.buildMergeLikeInstr(
S64, LocalAccum).getReg(0);
4482 HaveSmallAccum =
false;
4484 Tmp =
B.buildZExt(
S64, LocalAccum[0]).getReg(0);
4485 HaveSmallAccum =
true;
4488 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4490 HaveSmallAccum =
true;
4494 unsigned j1 = DstIndex - j0;
4495 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4499 auto Mad =
B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {
S64,
S1},
4500 {Src0[j0], Src1[j1], Tmp});
4501 Tmp = Mad.getReg(0);
4502 if (!HaveSmallAccum)
4503 CarryOut.push_back(Mad.getReg(1));
4504 HaveSmallAccum =
false;
4507 }
while (j0 <= DstIndex);
4509 auto Unmerge =
B.buildUnmerge(
S32, Tmp);
4510 LocalAccum[0] = Unmerge.getReg(0);
4511 if (LocalAccum.size() > 1)
4512 LocalAccum[1] = Unmerge.getReg(1);
4539 for (
unsigned i = 0; i <= Accum.
size() / 2; ++i) {
4540 Carry OddCarryIn = std::move(OddCarry);
4541 Carry EvenCarryIn = std::move(EvenCarry);
4546 if (2 * i < Accum.
size()) {
4547 auto LocalAccum = Accum.
drop_front(2 * i).take_front(2);
4548 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4553 if (!SeparateOddAlignedProducts) {
4554 auto LocalAccum = Accum.
drop_front(2 * i - 1).take_front(2);
4555 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4557 bool IsHighest = 2 * i >= Accum.
size();
4560 .take_front(IsHighest ? 1 : 2);
4561 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4567 Lo =
B.buildUAddo(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0]);
4569 Lo =
B.buildAdd(
S32, Accum[2 * i - 1], SeparateOddOut[0]);
4571 Lo =
B.buildUAdde(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0],
4574 Accum[2 * i - 1] =
Lo->getOperand(0).getReg();
4577 auto Hi =
B.buildUAdde(
S32,
S1, Accum[2 * i], SeparateOddOut[1],
4578 Lo->getOperand(1).getReg());
4579 Accum[2 * i] =
Hi.getReg(0);
4580 SeparateOddCarry =
Hi.getReg(1);
4587 if (
Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4588 EvenCarryIn.push_back(CarryOut);
4590 if (2 * i < Accum.
size()) {
4591 if (
Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4592 OddCarry.push_back(CarryOut);
4604 assert(ST.hasMad64_32());
4605 assert(
MI.getOpcode() == TargetOpcode::G_MUL);
4617 unsigned Size = Ty.getSizeInBits();
4618 if (ST.hasVectorMulU64() &&
Size == 64)
4621 unsigned NumParts =
Size / 32;
4633 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4637 for (
unsigned i = 0; i < NumParts; ++i) {
4641 B.buildUnmerge(Src0Parts, Src0);
4642 B.buildUnmerge(Src1Parts, Src1);
4645 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4646 SeparateOddAlignedProducts);
4648 B.buildMergeLikeInstr(DstReg, AccumRegs);
4649 MI.eraseFromParent();
4664 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_CTLZ
4665 ? AMDGPU::G_AMDGPU_FFBH_U32
4666 : AMDGPU::G_AMDGPU_FFBL_B32;
4667 auto Tmp =
B.buildInstr(NewOpc, {DstTy}, {Src});
4670 MI.eraseFromParent();
4680 TypeSize NumBits = SrcTy.getSizeInBits();
4684 auto ShiftAmt =
B.buildConstant(
S32, 32u - NumBits);
4685 auto Extend =
B.buildAnyExt(
S32, {Src}).
getReg(0u);
4686 auto Shift =
B.buildShl(
S32, Extend, ShiftAmt);
4687 auto Ctlz =
B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {
S32}, {Shift});
4688 B.buildTrunc(Dst, Ctlz);
4689 MI.eraseFromParent();
4700 assert(SrcTy ==
S32 &&
"legalizeCTLS only supports s32");
4701 unsigned BitWidth = SrcTy.getSizeInBits();
4703 auto Sffbh =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32}).addUse(Src);
4705 B.buildSub(Dst, Clamped,
B.buildConstant(
S32, 1));
4706 MI.eraseFromParent();
4712 if (
MI.getOpcode() != TargetOpcode::G_XOR)
4715 return ConstVal == -1;
4722 Register CondDef =
MI.getOperand(0).getReg();
4741 if (
UseMI->getParent() != Parent ||
UseMI->getOpcode() != AMDGPU::G_BRCOND)
4750 UncondBrTarget = &*NextMBB;
4752 if (
Next->getOpcode() != AMDGPU::G_BR)
4771 *ArgRC,
B.getDebugLoc(), ArgTy);
4775 const unsigned Mask = Arg->
getMask();
4783 auto ShiftAmt =
B.buildConstant(
S32, Shift);
4784 AndMaskSrc =
B.buildLShr(
S32, LiveIn, ShiftAmt).getReg(0);
4787 B.buildAnd(DstReg, AndMaskSrc,
B.buildConstant(
S32, Mask >> Shift));
4789 B.buildCopy(DstReg, LiveIn);
4799 if (!ST.hasClusters()) {
4802 MI.eraseFromParent();
4822 auto One =
B.buildConstant(
S32, 1);
4823 auto ClusterSizeXYZ =
B.buildAdd(
S32, ClusterMaxIdXYZ, One);
4824 auto GlobalIdXYZ =
B.buildAdd(
S32, ClusterWorkGroupIdXYZ,
4825 B.buildMul(
S32, ClusterIdXYZ, ClusterSizeXYZ));
4832 B.buildCopy(DstReg, GlobalIdXYZ);
4833 MI.eraseFromParent();
4837 B.buildCopy(DstReg, ClusterIdXYZ);
4838 MI.eraseFromParent();
4843 unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4845 MRI.
setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4846 B.buildInstr(AMDGPU::S_GETREG_B32_const)
4848 .addImm(ClusterIdField);
4849 auto Zero =
B.buildConstant(
S32, 0);
4852 B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4853 MI.eraseFromParent();
4895 auto LoadConstant = [&](
unsigned N) {
4896 B.buildConstant(DstReg,
N);
4900 if (ST.hasArchitectedSGPRs() &&
4907 Arg = &WorkGroupIDX;
4908 ArgRC = &AMDGPU::SReg_32RegClass;
4912 Arg = &WorkGroupIDY;
4913 ArgRC = &AMDGPU::SReg_32RegClass;
4917 Arg = &WorkGroupIDZ;
4918 ArgRC = &AMDGPU::SReg_32RegClass;
4922 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
4923 return LoadConstant(0);
4924 Arg = &ClusterWorkGroupIDX;
4925 ArgRC = &AMDGPU::SReg_32RegClass;
4929 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
4930 return LoadConstant(0);
4931 Arg = &ClusterWorkGroupIDY;
4932 ArgRC = &AMDGPU::SReg_32RegClass;
4936 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
4937 return LoadConstant(0);
4938 Arg = &ClusterWorkGroupIDZ;
4939 ArgRC = &AMDGPU::SReg_32RegClass;
4944 return LoadConstant(ClusterDims.
getDims()[0] - 1);
4945 Arg = &ClusterWorkGroupMaxIDX;
4946 ArgRC = &AMDGPU::SReg_32RegClass;
4951 return LoadConstant(ClusterDims.
getDims()[1] - 1);
4952 Arg = &ClusterWorkGroupMaxIDY;
4953 ArgRC = &AMDGPU::SReg_32RegClass;
4958 return LoadConstant(ClusterDims.
getDims()[2] - 1);
4959 Arg = &ClusterWorkGroupMaxIDZ;
4960 ArgRC = &AMDGPU::SReg_32RegClass;
4964 Arg = &ClusterWorkGroupMaxFlatID;
4965 ArgRC = &AMDGPU::SReg_32RegClass;
4980 return LoadConstant(0);
4985 B.buildUndef(DstReg);
4989 if (!Arg->isRegister() || !Arg->getRegister().isValid())
5001 MI.eraseFromParent();
5007 B.buildConstant(
MI.getOperand(0).getReg(),
C);
5008 MI.eraseFromParent();
5015 unsigned MaxID = ST.getMaxWorkitemID(
B.getMF().getFunction(), Dim);
5029 B.buildUndef(DstReg);
5030 MI.eraseFromParent();
5034 if (Arg->isMasked()) {
5048 MI.eraseFromParent();
5063 Register KernArgReg =
B.getMRI()->createGenericVirtualRegister(PtrTy);
5072 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
5080 Align Alignment)
const {
5084 "unexpected kernarg parameter type");
5091 MI.eraseFromParent();
5126 auto FloatY =
B.buildUITOFP(
S32,
Y);
5127 auto RcpIFlag =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {FloatY});
5129 auto ScaledY =
B.buildFMul(
S32, RcpIFlag, Scale);
5130 auto Z =
B.buildFPTOUI(
S32, ScaledY);
5133 auto NegY =
B.buildSub(
S32,
B.buildConstant(
S32, 0),
Y);
5134 auto NegYZ =
B.buildMul(
S32, NegY, Z);
5135 Z =
B.buildAdd(
S32, Z,
B.buildUMulH(
S32, Z, NegYZ));
5138 auto Q =
B.buildUMulH(
S32,
X, Z);
5139 auto R =
B.buildSub(
S32,
X,
B.buildMul(
S32, Q,
Y));
5142 auto One =
B.buildConstant(
S32, 1);
5145 Q =
B.buildSelect(
S32,
Cond,
B.buildAdd(
S32, Q, One), Q);
5151 B.buildSelect(DstDivReg,
Cond,
B.buildAdd(
S32, Q, One), Q);
5154 B.buildSelect(DstRemReg,
Cond,
B.buildSub(
S32, R,
Y), R);
5173 auto Unmerge =
B.buildUnmerge(
S32, Val);
5175 auto CvtLo =
B.buildUITOFP(
S32, Unmerge.getReg(0));
5176 auto CvtHi =
B.buildUITOFP(
S32, Unmerge.getReg(1));
5178 auto Mad =
B.buildFMAD(
5182 auto Rcp =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {Mad});
5183 auto Mul1 =
B.buildFMul(
5187 auto Mul2 =
B.buildFMul(
5189 auto Trunc =
B.buildIntrinsicTrunc(
S32, Mul2);
5192 auto Mad2 =
B.buildFMAD(
5196 auto ResultLo =
B.buildFPTOUI(
S32, Mad2);
5197 auto ResultHi =
B.buildFPTOUI(
S32, Trunc);
5199 return {ResultLo.getReg(0), ResultHi.getReg(0)};
5214 auto Rcp =
B.buildMergeLikeInstr(
S64, {RcpLo, RcpHi});
5216 auto Zero64 =
B.buildConstant(
S64, 0);
5217 auto NegDenom =
B.buildSub(
S64, Zero64, Denom);
5219 auto MulLo1 =
B.buildMul(
S64, NegDenom, Rcp);
5220 auto MulHi1 =
B.buildUMulH(
S64, Rcp, MulLo1);
5222 auto UnmergeMulHi1 =
B.buildUnmerge(
S32, MulHi1);
5223 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
5224 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
5226 auto Add1_Lo =
B.buildUAddo(
S32,
S1, RcpLo, MulHi1_Lo);
5227 auto Add1_Hi =
B.buildUAdde(
S32,
S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
5228 auto Add1 =
B.buildMergeLikeInstr(
S64, {Add1_Lo, Add1_Hi});
5230 auto MulLo2 =
B.buildMul(
S64, NegDenom, Add1);
5231 auto MulHi2 =
B.buildUMulH(
S64, Add1, MulLo2);
5232 auto UnmergeMulHi2 =
B.buildUnmerge(
S32, MulHi2);
5233 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
5234 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
5236 auto Zero32 =
B.buildConstant(
S32, 0);
5237 auto Add2_Lo =
B.buildUAddo(
S32,
S1, Add1_Lo, MulHi2_Lo);
5238 auto Add2_Hi =
B.buildUAdde(
S32,
S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
5239 auto Add2 =
B.buildMergeLikeInstr(
S64, {Add2_Lo, Add2_Hi});
5241 auto UnmergeNumer =
B.buildUnmerge(
S32, Numer);
5242 Register NumerLo = UnmergeNumer.getReg(0);
5243 Register NumerHi = UnmergeNumer.getReg(1);
5245 auto MulHi3 =
B.buildUMulH(
S64, Numer, Add2);
5246 auto Mul3 =
B.buildMul(
S64, Denom, MulHi3);
5247 auto UnmergeMul3 =
B.buildUnmerge(
S32, Mul3);
5248 Register Mul3_Lo = UnmergeMul3.getReg(0);
5249 Register Mul3_Hi = UnmergeMul3.getReg(1);
5250 auto Sub1_Lo =
B.buildUSubo(
S32,
S1, NumerLo, Mul3_Lo);
5251 auto Sub1_Hi =
B.buildUSube(
S32,
S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
5252 auto Sub1_Mi =
B.buildSub(
S32, NumerHi, Mul3_Hi);
5253 auto Sub1 =
B.buildMergeLikeInstr(
S64, {Sub1_Lo, Sub1_Hi});
5255 auto UnmergeDenom =
B.buildUnmerge(
S32, Denom);
5256 Register DenomLo = UnmergeDenom.getReg(0);
5257 Register DenomHi = UnmergeDenom.getReg(1);
5260 auto C1 =
B.buildSExt(
S32, CmpHi);
5263 auto C2 =
B.buildSExt(
S32, CmpLo);
5266 auto C3 =
B.buildSelect(
S32, CmpEq, C2, C1);
5273 auto Sub2_Lo =
B.buildUSubo(
S32,
S1, Sub1_Lo, DenomLo);
5274 auto Sub2_Mi =
B.buildUSube(
S32,
S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
5275 auto Sub2_Hi =
B.buildUSube(
S32,
S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
5276 auto Sub2 =
B.buildMergeLikeInstr(
S64, {Sub2_Lo, Sub2_Hi});
5278 auto One64 =
B.buildConstant(
S64, 1);
5279 auto Add3 =
B.buildAdd(
S64, MulHi3, One64);
5285 auto C6 =
B.buildSelect(
5289 auto Add4 =
B.buildAdd(
S64, Add3, One64);
5290 auto Sub3_Lo =
B.buildUSubo(
S32,
S1, Sub2_Lo, DenomLo);
5292 auto Sub3_Mi =
B.buildUSube(
S32,
S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
5293 auto Sub3_Hi =
B.buildUSube(
S32,
S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
5294 auto Sub3 =
B.buildMergeLikeInstr(
S64, {Sub3_Lo, Sub3_Hi});
5300 auto Sel1 =
B.buildSelect(
5307 auto Sel2 =
B.buildSelect(
5318 switch (
MI.getOpcode()) {
5321 case AMDGPU::G_UDIV: {
5322 DstDivReg =
MI.getOperand(0).getReg();
5325 case AMDGPU::G_UREM: {
5326 DstRemReg =
MI.getOperand(0).getReg();
5329 case AMDGPU::G_UDIVREM: {
5330 DstDivReg =
MI.getOperand(0).getReg();
5331 DstRemReg =
MI.getOperand(1).getReg();
5338 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
5339 Register Num =
MI.getOperand(FirstSrcOpIdx).getReg();
5340 Register Den =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
5350 MI.eraseFromParent();
5361 if (Ty !=
S32 && Ty !=
S64)
5364 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
5365 Register LHS =
MI.getOperand(FirstSrcOpIdx).getReg();
5366 Register RHS =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
5368 auto SignBitOffset =
B.buildConstant(
S32, Ty.getSizeInBits() - 1);
5369 auto LHSign =
B.buildAShr(Ty, LHS, SignBitOffset);
5370 auto RHSign =
B.buildAShr(Ty, RHS, SignBitOffset);
5372 LHS =
B.buildAdd(Ty, LHS, LHSign).getReg(0);
5373 RHS =
B.buildAdd(Ty, RHS, RHSign).getReg(0);
5375 LHS =
B.buildXor(Ty, LHS, LHSign).getReg(0);
5376 RHS =
B.buildXor(Ty, RHS, RHSign).getReg(0);
5378 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5379 switch (
MI.getOpcode()) {
5382 case AMDGPU::G_SDIV: {
5383 DstDivReg =
MI.getOperand(0).getReg();
5387 case AMDGPU::G_SREM: {
5388 DstRemReg =
MI.getOperand(0).getReg();
5392 case AMDGPU::G_SDIVREM: {
5393 DstDivReg =
MI.getOperand(0).getReg();
5394 DstRemReg =
MI.getOperand(1).getReg();
5407 auto Sign =
B.buildXor(Ty, LHSign, RHSign).getReg(0);
5408 auto SignXor =
B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
5409 B.buildSub(DstDivReg, SignXor, Sign);
5413 auto Sign = LHSign.getReg(0);
5414 auto SignXor =
B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
5415 B.buildSub(DstRemReg, SignXor, Sign);
5418 MI.eraseFromParent();
5434 if (!AllowInaccurateRcp && ResTy !=
LLT::scalar(16))
5445 if (CLHS->isExactlyValue(1.0)) {
5446 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5450 MI.eraseFromParent();
5455 if (CLHS->isExactlyValue(-1.0)) {
5456 auto FNeg =
B.buildFNeg(ResTy, RHS, Flags);
5457 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5458 .addUse(FNeg.getReg(0))
5461 MI.eraseFromParent();
5468 if (!AllowInaccurateRcp && (ResTy !=
LLT::scalar(16) ||
5473 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5476 B.buildFMul(Res, LHS, RCP, Flags);
5478 MI.eraseFromParent();
5493 if (!AllowInaccurateRcp)
5496 auto NegY =
B.buildFNeg(ResTy,
Y);
5497 auto One =
B.buildFConstant(ResTy, 1.0);
5499 auto R =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5503 auto Tmp0 =
B.buildFMA(ResTy, NegY, R, One);
5504 R =
B.buildFMA(ResTy, Tmp0, R, R);
5506 auto Tmp1 =
B.buildFMA(ResTy, NegY, R, One);
5507 R =
B.buildFMA(ResTy, Tmp1, R, R);
5509 auto Ret =
B.buildFMul(ResTy,
X, R);
5510 auto Tmp2 =
B.buildFMA(ResTy, NegY, Ret,
X);
5512 B.buildFMA(Res, Tmp2, R, Ret);
5513 MI.eraseFromParent();
5545 auto LHSExt =
B.buildFPExt(
S32, LHS, Flags);
5546 auto RHSExt =
B.buildFPExt(
S32, RHS, Flags);
5547 auto NegRHSExt =
B.buildFNeg(
S32, RHSExt);
5548 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5549 .addUse(RHSExt.getReg(0))
5551 auto Quot =
B.buildFMul(
S32, LHSExt, Rcp, Flags);
5553 if (ST.hasMadMacF32Insts()) {
5554 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5555 Quot =
B.buildFMAD(
S32, Err, Rcp, Quot, Flags);
5556 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5558 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5559 Quot =
B.buildFMA(
S32, Err, Rcp, Quot, Flags);
5560 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5562 auto Tmp =
B.buildFMul(
S32, Err, Rcp, Flags);
5563 Tmp =
B.buildAnd(
S32, Tmp,
B.buildConstant(
S32, 0xff800000));
5564 Quot =
B.buildFAdd(
S32, Tmp, Quot, Flags);
5565 auto RDst =
B.buildFPTrunc(
S16, Quot, Flags);
5566 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5567 .addUse(RDst.getReg(0))
5572 MI.eraseFromParent();
5585 unsigned SPDenormMode =
5588 if (ST.hasDenormModeInst()) {
5590 uint32_t DPDenormModeDefault =
Mode.fpDenormModeDPValue();
5592 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5593 B.buildInstr(AMDGPU::S_DENORM_MODE)
5594 .addImm(NewDenormModeValue);
5597 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5598 .addImm(SPDenormMode)
5620 auto One =
B.buildFConstant(
S32, 1.0f);
5622 auto DenominatorScaled =
5623 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5628 auto NumeratorScaled =
5629 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5635 auto ApproxRcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5636 .addUse(DenominatorScaled.getReg(0))
5638 auto NegDivScale0 =
B.buildFNeg(
S32, DenominatorScaled, Flags);
5641 const bool HasDynamicDenormals =
5646 if (!PreservesDenormals) {
5647 if (HasDynamicDenormals) {
5649 B.buildInstr(AMDGPU::S_GETREG_B32)
5650 .addDef(SavedSPDenormMode)
5656 auto Fma0 =
B.buildFMA(
S32, NegDivScale0, ApproxRcp, One, Flags);
5657 auto Fma1 =
B.buildFMA(
S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5658 auto Mul =
B.buildFMul(
S32, NumeratorScaled, Fma1, Flags);
5659 auto Fma2 =
B.buildFMA(
S32, NegDivScale0,
Mul, NumeratorScaled, Flags);
5660 auto Fma3 =
B.buildFMA(
S32, Fma2, Fma1,
Mul, Flags);
5661 auto Fma4 =
B.buildFMA(
S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5663 if (!PreservesDenormals) {
5664 if (HasDynamicDenormals) {
5665 assert(SavedSPDenormMode);
5666 B.buildInstr(AMDGPU::S_SETREG_B32)
5667 .addReg(SavedSPDenormMode)
5673 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S32})
5674 .addUse(Fma4.getReg(0))
5675 .addUse(Fma1.getReg(0))
5676 .addUse(Fma3.getReg(0))
5677 .addUse(NumeratorScaled.getReg(1))
5680 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5681 .addUse(Fmas.getReg(0))
5686 MI.eraseFromParent();
5705 auto One =
B.buildFConstant(
S64, 1.0);
5707 auto DivScale0 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5713 auto NegDivScale0 =
B.buildFNeg(
S64, DivScale0.getReg(0), Flags);
5715 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S64})
5716 .addUse(DivScale0.getReg(0))
5719 auto Fma0 =
B.buildFMA(
S64, NegDivScale0, Rcp, One, Flags);
5720 auto Fma1 =
B.buildFMA(
S64, Rcp, Fma0, Rcp, Flags);
5721 auto Fma2 =
B.buildFMA(
S64, NegDivScale0, Fma1, One, Flags);
5723 auto DivScale1 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5729 auto Fma3 =
B.buildFMA(
S64, Fma1, Fma2, Fma1, Flags);
5730 auto Mul =
B.buildFMul(
S64, DivScale1.getReg(0), Fma3, Flags);
5731 auto Fma4 =
B.buildFMA(
S64, NegDivScale0,
Mul, DivScale1.getReg(0), Flags);
5734 if (!ST.hasUsableDivScaleConditionOutput()) {
5740 auto NumUnmerge =
B.buildUnmerge(
S32, LHS);
5741 auto DenUnmerge =
B.buildUnmerge(
S32, RHS);
5742 auto Scale0Unmerge =
B.buildUnmerge(
S32, DivScale0);
5743 auto Scale1Unmerge =
B.buildUnmerge(
S32, DivScale1);
5746 Scale1Unmerge.getReg(1));
5748 Scale0Unmerge.getReg(1));
5749 Scale =
B.buildXor(
S1, CmpNum, CmpDen).getReg(0);
5751 Scale = DivScale1.getReg(1);
5754 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S64})
5755 .addUse(Fma4.getReg(0))
5756 .addUse(Fma3.getReg(0))
5757 .addUse(
Mul.getReg(0))
5761 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup,
ArrayRef(Res))
5762 .addUse(Fmas.getReg(0))
5767 MI.eraseFromParent();
5782 auto Mant =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5785 auto Exp =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5789 if (ST.hasFractBug()) {
5790 auto Fabs =
B.buildFAbs(Ty, Val);
5794 auto Zero =
B.buildConstant(InstrExpTy, 0);
5795 Exp =
B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5796 Mant =
B.buildSelect(Ty, IsFinite, Mant, Val);
5799 B.buildCopy(Res0, Mant);
5800 B.buildSExtOrTrunc(Res1, Exp);
5802 MI.eraseFromParent();
5817 auto Abs =
B.buildFAbs(
S32, RHS, Flags);
5820 auto C0 =
B.buildFConstant(
S32, 0x1p+96f);
5821 auto C1 =
B.buildFConstant(
S32, 0x1p-32f);
5822 auto C2 =
B.buildFConstant(
S32, 1.0f);
5825 auto Sel =
B.buildSelect(
S32, CmpRes, C1, C2, Flags);
5827 auto Mul0 =
B.buildFMul(
S32, RHS, Sel, Flags);
5829 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5830 .addUse(Mul0.getReg(0))
5833 auto Mul1 =
B.buildFMul(
S32, LHS, RCP, Flags);
5835 B.buildFMul(Res, Sel, Mul1, Flags);
5837 MI.eraseFromParent();
5846 unsigned Flags =
MI.getFlags();
5847 assert(!ST.has16BitInsts());
5849 auto Ext =
B.buildFPExt(
F32,
MI.getOperand(1), Flags);
5850 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {
F32})
5851 .addUse(Ext.getReg(0))
5853 B.buildFPTrunc(
MI.getOperand(0),
Log2, Flags);
5854 MI.eraseFromParent();
5864 const unsigned Flags =
MI.getFlags();
5873 MI.eraseFromParent();
5877 auto ScaleThreshold =
B.buildFConstant(
F32, 0x1.0p-96f);
5879 auto ScaleUpFactor =
B.buildFConstant(
F32, 0x1.0p+32f);
5880 auto ScaledX =
B.buildFMul(
F32,
X, ScaleUpFactor, Flags);
5881 auto SqrtX =
B.buildSelect(
F32, NeedScale, ScaledX,
X, Flags);
5886 .addUse(SqrtX.getReg(0))
5889 auto NegOne =
B.buildConstant(I32, -1);
5890 auto SqrtSNextDown =
B.buildAdd(I32, SqrtS, NegOne);
5892 auto NegSqrtSNextDown =
B.buildFNeg(
F32, SqrtSNextDown, Flags);
5893 auto SqrtVP =
B.buildFMA(
F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5895 auto PosOne =
B.buildConstant(I32, 1);
5896 auto SqrtSNextUp =
B.buildAdd(I32, SqrtS, PosOne);
5898 auto NegSqrtSNextUp =
B.buildFNeg(
F32, SqrtSNextUp, Flags);
5899 auto SqrtVS =
B.buildFMA(
F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5901 auto Zero =
B.buildFConstant(
F32, 0.0f);
5905 B.buildSelect(
F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5909 B.buildSelect(
F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5912 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F32}).addReg(SqrtX.getReg(0));
5913 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5915 auto Half =
B.buildFConstant(
F32, 0.5f);
5916 auto SqrtH =
B.buildFMul(
F32, SqrtR, Half, Flags);
5917 auto NegSqrtH =
B.buildFNeg(
F32, SqrtH, Flags);
5918 auto SqrtE =
B.buildFMA(
F32, NegSqrtH, SqrtS, Half, Flags);
5919 SqrtH =
B.buildFMA(
F32, SqrtH, SqrtE, SqrtH, Flags);
5920 SqrtS =
B.buildFMA(
F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5921 auto NegSqrtS =
B.buildFNeg(
F32, SqrtS, Flags);
5922 auto SqrtD =
B.buildFMA(
F32, NegSqrtS, SqrtS, SqrtX, Flags);
5923 SqrtS =
B.buildFMA(
F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5926 auto ScaleDownFactor =
B.buildFConstant(
F32, 0x1.0p-16f);
5928 auto ScaledDown =
B.buildFMul(
F32, SqrtS, ScaleDownFactor, Flags);
5930 SqrtS =
B.buildSelect(
F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5933 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5935 MI.eraseFromParent();
5970 unsigned Flags =
MI.getFlags();
5975 auto ScaleConstant =
B.buildFConstant(
F64, 0x1.0p-767);
5977 ZeroInt =
B.buildConstant(
S32, 0).getReg(0);
5981 auto ScaleUpFactor =
B.buildConstant(
S32, 256);
5982 auto ScaleUp =
B.buildSelect(
S32, Scaling, ScaleUpFactor, ZeroInt);
5983 SqrtX =
B.buildFLdexp(
F64,
X, ScaleUp, Flags).getReg(0);
5986 auto SqrtY =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F64}).addReg(SqrtX);
5988 auto Half =
B.buildFConstant(
F64, 0.5);
5989 auto SqrtH0 =
B.buildFMul(
F64, SqrtY, Half);
5990 auto SqrtS0 =
B.buildFMul(
F64, SqrtX, SqrtY);
5992 auto NegSqrtH0 =
B.buildFNeg(
F64, SqrtH0);
5993 auto SqrtR0 =
B.buildFMA(
F64, NegSqrtH0, SqrtS0, Half);
5995 auto SqrtS1 =
B.buildFMA(
F64, SqrtS0, SqrtR0, SqrtS0);
5996 auto SqrtH1 =
B.buildFMA(
F64, SqrtH0, SqrtR0, SqrtH0);
5998 auto NegSqrtS1 =
B.buildFNeg(
F64, SqrtS1);
5999 auto SqrtD0 =
B.buildFMA(
F64, NegSqrtS1, SqrtS1, SqrtX);
6001 auto SqrtS2 =
B.buildFMA(
F64, SqrtD0, SqrtH1, SqrtS1);
6003 Register SqrtRet = SqrtS2.getReg(0);
6005 auto NegSqrtS2 =
B.buildFNeg(
F64, SqrtS2);
6006 auto SqrtD1 =
B.buildFMA(
F64, NegSqrtS2, SqrtS2, SqrtX);
6007 auto SqrtD2 =
B.buildFMA(
F64, SqrtD1, SqrtH1, SqrtS2);
6010 auto ScaleDownFactor =
B.buildConstant(
S32, -128);
6011 auto ScaleDown =
B.buildSelect(
S32, Scaling, ScaleDownFactor, ZeroInt);
6012 SqrtRet =
B.buildFLdexp(
F64, SqrtD2, ScaleDown, Flags).getReg(0);
6017 auto ZeroFP =
B.buildFConstant(
F64, 0.0);
6026 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
6028 MI.eraseFromParent();
6059 auto Flags =
MI.getFlags();
6071 auto Rsq =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
6081 auto ClampMax = UseIEEE ?
B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
6082 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
6087 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
6089 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
6090 MI.eraseFromParent();
6102 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6103 IID == Intrinsic::amdgcn_permlanex16;
6104 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6105 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6109 auto LaneOp =
B.buildIntrinsic(IID, {VT}).addUse(Src0);
6111 case Intrinsic::amdgcn_readfirstlane:
6112 case Intrinsic::amdgcn_permlane64:
6113 return LaneOp.getReg(0);
6114 case Intrinsic::amdgcn_readlane:
6115 case Intrinsic::amdgcn_set_inactive:
6116 case Intrinsic::amdgcn_set_inactive_chain_arg:
6117 return LaneOp.addUse(Src1).getReg(0);
6118 case Intrinsic::amdgcn_writelane:
6119 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
6120 case Intrinsic::amdgcn_permlane16:
6121 case Intrinsic::amdgcn_permlanex16: {
6123 int64_t Src4 =
MI.getOperand(6).getImm();
6124 int64_t Src5 =
MI.getOperand(7).getImm();
6125 return LaneOp.addUse(Src1)
6132 case Intrinsic::amdgcn_mov_dpp8:
6133 return LaneOp.addImm(
MI.getOperand(3).getImm()).getReg(0);
6134 case Intrinsic::amdgcn_update_dpp:
6135 return LaneOp.addUse(Src1)
6136 .addImm(
MI.getOperand(4).getImm())
6137 .addImm(
MI.getOperand(5).getImm())
6138 .addImm(
MI.getOperand(6).getImm())
6139 .addImm(
MI.getOperand(7).getImm())
6149 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6150 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6151 Src1 =
MI.getOperand(3).getReg();
6152 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
6153 Src2 =
MI.getOperand(4).getReg();
6158 unsigned Size = Ty.getSizeInBits();
6160 unsigned SplitSize = 32;
6161 if (IID == Intrinsic::amdgcn_update_dpp && (
Size % 64 == 0) &&
6162 ST.hasDPALU_DPP() &&
6166 if (
Size == SplitSize) {
6172 Src0 =
B.buildAnyExt(
S32, Src0).getReg(0);
6174 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6177 if (IID == Intrinsic::amdgcn_writelane)
6180 Register LaneOpDst = createLaneOp(Src0, Src1, Src2,
S32);
6181 B.buildTrunc(DstReg, LaneOpDst);
6182 MI.eraseFromParent();
6186 if (
Size % SplitSize != 0)
6190 bool NeedsBitcast =
false;
6191 if (Ty.isVector()) {
6194 if (EltSize == SplitSize) {
6195 PartialResTy = EltTy;
6196 }
else if (EltSize == 16 || EltSize == 32) {
6197 unsigned NElem = SplitSize / EltSize;
6201 NeedsBitcast =
true;
6206 unsigned NumParts =
Size / SplitSize;
6210 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6211 Src1Parts =
B.buildUnmerge(PartialResTy, Src1);
6213 if (IID == Intrinsic::amdgcn_writelane)
6214 Src2Parts =
B.buildUnmerge(PartialResTy, Src2);
6216 for (
unsigned i = 0; i < NumParts; ++i) {
6217 Src0 = Src0Parts.
getReg(i);
6219 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6220 Src1 = Src1Parts.
getReg(i);
6222 if (IID == Intrinsic::amdgcn_writelane)
6223 Src2 = Src2Parts.
getReg(i);
6225 PartialRes.
push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
6229 B.buildBitcast(DstReg,
B.buildMergeLikeInstr(
6232 B.buildMergeLikeInstr(DstReg, PartialRes);
6234 MI.eraseFromParent();
6242 ST.getTargetLowering()->getImplicitParameterOffset(
6252 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
6253 B.buildConstant(IdxTy,
Offset).getReg(0));
6264 Register Pointer =
MI.getOperand(2).getReg();
6266 Register NumRecords =
MI.getOperand(4).getReg();
6272 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6274 auto ExtStride =
B.buildAnyExt(
S32, Stride);
6276 if (ST.has45BitNumRecordsBufferResource()) {
6281 auto PointerInt =
B.buildPtrToInt(PtrIntTy, Pointer);
6282 auto ExtPointer =
B.buildAnyExtOrTrunc(
S64, PointerInt);
6283 auto NumRecordsLHS =
B.buildShl(
S64, NumRecords,
B.buildConstant(
S32, 57));
6284 Register LowHalf =
B.buildOr(
S64, ExtPointer, NumRecordsLHS).getReg(0);
6288 auto NumRecordsRHS =
B.buildLShr(
S64, NumRecords,
B.buildConstant(
S32, 7));
6289 auto ShiftedStride =
B.buildShl(
S32, ExtStride,
B.buildConstant(
S32, 12));
6290 auto ExtShiftedStride =
6291 B.buildMergeValues(
S64, {Zero, ShiftedStride.getReg(0)});
6292 auto ShiftedFlags =
B.buildShl(
S32, Flags,
B.buildConstant(
S32, 28));
6293 auto ExtShiftedFlags =
6294 B.buildMergeValues(
S64, {Zero, ShiftedFlags.getReg(0)});
6295 auto CombinedFields =
B.buildOr(
S64, NumRecordsRHS, ExtShiftedStride);
6297 B.buildOr(
S64, CombinedFields, ExtShiftedFlags).getReg(0);
6298 B.buildMergeValues(Result, {LowHalf, HighHalf});
6300 NumRecords =
B.buildTrunc(
S32, NumRecords).getReg(0);
6301 auto Unmerge =
B.buildUnmerge(
S32, Pointer);
6302 auto LowHalf = Unmerge.getReg(0);
6303 auto HighHalf = Unmerge.getReg(1);
6305 auto AndMask =
B.buildConstant(
S32, 0x0000ffff);
6306 auto Masked =
B.buildAnd(
S32, HighHalf, AndMask);
6307 auto ShiftConst =
B.buildConstant(
S32, 16);
6308 auto ShiftedStride =
B.buildShl(
S32, ExtStride, ShiftConst);
6309 auto NewHighHalf =
B.buildOr(
S32,
Masked, ShiftedStride);
6310 Register NewHighHalfReg = NewHighHalf.getReg(0);
6311 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
6314 MI.eraseFromParent();
6331 MI.eraseFromParent();
6339 std::optional<uint32_t> KnownSize =
6341 if (KnownSize.has_value())
6342 B.buildConstant(DstReg, *KnownSize);
6360 MI.eraseFromParent();
6367 unsigned AddrSpace)
const {
6369 auto Unmerge =
B.buildUnmerge(
S32,
MI.getOperand(2).getReg());
6373 ST.hasGloballyAddressableScratch()) {
6375 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
6376 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6378 MRI.
setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
6380 Register XOR =
B.buildXor(
S32, Hi32, FlatScratchBaseHi).getReg(0);
6382 B.buildConstant(
S32, 1u << 26));
6387 MI.eraseFromParent();
6397std::pair<Register, unsigned>
6409 bool CheckNUW = ST.hasGFX1250Insts();
6411 MRI, OrigOffset,
nullptr, CheckNUW);
6415 BaseReg =
B.buildPtrToInt(MRI.
getType(OrigOffset), BaseReg).getReg(0);
6425 unsigned Overflow = ImmOffset & ~MaxImm;
6426 ImmOffset -= Overflow;
6427 if ((int32_t)Overflow < 0) {
6428 Overflow += ImmOffset;
6432 if (Overflow != 0) {
6434 BaseReg =
B.buildConstant(
S32, Overflow).getReg(0);
6436 auto OverflowVal =
B.buildConstant(
S32, Overflow);
6437 BaseReg =
B.buildAdd(
S32, BaseReg, OverflowVal).getReg(0);
6442 BaseReg =
B.buildConstant(
S32, 0).getReg(0);
6444 return std::pair(BaseReg, ImmOffset);
6451 bool ImageStore)
const {
6457 if (ST.hasUnpackedD16VMem()) {
6458 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6461 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6462 WideRegs.
push_back(
B.buildAnyExt(
S32, Unmerge.getReg(
I)).getReg(0));
6470 if (ImageStore && ST.hasImageStoreD16Bug()) {
6473 Reg =
B.buildBitcast(
S32, Reg).getReg(0);
6475 PackedRegs.
resize(2,
B.buildUndef(
S32).getReg(0));
6482 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6483 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6485 PackedRegs.
resize(6,
B.buildUndef(
S16).getReg(0));
6493 auto Unmerge =
B.buildUnmerge(
S32, Reg);
6494 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6496 PackedRegs.
resize(4,
B.buildUndef(
S32).getReg(0));
6513 bool IsFormat)
const {
6525 VData =
B.buildBitcast(Ty, VData).getReg(0);
6533 if (Ty.isVector()) {
6534 if (Ty.getElementType() ==
S16 && Ty.getNumElements() <= 4) {
6546 bool IsFormat)
const {
6553 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6568 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6571 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6575 VIndex =
MI.getOperand(3).getReg();
6578 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6581 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6582 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6586 Format =
MI.getOperand(5 + OpOffset).getImm();
6590 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6596 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6597 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6598 }
else if (IsFormat) {
6599 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6600 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6604 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6607 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6610 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6615 auto MIB =
B.buildInstr(
Opc)
6626 MIB.addImm(AuxiliaryData)
6627 .addImm(HasVIndex ? -1 : 0)
6628 .addMemOperand(MMO);
6630 MI.eraseFromParent();
6636 unsigned ImmOffset,
unsigned Format,
6639 auto MIB =
B.buildInstr(
Opc)
6650 MIB.addImm(AuxiliaryData)
6651 .addImm(HasVIndex ? -1 : 0)
6652 .addMemOperand(MMO);
6658 bool IsTyped)
const {
6672 assert(
MI.getNumExplicitDefs() == 1 ||
MI.getNumExplicitDefs() == 2);
6673 bool IsTFE =
MI.getNumExplicitDefs() == 2;
6675 StatusDst =
MI.getOperand(1).getReg();
6680 Register RSrc =
MI.getOperand(2 + OpOffset).getReg();
6683 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6686 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps + OpOffset;
6689 VIndex =
MI.getOperand(3 + OpOffset).getReg();
6692 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6695 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6696 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6700 Format =
MI.getOperand(5 + OpOffset).getImm();
6704 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6714 Dst =
MI.getOperand(0).getReg();
6715 B.setInsertPt(
B.getMBB(),
MI);
6722 Dst =
MI.getOperand(0).getReg();
6723 B.setInsertPt(
B.getMBB(),
MI);
6727 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6728 const bool Unpacked = ST.hasUnpackedD16VMem();
6738 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6739 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6740 }
else if (IsFormat) {
6744 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6746 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6747 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6752 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6753 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6756 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6757 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6760 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6761 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6767 unsigned NumValueDWords =
divideCeil(Ty.getSizeInBits(), 32);
6768 unsigned NumLoadDWords = NumValueDWords + 1;
6770 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(LoadTy);
6772 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6774 Register ExtDst =
B.getMRI()->createGenericVirtualRegister(
S32);
6775 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6776 B.buildTrunc(Dst, ExtDst);
6777 }
else if (NumValueDWords == 1) {
6778 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6781 for (
unsigned I = 0;
I != NumValueDWords; ++
I)
6782 LoadElts.
push_back(
B.getMRI()->createGenericVirtualRegister(
S32));
6784 B.buildUnmerge(LoadElts, LoadDstReg);
6786 B.buildMergeLikeInstr(Dst, LoadElts);
6789 (IsD16 && !Ty.isVector())) {
6790 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(
S32);
6792 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6793 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6794 B.buildTrunc(Dst, LoadDstReg);
6795 }
else if (Unpacked && IsD16 && Ty.isVector()) {
6797 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6799 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6800 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6802 auto Unmerge =
B.buildUnmerge(
S32, LoadDstReg);
6804 for (
unsigned I = 0,
N = Unmerge->getNumOperands() - 1;
I !=
N; ++
I)
6805 Repack.
push_back(
B.buildTrunc(EltTy, Unmerge.getReg(
I)).getReg(0));
6806 B.buildMergeLikeInstr(Dst, Repack);
6809 AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6812 MI.eraseFromParent();
6818 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6819 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6820 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6821 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6822 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6823 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6824 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6825 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6826 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6827 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6828 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6829 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6830 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6831 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6832 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6833 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6834 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6835 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6836 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6837 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6838 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6839 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6840 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6841 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6842 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6843 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6844 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6845 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6846 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6847 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6848 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6849 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6850 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6851 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6852 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6853 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6854 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6855 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6856 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6857 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6858 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6859 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6860 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6861 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6862 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6863 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6864 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6865 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6866 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6867 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6868 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6869 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6870 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6871 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6872 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6873 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6874 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6875 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6876 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6877 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6878 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6879 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6880 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6881 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6882 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6883 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6884 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6885 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6886 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6887 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6888 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6889 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6890 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6891 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6892 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6893 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6894 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6895 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6896 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6897 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6898 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
6899 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
6900 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
6901 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
6902 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
6903 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6904 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
6905 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6906 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
6907 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6916 const bool IsCmpSwap =
6917 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6918 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6919 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6920 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6931 CmpVal =
MI.getOperand(3).getReg();
6936 Register RSrc =
MI.getOperand(3 + OpOffset).getReg();
6937 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6940 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6943 VIndex =
MI.getOperand(4 + OpOffset).getReg();
6946 VIndex =
B.buildConstant(
LLT::scalar(32), 0).getReg(0);
6949 Register VOffset =
MI.getOperand(4 + OpOffset).getReg();
6950 Register SOffset =
MI.getOperand(5 + OpOffset).getReg();
6951 unsigned AuxiliaryData =
MI.getOperand(6 + OpOffset).getImm();
6970 .addImm(AuxiliaryData)
6971 .addImm(HasVIndex ? -1 : 0)
6972 .addMemOperand(MMO);
6974 MI.eraseFromParent();
6984 bool IsA16,
bool IsG16) {
7000 (
B.getMRI()->getType(AddrReg) ==
S16)) {
7005 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7009 "Bias needs to be converted to 16 bit in A16 mode");
7011 AddrReg =
B.buildBitcast(
V2S16, AddrReg).getReg(0);
7017 if (((
I + 1) >= EndIdx) ||
7024 !
MI.getOperand(ArgOffset +
I + 1).isReg()) {
7026 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7031 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
7042 int DimIdx,
int NumVAddrs) {
7046 for (
int I = 0;
I != NumVAddrs; ++
I) {
7048 if (
SrcOp.isReg()) {
7054 int NumAddrRegs = AddrRegs.
size();
7055 if (NumAddrRegs != 1) {
7058 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
7061 for (
int I = 1;
I != NumVAddrs; ++
I) {
7064 MI.getOperand(DimIdx +
I).setReg(AMDGPU::NoRegister);
7086 const unsigned NumDefs =
MI.getNumExplicitDefs();
7087 const unsigned ArgOffset = NumDefs + 1;
7088 bool IsTFE = NumDefs == 2;
7106 VData =
MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
7110 const bool IsAtomicPacked16Bit =
7111 (BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7112 BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7120 ST.hasG16() ? (BaseOpcode->
Gradients && GradTy ==
S16) : GradTy ==
S16;
7121 const bool IsA16 = AddrTy ==
S16;
7122 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() ==
S16;
7125 if (!BaseOpcode->
Atomic) {
7126 DMask =
MI.getOperand(ArgOffset + Intr->
DMaskIndex).getImm();
7129 }
else if (DMask != 0) {
7131 }
else if (!IsTFE && !BaseOpcode->
Store) {
7133 B.buildUndef(
MI.getOperand(0));
7134 MI.eraseFromParent();
7142 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
7143 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
7144 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
7145 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
7146 unsigned NewOpcode = LoadOpcode;
7147 if (BaseOpcode->
Store)
7148 NewOpcode = StoreOpcode;
7150 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
7153 MI.setDesc(
B.getTII().get(NewOpcode));
7157 if (IsTFE && DMask == 0) {
7160 MI.getOperand(ArgOffset + Intr->
DMaskIndex).setImm(DMask);
7163 if (BaseOpcode->
Atomic) {
7168 if (Ty.isVector() && !IsAtomicPacked16Bit)
7175 auto Concat =
B.buildBuildVector(PackedTy, {VData0, VData1});
7176 MI.getOperand(2).setReg(
Concat.getReg(0));
7177 MI.getOperand(3).setReg(AMDGPU::NoRegister);
7181 unsigned CorrectedNumVAddrs = Intr->
NumVAddrs;
7184 if (BaseOpcode->
Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
7190 if (IsA16 && !ST.hasA16()) {
7195 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->
Sampler);
7196 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
7198 if (IsA16 || IsG16) {
7206 const bool UseNSA = ST.hasNSAEncoding() &&
7207 PackedRegs.
size() >= ST.getNSAThreshold(MF) &&
7208 (PackedRegs.
size() <= NSAMaxSize || HasPartialNSA);
7209 const bool UsePartialNSA =
7210 UseNSA && HasPartialNSA && PackedRegs.
size() > NSAMaxSize;
7212 if (UsePartialNSA) {
7216 auto Concat =
B.buildConcatVectors(
7217 PackedAddrTy,
ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
7218 PackedRegs[NSAMaxSize - 1] =
Concat.getReg(0);
7219 PackedRegs.
resize(NSAMaxSize);
7220 }
else if (!UseNSA && PackedRegs.
size() > 1) {
7222 auto Concat =
B.buildConcatVectors(PackedAddrTy, PackedRegs);
7223 PackedRegs[0] =
Concat.getReg(0);
7227 const unsigned NumPacked = PackedRegs.
size();
7230 if (!
SrcOp.isReg()) {
7240 SrcOp.setReg(AMDGPU::NoRegister);
7257 const bool UseNSA = ST.hasNSAEncoding() &&
7258 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
7259 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
7260 const bool UsePartialNSA =
7261 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
7263 if (UsePartialNSA) {
7265 ArgOffset + Intr->
VAddrStart + NSAMaxSize - 1,
7267 }
else if (!UseNSA && Intr->
NumVAddrs > 1) {
7282 if (!Ty.isVector() || !IsD16)
7286 if (RepackedReg != VData) {
7287 MI.getOperand(1).setReg(RepackedReg);
7295 const int NumElts = Ty.
isVector() ? Ty.getNumElements() : 1;
7298 if (NumElts < DMaskLanes)
7301 if (NumElts > 4 || DMaskLanes > 4)
7311 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
7312 const LLT AdjustedTy =
7328 if (IsD16 && ST.hasUnpackedD16VMem()) {
7335 unsigned RoundedElts = (AdjustedTy.
getSizeInBits() + 31) / 32;
7336 unsigned RoundedSize = 32 * RoundedElts;
7340 RegTy = !IsTFE && EltSize == 16 ?
V2S16 :
S32;
7345 if (!IsTFE && (RoundedTy == Ty || !Ty.
isVector()))
7351 B.setInsertPt(*
MI.getParent(), ++
MI.getIterator());
7355 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
7356 const int ResultNumRegs = LoadResultTy.
getSizeInBits() / 32;
7360 MI.getOperand(0).setReg(NewResultReg);
7368 Dst1Reg =
MI.getOperand(1).getReg();
7373 MI.removeOperand(1);
7377 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
7386 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7388 if (ResultNumRegs == 1) {
7390 ResultRegs[0] = NewResultReg;
7393 for (
int I = 0;
I != NumDataRegs; ++
I)
7395 B.buildUnmerge(ResultRegs, NewResultReg);
7400 ResultRegs.
resize(NumDataRegs);
7405 if (IsD16 && !Ty.isVector()) {
7406 B.buildTrunc(DstReg, ResultRegs[0]);
7411 if (Ty ==
V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7412 B.buildBitcast(DstReg, ResultRegs[0]);
7424 if (RegTy !=
V2S16 && !ST.hasUnpackedD16VMem()) {
7426 Reg =
B.buildBitcast(
V2S16, Reg).getReg(0);
7427 }
else if (ST.hasUnpackedD16VMem()) {
7429 Reg =
B.buildTrunc(
S16, Reg).getReg(0);
7433 auto padWithUndef = [&](
LLT Ty,
int NumElts) {
7437 for (
int I = 0;
I != NumElts; ++
I)
7444 padWithUndef(ResTy, NumElts - ResultRegs.
size());
7445 B.buildBuildVector(DstReg, ResultRegs);
7449 assert(!ST.hasUnpackedD16VMem() && ResTy ==
V2S16);
7450 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7456 if (ResultRegs.
size() == 1) {
7457 NewResultReg = ResultRegs[0];
7458 }
else if (ResultRegs.
size() == 2) {
7460 NewResultReg =
B.buildConcatVectors(
V4S16, ResultRegs).getReg(0);
7468 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
7470 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
7475 padWithUndef(ResTy, RegsToCover - ResultRegs.
size());
7476 B.buildConcatVectors(DstReg, ResultRegs);
7485 Register OrigDst =
MI.getOperand(0).getReg();
7487 LLT Ty =
B.getMRI()->getType(OrigDst);
7488 unsigned Size = Ty.getSizeInBits();
7491 if (
Size < 32 && ST.hasScalarSubwordLoads()) {
7493 Opc =
Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7494 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7497 Dst =
B.getMRI()->createGenericVirtualRegister(
LLT::scalar(32));
7499 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7508 B.setInsertPt(
B.getMBB(),
MI);
7513 B.setInsertPt(
B.getMBB(),
MI);
7519 MI.setDesc(
B.getTII().get(
Opc));
7520 MI.removeOperand(1);
7523 const unsigned MemSize = (
Size + 7) / 8;
7524 const Align MemAlign =
B.getDataLayout().getABITypeAlign(
7531 MI.addMemOperand(MF, MMO);
7532 if (Dst != OrigDst) {
7533 MI.getOperand(0).setReg(Dst);
7534 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
7535 B.buildTrunc(OrigDst, Dst);
7557 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7558 MI.removeOperand(0);
7568 if (!ST.hasTrapHandler() ||
7572 return ST.supportsGetDoorbellID() ?
7585 MI.eraseFromParent();
7595 BuildMI(*TrapBB, TrapBB->
end(),
DL,
B.getTII().get(AMDGPU::S_ENDPGM))
7597 BuildMI(BB, &
MI,
DL,
B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7601 MI.eraseFromParent();
7610 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7617 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
7637 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7640 Register Temp =
B.buildLoad(
S64, LoadAddr, *MMO).getReg(0);
7641 B.buildCopy(SGPR01, Temp);
7642 B.buildInstr(AMDGPU::S_TRAP)
7645 MI.eraseFromParent();
7656 B.buildCopy(SGPR01, LiveIn);
7657 B.buildInstr(AMDGPU::S_TRAP)
7661 MI.eraseFromParent();
7670 if (ST.hasPrivEnabledTrap2NopBug()) {
7671 ST.getInstrInfo()->insertSimulatedTrap(MRI,
B.getMBB(),
MI,
7673 MI.eraseFromParent();
7677 B.buildInstr(AMDGPU::S_TRAP)
7679 MI.eraseFromParent();
7688 if (!ST.hasTrapHandler() ||
7692 Fn,
"debugtrap handler not supported",
MI.getDebugLoc(),
DS_Warning));
7695 B.buildInstr(AMDGPU::S_TRAP)
7699 MI.eraseFromParent();
7712 Register NodePtr =
MI.getOperand(2).getReg();
7713 Register RayExtent =
MI.getOperand(3).getReg();
7714 Register RayOrigin =
MI.getOperand(4).getReg();
7716 Register RayInvDir =
MI.getOperand(6).getReg();
7719 if (!ST.hasGFX10_AEncoding()) {
7722 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7731 const unsigned NumVDataDwords = 4;
7732 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7733 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7735 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7737 const unsigned BaseOpcodes[2][2] = {
7738 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7739 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7740 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7744 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7745 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7746 : AMDGPU::MIMGEncGfx10NSA,
7747 NumVDataDwords, NumVAddrDwords);
7751 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7752 : AMDGPU::MIMGEncGfx10Default,
7753 NumVDataDwords, NumVAddrDwords);
7758 if (UseNSA && IsGFX11Plus) {
7760 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7761 auto Merged =
B.buildMergeLikeInstr(
7762 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7763 Ops.push_back(Merged.getReg(0));
7766 Ops.push_back(NodePtr);
7767 Ops.push_back(RayExtent);
7768 packLanes(RayOrigin);
7771 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7772 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7773 auto MergedDir =
B.buildMergeLikeInstr(
7776 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(0),
7777 UnmergeRayDir.getReg(0)}))
7780 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(1),
7781 UnmergeRayDir.getReg(1)}))
7784 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(2),
7785 UnmergeRayDir.getReg(2)}))
7787 Ops.push_back(MergedDir.getReg(0));
7790 packLanes(RayInvDir);
7794 auto Unmerge =
B.buildUnmerge({
S32,
S32}, NodePtr);
7795 Ops.push_back(Unmerge.getReg(0));
7796 Ops.push_back(Unmerge.getReg(1));
7798 Ops.push_back(NodePtr);
7800 Ops.push_back(RayExtent);
7803 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7804 Ops.push_back(Unmerge.getReg(0));
7805 Ops.push_back(Unmerge.getReg(1));
7806 Ops.push_back(Unmerge.getReg(2));
7809 packLanes(RayOrigin);
7811 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7812 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7816 B.buildMergeLikeInstr(R1,
7817 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7818 B.buildMergeLikeInstr(
7819 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7820 B.buildMergeLikeInstr(
7821 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7827 packLanes(RayInvDir);
7834 Register MergedOps =
B.buildMergeLikeInstr(OpTy,
Ops).getReg(0);
7836 Ops.push_back(MergedOps);
7839 auto MIB =
B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7848 .addImm(IsA16 ? 1 : 0)
7851 MI.eraseFromParent();
7861 Register DstOrigin =
MI.getOperand(1).getReg();
7863 Register NodePtr =
MI.getOperand(4).getReg();
7864 Register RayExtent =
MI.getOperand(5).getReg();
7865 Register InstanceMask =
MI.getOperand(6).getReg();
7866 Register RayOrigin =
MI.getOperand(7).getReg();
7868 Register Offsets =
MI.getOperand(9).getReg();
7869 Register TDescr =
MI.getOperand(10).getReg();
7871 if (!ST.hasBVHDualAndBVH8Insts()) {
7874 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7879 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7880 const unsigned NumVDataDwords = 10;
7881 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7883 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7884 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7885 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7888 auto RayExtentInstanceMaskVec =
B.buildMergeLikeInstr(
7889 V2S32, {RayExtent,
B.buildAnyExt(
S32, InstanceMask)});
7891 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7892 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7898 .addUse(RayExtentInstanceMaskVec.getReg(0))
7905 MI.eraseFromParent();
7914 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7915 MI.eraseFromParent();
7922 if (!ST.hasArchitectedSGPRs())
7926 auto TTMP8 =
B.buildCopy(
S32,
Register(AMDGPU::TTMP8));
7927 auto LSB =
B.buildConstant(
S32, 25);
7928 auto Width =
B.buildConstant(
S32, 5);
7929 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7930 MI.eraseFromParent();
7938 unsigned Width)
const {
7942 MRI.
setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
7943 B.buildInstr(AMDGPU::S_GETREG_B32_const)
7946 MI.eraseFromParent();
7964 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
7968 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
7971 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7972 MI.eraseFromParent();
7983 auto Unmerge =
B.buildUnmerge({
S32,
S32},
MI.getOperand(0));
7987 .addReg(Unmerge.getReg(0));
7991 .addReg(Unmerge.getReg(1));
7992 MI.eraseFromParent();
8004 case Intrinsic::sponentry:
8010 B.buildInstr(AMDGPU::G_AMDGPU_SPONENTRY).addDef(TmpReg);
8013 B.buildIntToPtr(DstReg, TmpReg);
8014 MI.eraseFromParent();
8016 int FI =
B.getMF().getFrameInfo().CreateFixedObject(
8018 B.buildFrameIndex(
MI.getOperand(0), FI);
8019 MI.eraseFromParent();
8022 case Intrinsic::amdgcn_if:
8023 case Intrinsic::amdgcn_else: {
8026 bool Negated =
false;
8038 std::swap(CondBrTarget, UncondBrTarget);
8040 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
8041 if (IntrID == Intrinsic::amdgcn_if) {
8042 B.buildInstr(AMDGPU::SI_IF)
8045 .addMBB(UncondBrTarget);
8047 B.buildInstr(AMDGPU::SI_ELSE)
8050 .addMBB(UncondBrTarget);
8059 B.buildBr(*CondBrTarget);
8064 MI.eraseFromParent();
8065 BrCond->eraseFromParent();
8071 case Intrinsic::amdgcn_loop: {
8074 bool Negated =
false;
8084 std::swap(CondBrTarget, UncondBrTarget);
8086 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
8087 B.buildInstr(AMDGPU::SI_LOOP)
8089 .addMBB(UncondBrTarget);
8094 B.buildBr(*CondBrTarget);
8096 MI.eraseFromParent();
8097 BrCond->eraseFromParent();
8104 case Intrinsic::amdgcn_addrspacecast_nonnull:
8106 case Intrinsic::amdgcn_make_buffer_rsrc:
8108 case Intrinsic::amdgcn_kernarg_segment_ptr:
8111 B.buildConstant(
MI.getOperand(0).getReg(), 0);
8112 MI.eraseFromParent();
8118 case Intrinsic::amdgcn_implicitarg_ptr:
8120 case Intrinsic::amdgcn_workitem_id_x:
8123 case Intrinsic::amdgcn_workitem_id_y:
8126 case Intrinsic::amdgcn_workitem_id_z:
8129 case Intrinsic::amdgcn_workgroup_id_x:
8134 case Intrinsic::amdgcn_workgroup_id_y:
8139 case Intrinsic::amdgcn_workgroup_id_z:
8144 case Intrinsic::amdgcn_cluster_id_x:
8145 return ST.hasClusters() &&
8148 case Intrinsic::amdgcn_cluster_id_y:
8149 return ST.hasClusters() &&
8152 case Intrinsic::amdgcn_cluster_id_z:
8153 return ST.hasClusters() &&
8156 case Intrinsic::amdgcn_cluster_workgroup_id_x:
8157 return ST.hasClusters() &&
8160 case Intrinsic::amdgcn_cluster_workgroup_id_y:
8161 return ST.hasClusters() &&
8164 case Intrinsic::amdgcn_cluster_workgroup_id_z:
8165 return ST.hasClusters() &&
8168 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
8169 return ST.hasClusters() &&
8171 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
8172 return ST.hasClusters() &&
8175 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
8176 return ST.hasClusters() &&
8179 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
8180 return ST.hasClusters() &&
8183 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
8184 return ST.hasClusters() &&
8188 case Intrinsic::amdgcn_wave_id:
8190 case Intrinsic::amdgcn_lds_kernel_id:
8193 case Intrinsic::amdgcn_dispatch_ptr:
8196 case Intrinsic::amdgcn_queue_ptr:
8199 case Intrinsic::amdgcn_implicit_buffer_ptr:
8202 case Intrinsic::amdgcn_dispatch_id:
8205 case Intrinsic::r600_read_ngroups_x:
8209 case Intrinsic::r600_read_ngroups_y:
8212 case Intrinsic::r600_read_ngroups_z:
8215 case Intrinsic::r600_read_local_size_x:
8218 case Intrinsic::r600_read_local_size_y:
8222 case Intrinsic::r600_read_local_size_z:
8225 case Intrinsic::amdgcn_fdiv_fast:
8227 case Intrinsic::amdgcn_is_shared:
8229 case Intrinsic::amdgcn_is_private:
8231 case Intrinsic::amdgcn_wavefrontsize: {
8232 B.buildConstant(
MI.getOperand(0), ST.getWavefrontSize());
8233 MI.eraseFromParent();
8236 case Intrinsic::amdgcn_s_buffer_load:
8238 case Intrinsic::amdgcn_raw_buffer_store:
8239 case Intrinsic::amdgcn_raw_ptr_buffer_store:
8240 case Intrinsic::amdgcn_struct_buffer_store:
8241 case Intrinsic::amdgcn_struct_ptr_buffer_store:
8243 case Intrinsic::amdgcn_raw_buffer_store_format:
8244 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
8245 case Intrinsic::amdgcn_struct_buffer_store_format:
8246 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
8248 case Intrinsic::amdgcn_raw_tbuffer_store:
8249 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
8250 case Intrinsic::amdgcn_struct_tbuffer_store:
8251 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
8253 case Intrinsic::amdgcn_raw_buffer_load:
8254 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8255 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8256 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8257 case Intrinsic::amdgcn_struct_buffer_load:
8258 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8259 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8260 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
8262 case Intrinsic::amdgcn_raw_buffer_load_format:
8263 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
8264 case Intrinsic::amdgcn_struct_buffer_load_format:
8265 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8267 case Intrinsic::amdgcn_raw_tbuffer_load:
8268 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
8269 case Intrinsic::amdgcn_struct_tbuffer_load:
8270 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
8272 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8273 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8274 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8275 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8276 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8277 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8278 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8279 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8280 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8281 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8282 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8283 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8284 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8285 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8286 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8287 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8288 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8289 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8290 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8291 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8292 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8293 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8294 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8295 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8296 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8297 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8298 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8299 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8300 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8301 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8302 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8303 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8304 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8305 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8306 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8307 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8308 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8309 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8310 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8311 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8312 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8313 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8314 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8315 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8316 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8317 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8318 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8319 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8320 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8321 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
8322 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8323 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
8324 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8325 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8326 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8327 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8328 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8329 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8330 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8331 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8332 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
8333 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
8334 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
8335 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
8336 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8337 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
8338 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8339 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
8340 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8341 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8342 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8343 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8345 case Intrinsic::amdgcn_rsq_clamp:
8347 case Intrinsic::amdgcn_image_bvh_intersect_ray:
8349 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
8350 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
8352 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
8353 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
8354 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
8355 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
8356 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
8357 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
8358 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
8359 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
8363 if (IndexArgTy !=
S64) {
8364 auto NewIndex = IndexArgTy.
isVector() ?
B.buildBitcast(
S64, Index)
8365 :
B.buildAnyExt(
S64, Index);
8366 MI.getOperand(5).setReg(NewIndex.getReg(0));
8370 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8371 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8372 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8373 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8374 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8375 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8376 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8377 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8381 MI.getOperand(5).setReg(
B.buildAnyExt(
S32, Index).getReg(0));
8384 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
8385 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
8386 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
8387 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
8388 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
8389 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
8390 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8391 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8392 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8394 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8398 if (IndexArgTy != IdxTy) {
8399 auto NewIndex = IndexArgTy.
isVector() ?
B.buildBitcast(IdxTy, Index)
8400 :
B.buildAnyExt(IdxTy, Index);
8401 MI.getOperand(7).setReg(NewIndex.getReg(0));
8406 case Intrinsic::amdgcn_fmed3: {
8412 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
8413 MI.removeOperand(1);
8417 case Intrinsic::amdgcn_readlane:
8418 case Intrinsic::amdgcn_writelane:
8419 case Intrinsic::amdgcn_readfirstlane:
8420 case Intrinsic::amdgcn_permlane16:
8421 case Intrinsic::amdgcn_permlanex16:
8422 case Intrinsic::amdgcn_permlane64:
8423 case Intrinsic::amdgcn_set_inactive:
8424 case Intrinsic::amdgcn_set_inactive_chain_arg:
8425 case Intrinsic::amdgcn_mov_dpp8:
8426 case Intrinsic::amdgcn_update_dpp:
8428 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8430 case Intrinsic::amdgcn_dead: {
8434 MI.eraseFromParent();
8437 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8438 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8439 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8440 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8441 B.buildLoad(
MI.getOperand(0),
MI.getOperand(2), **
MI.memoperands_begin());
8442 MI.eraseFromParent();
8444 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8445 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8446 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8447 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8448 B.buildStore(
MI.getOperand(2),
MI.getOperand(1), **
MI.memoperands_begin());
8449 MI.eraseFromParent();
8451 case Intrinsic::amdgcn_flat_load_monitor_b32:
8452 case Intrinsic::amdgcn_flat_load_monitor_b64:
8453 case Intrinsic::amdgcn_flat_load_monitor_b128:
8454 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8455 B.buildInstr(AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR)
8456 .add(
MI.getOperand(0))
8457 .add(
MI.getOperand(2))
8458 .addMemOperand(*
MI.memoperands_begin());
8459 MI.eraseFromParent();
8461 case Intrinsic::amdgcn_global_load_monitor_b32:
8462 case Intrinsic::amdgcn_global_load_monitor_b64:
8463 case Intrinsic::amdgcn_global_load_monitor_b128:
8464 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8465 B.buildInstr(AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR)
8466 .add(
MI.getOperand(0))
8467 .add(
MI.getOperand(2))
8468 .addMemOperand(*
MI.memoperands_begin());
8469 MI.eraseFromParent();
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst, const SrcOp &Src, unsigned Flags)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
static constexpr unsigned FPEnvModeBitField
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS32Vectors
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS64Vectors
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
static constexpr unsigned FPEnvTrapBitField
static constexpr unsigned MaxRegisterSize
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
static bool isRegisterVectorType(LLT Ty)
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Interface for Targets to specify which operations they can successfully select and how the others sho...
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define FP_DENORM_FLUSH_NONE
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static constexpr int Concat[]
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B, AMDGPU::Hwreg::Id HwReg, unsigned LowBit, unsigned Width) const
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeInsert(LegalizerHelper &Helper, MachineInstr &MI) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp10Unsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafeImpl(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags, bool IsExp10) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFEXPF64(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtract(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLS(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkGroupId(MachineInstr &MI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
bool isModuleEntryFunction() const
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isBottomOfStack() const
bool isEntryFunction() const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const fltSemantics & IEEEsingle()
static const fltSemantics & IEEEdouble()
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
@ ICMP_SLT
signed less than
@ FCMP_OLT
0 1 0 0 True if ordered and less than
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
@ ICMP_UGE
unsigned greater or equal
@ ICMP_SGT
signed greater than
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
@ ICMP_ULT
unsigned less than
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
This is the shared class of boolean and integer constants.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
static constexpr LLT float64()
Get a 64-bit IEEE double value.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
static constexpr LLT float16()
Get a 16-bit IEEE half value.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr LLT getScalarType() const
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
static constexpr LLT float32()
Get a 32-bit IEEE float value.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI void computeTables()
Compute any ancillary tables needed to quickly decide how an operation should be handled.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & unsupported()
The instruction is unsupported.
LegalizeRuleSet & scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx)
Change the type TypeIdx to have the same scalar size as type SameSizeIdx.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & bitcastIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
The specified type index is coerced if predicate is true.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & unsupportedFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & unsupportedIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Widen the scalar to the one selected by the mutation if the predicate is true.
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & clampNumElements(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the number of elements for the given vectors to at least MinTy's number of elements and at most...
LegalizeRuleSet & maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Conditionally limit the maximum size of the scalar.
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
LLVM_ABI LegalizeResult lowerInsert(MachineInstr &MI)
LLVM_ABI LegalizeResult lowerExtract(MachineInstr &MI)
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
const LegacyLegalizerInfo & getLegacyLegalizerInfo() const
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
use_instr_nodbg_iterator use_instr_nodbg_begin(Register RegNo) const
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI Register createGenericVirtualRegister(LLT Ty, StringRef Name="")
Create and return a new generic virtual register with low-level type Ty.
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
constexpr bool isValid() const
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
unsigned getPointerSizeInBits(unsigned AS) const
A Use represents the edge between a Value definition and its users.
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
Invariant opcodes: All instruction sets have these as their low opcodes.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Undef
Value of the register doesn't matter.
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool has_single_bit(T Value) noexcept
std::function< bool(const LegalityQuery &)> LegalityPredicate
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
To bit_cast(const From &from) noexcept
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned BitWidth
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
unsigned Log2(Align A)
Returns the log2 of the alignment.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
static constexpr uint64_t encode(Fields... Values)
MIMGBaseOpcode BaseOpcode
This struct is a compact representation of a valid (non-zero power of two) alignment.
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.